问IndexError:超出范围的列表索引(在Reddit数据爬虫上)
EN

Stack Overflow用户

提问于 2020-04-14 03:54:19

回答 2查看 240关注 0票数 0

是预期的，下面应该是运行没有问题。

Reddit数据解决方案：

    import requests
    import re
    import praw
    from datetime import date
    import csv
    import pandas as pd
    import time
    import sys

    class Crawler(object):
        '''
            basic_url is the reddit site.
            headers is for requests.get method
            REX is to find submission ids.
        '''
        def __init__(self, subreddit="apple"):
            '''
                Initialize a Crawler object.
                    subreddit is the topic you want to parse. default is r"apple"
                basic_url is the reddit site.
                headers is for requests.get method
                REX is to find submission ids.
                submission_ids save all the ids of submission you will parse.
                reddit is an object created using praw API. Please check it before you use.
            '''
            self.basic_url = "https://www.reddit.com"
            self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
            self.REX = re.compile(r"<div class=\" thing id-t3_[\w]+")
            self.subreddit = subreddit
            self.submission_ids = []
            self.reddit = praw.Reddit(client_id="your_id", client_secret="your_secret", user_agent="subreddit_comments_crawler")

        def get_submission_ids(self, pages=2):
            '''
                Collect all ids of submissions..
                One page has 25 submissions.
                page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
                    id(after) is the last submission from last page.
            '''
    #         This is page url.
            url = self.basic_url + "/r/" + self.subreddit

            if pages <= 0:
                return []

            text = requests.get(url, headers=self.headers).text
            ids = self.REX.findall(text)
            ids = list(map(lambda x: x[-6:], ids))
            if pages == 1:
                self.submission_ids = ids
                return ids

            count = 0
            after = ids[-1]
            for i in range(1, pages):
                count += 25
                temp_url = self.basic_url + "/r/" + self.subreddit + "?count=" + str(count) + "&after=t3_" + ids[-1]
                text = requests.get(temp_url, headers=self.headers).text
                temp_list = self.REX.findall(text)
                temp_list = list(map(lambda x: x[-6:], temp_list))
                ids += temp_list
                if count % 100 == 0:
                    time.sleep(60)
            self.submission_ids = ids
            return ids

        def get_comments(self, submission):
            '''
                Submission is an object created using praw API.
            '''
    #         Remove all "more comments".
            submission.comments.replace_more(limit=None)
            comments = []
            for each in submission.comments.list():
                try:
                    comments.append((each.id, each.link_id[3:], each.author.name, date.fromtimestamp(each.created_utc).isoformat(), each.score, each.body) )
                except AttributeError as e: # Some comments are deleted, we cannot access them.
    #                 print(each.link_id, e)
                    continue
            return comments

        def save_comments_submissions(self, pages):
            '''
                1. Save all the ids of submissions.
                2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
                3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
                4. Separately, save them to two csv file.
                Note: You can link them with submission_id.
                Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
            '''

            print("Start to collect all submission ids...")
            self.get_submission_ids(pages)
            print("Start to collect comments...This may cost a long time depending on # of pages.")
            submission_url = self.basic_url + "/r/" + self.subreddit + "/comments/"
            comments = []
            submissions = []
            count = 0
            for idx in self.submission_ids:
                temp_url = submission_url + idx
                submission = self.reddit.submission(url=temp_url)
                submissions.append((submission.name[3:], submission.num_comments, submission.score, submission.subreddit_name_prefixed, date.fromtimestamp(submission.created_utc).isoformat(), submission.title, submission.selftext))
                temp_comments = self.get_comments(submission)
                comments += temp_comments
                count += 1
                print(str(count) + " submissions have got...")
                if count % 50 == 0:
                    time.sleep(60)
            comments_fieldnames = ["comment_id", "submission_id", "author_name", "post_time", "comment_score", "text"]
            df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
            df_comments.to_csv("comments.csv")
            submissions_fieldnames = ["submission_id", "num_of_comments", "submission_score", "submission_subreddit", "post_date", "submission_title", "text"]
            df_submission = pd.DataFrame(submissions, columns=submissions_fieldnames)
            df_submission.to_csv("submissions.csv")
            return df_comments


    if __name__ == "__main__":
        args = sys.argv[1:]
        if len(args) != 2:
            print("Wrong number of args...")
            exit()

        subreddit, pages = args
        c = Crawler(subreddit)
        c.save_comments_submissions(int(pages))

，但我得到了：

UserAir:scrape_reddit用户$ python reddit_crawler.py apple 2

开始收集所有提交的身份证..。

回溯(最近一次调用)：

文件"reddit_crawler.py"，第127行，在中

c.save_comments_submissions(int(pages))

文件"reddit_crawler.py"，第94行，在save_comments_submissions中

self.get_submission_ids(pages)

文件"reddit_crawler.py"，第54行，在get_submission_ids中

after = ids[-1]

IndexError:列出超出范围的索引

python

praw

回答 2

Stack Overflow用户

回答已采纳

发布于 2020-04-14 18:49:31

Erik's answer诊断这一错误的具体原因，但更广泛地说，我认为这是因为您没有最大限度地使用PRAW。您的脚本导入requests并执行PRAW已有方法的大量手动请求。PRAW的全部目的是防止您不得不编写这些请求来执行诸如分页列表之类的事情，因此我建议您利用这一点。

例如，您的get_submission_ids函数(它会抓取Reddit的web版本并处理分页)可以被替换为

def get_submission_ids(self, pages=2):
    return [
        submission.id
        for submission in self.reddit.subreddit(self.subreddit).hot(
            limit=25 * pages
        )
    ]

因为.hot() function做了你想要做的所有事情。

我将在这里更进一步，函数只返回一个Submission对象列表，因为您的代码的其余部分通过与PRAW Submission对象交互来完成更好的工作。下面是代码(我将函数重命名以反映其更新的目的)：

def get_submissions(self, pages=2):
    return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages))

(我已经更新了这个函数以返回它的结果，因为您的版本既返回值，又将其设置为self.submission_ids，，除非 pages是0。这感觉很不一致，所以我只返回值。)

您的get_comments函数看起来不错。

save_comments_submissions函数和get_submission_ids一样，完成了PRAW能够处理的大量手工工作。您可以构造一个具有post完整URL的temp_url，然后使用它来生成PRAW Submission对象，但是我们可以直接使用get_submissions返回的URL来替换它。您也有一些对time.sleep()的调用，我删除了这些调用，因为PRAW将自动为您提供适当的睡眠量。最后，我删除了该函数的返回值，因为函数的目的是将数据保存到磁盘，而不是将其返回到其他任何地方，而脚本的其余部分不使用返回值。下面是该函数的更新版本：

def save_comments_submissions(self, pages):
    """
        1. Save all the ids of submissions.
        2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
        3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
        4. Separately, save them to two csv file.
        Note: You can link them with submission_id.
        Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
    """

    print("Start to collect all submission ids...")
    submissions = self.get_submissions(pages)
    print(
        "Start to collect comments...This may cost a long time depending on # of pages."
    )
    comments = []
    pandas_submissions = []
    for count, submission in enumerate(submissions):
        pandas_submissions.append(
            (
                submission.name[3:],
                submission.num_comments,
                submission.score,
                submission.subreddit_name_prefixed,
                date.fromtimestamp(submission.created_utc).isoformat(),
                submission.title,
                submission.selftext,
            )
        )
        temp_comments = self.get_comments(submission)
        comments += temp_comments
        print(str(count) + " submissions have got...")

    comments_fieldnames = [
        "comment_id",
        "submission_id",
        "author_name",
        "post_time",
        "comment_score",
        "text",
    ]
    df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
    df_comments.to_csv("comments.csv")
    submissions_fieldnames = [
        "submission_id",
        "num_of_comments",
        "submission_score",
        "submission_subreddit",
        "post_date",
        "submission_title",
        "text",
    ]
    df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames)
    df_submission.to_csv("submissions.csv")

以下是完全使用PRAW的整个脚本的更新版本：

from datetime import date
import sys


import pandas as pd
import praw


class Crawler:
    """
        basic_url is the reddit site.
        headers is for requests.get method
        REX is to find submission ids.
    """

    def __init__(self, subreddit="apple"):
        """
            Initialize a Crawler object.
                subreddit is the topic you want to parse. default is r"apple"
            basic_url is the reddit site.
            headers is for requests.get method
            REX is to find submission ids.
            submission_ids save all the ids of submission you will parse.
            reddit is an object created using praw API. Please check it before you use.
        """
        self.subreddit = subreddit
        self.submission_ids = []
        self.reddit = praw.Reddit(
            client_id="your_id",
            client_secret="your_secret",
            user_agent="subreddit_comments_crawler",
        )

    def get_submissions(self, pages=2):
        """
            Collect all submissions..
            One page has 25 submissions.
            page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
                id(after) is the last submission from last page.
        """
        return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages))

    def get_comments(self, submission):
        """
            Submission is an object created using praw API.
        """
        #         Remove all "more comments".
        submission.comments.replace_more(limit=None)
        comments = []
        for each in submission.comments.list():
            try:
                comments.append(
                    (
                        each.id,
                        each.link_id[3:],
                        each.author.name,
                        date.fromtimestamp(each.created_utc).isoformat(),
                        each.score,
                        each.body,
                    )
                )
            except AttributeError as e:  # Some comments are deleted, we cannot access them.
                #                 print(each.link_id, e)
                continue
        return comments

    def save_comments_submissions(self, pages):
        """
            1. Save all the ids of submissions.
            2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
            3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
            4. Separately, save them to two csv file.
            Note: You can link them with submission_id.
            Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
        """

        print("Start to collect all submission ids...")
        submissions = self.get_submissions(pages)
        print(
            "Start to collect comments...This may cost a long time depending on # of pages."
        )
        comments = []
        pandas_submissions = []
        for count, submission in enumerate(submissions):
            pandas_submissions.append(
                (
                    submission.name[3:],
                    submission.num_comments,
                    submission.score,
                    submission.subreddit_name_prefixed,
                    date.fromtimestamp(submission.created_utc).isoformat(),
                    submission.title,
                    submission.selftext,
                )
            )
            temp_comments = self.get_comments(submission)
            comments += temp_comments
            print(str(count) + " submissions have got...")

        comments_fieldnames = [
            "comment_id",
            "submission_id",
            "author_name",
            "post_time",
            "comment_score",
            "text",
        ]
        df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
        df_comments.to_csv("comments.csv")
        submissions_fieldnames = [
            "submission_id",
            "num_of_comments",
            "submission_score",
            "submission_subreddit",
            "post_date",
            "submission_title",
            "text",
        ]
        df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames)
        df_submission.to_csv("submissions.csv")


if __name__ == "__main__":
    args = sys.argv[1:]
    if len(args) != 2:
        print("Wrong number of args...")
        exit()

    subreddit, pages = args
    c = Crawler(subreddit)
    c.save_comments_submissions(int(pages))

我意识到我的答案进入了领域，但我希望这个答案有助于理解PRAW所能做的一些事情。使用预先存在的库代码可以避免“超出范围的列表索引”错误，因此我认为这是解决问题的方法。

票数 3

Stack Overflow用户

发布于 2020-04-14 03:58:55

当my_list[-1]抛出一个IndexError时，它意味着my_list是空的：

>>> ids = []
>>> ids[-1]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
IndexError: list index out of range
>>> ids = ['1']
>>> ids[-1]
'1'