1、items.py
import scrapy
class LearnscrapyItem(scrapy.Item): name = scrapy.Field() link = scrapy.Field()
2、settings.py
BOT_NAME = 'learnscrapy'
SPIDER_MODULES = ['learnscrapy.spiders'] NEWSPIDER_MODULE = 'learnscrapy.spiders'
ROBOTSTXT_OBEY = True COOKIES_ENABLED = False
DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2, 'learnscrapy.middlewares.USERAGENT': 1, }
ITEM_PIPELINES = { 'learnscrapy.pipelines.LearnscrapyPipeline': 300, }
3、middlewares.py
import random
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
class HTTPPROXY(HttpProxyMiddleware): # 初始化 注意一定是 ip='' def init(self, ip=''): self.ip = ip
def process_request(self, request, spider):
item = random.choice(IPPOOL)
try:
print("当前的IP是:"+item["ipaddr"])
request.meta["proxy"] = "http://"+item["ipaddr"]
except Exception as e:
print(e)
passIPPOOL = [ {"ipaddr": "182.117.102.10:8118"}, {"ipaddr": "121.31.102.215:8123"}, {"ipaddr": "1222.94.128.49:8118"} ]
class USERAGENT(UserAgentMiddleware): #初始化 注意一定是 user_agent='' def init(self, user_agent=''): self.user_agent = user_agent
def process_request(self, request, spider):
item = random.choice(UPPOOL)
try:
print("当前的User-Agent是:"+item)
request.headers.setdefault('User-Agent', item)
except Exception as e:
print(e)
passUPPOOL = [ "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393" ] 4、pipeline.py
import pymysql import json class LearnscrapyPipeline(object): def init(self): # 数据库连接 self.conn = pymysql.connect(host='192.168.126.181', user='wx', password='wx', database='test', charset='utf8') self.cur = self.conn.cursor() def process_item(self, item, spider): for j in range(0, len(item["name"])): nam = item["name"][j] lin = item["link"][j] sql = "insert into site(name,link) values(%s,%s)" self.cur.execute(sql, (nam, lin)) self.conn.commit() return item
def close_spider(self, spider):
self.cur.close()
self.conn.close()5、spiders/test.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from learnscrapy.items import LearnscrapyItem
class TestSpider(CrawlSpider): name = 'test' allowed_domains = ['sohu.com'] start_urls = ['http://www.sohu.com/']
rules = (
Rule(LinkExtractor(allow=('http://news.sohu.com'), allow_domains=('sohu.com')), callback='parse_item',
follow=False),
)
def parse_item(self, response):
i = LearnscrapyItem()
i['name'] = response.xpath('//div[@class="news"]/p/a/text()').extract()
i['link'] = response.xpath('//div[@class="news"]/p/a/@href').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return i6、main.py from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(file))) execute(['scarpy', 'crawl', 'test'])