image.png
image.png
Scrapy 核心的代码都在scrapy类库的scrapy/core文件夹下
image.png
(downloader 支持多种类型下载)
spider,pipline,middleware 是自己编写的
image.png
...
...
# 此处为执行过程中1-2步,Engine拿到request后发送给Scheduler
def schedule(self, request, spider):
self.signals.send_catch_log(signal=signals.request_scheduled,
request=request, spider=spider)
# 调用scheduler的enqueue_request方法将request放到Scheduler中
if not self.slot.scheduler.enqueue_request(request):
self.signals.send_catch_log(signal=signals.request_dropped,
request=request, spider=spider)
...
...
# 此处为执行过程中第三步,从Engine中拿request给Scheduler
def _next_request_from_scheduler(self, spider):
slot = self.slot
request = slot.scheduler.next_request()
# 爬虫首次启动的时候先执行这个_next_request_from_scheduler方法,
# 但是scheduler里此时没有request,所以就会去从Spider中读取start_urls
if not request:
return
d = self._download(request, spider)
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.remove_request(request))
d.addErrback(lambda f: logger.info('Error while removing request from slot',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(lambda f: logger.info('Error while scheduling new request',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d
...
...
class Request(object_ref):
# url: 请求参数
# callback:请求回调函数
# method: http请求类型
# headers: 请求头
# body:请求体
# cookies:浏览器cookie,自动登录后,scrapy会自动把cookie加入request中
# 该操作的实现是由scrapy.downloadermiddlewares.cookies.CookiesMiddleware的scrapy内置Middleware完成的
# meta:元信息,(可以在Request中传递)
# encoding:网页编码格式,默认UTF-8
# priority:设置在scheduler的调度优先级
# dont_filter:是否不过滤同时发出的相同request请求
# errback:失败的回调函数
#
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None):
class Response(object_ref):
# url 网页的url
# status 返回状态码,默认是200,代表成功
# headers 服务器返回的响应头
# body 返回的内容体
# request 之前yield的Request,对应的请求
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
其子类有HtmlResponse,TextResponse,XmlResponse
from scrapy.http.response.text import TextResponse
class HtmlResponse(TextResponse):
pass
class TextResponse(Response):
...
...
# Response内部已经引入了selector拱xpath,css方法调用
@property
def selector(self):
from scrapy.selector import Selector
if self._cached_selector is None:
self._cached_selector = Selector(self)
return self._cached_selector
# xpath 选择器
def xpath(self, query, **kwargs):
return self.selector.xpath(query, **kwargs)
# css 选择器
def css(self, query):
return self.selector.css(query)
...
...
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有