成品打包:点击进入
代码:
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from practice.items import PracticeItem
from urllib import parse
class LolskinSpider(scrapy.Spider):
name = 'lolskin'
allowed_domains = ['lolskin.cn']
start_urls = ['https://lolskin.cn/champions.html']
# 获取所有英雄链接
def parse(self, response):
item = PracticeItem()
item['urls'] = response.xpath('//div[2]/div[1]/div/ul/li/a/@href').extract()
for url in item['urls']:
self.csurl = 'https://lolskin.cn'
yield scrapy.Request(url=parse.urljoin(self.csurl, url), dont_filter=True, callback=self.bizhi)
return item
# 获取所有英雄皮肤链接
def bizhi(self, response):
skins = (response.xpath('//td/a/@href').extract())
for skin in skins:
yield scrapy.Request(url=parse.urljoin(self.csurl, skin), dont_filter=True, callback=self.get_bzurl)
# 采集每个皮肤的壁纸,获取壁纸链接
def get_bzurl(self, response):
item = PracticeItem()
image_urls = response.xpath('//body/div[1]/div/a/@href').extract()
image_name = response.xpath('//h1/text()').extract()
yield {
'image_urls': image_urls,
'image_name': image_name
}
return item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PracticeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# titles = scrapy.Field()
# yxpngs = scrapy.Field()
urls = scrapy.Field()
skin_name = scrapy.Field() # 皮肤名
image_urls = scrapy.Field() # 皮肤壁纸url
images = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
import re
from scrapy.pipelines.images import ImagesPipeline
import scrapy
# class PracticePipeline(object):
# def __init__(self):
# self.file = open('text.csv', 'a+')
#
# def process_item(self, item, spider):
# # os.chdir('lolskin')
# # for title in item['titles']:
# # os.makedirs(title)
# skin_name = item['skin_name']
# skin_jpg = item['skin_jpg']
# for i in range(len(skin_name)):
# self.file.write(f'{skin_name[i]},{skin_jpg}
')
# self.file.flush()
# return item
#
# def down_bizhi(self, item, spider):
# self.file.close()
class LoLPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'image_name': item['image_name']})
# 修改下载之后的路径以及文件名
def file_path(self, request, response=None, info=None):
image_name = re.findall('/skin/(.*?)/', request.url)[0] + "/" + request.meta[f'image_name'][0] + '.jpg'
return image_name
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for practice project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os
BOT_NAME = 'practice'
SPIDER_MODULES = ['practice.spiders']
NEWSPIDER_MODULE = 'practice.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'practice (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 设置延时
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'practice.middlewares.PracticeSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'practice.middlewares.PracticeDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'practice.pipelines.PracticePipeline': 300,
# 'scrapy.pipelines.images.ImagesPipeline': 1,
'practice.pipelines.LoLPipeline': 1
}
# 设置采集文件夹路径
IMAGES_STORE = 'E:PythonscrapypracticepracticeLOLskin'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
main.py
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'lolskin'])
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有