这里以某度小说网站举例说明,其余网站均可类似处理,打开小说网站的首页(网页链接见评论区),打开网页,输入并查询我们想要下载的小说,点击相应章节就能跳转到对应内容中,此时要检查页面源代码是否包含所有的小说内容数据。
细心的网友会发现,请求头地址URL中出现了一堆“%22”的乱码,其实它就是ASCII码中的双引号,参阅百度百科的词条内容:URL编码。
def get_child_url(book_id):
cids=[]#保存所有章节的id
url = main_url+'/getCatalog?data={"book_id":"' + book_id + '"}'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Referer': url
}
resp = requests.get(url, headers=header)
result = resp.json()
resp.close()
data=result['data']['novel']['items']
for item in data:
cids.append(item['cid'])
return cids
def download_one_page(book_id,title_id):
data = {
"book_id": book_id,
"cid": f"{book_id}|{title_id}",
"need_bookinfo": 1
}#拼接请求头参数
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
url = main_url+'/getChapterContent?data=' + json.dumps(data)
resp = requests.get(url, headers=header)
result = resp.json()
resp.close()
title = result['data']['novel']['chapter_title']
content = result['data']['novel']['content']
with open(f'小说/{title}.txt', 'w', encoding='utf-8') as file:
file.write(content)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import requests
from concurrent.futures import ThreadPoolExecutor
def get_child_url(book_id):
cids=[]
url = main_url+'/getCatalog?data={"book_id":"' + book_id + '"}'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Referer': url
}
resp = requests.get(url, headers=header)
result = resp.json()
resp.close()
data=result['data']['novel']['items']
for item in data:
cids.append(item['cid'])
return cids
def download_one_page(book_id,title_id):
data = {
"book_id": book_id,
"cid": f"{book_id}|{title_id}",
"need_bookinfo": 1
}#拼接请求头参数
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
url = main_url+'/getChapterContent?data=' + json.dumps(data)
resp = requests.get(url, headers=header)
result = resp.json()
resp.close()
title = result['data']['novel']['chapter_title']
content = result['data']['novel']['content']
with open(f'小说/{title}.txt', 'w', encoding='utf-8') as file:
file.write(content)
if __name__ == '__main__':
book_id = '4305593636'
main_url=网址
cids=get_child_url(book_id)
# 创建线程池
with ThreadPoolExecutor(50) as t:
for cid in cids:
# 提交下载任务给线程池
t.submit(download_one_page, book_id,cid)
print('全部下载完毕!')
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。