如问题描述:开始是视频显示有60000多条弹幕,但是我只能爬取到25000条。
顺便 想问一下,js包里面的内容都代表什么呢?
视频网址https://v.qq.com/x/cover/pkd7mm8nlor7sqv/m0020d4bocq.html
其中一个js包中的一个内容:{"commentid"::"6129416678651013752","content":"今天我们","upcount":5,"isfriend":0,"isop":0,"isself":0,"timepoint":155,"headurl":"","opername":"","bb_bcolor":"","bb_head":"","bb_level":"","bb_id":"","rich_type":0,"uservip_degree":0,"content_style":""}
因为我看到第一个弹幕的时间戳是15所以设置的stamp=15,然后+=30
下面附代码,求助!
from urllib.parse import urlencode
import requests
import re
import csv
import time
import random
def search(base_url, time_stamp):
try:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4295.400 QQBrowser/9.7.12661.400',
'accept': '*/*',
'accept - encoding': 'gzip, deflate, sdch, br',
'accept - language': 'zh-CN,zh;q=0.8',
'cache - control': 'max - age = 0',
'cookie':'cuid=9102028816; pgv_pvi=8393473024; RK=ITK8C31Rc1; tvfe_boss_uuid=047a68f022a12505; pac_uid=1_1179630150; ptui_loginuin=1179630150; ptisp=cnc; ptcz=4c8a92ec5e84326bd8926bc75a1413db138bc560b11fd2026cbb4652c51a10d9; uin=o1179630150; skey=@G4DR3fRZB; appid=3000501; login_time_init=1553069185; _video_qq_version=1.1; _video_qq_appid=3000501; _video_qq_login_time_init=1553069185; main_login=qq; vuserid=624788404; vusession=70de633e23002eb00000000012d275a5949283c2bd4e; next_refresh_time=4405; _video_qq_main_login=qq; _video_qq_vuserid=624788404; _video_qq_vusession=70de633e23002eb00000000012d275a5949283c2bd4e; _video_qq_next_refresh_time=4405; login_time_last=2019-3-20 16:42:40; o_cookie=1179630150; pgv_info=ssid=s3395872160; pgv_pvid=3092479592'
}
data = {
'timestamp': time_stamp,
'target_id': '1379190863',
}
# 使用urlencode()函数将参数表示为链接形式,进行拼接
url = base_url + urlencode(data)
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except Exception:
return None
def get_30s_danmu(text):
id = re.findall(r'"commentid":"(.*?)",', text)
T = re.findall(r'"timepoint":(.*?),', text)
comments = re.findall(r'"content":"(.*?)",', text)
like = re.findall(r'"upcount":(.*?),"', text)
for (i, j, k, l) in zip(id, T, like, comments): # 生成可迭代对象
a=int(int(j)/60)
b=int(j)%60
c=":"
d=str(a)
e=str(b)
seq=(d,e)
j=c.join(seq)
with open('douluo_test1_danmu.csv', 'a+', newline='', encoding='utf-8-sig') as f: # newline=''可以避免存入空行;设置编码;追加写入
writer = csv.writer(f)
writer.writerow(['\''+str(i), str(j), str(k), l])
def main():
time_stamp = 15
num = 1
while True:
print('第{}次爬取'.format(num))
print('*'*50)
base_url = 'https://mfm.video.qq.com/danmu?'
text = search(base_url, time_stamp)
print('*'*50)
get_30s_danmu(text)
time_stamp += 30
num += 1
if __name__ == '__main__':
main()