Bilibili(B站)是国内知名的视频分享平台,拥有海量的弹幕数据。弹幕是B站的核心特色之一,用户通过弹幕进行实时互动,这些数据对于分析视频热度、用户情感倾向等具有重要价值。
本文将介绍如何利用Python爬虫技术抓取Bilibili视频的弹幕数据,并使用WordCloud库生成词云,直观展示弹幕中的高频词汇。
B站的弹幕数据通常存储在XML文件中,每个视频对应一个弹幕文件(**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
决定)。我们需要:
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
(弹幕ID)**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">https://comment.bilibili.com/{cid}.xml</font>**
)**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
B站的视频页面(如 **<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">https://www.bilibili.com/video/BV1xxxxxx</font>**
)中,**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
通常可以通过以下方式获取:
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">window.__playinfo__</font>**
或 **<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
相关字段**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">https://api.bilibili.com/x/web-interface/view?bvid=BV1xxxxxx</font>**
)本文采用 API方式 获取 **<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
,更加稳定。
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
import requests
def get_cid(bvid):
"""通过B站API获取视频的cid"""
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return data["data"]["cid"]
else:
raise Exception("Failed to fetch cid")
# 示例:获取视频 BV1GJ411x7h7 的 cid
bvid = "BV1GJ411x7h7" # 替换为目标视频的BV号
cid = get_cid(bvid)
print(f"视频的cid: {cid}")
B站的弹幕文件通常存储在 **<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">https://comment.bilibili.com/{cid}.xml</font>**
,我们需要解析XML并提取弹幕文本。
from bs4 import BeautifulSoup
def fetch_danmaku(cid):
"""获取弹幕XML并解析"""
url = f"https://comment.bilibili.com/{cid}.xml"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "lxml")
danmaku_list = [d.text for d in soup.find_all("d")]
return danmaku_list
else:
raise Exception("Failed to fetch danmaku")
# 获取弹幕
danmaku_list = fetch_danmaku(cid)
print(f"共获取 {len(danmaku_list)} 条弹幕")
弹幕可能包含无意义的符号、表情等,可以使用正则表达式过滤:
import re
def clean_text(text):
"""清洗弹幕文本"""
# 去除特殊符号、空格、换行等
text = re.sub(r'[^\w\s]', '', text) # 去除非字母数字汉字
text = re.sub(r'\s+', ' ', text) # 合并多个空格
return text.strip()
cleaned_danmaku = [clean_text(d) for d in danmaku_list]
使用 **<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">jieba</font>**
进行中文分词,并用 **<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">WordCloud</font>**
生成词云:
from wordcloud import WordCloud
import jieba
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
def generate_wordcloud(text_list, output_path="wordcloud.png"):
"""生成词云"""
# 合并所有弹幕
text = " ".join(text_list)
# 使用 jieba 分词
words = " ".join(jieba.cut(text))
# 设置词云参数
wc = WordCloud(
font_path="msyh.ttc", # 支持中文的字体(Windows可用)
width=800,
height=600,
background_color="white",
max_words=200,
collocations=False, # 避免重复词
)
# 生成词云
wc.generate(words)
# 保存词云图片
wc.to_file(output_path)
print(f"词云已生成: {output_path}")
# 显示词云
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
# 生成词云
generate_wordcloud(cleaned_danmaku)
import requests
from bs4 import BeautifulSoup
import re
from wordcloud import WordCloud
import jieba
import matplotlib.pyplot as plt
# 代理配置
proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"
# 代理格式整理
proxyMeta = f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}"
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
def get_cid(bvid):
"""获取视频cid"""
url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
headers = {"User-Agent": "Mozilla/5.0"}
try:
# 添加 proxies 参数
response = requests.get(url, headers=headers, proxies=proxies, timeout=10)
if response.status_code == 200:
return response.json()["data"]["cid"]
else:
raise Exception(f"API请求失败,状态码:{response.status_code}")
except Exception as e:
raise Exception(f"获取cid时出错:{str(e)}")
def fetch_danmaku(cid):
"""获取弹幕XML并解析"""
url = f"https://comment.bilibili.com/{cid}.xml"
headers = {"User-Agent": "Mozilla/5.0"}
try:
# 添加 proxies 参数
response = requests.get(url, headers=headers, proxies=proxies, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "lxml")
return [d.text for d in soup.find_all("d")]
else:
raise Exception(f"弹幕请求失败,状态码:{response.status_code}")
except Exception as e:
raise Exception(f"获取弹幕时出错:{str(e)}")
def clean_text(text):
"""清洗弹幕文本"""
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def generate_wordcloud(text_list, output_path="wordcloud.png"):
"""生成词云"""
text = " ".join(text_list)
words = " ".join(jieba.cut(text))
wc = WordCloud(
font_path="msyh.ttc",
width=800,
height=600,
background_color="white",
max_words=200,
collocations=False,
)
wc.generate(words)
wc.to_file(output_path)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
if __name__ == "__main__":
try:
bvid = "BV1GJ411x7h7" # 替换为目标视频BV号
cid = get_cid(bvid)
print(f"成功获取视频CID: {cid}")
danmaku_list = fetch_danmaku(cid)
print(f"共获取 {len(danmaku_list)} 条弹幕")
cleaned_danmaku = [clean_text(d) for d in danmaku_list]
generate_wordcloud(cleaned_danmaku)
except Exception as e:
print(f"程序运行出错: {str(e)}")
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">headers</font>**
模拟浏览器访问,或使用代理IP。**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
,批量获取弹幕数据。**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">SnowNLP</font>**
或 **<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">TextBlob</font>**
分析弹幕情感倾向。**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">Pyecharts</font>**
生成交互式词云。本文介绍了如何用Python爬取B站弹幕并生成词云,涉及:
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">cid</font>**
)**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">BeautifulSoup</font>**
)**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">jieba</font>**
)**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">WordCloud</font>**
)该方法适用于视频分析、用户行为研究、热点话题挖掘等场景。读者可以进一步扩展,如结合机器学习进行弹幕分类或情感分析。
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有