Python浏览器爬虫

原创

esse LL

修改于 2025-03-08 23:27:20

180

1. 安装依赖

pip install requests beautifulsoup4 lxml selenium -i https://mirrors.aliyun.com/pypi/simple/

# 使用前导入
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup

# 使用 Selenium执行 JavaScript，需要进行一些设置
# 设置 ChromeDriver 的路径 （直接放在桌面上driver文件夹即可）：
chrome_driver_path = r'L:\driver\chromedriver.exe' 
# 浏览器程序的路径
chrome_binary_path = r'D:\Program Files\Chrome\Application\chrome.exe'

# 配置选项
chrome_options = Options()
chrome_options.binary_location = chrome_binary_path  # 指定 Chrome 浏览器的路径
chrome_options.add_argument('--headless')  # 无头模式，不显示浏览器窗口
chrome_options.add_argument('--disable-gpu') 
chrome_options.add_argument('--no-sandbox')
 
# 初始化 WebDriver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

启动后的提示：

try:
    # 发送 HTTP GET 请求
    driver.get('https://y.qq.com/n/ryqq/toplist/60')  # qq音乐榜单页面的地址
 
    # 等待页面加载5秒钟
    time.sleep(5)
 
    # 获取html
    html_content = driver.page_source
    print(html_content)
 
finally:
    # 关闭浏览器
    driver.quit()

没有出现正确的结果，所以手动复制代码到chart.html文件中，稍后使用soup解析

对应这一部分内容：

2. 列表解析

使用soup解析xml，得到歌曲列表：

from bs4 import BeautifulSoup

# 文件路径
file_path = r'L:\driver\chart.html'

# 读取文件内容到html_content对象中
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
    
# 创建 soup 对象
soup = BeautifulSoup(html_content, 'lxml')

# 查找歌曲列表
song_list = soup.find('ul', class_='songlist__list')

# 提取每首歌曲的信息
if song_list:
    songs = song_list.find_all('li')
    for song in songs:
        # 提取歌曲名称
        song_name_div = song.find('span', class_='songlist__songname_txt')
        song_name = song_name_div.get_text(strip=True) if song_name_div else 'N/A'
        
        # 提取歌手信息
        author_div = song.find('div', class_='songlist__artist')
        author = author_div.get_text(strip=True) if author_div else 'N/A'
        
        # 提取时长信息（假设时长在<li>标签中）
        time_div = song.find('div', class_='songlist__time')
        time = time_div.get_text(strip=True) if time_div else 'N/A'
 
        print(f"歌曲: {song_name}, 歌手: {author}, 时长: {time}")
else:
    print("未找到歌曲列表。")

参考效果：

3. 保存表格

提前安装依赖：

pip install pandas openpyxl -i https://mirrors.aliyun.com/pypi/simple/

from bs4 import BeautifulSoup
import pandas as pd

# 文件路径
file_path = r'L:\driver\chart.html'

# 读取文件内容到 html_content 对象中
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# 创建 soup 对象
soup = BeautifulSoup(html_content, 'lxml')

# 查找歌曲列表
song_list = soup.find('ul', class_='songlist__list')

# 初始化一个列表来存储歌曲信息
songs_data = []

# 提取每首歌曲的信息
if song_list:
    songs = song_list.find_all('li')
    for song in songs:
        # 提取歌曲名称
        song_name_div = song.find('span', class_='songlist__songname_txt')
        song_name = song_name_div.get_text(strip=True) if song_name_div else 'N/A'
        
        # 提取歌手信息
        author_div = song.find('div', class_='songlist__artist')
        author = author_div.get_text(strip=True) if author_div else 'N/A'
        
        # 提取时长信息
        time_div = song.find('div', class_='songlist__time')
        time = time_div.get_text(strip=True) if time_div else 'N/A'
        
        # 将歌曲信息添加到列表中
        songs_data.append({
            '歌曲': song_name,
            '歌手': author,
            '时长': time
        })
else:
    print("未找到歌曲列表。")

# 使用pandas创建df
if songs_data:
    df = pd.DataFrame(songs_data)
    
    #df保存到xls表格
    df.to_excel('songs_data.xlsx', index=False, engine='openpyxl')
    
    print(f"保存成功。")
else:
    print("没有数据可写入。")

参考效果：