前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >专栏 >python爬取北京公交数据

python爬取北京公交数据

原创
作者头像
弟大不用洗
发布2024-10-17 15:33:52
发布2024-10-17 15:33:52
21300
代码可运行
举报
运行总次数:0
代码可运行
代码语言:python
代码运行次数:0
运行
复制
# -*- coding = utf-8 -*-
# @Author :ZDHXN
# @File :beijingbus.py
# @Software : PyCharm
import csv
import urllib.request
from time import sleep

from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
# 存放所有数据列表
all_data_list = []


# 获取公交路线详情页url
def get_page_url(urls):
    req = urllib.request.Request(urls, headers=headers)
    html = urllib.request.urlopen(req)
    soup = bs(html.read(), 'html.parser')
    lu = soup.find('div', class_='list clearfix')
    hrefs = lu.find_all('a')
    for k in hrefs:
        urls = urljoin(url, k['href'])
        get_page_info(urls)


# 获取公交路线详情页目标数据信息
def get_page_info(urls):
    req = urllib.request.Request(urls, headers=headers)
    html = urllib.request.urlopen(req)
    soup = bs(html.read(), 'html.parser')
    # 使用BeautifulSoup的select()方法
    # 线路类型
    line_type = soup.select('div.layout-left > div > div.info > h1 > a')[0].string
    try:
        # 总里程
        mileage = soup.select('div.layout-left > div.change-info.mb20')[0].string
    except:
        mileage = ""

    # 使用BeautifulSoup的find()、find_all()方法爬取更多的相关数据
    # 线路名称
    line_name = soup.find("h1", {"class": "title"}).a.previous_sibling.string
    info_list = soup.find("ul", {"class": "bus-desc"})
    # 运行时间
    run_time = info_list.find_all("li")[0].string
    # 参考票价
    ticket = info_list.find_all("li")[1].string
    # 公交公司
    company = info_list.find_all("li")[2].text
    # 最后更新
    update_last = info_list.find_all("li")[3].div.previous_sibling.string

    line_name_list = soup.find_all("div", {"class": "trip"})
    line_list = soup.find_all("div", {"class": "bus-lzlist mb15"})

    wang_line_list = []
    fan_line_list = []

    wang_line_name = ""
    fan_line_name = ""

    for i in range(len(line_list)):
        if i == 0:
            wang_line_list = line_list[0].find_all(["li"])
            wang_line_name = line_name + "(" + line_name_list[0].string + ")"
        else:
            fan_line_list = line_list[1].find_all(["li"])
            fan_line_name = line_name + "(" + line_name_list[1].string + ")"

    # 公交路线-往(环形线默认为此项)
    wang_info = wang_line_name + "\n"
    # 公交路线-返
    fan_info = fan_line_name + "\n"

    for i in range(len(wang_line_list)):
        if i != (len(wang_line_list) - 1):
            if wang_line_list[i].find_all(["a"]) != []:
                for k in wang_line_list[i].find_all(["a"]):
                    if k.get('title'):
                        continue
                    else:
                        wang_info += wang_line_list[i].find_all(["a"])[0].string + ","
        else:
            wang_info += wang_line_list[i].string
    if len(fan_line_list) != 0:
        for i in range(len(fan_line_list)):
            if i != (len(fan_line_list) - 1):
                if fan_line_list[i].find_all(["a"]) != []:
                    for k in fan_line_list[i].find_all(["a"]):
                        if k.get('title'):
                            continue
                        else:
                            fan_info += fan_line_list[i].find_all(["a"])[0].string + ","
            else:
                fan_info += fan_line_list[i].string

    result_list = [line_name, line_type, run_time, mileage, ticket, company, update_last, wang_info, fan_info]
    all_data_list.append(result_list)


# 将数据存储到mysql



if __name__ == '__main__':
    url = 'http://beijing.8684.cn'
    url_list = url + '/list%d'
    for k in range(1, 10):
        urls = url_list % k
        get_page_url(urls)

    # 存储到csv文件
    field_name = ["线路名称", "线路类型", "运行时间", "总里程", "参考票价", "公交公司", "最后更新", "公交路线-往(环形线默认为此项)", "公交路线-返"]
    path = "f:/data/test/bus_info.csv"
    with open(path, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(field_name)
        writer.writerows(all_data_list)

    # 存储到mysql数据库

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档