# -*- coding = utf-8 -*-
# @Author :ZDHXN
# @File :beijingbus.py
# @Software : PyCharm
import csv
import urllib.request
from time import sleep
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
# 存放所有数据列表
all_data_list = []
# 获取公交路线详情页url
def get_page_url(urls):
req = urllib.request.Request(urls, headers=headers)
html = urllib.request.urlopen(req)
soup = bs(html.read(), 'html.parser')
lu = soup.find('div', class_='list clearfix')
hrefs = lu.find_all('a')
for k in hrefs:
urls = urljoin(url, k['href'])
get_page_info(urls)
# 获取公交路线详情页目标数据信息
def get_page_info(urls):
req = urllib.request.Request(urls, headers=headers)
html = urllib.request.urlopen(req)
soup = bs(html.read(), 'html.parser')
# 使用BeautifulSoup的select()方法
# 线路类型
line_type = soup.select('div.layout-left > div > div.info > h1 > a')[0].string
try:
# 总里程
mileage = soup.select('div.layout-left > div.change-info.mb20')[0].string
except:
mileage = ""
# 使用BeautifulSoup的find()、find_all()方法爬取更多的相关数据
# 线路名称
line_name = soup.find("h1", {"class": "title"}).a.previous_sibling.string
info_list = soup.find("ul", {"class": "bus-desc"})
# 运行时间
run_time = info_list.find_all("li")[0].string
# 参考票价
ticket = info_list.find_all("li")[1].string
# 公交公司
company = info_list.find_all("li")[2].text
# 最后更新
update_last = info_list.find_all("li")[3].div.previous_sibling.string
line_name_list = soup.find_all("div", {"class": "trip"})
line_list = soup.find_all("div", {"class": "bus-lzlist mb15"})
wang_line_list = []
fan_line_list = []
wang_line_name = ""
fan_line_name = ""
for i in range(len(line_list)):
if i == 0:
wang_line_list = line_list[0].find_all(["li"])
wang_line_name = line_name + "(" + line_name_list[0].string + ")"
else:
fan_line_list = line_list[1].find_all(["li"])
fan_line_name = line_name + "(" + line_name_list[1].string + ")"
# 公交路线-往(环形线默认为此项)
wang_info = wang_line_name + "\n"
# 公交路线-返
fan_info = fan_line_name + "\n"
for i in range(len(wang_line_list)):
if i != (len(wang_line_list) - 1):
if wang_line_list[i].find_all(["a"]) != []:
for k in wang_line_list[i].find_all(["a"]):
if k.get('title'):
continue
else:
wang_info += wang_line_list[i].find_all(["a"])[0].string + ","
else:
wang_info += wang_line_list[i].string
if len(fan_line_list) != 0:
for i in range(len(fan_line_list)):
if i != (len(fan_line_list) - 1):
if fan_line_list[i].find_all(["a"]) != []:
for k in fan_line_list[i].find_all(["a"]):
if k.get('title'):
continue
else:
fan_info += fan_line_list[i].find_all(["a"])[0].string + ","
else:
fan_info += fan_line_list[i].string
result_list = [line_name, line_type, run_time, mileage, ticket, company, update_last, wang_info, fan_info]
all_data_list.append(result_list)
# 将数据存储到mysql
if __name__ == '__main__':
url = 'http://beijing.8684.cn'
url_list = url + '/list%d'
for k in range(1, 10):
urls = url_list % k
get_page_url(urls)
# 存储到csv文件
field_name = ["线路名称", "线路类型", "运行时间", "总里程", "参考票价", "公交公司", "最后更新", "公交路线-往(环形线默认为此项)", "公交路线-返"]
path = "f:/data/test/bus_info.csv"
with open(path, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(field_name)
writer.writerows(all_data_list)
# 存储到mysql数据库
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。