# -*- coding: utf-8 -*-
"""
@author: sato
@file: sina_spider.py
@time: 2019-09-03 15:57
"""
import requests
import re
import multiprocessing
import os
class Spider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/76.0.3809.132 Safari/537.36'
}
# 新浪新闻首页 每日要闻、重点新闻 id="wrap" > class="part_01 clearfix" > class="p_middle"
self.base_url = 'https://news.sina.com.cn/'
def get_news_list(self):
res = requests.get(self.base_url, self.headers)
if res.status_code not in (200, 201):
raise Exception('network error!')
res.encoding = 'utf-8'
part_01_clearfix = re.findall('<div class="part_01 clearfix">([\S\s]*?)<div class="part_01 clearfix" data-sudaclick="blk_livevideo">', res.text)
if part_01_clearfix:
part_01_clearfix = part_01_clearfix[0]
p_middle = re.findall('<div class="p_middle">([\S\s]*?)<div class="p_right">', part_01_clearfix)
if p_middle:
return re.findall('<a target="_blank" href="([\S\s]*?)"', p_middle[0])
def rep_and_write(self, link):
print(f'get data from {link}')
ret = requests.get(url=link, headers=self.headers)
if ret.status_code not in (200, 201):
raise Exception(f'get {link} error!')
ret.encoding = 'utf-8'
content = ret.text
title = re.findall('<title>([\S\s]*?)</title>', content)[0]
with open(os.path.join('./html', title + '.html'), 'w') as f:
f.write(content)
def run(self):
links = self.get_news_list()
if not links:
raise Exception('error!')
if not os.path.exists('./html'):
os.mkdir('./html')
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
for link in links:
pool.apply_async(self.rep_and_write, (link,))
pool.close()
pool.join()
print('done')
spider = Spider()
spider.run()