春节就不更新啦
本中年人看留言板的时候想起来年轻的时候,就写了q空间留言爬取的代码。就是现在我们中年人好像不怎么用qq了。
python 2.7
依赖库:selenium、BeautifulSoup
附加文件:chromedriver.exe
# encoding: utf-8
"""
Created on 2018-02-12
@author: flyrae
"""
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
# 登录QQ空间
def get_msg(qq):
data=[]
name=[]
chromedriver = r"chromedriver.exe"
driver = webdriver.Chrome(chromedriver)
#使用get()方法打开待抓取的URL
driver.get('http://user.qzone.qq.com/{}/334'.format(qq))
time.sleep(5)
#等待5秒后,判断页面是否需要登录,通过查找页面是否有相应的DIV的id来判断
try:
driver.find_element_by_id('login_div')
a = True
except:
a = False
if a == True:
#如果页面存在登录的DIV,则模拟登录
driver.switch_to.frame('login_frame')
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear() # 选择用户名框
driver.find_element_by_id('u').send_keys('2806952949') #此处替换为自己的qq号
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys('要写密码哎') #刺虎替换为自己的密码
driver.find_element_by_id('login_button').click()
time.sleep(3)
driver.implicitly_wait(3)
#判断好友空间是否设置了权限,通过判断是否存在元素ID:QM_OwnerInfo_Icon
try:
driver.find_element_by_id('QM_OwnerInfo_Icon')
b = True
except:
b = False
#如果有权限能够访问到说说页面,那么定位元素和数据,并解析
if b == True:
driver.switch_to.frame('app_canvas_frame')
content = driver.find_elements_by_css_selector('.cont')
names = driver.find_elements_by_css_selector('.userinfo')
count = driver.find_elements_by_css_selector('.mod_pagenav_count')
tmp= count[len(count)-1].text.split(' ')
pages = int(tmp[len(tmp)-1])
for c in content:
data.append(c.text)
for n in names:
name.append(n.text)
print n.text
for i in range(1,pages):
driver.find_element_by_link_text('下一页').click()
time.sleep(3)
content = driver.find_elements_by_css_selector('.cont')
names = driver.find_elements_by_css_selector('.userinfo')
for c in content:
data.append(c.text)
for n in names:
name.append(n.text)
print n.text
#尝试一下获取Cookie,使用get_cookies()
cookie = driver.get_cookies()
cookie_dict = []
for c in cookie:
ck = "=;".format(c['name'], c['value'])
cookie_dict.append(ck)
i = ''
for c in cookie_dict:
i += c
print('Cookies:', i)
driver.close()
driver.quit()
return name,data
if __name__ == '__main__':
name,data = get_msg(id)
with open('msg_'+id+'.txt','w') as f:
i = 0
for d in data:
f.write(str(i)+'\n')
f.write(name[i].encode('utf-8'))
f.write(':\t')
f.write(d.encode('utf-8'))
f.write('\n\n--------------------\n\n')
i += 1
github链接: https://github.com/flyrae/QZONE_MSG
QQ空间,从心开始
提前祝大家春节快乐!
领取专属 10元无门槛券
私享最新 技术干货