前段时间写了一篇文章介绍了使用python爬虫自动抓取百度site:命令的结果,但那个方案有个问题是不稳定,而且只是判断了是否收录,具体收录了多少个链接不清楚,这无法达到持续关注收录数量是否有增加的目的,于是用selenium写了这个实现方案,可以精准监测收录数量
import json
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import requests
from lxml import etree
import time
myService = Service(r'./../chromedriver')
options = webdriver.ChromeOptions()
# options.add_argument('headless') #如果想不弹出浏览器则加上这项配置
myChrome = webdriver.Chrome(service=myService, options=options)
myChrome.implicitly_wait(10)
domain = 'jentian.com'
myChrome.get('https://www.baidu.com')
keywordInput = myChrome.find_element(By.ID, 'kw')
keywordInput.send_keys('site:' + domain)
searchBtn = myChrome.find_element(By.ID, 'su')
searchBtn.click()
time.sleep(3) # 点击搜索按钮后要过一会再对页面文本进行解析, 因为需要时间跳转及加载内容
dom = etree.HTML(myChrome.page_source)
resultStringArr = dom.xpath('//*[@id="content_left"]/div[1]/div/p[1]/b/text()')
resultCount = 0
if len(resultStringArr) > 0:
resultCountString = resultStringArr[0]
resultCountGroup = re.compile(r'\d+').findall(resultCountString)
if resultCountGroup:
resultCount = ''.join(resultCountGroup)
if int(resultCount) > 0:
msg = '百度已收录' + domain + ',收录数量:' + str(resultCount)
else:
msg = '百度未收录' + domain
print('抓取完毕!!!', msg, '\n')
qiWeiWebHook = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=**'#请自动替换成自己的webhook链接
postHeaders = {
'Content-Type': 'application/json'
}
msgData = {
"msgtype": "text",
"text": {
"content": msg
}
}
requests.post(qiWeiWebHook, headers=postHeaders, data=json.dumps(msgData))
#通过抓取某个域名的site指令结果,判断是否已被百度收录代码
import json
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import requests
from lxml import etree
import time
def crawlBaiduPickupData():
myService = Service(r'./../chromedriver')
options = webdriver.ChromeOptions()
# options.add_argument('headless') #如果想不弹出浏览器则加上这项配置
myChrome = webdriver.Chrome(service=myService, options=options)
myChrome.implicitly_wait(10)
domain = 'jentian.com'
while True:
myChrome.get('https://www.baidu.com')
keywordInput = myChrome.find_element(By.ID, 'kw')
keywordInput.send_keys('site:' + domain)
searchBtn = myChrome.find_element(By.ID, 'su')
searchBtn.click()
time.sleep(3) # 点击搜索按钮后要过一会再对页面文本进行解析, 因为需要时间跳转及加载内容
dom = etree.HTML(myChrome.page_source)
resultStringArr = dom.xpath('//*[@id="content_left"]/div[1]/div/p[1]/b/text()')
resultCount = 0
if len(resultStringArr) > 0:
resultCountString = resultStringArr[0]
resultCountGroup = re.compile(r'\d+').findall(resultCountString)
if resultCountGroup:
resultCount = ''.join(resultCountGroup)
if int(resultCount) > 0:
msg = '百度已收录' + domain + ',收录数量:' + str(resultCount)
else:
msg = '百度未收录' + domain
print('抓取完毕!!!', msg, '\n')
qiWeiWebHook = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=*'
postHeaders = {
'Content-Type': 'application/json'
}
msgData = {
"msgtype": "text",
"text": {
"content": msg
}
}
requests.post(qiWeiWebHook, headers=postHeaders, data=json.dumps(msgData))
time.sleep(3600) # 每小时跟进一次
if __name__ == '__main__':
crawlBaiduPickupData()