想要一些客户或者目标网站的联系方式的原因,直接采集网站,特别是零售的独立网站,留下来的大多都是客服的Email。留了一圈,独立网站基本上都会有Facebook主页跟Instagram等社交页面,其他相关信息也会更充分。
于是形成了一个采集的路径。首先确定关键词+地区(如sexy dress us),排名靠前的独立站,说明运营能力还不错,流量也摆在眼前,然后去掉那些大品牌知名品牌等目前阶段菲目标客户,去重,获取到Facebook Page页面,最后获取页面的联系方式Email。
先整个header
# -*- coding: utf-8 -*-importrequestsfromurllibimportparsefromlxmlimportetreeimportreimportpymysqlheaders={ "authority":"m.facebook.com", "method":"GET", "path":"/hopefashionuk/about/?ref=page_internal&mt_nav=1&_rdr", "scheme":"https", "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "accept-language":"zh-CN,zh;q=0.9", "cache-control":"max-age=0", "cookie":"sb=WL-YWjMepfg59FZkaWkw6Ifs; datr=mL-YWrehi6IneIZo4XA04EMV; c_user=100001562125482; xs=11%3AGqYU38X8a4ye2g%3A2%3A1520302191%3A17032%3A11382; pl=n; dpr=3; m_pixel_ratio=3; fr=0hlOgwHibInYlCfeq.AWVg5fJCZVbOnlJWv-sHHMcEDiw.BalRTE.A6.Fqd.0.0.Bapyu4.AWWAJvNd; act=1520905198065%2F2; presence=EDvF3EtimeF1520905215EuserFA21B01562125482A2EstateFDutF1520905215899CEchFDp_5f1B01562125482F2CC; wd=375x812; x-referer=eyJyIjoiL2hvcGVmYXNoaW9udWsvYWJvdXQvP3JlZj1wYWdlX2ludGVybmFsJm10X25hdj0xIiwiaCI6Ii9wZy9ob3BlZmFzaGlvbnVrL3ZpZGVvcy8%2FcmVmPXBhZ2VfaW50ZXJuYWwmbXRfbmF2PTEiLCJzIjoibSJ9", "upgrade-insecure-requests":"1", "user-agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1"}params={ "ref":"page_internal", "mt_nav":"1"}获取搜索结果中的目标网站URL
deffbscraw(keyword): page=10 whilepage=1: proto,rest=parse.splittype(str(url[])) host,rest=parse.splithost(str(rest)) else: continue#就是一出问题,我就跳出循环下一个 try: fbreq=requests.get('http://'+str(host)) except: continue else: refb=re.compile(r'facebook.com/(pages/[A-Za-z0-9@._/-]|[A-Za-z0-9@._/-])"') fb=refb.findall(fbreq.text)#获取facebook page页面 iflen(fb)>=1: page_name=fb[].replace('/','') db=pymysql.connect('localhost','root','root','',charset='utf8mb4') cur=db.cursor() sql='''SELECT * FROM ... ''' cur.execute(sql) data=cur.fetchone() ifdata==None: insert_sql='''INSERT INTO ...''' cur.execute(insert_sql) cur.close() db.close() else: continue page+=10 qq.close()开始采集对应Facebook Page 页面信息
defscrawemail(): db=pymysql.connect('localhost','root','root','',charset='utf8mb4') cur=db.cursor() sql='''SELECT * FROM ...''' cur.execute(sql) cj=cur.fetchall() foriincj: url='https://m.facebook.com/'+str(i[2])+'/about/' qq=requests.session() qq.proxies={'https':'127.0.0.1:1087'} headers.update({'path':'/'+str(i[2])+'/about/?ref=page_internal&mt_nav=1&_rdr'})#更新一下header try: req=qq.get(url,headers=headers,params=params) except: continue else: html=req.text html2=re.compile(r'mailto:(\S*)"').findall(html)#这个就是全文的重点目标了 iflen(html2)>=1: mail=html2[].replace('@','@') update_sql='''UPDATE FBEMAIL ... ''' cur.execute(update_sql) else: update_sql='''UPDATE FBEMAIL SET .... ''' cur.execute(update_sql) qq.close() continue cur.close() db.close()if__name__=='__main__': keyword=input('请输入关键词如 sexy dress us:') fbscraw(keyword) scrawemail()
边学编写的Python,写的很懒散,不健壮,很多考虑不到的直接淫荡的忽略了。。。
更多细节与技术优化的地方,或者有更好的思路。请赐教。
领取专属 10元无门槛券
私享最新 技术干货