python打造批量关键词排名查询工具
Posted 68xi
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python打造批量关键词排名查询工具相关的知识,希望对你有一定的参考价值。
自己做站点的时候,都看看收录和关键词排名什么的,所以打造的这个批量关键词查询工具。
#encoding:utf-8 import urllib,re,random,time,sys,StringIO,socket try: import pycurl except: pass from bs4 import BeautifulSoup score={1: 28.56, 2: 19.23, 3: 10.20, 4: 8.14, 5: 7.50, 6: 5.72, 7: 4.01, 8: 4.41, 9: 5.53, 10: 6.70,} #获取根域名,百度产品直接显示子域名 def root_domain(url): if ‘baidu.com‘ in url: return url else: try: url = url.replace(‘http://‘, ‘‘) l = [‘.com.cn‘, ‘.org.cn‘, ‘.net.cn‘, ‘.gov.cn‘] for suffix in l: if suffix in url: return re.search(‘^(.*?..*?)*([^.]+?.[^.]+?.[^.]+)‘, url).group(2) return re.search(‘^(.*?..*?)*([^.]+?.[^.]+)‘, url).group(2) except: return ‘-‘ def curl(url, debug=False, **kwargs): list=[‘Mozilla/5.0 (Windows NT 5.1; rv:37.0) Gecko/20100101 Firefox/37.0‘,‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/31.0.1650.63 Safari/537.36‘,‘Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36‘] randhead=random.sample(list,1) while 1: try: s = StringIO.StringIO() c = pycurl.Curl() c.setopt(pycurl.URL, url) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.TIMEOUT, 60) c.setopt(pycurl.ENCODING, ‘gzip‘) c.setopt(pycurl.USERAGENT, ‘%s‘%randhead[0]) c.setopt(pycurl.NOSIGNAL, True) c.setopt(pycurl.WRITEFUNCTION, s.write) for k, v in kwargs.iteritems(): c.setopt(vars(pycurl)[k], v) c.perform() c.close() return s.getvalue() except: if debug: raise continue def get_baidudata(keyword,rn): search_url = ‘http://www.baidu.com/s?wd=%s&rn=%d‘%(urllib.quote(keyword),rn) pagetext = curl(search_url) #获取百度搜索结果源代码 while ‘http://verify.baidu.com‘ in pagetext: #判断 如果查询过程中出现验证码则提示并停止10分钟,然后重新查询 print u"查询过程出现验证码,休息10分钟",keyword time.sleep(600) pagetext = curl(search_url) else: soup = BeautifulSoup(pagetext) data = soup.find_all("div",attrs={‘class‘:‘result c-container ‘})#提取自然排名结果 return data return def get_rank_data(keyword,rn): data = get_baidudata(keyword,rn)#获取自然排名结果 items = {} for result in data: g = result.find_all("a",attrs={‘class‘:‘c-showurl‘})#获取主域名 if g: site=re.search(r‘([a-zA-Z0-9.-]+)‘,g[0].text) host = site.groups(1)[0] host=root_domain(host)#获取根域名 rank = int(result[‘id‘])#排名 if host not in items.keys(): items[host] = [] items[host].append(score[rank]) else: items[host].append(score[rank]) return items#返回单个词前十数据 def get_keywords(filename):#读取关键词返回列表 kwfile = open(filename,‘r‘) keywords = kwfile.readline() kw_list = [] while keywords: kw = keywords.strip() kw_list.append(kw) keywords = kwfile.readline() kwfile.close() return kw_list def get_all_data(filename,rn):#单域名数据合并 kw_list = get_keywords(filename) items = {} for i,kw in enumerate(kw_list,1): print i,kw item = get_rank_data(kw,rn) for host,rank in item.items(): if host not in items.keys(): items[host] = rank else: items[host].extend(rank) return items def get_score(filename,rn): data = get_all_data(filename,rn) fh = open(‘score.csv‘,‘a+‘) fh.write(‘host,kws,average_score,host_score, ‘) for host,rank in data.items(): if host != None: host = host.encode(‘utf-8‘) else: host = ‘error page‘ kws = len(rank)#关键词数 host_score = sum(rank)#总得分 average_score = host_score/kws#平均分 fh.write(host+‘,‘+str(kws)+‘,‘+str(average_score)+‘,‘+str(host_score)+‘ ‘) return if __name__=="__main__": file=raw_input("请输入包含关键词的文件名:")
以上是关于python打造批量关键词排名查询工具的主要内容,如果未能解决你的问题,请参考以下文章
使用Python Requests伪装成浏览器请求百度360获取关键词批量排名
四十二 Python分布式爬虫打造搜索引擎Scrapy精讲—elasticsearch(搜索引擎)的mget和bulk批量操作