dogedoge浏览器爬取标题
Posted qxh-beijing2016
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了dogedoge浏览器爬取标题相关的知识,希望对你有一定的参考价值。
# coding:utf-8 import hashlib import datetime import lxml import pymysql import requests from lxml import etree import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) def search_data(kw, n): ll = [] res = requests.get(‘https://www.dogedoge.com/results?q={}‘.format(kw)) if n > 1: res = requests.get(‘https://www.dogedoge.com/results?q={}&p={}‘.format(kw, n)) con = etree.html(res.text) url = con.xpath(‘//div[@class="result results_links_deep highlight_d result--url-above-snippet"]‘) for u in url: title = ‘‘ for i in u.xpath(‘./div/h2/a//text()‘): title += i url = ‘‘ for i in u.xpath(‘./div/div/div/a/span//text()‘): url += i domain = ‘‘ if url.find(‘http‘) != -1: domain = url.split(‘/‘)[2] else: domain = url.split(‘/‘)[0] md5 = hashlib.md5(url).hexdigest() item = {} item[‘keywd‘] = kw item[‘domain‘] = domain item[‘title‘] = title item[‘md5‘] = md5 item[‘url‘] = url item[‘searcher‘] = ‘dogedoge‘ ll.append(item) save(ll) try: next = con.xpath(‘//div[@id="rld-2"]‘) except: print ‘没有下一页了‘ return ‘‘ else: return next def main(kw): n = 1 while True: next_page = search_data(kw, n) if not next_page: break n += 1 def save(ll): db = pymysql.connect( host=MYSQL_HOST, db=MYSQL_DBNAME, user=MYSQL_USER, passwd=MYSQL_PASSWD, charset=‘utf8‘, use_unicode=True) cursor = db.cursor() for item in ll: # print type(item), item[‘searcher‘] try: # 插入数据库 cursor.execute( "insert into weixintb(md5,keyword,title,url,`date`,`domain`, browser) value(%s, %s, %s, %s, %s, %s,%s)", (item[‘md5‘], item[‘keywd‘], item[‘title‘], item[‘url‘], datetime.datetime.now(), item[‘domain‘], item[‘searcher‘] )) # 提交sql语句 db.commit() except Exception as error: # 出现错误时打印错误日志 # print error # logger.error(error) db.rollback() cursor.close() db.close() main(‘爬取关键词‘)
以上是关于dogedoge浏览器爬取标题的主要内容,如果未能解决你的问题,请参考以下文章