python trump.py
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python trump.py相关的知识,希望对你有一定的参考价值。
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-03-14 21:01:10
# Project: amnews
from pyspider.libs.base_handler import *
import json
from urllib.parse import urlencode
import time
from hashlib import md5
import pymysql
class Mysql():
def __init__(self):
self.conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='news',
charset='utf8')
self.cursor = self.conn.cursor()
self.table = 'news'
def insert(self, item):
if not item['title']:
return
sql = 'select * from ' + self.table + " where title = '" + item['title'] + "'"
print(sql)
result = self.cursor.execute(sql)
if result:
print('Exists', item)
else:
keys = ", ".join(item.keys())
values = ", ".join(['%s'] * len(item))
sql_query = "insert into %s (%s) values (%s)" % (self.table, keys, values)
print(sql_query)
print(item.values())
self.cursor.execute(sql_query, list(item.values()))
self.conn.commit()
class Handler(BaseHandler):
crawl_config = {
}
start_url = 'http://www.politico.com/search/1?q=trump&adv=true&start=10%2F01%2F2016&end=11%2F30%2F2016&c=0000014b-324d-d4f3-a3cb-f3ff415e0035'
headers = {
'Cookie':'optimizelyEndUserId=oeu1489498886259r0.85759288896717; utag_vnum=1492090888643&vn=1; utag_vi=1489498888643; _cb_ls=1; __qca=P0-1345596598-1489498890595; optimizelySegments=%7B%22536712581%22%3A%22search%22%2C%22538992278%22%3A%22gc%22%2C%22539001624%22%3A%22none%22%2C%22553840423%22%3A%22false%22%7D; optimizelyBuckets=%7B%7D; utag_main=v_id:015acd0d59bb001dda61cbc50abd04078002907000bd0$_sn:1$_ss:0$_pn:10%3Bexp-session$_st:1489500897982$ses_id:1489498888635%3Bexp-session$_prevpage:site%20search%20advanced%3Bexp-1489502697989; _cp_pt=site search; utag_invisit=true; utag_vs=10; utag_dslv=1489499097995; utag_dslv_s=Less than 1 day; __atuvc=10%7C11; __atuvs=58c7f307760abfe4009; s_cc=true; _cb=DOGMwfDu4tbdBHkHi_; _chartbeat2=.1489498889778.1489499098910.1.DhfM-OC-eYVLBU8JDrDhVPJcCjOS_7; _cb_svref=https%3A%2F%2Fwww.google.co.jp%2F; _mkto_trk=id:966-KHF-533&token:_mch-politico.com-1489498891126-55352; s_vi=[CS]v1|2C63F986052C3D7F-400000C0200009B0[CE]; optimizelyPendingLogEvents=%5B%5D; s_fid=7D02211BE49149CB-37EEE9007DE547E9; SC_LINKS=%5B%5BB%5D%5D; s_sq=allbrittonpolitico2%3D%2526c.%2526a.%2526activitymap.%2526page%253Dsite%252520search%252520advanced%2526link%253D1%2526region%253DglobalWrapper%2526pageIDType%253D1%2526.activitymap%2526.a%2526.c%2526pid%253Dsite%252520search%252520advanced%2526pidt%253D1%2526oid%253Dhttp%25253A%25252F%25252Fwww.politico.com%25252Fsearch%25252F1%25253Fq%25253Dtrump%252526adv%25253Dtrue%252526start%25253D10%2525252F01%2525252F2016%252526end%25253D11%2525252F30%2525252F2016%252526c%25253D0000014b%2526ot%253DA; _chartbeat5=851,4900,%2Fsearch,http%3A%2F%2Fwww.politico.com%2Fsearch%2F1%3Fq%3Dtrump%26adv%3Dtrue%26start%3D10%252F01%252F2016%26end%3D11%252F30%252F2016%26c%3D0000014b-324d-d4f3-a3cb-f3ff415e0035,DXvxN-CE5Lm8Dv0uIBvDWFXDRCPS9,,c,BZ6AT-68tVHDZrzfLCeW7w8D6C4s9,politico.com,',
'Host':'www.politico.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
mysql = Mysql()
proxy = '127.0.0.1:9743'
@every(minutes=24 * 60)
def on_start(self):
self.crawl(self.start_url, callback=self.index_page, headers=self.headers, fetch_type='js', proxy=self.proxy)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('.content-groupset.pos-alpha .story-frag-list li header h3 a').items():
self.crawl(each.attr.href, callback=self.detail_page, proxy=self.proxy)
next= response.doc('#globalWrapper div.contextual > a:last-child').attr.href
print(next)
self.crawl(next, callback=self.index_page,fetch_type='js', proxy=self.proxy)
@config(priority=2)
def detail_page(self, response):
url = response.url
print(url)
title = response.doc('title').text()
print(title)
author = response.doc('#globalWrapper div.summary > footer > p.byline > span > a').text()
print(author)
image = response.doc('#globalWrapper article div.content.layout-story.sticky-wrapper > section.content-groupset.pos-omega .fig-graphic img').attr('src')
print(image)
text = response.doc('.story-text p').text()
print(text)
published = response.doc('#globalWrapper div.content.layout-story.sticky-wrapper > section.content-groupset.pos-omega p.timestamp > time').attr('datetime').replace('-05', ':00.000-05:')
comments = 0
print(published)
print(comments)
data = {
'site_url': url,
'uuid': md5(url.encode('utf-8')).hexdigest(),
'title': title,
'author': author,
'main_img_url': image,
'text': text,
'published': published,
'crawled': time.strftime("%Y-%m-%dT%H:%M:%S.000+08:00", time.localtime()),
'comments': comments
}
print(data)
return data
def on_result(self, data):
if data:
self.mysql.insert(data)
以上是关于python trump.py的主要内容,如果未能解决你的问题,请参考以下文章