python trump.py

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python trump.py相关的知识,希望对你有一定的参考价值。

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-03-14 21:01:10
# Project: amnews

from pyspider.libs.base_handler import *
import json
from urllib.parse import urlencode
import time
from hashlib import md5
import pymysql

class Mysql():
    def __init__(self):
        self.conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='news',
                                    charset='utf8')
        self.cursor = self.conn.cursor()
        self.table = 'news'
    
    def insert(self, item):
        if not item['title']:
            return
        sql = 'select * from ' + self.table + " where title = '" + item['title'] + "'"
        print(sql)
        result = self.cursor.execute(sql)
        if result:
            print('Exists', item)
        else:
            keys = ", ".join(item.keys())
            values = ", ".join(['%s'] * len(item))
            sql_query = "insert into %s (%s) values (%s)" % (self.table, keys, values)
            print(sql_query)
            print(item.values())
            self.cursor.execute(sql_query, list(item.values()))
            self.conn.commit()

class Handler(BaseHandler):
    crawl_config = {
    }
    
    start_url = 'http://www.politico.com/search/1?q=trump&adv=true&start=10%2F01%2F2016&end=11%2F30%2F2016&c=0000014b-324d-d4f3-a3cb-f3ff415e0035'
    
    headers = {
        'Cookie':'optimizelyEndUserId=oeu1489498886259r0.85759288896717; utag_vnum=1492090888643&vn=1; utag_vi=1489498888643; _cb_ls=1; __qca=P0-1345596598-1489498890595; optimizelySegments=%7B%22536712581%22%3A%22search%22%2C%22538992278%22%3A%22gc%22%2C%22539001624%22%3A%22none%22%2C%22553840423%22%3A%22false%22%7D; optimizelyBuckets=%7B%7D; utag_main=v_id:015acd0d59bb001dda61cbc50abd04078002907000bd0$_sn:1$_ss:0$_pn:10%3Bexp-session$_st:1489500897982$ses_id:1489498888635%3Bexp-session$_prevpage:site%20search%20advanced%3Bexp-1489502697989; _cp_pt=site search; utag_invisit=true; utag_vs=10; utag_dslv=1489499097995; utag_dslv_s=Less than 1 day; __atuvc=10%7C11; __atuvs=58c7f307760abfe4009; s_cc=true; _cb=DOGMwfDu4tbdBHkHi_; _chartbeat2=.1489498889778.1489499098910.1.DhfM-OC-eYVLBU8JDrDhVPJcCjOS_7; _cb_svref=https%3A%2F%2Fwww.google.co.jp%2F; _mkto_trk=id:966-KHF-533&token:_mch-politico.com-1489498891126-55352; s_vi=[CS]v1|2C63F986052C3D7F-400000C0200009B0[CE]; optimizelyPendingLogEvents=%5B%5D; s_fid=7D02211BE49149CB-37EEE9007DE547E9; SC_LINKS=%5B%5BB%5D%5D; s_sq=allbrittonpolitico2%3D%2526c.%2526a.%2526activitymap.%2526page%253Dsite%252520search%252520advanced%2526link%253D1%2526region%253DglobalWrapper%2526pageIDType%253D1%2526.activitymap%2526.a%2526.c%2526pid%253Dsite%252520search%252520advanced%2526pidt%253D1%2526oid%253Dhttp%25253A%25252F%25252Fwww.politico.com%25252Fsearch%25252F1%25253Fq%25253Dtrump%252526adv%25253Dtrue%252526start%25253D10%2525252F01%2525252F2016%252526end%25253D11%2525252F30%2525252F2016%252526c%25253D0000014b%2526ot%253DA; _chartbeat5=851,4900,%2Fsearch,http%3A%2F%2Fwww.politico.com%2Fsearch%2F1%3Fq%3Dtrump%26adv%3Dtrue%26start%3D10%252F01%252F2016%26end%3D11%252F30%252F2016%26c%3D0000014b-324d-d4f3-a3cb-f3ff415e0035,DXvxN-CE5Lm8Dv0uIBvDWFXDRCPS9,,c,BZ6AT-68tVHDZrzfLCeW7w8D6C4s9,politico.com,',
        'Host':'www.politico.com',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    mysql = Mysql()
    proxy = '127.0.0.1:9743'
    @every(minutes=24 * 60)
    def on_start(self):
       
         
        self.crawl(self.start_url, callback=self.index_page, headers=self.headers, fetch_type='js', proxy=self.proxy)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('.content-groupset.pos-alpha .story-frag-list li header h3 a').items():
            self.crawl(each.attr.href, callback=self.detail_page, proxy=self.proxy)
        
        next= response.doc('#globalWrapper div.contextual > a:last-child').attr.href
        print(next)
        
        self.crawl(next, callback=self.index_page,fetch_type='js', proxy=self.proxy)
        
    @config(priority=2)
    def detail_page(self, response):
        url = response.url
        print(url)
        title = response.doc('title').text()
        print(title)
        author = response.doc('#globalWrapper div.summary > footer > p.byline > span > a').text()
        print(author)
        image = response.doc('#globalWrapper article div.content.layout-story.sticky-wrapper > section.content-groupset.pos-omega .fig-graphic img').attr('src')
        print(image)
        text = response.doc('.story-text p').text()
        print(text)
        published = response.doc('#globalWrapper div.content.layout-story.sticky-wrapper > section.content-groupset.pos-omega p.timestamp > time').attr('datetime').replace('-05', ':00.000-05:')
        comments = 0
        print(published)
        print(comments)
        data = {
            'site_url': url,
            'uuid': md5(url.encode('utf-8')).hexdigest(),
            'title': title,
            'author': author,
            'main_img_url': image,
            'text': text,
            'published': published,
            'crawled': time.strftime("%Y-%m-%dT%H:%M:%S.000+08:00", time.localtime()),
            'comments': comments
        }
        print(data)
        return data
    
    
    def on_result(self, data):
        if data:
            self.mysql.insert(data)

以上是关于python trump.py的主要内容,如果未能解决你的问题,请参考以下文章

001--python全栈--基础知识--python安装

Python代写,Python作业代写,代写Python,代做Python

Python开发

Python,python,python

Python 介绍

Python学习之认识python