python爬虫如何爬知乎的话题?

Posted 赤尔宝丁

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python爬虫如何爬知乎的话题?相关的知识,希望对你有一定的参考价值。

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用

#coding:utf-8
from fileinput import filename
__author__ = ‘haoning‘
__crawler for http://www.guandn.com/

#!/usr/bin/env python

import urllib
import urllib2
import time
import re
import json
import uuid
import platform
import os
import sys
import cookielib
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
   ‘User-Agent‘ : ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0‘,
   ‘Content-Type‘:‘application/x-www-form-urlencoded; charset=UTF-8‘,
   ‘X-Requested-With‘:‘XMLHttpRequest‘,
   ‘Referer‘:‘https://www.zhihu.com/topics‘,
   ‘Cookie‘:‘__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a‘
}

DB_HOST = ‘127.0.0.1‘
DB_USER = ‘root‘
DB_PASS = ‘root‘

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, ‘zhihu‘, charset=‘utf8‘)
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req,None,20) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
        html = response.read()
        return html
    except:
        print "timeout"
    return None

def getTopics():
    url = ‘https://www.zhihu.com/topics‘
    print url
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
        html = response.read().decode(‘utf-8‘)
        print html
        soup = BeautifulSoup(html)
        lis = soup.find_all(‘li‘, {‘class‘ : ‘zm-topic-cat-item‘})
        
        for li in lis:
            data_id=li.get(‘data-id‘)
            name=li.text
            curr.execute(‘select id from classify_new where name=%s‘,(name))
            y= curr.fetchone()
            if not y:
                curr.execute(‘INSERT INTO classify_new(data_id,name)VALUES(%s,%s)‘,(data_id,name))
        conn.commit()
    except Exception as e:
        print "get topic error",e
        

def get_extension(name):  
    where=name.rfind(‘.‘)
    if where!=-1:
        return name[where:len(name)]
    return None


def which_platform():
    sys_str = platform.system()
    return sys_str

def GetDateString():
    when=time.strftime(‘%Y-%m-%d‘,time.localtime(time.time()))
    foldername = str(when)
    return foldername 

def makeDateFolder(par,classify):
    try:
        if os.path.isdir(par):
            newFolderName=par + ‘//‘ + GetDateString() + ‘//‘  +classify
            if which_platform()=="Linux":
                newFolderName=par + ‘/‘ + GetDateString() + "/" +classify
            if not os.path.isdir( newFolderName ):
                os.makedirs( newFolderName )
            return newFolderName
        else:
            return None 
    except Exception,e:
        print "kk",e
    return None 

def download_img(url,classify):
    try:
        extention=get_extension(url)
        if(extention is None):
            return None
        req = urllib2.Request(url)
        resp = urllib2.urlopen(req,None,15)
        dataimg=resp.read()
        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
        top="E://topic_pic"
        folder=makeDateFolder(top, classify)
        filename=None
        if folder is not None:
            filename  =folder+"//"+name
        #print "filename",filename
        try:
            if "e82bab09c_xs" not in str(url):
                if not os.path.exists(filename):
                    file_object = open(filename,‘w+b‘)
                    file_object.write(dataimg)
                    file_object.close()
                    return GetDateString()+‘/‘+classify+"/"+name
                else:
                    print "file exist"
                    return None
        except IOError,e1:
            print "e1=",e1
            pass
    except Exception as e:
        print "eee",e
        pass
    return None #濡傛灉娌℃湁涓嬭浇涓嬫潵灏卞埄鐢ㄥ師鏉ョ綉绔欑殑閾炬帴
    
                
def get_topis(top_id,topic_name):
    url = ‘https://www.zhihu.com/node/TopicsPlazzaListV2‘
    isGet = True;
    offset = -20;
    top_id=str(top_id)
    while isGet:
        offset = offset + 20
        values = {‘method‘: ‘next‘, ‘params‘: ‘{"topic_id":‘+top_id+‘,"offset":‘+str(offset)+‘,"hash_id":""}‘}
        try:
            data = urllib.urlencode(values)
            request = urllib2.Request(url,data,headers)
            response = urllib2.urlopen(request)
            html=response.read().decode(‘utf-8‘)
            if html is None:
                return
            json_str = json.loads(html)
            ms=json_str[‘msg‘]
            if len(ms) <5:
                break
            msg=ms[0]
            #print msg
            soup = BeautifulSoup(str(msg))
            blks = soup.find_all(‘div‘, {‘class‘ : ‘blk‘})
            for blk in blks:
                page=blk.find(‘a‘).get(‘href‘)
                if page is not None:
                    node=page.replace("/topic/","")
                    print node,page
        except urllib2.URLError, e:
            print "error is",e
            pass
                

def work():
    #getTopics() #鑾峰緱璇濋
    curr.execute(‘select data_id,name from classify_new‘)
    results = curr.fetchall()
    for r in results:
        data_id=r[0]
        name=r[1]
        get_topis(data_id,name)
        
if __name__ == ‘__main__‘:
    i=0
    while i< 40:
        work()
        i=i+1

  

说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。

有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。

以上是关于python爬虫如何爬知乎的话题?的主要内容,如果未能解决你的问题,请参考以下文章

人生苦短,我用Python--分分钟下载知乎美图给你看

一文读懂:Python爬虫超详细讲解带你实战爬知乎(零基础入门,男女老少都看的懂)

[python](爬虫)如何使用正确的姿势欣赏知乎的“长的XXX是怎样体验”这一类中的图片

Python-爬虫-爬取知乎的标题和当页显示的文字

怎样用Python设计一个爬虫模拟登陆知乎

怎样用Python设计一个爬虫模拟登陆知乎