用户观点:知乎的成功是爬虫技术还是懂得人性

Posted 计算网

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了用户观点:知乎的成功是爬虫技术还是懂得人性相关的知识,希望对你有一定的参考价值。


知乎为什么成功?

因为知乎满足用户分享的欲望,同时满足了个人建立威望的人性需求。

知乎抓住了人性中的一个优点:分享。人其实是渴望分享的动物,有件趣事没有人分享就好比女人穿着件漂亮衣服却没有人问她在哪里买的一样难受,Keso说他之所以在知乎上如此活跃是因为“中国互联网在满足像我一样的人的需求方面,做得太少”,而知乎,正好给了他们一个高质量的分享舞台。


当然,分享的前提,是高质量的问题,最好是激起你回答的欲望乃至不回答都不舒服的问题。再者,这个分享其实也是有回馈的,他能让你建立威望。你回答得越多,就越显得你知识渊博,你的威望就越高。这恰好满足了马斯洛的需求金字塔中最高层次的需求——自我实现的需求。犹太裔人本主义心理学家亚伯拉罕·马斯洛(Abraham Maslow)提出的需求层次理论,将人的需求划分为五个层次。笔者认为人的需求不仅为五个层次,人的需求本身需很细微的分析。

这是一种观点,另一种观点认为知乎的成功是爬虫技术,如下:

 

 

#coding:utf-8

"""

@author:haoning

@create time:2015.8.5

"""

from __future__ import division  # 精确除法

from Queue import Queue

from __builtin__ import False

import json

import os

import re

import platform

import uuid

import urllib

import urllib2

import sys

import time

import mysqldb as mdb

from bs4 import BeautifulSoup

 

reload(sys)

sys.setdefaultencoding( "utf-8" )

 

headers = {

   'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',

   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

   'X-Requested-With':'XMLHttpRequest',

   'Referer':'https://www.zhihu.com/topics',

   'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'

}

 

DB_HOST = '127.0.0.1'

DB_USER = 'root'

DB_PASS = 'root'

 

queue= Queue() #接收队列

nodeSet=set()

keywordSet=set()

stop=0

offset=-20

level=0

maxLevel=7

counter=0

base=""

 

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')

conn.autocommit(False)

curr = conn.cursor()

 

def get_html(url):

    try:

        req = urllib2.Request(url)

        response = urllib2.urlopen(req,None,3) #在这里应该加入代理

        html = response.read()

        return html

    except:

        pass

    return None

 

def getTopics():

    url = 'https://www.zhihu.com/topics'

    print url

    try:

        req = urllib2.Request(url)

        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�

        html = response.read().decode('utf-8')

        print html

        soup = BeautifulSoup(html)

        lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})

       

        for li in lis:

            data_id=li.get('data-id')

            name=li.text

            curr.execute('select id from classify_new where name=%s',(name))

            y= curr.fetchone()

            if not y:

                curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))

        conn.commit()

    except Exception as e:

        print "get topic error",e

       

 

def get_extension(name):

    where=name.rfind('.')

    if where!=-1:

        return name[where:len(name)]

    return None

 

 

def which_platform():

    sys_str = platform.system()

    return sys_str

 

def GetDateString():

    when=time.strftime('%Y-%m-%d',time.localtime(time.time()))

    foldername = str(when)

    return foldername

 

def makeDateFolder(par,classify):

    try:

        if os.path.isdir(par):

            newFolderName=par + '//' + GetDateString() + '//'  +str(classify)

            if which_platform()=="Linux":

                newFolderName=par + '/' + GetDateString() + "/" +str(classify)

            if not os.path.isdir( newFolderName ):

                os.makedirs( newFolderName )

            return newFolderName

        else:

            return None

    except Exception,e:

        print "kk",e

    return None

 

def download_img(url,classify):

    try:

        extention=get_extension(url)

        if(extention is None):

            return None

        req = urllib2.Request(url)

        resp = urllib2.urlopen(req,None,3)

        dataimg=resp.read()

        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention

        top="E://topic_pic"

        folder=makeDateFolder(top, classify)

        filename=None

        if folder is not None:

            filename  =folder+"//"+name

        try:

            if "e82bab09c_m" in str(url):

                return True

            if not os.path.exists(filename):

                file_object = open(filename,'w+b')

                file_object.write(dataimg)

                file_object.close()

                return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name

            else:

                print "file exist"

                return None

        except IOError,e1:

            print "e1=",e1

            pass

    except Exception as e:

        print "eee",e

        pass

    return None #如果没有下载下来就利用原来网站的链接

 

def getChildren(node,name):

    global queue,nodeSet

    try:

        url="https://www.zhihu.com/topic/"+str(node)+"/hot"

        html=get_html(url)

        if html is None:

            return

        soup = BeautifulSoup(html)

        p_ch='父话题'

        node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

        topic_cla=soup.find('div', {'class' : 'child-topic'})

        if topic_cla is not None:

            try:

                p_ch=str(topic_cla.text)

                aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点

                if u'子话题' in p_ch:

                    for a in aList:

                        token=a.get('data-token')

                        a=str(a).replace('\n','').replace('\t','').replace('\r','')

                        start=str(a).find('>')

                        end=str(a).rfind('</a>')

                        new_node=str(str(a)[start+1:end])

                        curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同

                        y= curr.fetchone()

                        if not y:

                            print "y=",y,"new_node=",new_node,"token=",token

                            queue.put((token,new_node,node_name))

            except Exception as e:

                print "add queue error",e

    except Exception as e:

        print "get html error",e

       

   

 

def getContent(n,name,p,top_id):

    try:

        global counter

        curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

        y= curr.fetchone()

        print "exist?? ",y,"n=",n

        if not y:

            url="https://www.zhihu.com/topic/"+str(n)+"/hot"

            html=get_html(url)

            if html is None:

                return

            soup = BeautifulSoup(html)

            title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

            pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')

            description=soup.find('div',{'class':'zm-editable-content'})

            if description is not None:

                description=description.text

               

            if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环

                description=None

               

            tag_path=download_img(pic_path,top_id)

            print "tag_path=",tag_path

            if (tag_path is not None) or tag_path==True:

                if tag_path==True:

                    tag_path=None

                father_id=2 #默认为杂谈

                curr.execute('select id from rooms where name=%s',(p))

                results = curr.fetchall()

                for r in results:

                    father_id=r[0]

                name=title

                curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

                y= curr.fetchone()

                print "store see..",y

                if not y:

                    friends_num=0

                    temp = time.time()

                    x = time.localtime(float(temp))

                    create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now

                    create_time

                    creater_id=None

                    room_avatar=tag_path

                    is_pass=1

                    has_index=0

                    reason_id=None

                    #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id

                    ######################有资格入库的内容

                    counter=counter+1

                    curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))

                    conn.commit() #必须时时进入数据库,不然找不到父节点

                    if counter % 200==0:

                        print "current node",name,"num",counter

    except Exception as e:

        print "get content error",e    

 

def work():

    global queue

    curr.execute('select id,node,parent,name from classify where status=1')

    results = curr.fetchall()

    for r in results:

        top_id=r[0]

        node=r[1]

        parent=r[2]

        name=r[3]

        try:

            queue.put((node,name,parent)) #首先放入队列

            while queue.qsize() >0:

                n,p=queue.get() #顶节点出队

                getContent(n,p,top_id)

                getChildren(n,name) #出队内容的子节点

            conn.commit()

        except Exception as e:

            print "what's wrong",e

           

def new_work():

    global queue

    curr.execute('select id,data_id,name from classify_new_copy where status=1')

    results = curr.fetchall()

    for r in results:

        top_id=r[0]

        data_id=r[1]

        name=r[2]

        try:

            get_topis(data_id,name,top_id)

        except:

            pass

 

 

def get_topis(data_id,name,top_id):

    global queue

    url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'

    isGet = True;

    offset = -20;

    data_id=str(data_id)

    while isGet:

        offset = offset + 20

        values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}

        try:

            msg=None

            try:

                data = urllib.urlencode(values)

                request = urllib2.Request(url,data,headers)

                response = urllib2.urlopen(request,None,5)

                html=response.read().decode('utf-8')

                json_str = json.loads(html)

                ms=json_str['msg']

                if len(ms) <5:

                    break

                msg=ms[0]

            except Exception as e:

                print "eeeee",e

            #print msg

            if msg is not None:

                soup = BeautifulSoup(str(msg))

                blks = soup.find_all('div', {'class' : 'blk'})

                for blk in blks:

                    page=blk.find('a').get('href')

                    if page is not None:

                        node=page.replace("/topic/","") #将更多的种子入库

                        parent=name

                        ne=blk.find('strong').text

                        try:

                            queue.put((node,ne,parent)) #首先放入队列

                            while queue.qsize() >0:

                                n,name,p=queue.get() #顶节点出队

                                size=queue.qsize()

                                if size > 0:

                                    print size

                                getContent(n,name,p,top_id)

                                getChildren(n,name) #出队内容的子节点

                            conn.commit()

                        except Exception as e:

                            print "what's wrong",e

        except urllib2.URLError, e:

            print "error is",e

            pass

           

       

if __name__ == '__main__':

    i=0

    while i<400:

        new_work()

        i=i+1

  

 

说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。



(本文由中国计算网总编栾玲编辑  转载请注明出处)

以上是关于用户观点:知乎的成功是爬虫技术还是懂得人性的主要内容,如果未能解决你的问题,请参考以下文章

最全反爬虫技术介绍

最全反爬虫技术介绍

怎样用Python设计一个爬虫模拟登陆知乎

怎样用Python设计一个爬虫模拟登陆知乎

Python分布式爬虫抓取知乎用户信息并进行数据分析

Python-爬虫-爬取知乎的标题和当页显示的文字