网络爬虫入门——案例三：爬取大众点评的商户信息

Posted 2020-07-22 可爱的熊乖乖
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了网络爬虫入门——案例三：爬取大众点评的商户信息相关的知识，希望对你有一定的参考价值。
pyspider：http://demo.pyspider.org/
CSS选择器：http://www.w3school.com.cn/cssref/css_selectors.asp
Beautiful Soup：http://beautifulsoup.readthedocs.io/zh_CN/latest/
正则表达式：http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html
本帖目标：
http://www.dianping.com/search/keyword/3/0_%E4%B8%80%E9%B8%A3%E7%9C%9F%E9%B2%9C%E5%A5%B6%E5%90%A7
1.抓取一鸣真鲜奶吧的所有商店信息
2.抓取商店所有的评论信息
3.将抓取到的内容保存到数据库（没有体现）
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-06-07 07:40:58
# Project: dazhongdianping

from pyspider.libs.base_handler import *
from bs4 import BeautifulSoup
from pymongo import MongoClient
import base64
import re


id = 0
count = 0
number=0
global count
global id
global number


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(\'http://www.dianping.com/search/keyword/3/0_%E4%B8%80%E9%B8%A3%E7%9C%9F%E9%B2%9C%E5%A5%B6%E5%90%A7\', callback=self.local_page)
        
    @config(age=2 * 24 *60)
    def local_page(self, response):
        
        self.save_local(\'remark\',response.url,response.doc)
        for each in response.doc(\'DIV.pic>A\').items():
            
            self.crawl(each.attr.href, callback=self.index_page)
            
        #下一页
        for each in response.doc(\'A.next\',).items():
            
            self.crawl(each.attr.href, callback=self.local_page)

    @config(age=3*24*60)
    def index_page(self,response):
        
        global number
        
        
        #店铺信息
        for each in response.doc(\'DIV#basic-info\').items():
            
            number +=1
            
            info={}
            tmp = BeautifulSoup(str(each))
            name = tmp.find(\'h1\',class_=\'shop-name\')
            
            #店铺编号
            info[\'itemid\']=number
            
            #店铺名称
            if re.findall(r\'<h1 class="shop-name">[\\s]+(.*)\',str(name)):
                info[\'name\']=re.findall(r\'<h1 class="shop-name">[\\s]+(.*)\',str(name))[0]
            else:
                info[\'name\']=\'-\'
                
            #
            if re.findall(r\'<a class="branch J-branch">(.*)<i class="icon i-arrow"></i></a>\',str(name)):

                info[\'branch\']=re.findall(r\'<a class="branch J-branch">(.*)<i class="icon i-arrow"></i></a>\',str(name))[0]
            else:
                info[\'branch\']=\'-\'
                
            #   
            info[\'basic_info\']=[]

            basic_info = tmp.find("div",class_="brief-info")
            
            if basic_info:
                #星级
                star=basic_info.span.get(\'class\')[1]
                
                info[\'level\']=int(re.findall(r\'mid-str(.*)\',str(star))[0])*1.0/10
                print info[\'level\']
                for td in basic_info.find_all(\'span\',class_="item"):
                    
                    info[\'basic_info\'].append(td.string.encode(\'utf-8\'))
            else:
                info[\'level\']=\'-\'
            #区名       
            region=tmp.find(\'span\',itemprop=\'locality region\')
            
            
            #街道信息
            address=tmp.find(\'span\',class_=\'item\',itemprop="street-address")
            
            
            if region:
                info[\'region\']=region.string.encode(\'utf-8\')
            else:
                info[\'region\']=\'-\'
             
            if address:
                    
                info[\'address\']=address.string.encode(\'utf-8\').strip()
                
            else:
                info[\'address\']=\'-\'
            
            #电话
            tel=tmp.find(\'p\',class_="expand-info tel")
            if tel:
                    
                info[\'telephone\']=tel.find(\'span\',class_=\'item\').string.encode(\'utf-8\')
                
            else:
                info[\'telephone\']=\'-\'
                
         
        #更多评论     
        if response.doc(\'P.comment-all>A\'):
            
            for each in response.doc(\'P.comment-all>A\').items():
                
                self.crawl(each.attr.href, callback=self.detail_page_all)
        #如果当前已经显示了所有评论    
        else:
            
            self.crawl(response.url,callback=self.detail_page)

    @config(age=4*24*60)
    def detail_page(self, response):
        
        
        global id
        
        each = BeautifulSoup(str(response.doc))
        
        #获取评论
        tmp=each.find_all(\'li\',class_="comment-item")

        for tr in tmp:
                
            res={}
                
            id +=1
                
            #评论id
            res[\'itemid\']=id
            
            #用户名
            if tr.find(\'p\',class_=\'user-info\'):
                res[\'user\']=tr.find(\'p\',class_=\'user-info\').a.string.encode(\'utf-8\')
            else:
                res[\'user\']=\'-\'
                
            res[\'comment\']={}
                
            #点赞次数
            date=tr.find(\'div\',class_=\'misc-info\')
            res[\'time\']=date.find(\'span\',class_=\'time\').string.encode(\'utf-8\')
            
            #商店信息
            info = tr.find(\'p\',class_=\'shop-info\')
                
            #商店得分情况
            star=info.span.get(\'class\')[1]
            res[\'level\']=int(re.findall(r\'sml-str(.*)\',str(star))[0])*1.0/10
            #口味环境和服务得分
            if info.find_all(\'span\',class_=\'item\'):
                    
                for thing in info.find_all(\'span\',class_=\'item\'):
                        
                    thing = thing.string.encode(\'utf-8\').split(\'£º\')
                        
                    res[\'comment\'][thing[0]]=thing[1]
            
            if info.find(\'span\',class_=\'average\'):
                res[\'price\']=info.find(\'span\',class_=\'average\').string.encode(\'utf-8\').split(\'£º\')[1]
            else:
                res[\'price\']=\'-\'
               
            #展开评论
            content=tr.find(\'div\',class_=\'info J-info-all Hide\')
                
            if content:
                    
                res[\'content\']=content.p.string.encode(\'utf-8\')
                
            else:
                if tr.find(\'div\',class_=\'info J-info-short\'):
                        
                    res[\'content\']=tr.find(\'div\',class_=\'info J-info-short\').p.string.encode(\'utf-8\').strip()
                        
                else:
                    res[\'content\']=\'-\'
                    
            
    @config(age=4*24*60)
    def detail_page_all(self, response):
        
        global count
        
        
        #得到全部评论
        for each in response.doc(\'DIV.comment-list\').items():
            
            each = BeautifulSoup(str(each))
            
            tmp=each.find_all(\'li\')
            
            for tr in tmp:
               
                res={}
                count += 1
               
                #点评的id
                res[\'itemid\']=count
                
                #星级
                star=tr.find(\'div\',class_=\'content\')
                if star:
                    
                    rank=star.span.get(\'class\')[1]
                
                    res[\'level\']=int(re.findall(r\'irr-star(.*)\',str(rank))[0])*1.0/10
                    
                else:
                    continue
                    
                #点赞次数
                date=tr.find(\'div\',class_=\'misc-info\')
                res[\'time\']=date.find(\'span\',class_=\'time\').string.encode(\'utf-8\')
                
                #用户名
                name = tr.find(\'div\',class_=\'pic\')
                if name:
                    
                    res[\'user\']=name.find(\'p\',class_=\'name\').string.encode(\'utf-8\')
                else:
                    
                    res[\'user\']=\'-\'
                
                #口味环境服务
                res[\'comment\']={}
                page=tr.find(\'div\',class_=\'comment-rst\')
                if page:
                    
                    info= re.findall(\'class="rst">(.*)<em class="col-exp">(.*)</em></span>\',str(page))
                    
                    
                    if info:

                        for td in info:

                            res[\'comment\'][td[0]]=td[1].strip(\'(\').strip(\')\')
                #是否为团购点评
                group=tr.find(\'div\',class_=\'comment-txt\')
                if group.find(\'a\',target=\'blank\'):
                    
                    res[\'shopping_group\']=group.find(\'a\',target=\'blank\').string.encode(\'utf-8\')
                    
                else:
                    res[\'shopping_group\']=\'-\'
                    
                #人均价格     
                price=tr.find(\'span\',class_=\'comm-per\')
                if price:
                    res[\'price\']=price.string.encode(\'utf-8\')

                else:
                    res[\'price\']=\'-\'
                #简要评论
                if tr.find(\'div\',class_=\'J_brief-cont\'):
                    
                    tmp = str(tr.find(\'div\',class_=\'J_brief-cont\'))
                    res[\'content\']=re.findall(r\'<div class="J_brief-cont">([\\w\\W]*)</div>\',tmp)[0].strip()
                    
                else:
                    res[\'content\']=\'-\'
                
        
        #下一页
        for each in response.doc(\'A.NextPage\').items():
           
            self.crawl(each.attr.href, callback=self.detail_page_all)
以上是关于网络爬虫入门——案例三：爬取大众点评的商户信息的主要内容，如果未能解决你的问题，请参考以下文章