Scrapy爬取大众点评

Posted 风吹白杨的安妮

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Scrapy爬取大众点评相关的知识,希望对你有一定的参考价值。

最近想吃烤肉,所以想看看深圳哪里的烤肉比较好吃,于是自己就开始爬虫咯。这是个静态网页,有反爬机制,我在setting和middlewares设置了反爬措施

Setting

# -*- coding: utf-8 -*-

# Scrapy settings for dazhong project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = \'dazhong\'

SPIDER_MODULES = [\'dazhong.spiders\']
NEWSPIDER_MODULE = \'dazhong.spiders\'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36\'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 10
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\',
#   \'Accept-Language\': \'en\',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    \'dazhong.middlewares.DazhongSpiderMiddleware\': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    \'scrapy.downloadermiddleware.useragent.UserAgentMiddleware\': None, 
    \'dazhong.middlewares.MyUserAgentMiddleware\': 400,
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    \'scrapy.extensions.telnet.TelnetConsole\': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
\'dazhong.pipelines.DazhongPipeline\': 200,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = \'httpcache\'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\'

MY_USER_AGENT = [\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36\']

 

ITEM

import scrapy

class DazhongItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    location = scrapy.Field()
    people = scrapy.Field()
    money = scrapy.Field()
    taste = scrapy.Field()
    envir = scrapy.Field()
    taste_score = scrapy.Field()
    service = scrapy.Field()

 

Spider:

# -*- coding: utf-8 -*-
import scrapy
import re
from bs4 import BeautifulSoup
from scrapy.http import Request
from dazhong.items import DazhongItem

class DzSpider(scrapy.Spider):
    name = \'dz\'
    allowed_domains = [\'www.dianping.com\']
    #headers = {\'USER-Agent\':\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36\'}
    #custom_settings = {\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36\'}
    first_url = \'http://www.dianping.com/shenzhen/ch10/g114\'
    last_url = \'p\'
    def start_requests(self):
        for i in range(1,45):
            url = self.first_url + self.last_url + str(i)
            yield Request(url,self.parse)    
    def parse(self, response):
        soup = BeautifulSoup(response.body.decode(\'UTF-8\'),\'lxml\')
        for site in soup.find_all(\'div\',class_=\'txt\'):
            item = DazhongItem()
            try:
                item[\'name\'] = site.find(\'div\',class_=\'tit\').find({\'h4\'}).get_text()
                item[\'location\'] = site.find(\'div\',class_=\'tag-addr\').find(\'span\',class_=\'addr\').get_text()
                item[\'people\'] = site.find(\'div\',class_=\'comment\').find(\'a\').find(\'b\').get_text()
                item[\'money\'] = site.find(\'div\',class_=\'comment\').find_all(\'a\')[1].find(\'b\').get_text()
                item[\'taste\'] = site.find(\'div\',class_= \'tag-addr\').find(\'a\').find(\'span\').get_text() 
                item[\'envir\'] = site.find(\'span\',class_= \'comment-list\').find_all(\'span\')[1].find(\'b\').get_text()
                item[\'taste_score\'] = site.find(\'span\',class_= \'comment-list\').find_all(\'span\')[0].find(\'b\').get_text()
                item[\'service\'] = site.find(\'span\',class_= \'comment-list\').find_all(\'span\')[2].find(\'b\').get_text()
                yield item
            except:
                pass

 

PIPELINE:

from openpyxl import Workbook

class DazhongPipeline(object):  # 设置工序一
    def __init__(self):
        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.append([\'店铺名称\',\'地点\',\'评论人数\',\'平均消费\',\'口味\',\'环境评分\',\'口味评分\',\'服务评分\',])  # 设置表头
    def process_item(self, item, spider):  # 工序具体内容
        line = [item[\'name\'],item[\'location\'],item[\'people\'],item[\'money\'],item[\'taste\'],item[\'envir\'],item[\'taste_score\'],item[\'service\']]  # 把数据中每一项整理出来
        self.ws.append(line)  # 将数据以行的形式添加到xlsx中
        self.wb.save(\'dazhong.xlsx\')  # 保存xlsx文件
        return item
    def spider_closed(self, spider):
        self.file.close()

 

middlewares:

import scrapy
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random

class MyUserAgentMiddleware(UserAgentMiddleware):
    def __init__(self, user_agent):
        self.user_agent = user_agent
    @classmethod
    def from_crawler(cls,crawler):
        return cls(
                user_agent = crawler.settings.get(\'MY_USER_AGENT\')
            )
    def process_request(self, request, spider):
        agent = random.choice(self.user_agent)
        request.headers[\'User-Agent\'] = agent

 

那些没有环境评分、服务评分数据的也就跳过了,爬来没意义

结果如下:

 

 决定去吃姜虎东

以上是关于Scrapy爬取大众点评的主要内容,如果未能解决你的问题,请参考以下文章

爬取大众点评评论犯法吗

python爬虫实例详细介绍之爬取大众点评的数据

python爬虫爬取大众点评并导入redis

爬取大众点评

Python3爬虫实战:爬取大众点评网某地区所有酒店相关信息

python爬取大众点评并写入mongodb数据库和redis数据库