爬虫大作业

Posted 117李智濠

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫大作业相关的知识,希望对你有一定的参考价值。

# -*- coding:utf-8 -*-
# 第三方库
import scrapy
from scrapy.spiders import Spider
from lxml import etree
import re
import jieba
from BoKeYuan.items import BokeyuanItem

class BlogYuanSpider(Spider):
name = \'blog_yuan\'
start_urls = [\'https://www.cnblogs.com/\']
extra = \'/#p{}\'

def start_requests(self):
yield scrapy.Request(self.start_urls[0], callback=self.parse)

@staticmethod
def get_num(response):
html = response.body
selector = etree.HTML(html)
page_num = int(selector.xpath(\'string(//a[@class="p_200 last"])\'))
return page_num

@staticmethod
def get_info(response):
html = response.body
item = BokeyuanItem()
selector = etree.HTML(html)
i = selector.xpath(\'string(//div[@class="blogpost-body"])\')
info = re.sub(\'[\\s+\\n\\t]\', \'\', i)
item[\'info\'] = info
item[\'url\'] = response.url
d = {}
text = \'\'
text += \' \'.join(jieba.lcut(item[\'info\']))
t = re.sub(\'[\\,\\\'\\:\\/\\)\\.\\;\\}\\(\\,\\{]\', \'\', text).split()
for v in t:
d[v] = item[\'info\'].count(v)
e = list(d.items())
e.sort(key=lambda x: x[1], reverse=True)
for k, v in enumerate(e):
if k < 20:
print(v)
yield item

def get_page_url(self, response):
selector = etree.HTML(response.body)
page_url = selector.xpath(\'//a[@class="titlelnk"]/@href\')
for p in page_url:
yield scrapy.Request(url=p, callback=self.get_info)

def parse(self, response):
html = response.body
selector = etree.HTML(html)
page_url = selector.xpath(\'//a[@class="titlelnk"]/@href\')
for p in page_url:
yield scrapy.Request(url=p, callback=self.get_info)
page_num = self.get_num(response)
for n in range(2, page_num):
yield scrapy.Request(self.start_urls[0] + self.extra.format(n), callback=self.get_page_url)

通过设置入口url寻找首页中内容页的链接,并寻找首页中的最大页数,通过嵌套循环遍历页数和内容页链接,实现深度为3的深度爬取,通过yield生成item对象,同时输出词频统计后出现次数的top20

 

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field


class BokeyuanItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
info = Field()
url = Field()

该函数定义scrapy中item的键以传值

# -*- coding: utf-8 -*-

# Scrapy settings for BoKeYuan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = \'BoKeYuan\'

SPIDER_MODULES = [\'BoKeYuan.spiders\']
NEWSPIDER_MODULE = \'BoKeYuan.spiders\'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = \'BoKeYuan (+http://www.yourdomain.com)\'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\',
  \'Accept-Language\': \'en\',
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    \'BoKeYuan.middlewares.BokeyuanSpiderMiddleware\': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    \'BoKeYuan.middlewares.BokeyuanDownloaderMiddleware\': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    \'scrapy.extensions.telnet.TelnetConsole\': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   \'BoKeYuan.pipelines.BokeyuanPipeline\': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = \'httpcache\'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\'

该函数设置header头部信息及延迟时间的设置

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 第三方库
import jieba
import re
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from scipy.misc import imread


class BokeyuanPipeline(object):
    def __init__(self):
        pass

    def process_item(self, item, spider):
        text = \'\'
        name = item[\'url\'].split(\'/\')[-1].replace(\'.html\', \'\')
        p = os.path.abspath(__file__).replace(\'\\\\pipelines.py\', \'\')
        if os.path.exists(p+\'\\\\pic\'):
            path = p+\'\\\\pic\'
        else:
            path = os.mkdir(p+\'\\\\pic\')
        info = re.sub(\'[\\s+\\、\\(\\)\\(\\)\\{\\}\\_\\,\\.\\。\\“\\”\\;\\!\\?]\', \'\', item[\'info\'])
        text += \' \'.join(jieba.lcut(info))
        backgroud_Image = imread(p + \'\\\\ju.PNG\')
        wc = WordCloud(
            width=500,
            height=500,
            margin=2,
            background_color=\'white\',  # 设置背景颜色
            mask=backgroud_Image,  # 设置背景图片
            font_path=\'C:\\Windows\\Fonts\\STZHONGS.TTF\',  # 若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
            max_words=2000,  # 设置最大现实的字数
            stopwords=STOPWORDS,  # 设置停用词
            max_font_size=150,  # 设置字体最大值
            random_state=42  # 设置有多少种随机生成状态,即有多少种配色方案
        )
        wc.generate_from_text(text)
        wc.to_file(path + \'\\\\{}.jpg\'.format(name))
        return item

    def close_spider(self, spider):
        pass

通过jieba库实现分词并输出词云

 

以上是关于爬虫大作业的主要内容,如果未能解决你的问题,请参考以下文章

爬虫大作业

爬虫大作业

HTML5期末大作业:餐饮美食网站设计——咖啡(10页) HTML+CSS+JavaScript 学生DW网页设计作业成品 web课程设计网页规划与设计 咖啡网页设计 美食餐饮网页设计...(代码片段

Python大作业——爬虫+可视化+数据分析+数据库(可视化篇)

Python大作业——爬虫+可视化+数据分析+数据库(数据分析篇)

Python课程设计大作业:利用爬虫获取NBA比赛数据并进行机器学习预测NBA比赛结果