scrapy爬取python职位

Posted yymor

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy爬取python职位相关的知识,希望对你有一定的参考价值。

  • 使用scrapy框架爬取前程无忧上的python职位
  1. 创建cmd文件:star.cmd
scrapy startproject Jobs
cd Jobs
scrapy genspider Job51Spider www.51job.com
  1. 使用编译器打开Jobs开始项目
  • 打开/spiders/Job51Spider.py 写入
# -*- coding: utf-8 -*-
import json
import re
import time

from scrapy import Spider, Request
import requests

from Jobs.items import Job51Item


class Job51spiderSpider(Spider):
    name = 'Job51Spider'
    allowed_domains = ['www.51job.com']
    start_urls = ['http://www.51job.com/']

    # 配置搜索城市, 和搜索关键字
    kw = 'python'
    sou_url = 'https://search.51job.com/list/{city_code},000000,0000,00,9,99,{kw},2,1.html'
    # 城市编号js
    city_codings_url = 'https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js?20180319'

    def start_requests(self):
        # 获取循环城市
        cities = self.get_url_citycods()
        forcity = list(cities)[:2]  # 这里切割前两个城市
        for city in forcity:
            yield Request(
                self.sou_url.format(city_code=cities[city], kw=self.kw),
                callback=self.parse_jobs,
                meta={'city': city}
            )

    def parse_jobs(self, response):
        city = response.meta['city']
        els = response.css('.dw_table .el')[1:]
        # import ipdb; ipdb.set_trace()
        for el in els:
            item = Job51Item()
            item['soucity'] = city
            item['pname'] = el.css('span a::text').extract_first().strip()
            item['purl'] = el.css('span a::attr(href)').extract_first().strip()
            item['cname'] = el.css('span.t2 a::text').extract_first().strip()
            item['curl'] = el.css('span.t2 a::attr(href)').extract_first().strip()
            item['address'] = el.css('span.t3::text').extract_first().strip()
            item['pay'] = el.css('span.t4::text').extract_first()
            item['retime'] = el.css('span.t5::text').extract_first().strip()
            yield item

        next_page = response.css('.bk a::text')[-1].extract().strip()
        # import ipdb;ipdb.set_trace()
        if next_page == '下一页':
            next_url = response.css('.bk a::attr(href)')[-1].extract().strip()
            yield Request(url=next_url, callback=self.parse_jobs, dont_filter=True, meta={'city': city})

    # 获取城市编号
    def get_url_citycods(self):
        area_text = requests.get(self.city_codings_url).text
        ss = re.search('({.*}).*?', area_text, re.S)
        st = ss.group()
        st_dict = json.loads(st)
        # 键值调换
        in_dict = {}
        # for k in st_dict:
        #     in_dict[st_dict[k]] = k
        # with open('data.json', 'wt', encoding='utf-8') as fs:
        #     json.dump(in_dict, fs, indent=4, ensure_ascii=False)
        # # 获取主要城市
        in_dict.clear()
        for k in st_dict:
            if k.find('0000') not in [-1]:
                in_dict[st_dict[k]] = k
        with open('city_big.json', 'wt', encoding='utf-8') as fs:
            json.dump(in_dict, fs, indent=4, ensure_ascii=False)
        return in_dict

    def parse(self, response):
        pass 
  • items.py 中
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field


class Job51Item(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    soucity = Field()
    # 职位名
    pname = Field()
    # 职位地址
    purl = Field()
    # 公司名
    cname = Field()
    # 公司地址
    curl = Field()
    # 工作地点
    address = Field()
    # 工资
    pay = Field()
    # 发布时间
    retime = Field()  
    
  • pipelines.py 中存入mongodb
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient

class Job51Pipeline(object):
    
    job51s = 'job51'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
                mongo_uri = crawler.settings.get('MONGO_URI'),
                mongo_db = crawler.settings.get('MONGO_DB')
            )

    def open_spider(self, spider):
        self.client = MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.job51s].insert_one(dict(item))
        return item 
  • setttings.py 中配置, 最后即可
FEED_EXPORT_ENCODING = 'utf-8'
MONGO_URI = 'localhost'
MONGO_DB = 'jobsconnection'

==仅供参考学习==

以上是关于scrapy爬取python职位的主要内容,如果未能解决你的问题,请参考以下文章

scrapy主动退出爬虫的代码片段(python3)

Python 爬取拉勾网python职位信息

拉钩网爬取所有python职位信息

如何在scrapy框架下用python爬取json文件

python网络爬虫之使用scrapy自动爬取多个网页

通俗易懂的分析如何用Python实现一只小爬虫,爬取拉勾网的职位信息