爬取拉勾网示例

Posted Neither Candidate

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取拉勾网示例相关的知识,希望对你有一定的参考价值。

爬取需求分析

# 第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
# 1、请求url:https://passport.lagou.com/login/login.html
# 2、请求方法:GET
# 3、请求头:
#    User-agent
r1 = session.get(‘https://passport.lagou.com/login/login.html‘,
                 headers={
                     ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘,
                 },
                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r1.text, re.S)[0]

# 第二步:登陆
# 1、请求url:https://passport.lagou.com/login/login.json
# 2、请求方法:POST
# 3、请求头:
#    cookie
#    User-agent
#    Referer:https://passport.lagou.com/login/login.html
#    X-Anit-Forge-Code:53165984
#    X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
#    X-Requested-With:XMLHttpRequest
# 4、请求体:
# isValidate:true
# username:18611453110
# password:70621c64832c4d4d66a47be6150b4a8e
# request_form_verifyCode:‘‘
# submit:‘‘
r2 = session.post(‘https://passport.lagou.com/login/login.json‘,
                  headers={
                      ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘,
                      ‘Referer‘: ‘https://passport.lagou.com/login/login.html‘,
                      ‘X-Anit-Forge-Code‘: X_Anti_Forge_Code,
                      ‘X-Anit-Forge-Token‘: X_Anti_Forge_Token,
                      ‘X-Requested-With‘: ‘XMLHttpRequest‘
                  },
                  data={
                      "isValidate": True,
                      ‘username‘: ‘18611453110‘,
                      ‘password‘: ‘70621c64832c4d4d66a47be6150b4a8e‘,
                      ‘request_form_verifyCode‘: ‘‘,
                      ‘submit‘: ‘‘
                  }
                  )

# 第三步:授权
# 1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html
# 2、请求方法:GET
# 3、请求头:
#    User-agent
#    Referer:https://passport.lagou.com/login/login.html

r3 = session.get(‘https://passport.lagou.com/grantServiceTicket/grant.html‘,
                 headers={
                     ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘,
                     ‘Referer‘: ‘https://passport.lagou.com/login/login.html‘,
                 }
                 )


# 第四步:验证
r4 = session.get(‘https://www.lagou.com/resume/myresume.html‘,
                 headers={
                     ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘,
                 }
                 )

# 第五步:筛选职位信息
# 请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91
# 请求方法:GET
# 请求头:
# User-Agent
# 请求参数:
# gj:3年及以下
# px:default
# yx:25k-50k
# city:北京


#第六步,详细的筛选出职位信息条件
#请求参数
# params={
#      ‘gj‘: ‘3年及以下‘,
#      ‘px‘: ‘default‘,
#      ‘yx‘: ‘25k-50k‘,
#      ‘city‘: ‘北京‘,
#     ‘needAddtionalResult‘:False,
#     ‘isSchoolJob‘:0
# }

#第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
    # 请求url:详情页地址
    # 请求方式:GET
    # 请求头:User-Agent
    r7=session.get(company_link,
                headers={
                    ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘,
                }
                )
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r7.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r7.text, re.S)[0]



    #第八步:投递简历
    #请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    #请求方式:POST
    #请求头:
        #Referer:详情页地址
        #User-agent
        #X-Anit-Forge-Code:53165984
        #X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
        #X-Requested-With:XMLHttpRequest
    #请求体:
    # positionId:职位ID
    # type:1
    # force:true

    session.post(‘https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json‘,
                 headers={
                     ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘,
                     ‘Referer‘: company_link,
                     ‘X-Anit-Forge-Code‘: X_Anti_Forge_Code,
                     ‘X-Anit-Forge-Token‘: X_Anti_Forge_Token,
                     ‘X-Requested-With‘: ‘XMLHttpRequest‘
                 },
                 data={
    ‘positionId‘:positionId,
    ‘type‘:1,
    ‘force‘:True
                 }
                 )
    print(‘%s 投递成功‘ %(companyShortName))

 

技术分享图片
import requests
import re
from urllib.parse import urlencode
session = requests.session()
r1 = session.get(
    "https://passport.lagou.com/login/login.html",
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
    }
)
X_Anit_Forge_Code  = re.findall("X_Anit_Forge_Code =‘(.*?)‘",r1.text,re.S)
X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token =‘(.*?)‘",r1.text,re.S)
r2 = session.post(
    "https://passport.lagou.com/login/login.json",
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
        "Referer":"https://passport.lagou.com/login/login.html",
        "X-Anit-Forge-Code":X_Anit_Forge_Code,
        "X-Anit-Forge-Token":X_Anit_Forge_Token,
        "X-Requested-With":"XMLHttpRequest"
    },
    data={
        "isValidate": True,
        username: 18611453110,
        password: 70621c64832c4d4d66a47be6150b4a8e,
        request_form_verifyCode: ‘‘,
        submit: ‘‘
    }
)
r3 = session.get(
    "https://passport.lagou.com/grantServiceTicket/grant.html",
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
        Referer: https://passport.lagou.com/login/login.html,
    }
)
r4 = session.get(
    https://www.lagou.com/resume/myresume.html,
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
    }
)

print(18611453110 in r4.text)

# ============================
# res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
# url = "https://www.lagou.com/jobs/list_"+res
# r5 =session.get(url,
#             headers={
#                     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
#                 },
#              params={
#                      ‘gj‘: ‘3年及以下‘,
#                      ‘px‘: ‘default‘,
#                      ‘yx‘: ‘25k-50k‘,
#                      ‘city‘: ‘北京‘
#                 }
#          ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式
res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
url = "https://www.lagou.com/jobs/list_"+res
r6 = session.post(
    https://www.lagou.com/jobs/postionAjax.json,
    headers = {
        Referer: url,
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
    },
    data = {
        "first":True,
        "pn":1,
        "kd":"java高级开发"
    },
    params = {
        "gj":"3年及以下",
        "gx":"default",
        "yx":"15k-25k",
        "city":"北京",
        "needAddtionResult":False,
        "isSchoolJob":0
    }
)
from pprint import pprint
# print(r6.json())
comapines_list=r6.json()[content][positionResult][result]
for comapiny in comapines_list:
    positionId=comapiny[positionId]
    company_link=https://www.lagou.com/jobs/{pos_id}.html.format(pos_id=positionId)
    companyShortName = comapiny[companyShortName]
    positionName = comapiny[positionName]
    salary = comapiny[salary]
    print(‘‘‘
    详情连接:%s
    公司名:%s
    职位名:%s
    薪资:%s
    ‘‘‘ %(company_link,companyShortName,positionName,salary))
    r7=session.get(company_link,
                headers={
                    User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
                }
                )
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r7.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r7.text, re.S)[0]
    # print(X_Anti_Forge_Token,X_Anti_Forge_Code)


    session.post(https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json,
                 headers={
                     User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
                     Referer: company_link,
                     X-Anit-Forge-Code: X_Anti_Forge_Code,
                     X-Anit-Forge-Token: X_Anti_Forge_Token,
                     X-Requested-With: XMLHttpRequest
                 },
                 data={
    positionId:positionId,
    type:1,
    force:True
                 }
                 )
    print(%s 投递成功 %(companyShortName))
代码示例

 

以上是关于爬取拉勾网示例的主要内容,如果未能解决你的问题,请参考以下文章

Python 爬取拉勾网python职位信息

python3 爬取拉勾网1

爬取拉勾网

scrapy 爬取拉勾网

通俗易懂的分析如何用Python实现一只小爬虫,爬取拉勾网的职位信息

python无所不能,爬虫几分钟爬取拉勾网,实现数据可视化!