2020学习05 爬虫，修改了一些bug

Posted 2021-03-12 xcl666

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了2020学习05 爬虫，修改了一些bug相关的知识，希望对你有一定的参考价值。

在上个爬虫代码中没有对信件类型进行分类，而且爬取的数据会出现大片时间爬取不到和回复内容爬取不到，

对代码进行优化后，

得到如下数据：

技术图片

只显示部分数据，可以看到爬取的完整度基本完好。

代码如下：

#coding:utf-8
import requests
from lxml import etree
import time
import pymysql
import datetime
import urllib
import json
from IPython.core.page import page

conn = pymysql.connect(
        host="localhost",
        user="root",
        port=3306,
        password="123456",
        database="bjxj")
gg=2950

def db(conn, reqcontent,reqname,reqtime,resname,restime,rescontent,reqtype,isreply):
    cursor = conn.cursor()
    # cursor.execute(
    #     "INSERT INTO xinjian(name) VALUES (%s)",
    #     [name])
    if isreply == False :
        isreply = 0
        restime1 = ‘‘
    else :
        isreply = 1
        restime1 = restime
    # print(reqcontent)
    # print(reqname)
    # print(reqtime)
    # print(resname)
    # #print(restime)
    # print(rescontent)
    # print(reqtype)
    # print(isreply)
    cursor.execute("INSERT INTO aaa (reqcontent,reqname,reqtime,resname,rescontent,reqtype,isreply,restime) VALUES (%s,%s,%s,%s,%s,%s,%s,%s);", [reqcontent,reqname,reqtime,resname,rescontent,reqtype,isreply,restime1])
    conn.commit()
    cursor.close()

def shijinOU(json1,url,i):
    print(i)
    head = {
            ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/39.0.2171.95 Safari/537.36‘,
            ‘method‘: ‘POST‘,
            ‘Content-Type‘: ‘application/json;charset=UTF-8‘,

            }
    data_json = json.dumps(json1)
    r = requests.post(url,data = data_json,headers=head)
    html = r.content.decode("utf-8")
    print("Status code:",r.status_code)
    new_data = json.loads(html)
    #print("6666:" + html)
    for s in range(0,6):
        print(new_data[‘mailList‘][s])
        reqname = new_data[‘mailList‘][s][‘letter_title‘]
        reqtime = new_data[‘mailList‘][s][‘create_date‘]
        resname = new_data[‘mailList‘][s][‘org_id‘]
        isreply = new_data[‘mailList‘][s][‘isReply‘]
        reqtype = new_data[‘mailList‘][s][‘letter_type‘]
        if new_data[‘mailList‘][s][‘letter_type‘] == ‘咨询‘ :
            #print(isreply)
            #print("询问标题：" + reqname + "询问时间：" + reqtime + "回答部门:" + resname + "是否回答：")
            lettertype = ‘consult‘
            lettertype1 = ‘consultDetail‘
            zixunTiqu(new_data[‘mailList‘][s][‘original_id‘],reqname,reqtime,resname,isreply,reqtype,lettertype,lettertype1)

        if new_data[‘mailList‘][s][‘letter_type‘] == ‘建议‘ :
            lettertype = ‘suggest‘
            lettertype1 = ‘suggesDetail‘
            zixunTiqu(new_data[‘mailList‘][s][‘original_id‘], reqname, reqtime, resname, isreply, reqtype, lettertype,
                      lettertype1)
        if new_data[‘mailList‘][s][‘letter_type‘] == ‘投诉‘ :
            lettertype = ‘complain‘
            lettertype1 = ‘complainDetail‘
            zixunTiqu(new_data[‘mailList‘][s][‘original_id‘], reqname, reqtime, resname, isreply, reqtype, lettertype,
                      lettertype1)

def zixunTiqu(AH,reqname,reqtime,resname,isreply,reqtype,lettertype,lettertype1):
    #print("询问标题："+reqname+"询问时间："+reqtime+"回答部门:"+resname+"是否回答："+isreply)
    head = {
            ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36‘,
            ‘method‘: ‘GET‘
        }
    url2 = ‘http://www.beijing.gov.cn/hudong/hdjl/com.web.‘+lettertype+‘.‘+lettertype1+‘.flow?originalId=‘+AH

    r = requests.get(url2, headers=head)
    #print(r.status_code)

    html = r.content.decode("utf-8")
    #print("777"+html)
    html1 = etree.HTML(html)
    #print(html)
    reqcontent1 = html1.xpath(‘head/meta[@name="Description"]/@content‘)

    restime1 = html1.xpath(‘//div[@class="col-xs-12 col-sm-3 col-md-3 my-2 "]//text()‘)
    restime2 = html1.xpath(‘//div[@class="col-xs-12 col-sm-3 col-md-3 my-2"]//text()‘)
    print(restime1)
    restime = ‘‘
    rescontent = ‘‘
    if len(restime1) ==0 and len(restime2) ==0:
        print("未回答")
        restime = ‘‘
        rescontent = ‘‘
    else:
        if len(restime1) == 0:

            restime = restime2[0]
            rescontent1 = html1.xpath(‘string(//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"])‘)
            rescontent = rescontent1.strip()
        else:
            restime = restime1[0]
            rescontent1 = html1.xpath(‘string(//div[@class="col-xs-12 col-md-12 column p-4 text-muted my-3"])‘)
            rescontent = rescontent1.strip()


    #print(restime)
    print(rescontent)
    db(conn, reqcontent1[0], reqname, reqtime, resname, restime, rescontent, reqtype, isreply)

if __name__==‘__main__‘:
    for i in range(0,100):
        print(‘***************************************************‘)
        page = 6*i

        fuck = {"PageCond/begin":page,
                "PageCond/length":6,
                "PageCond/isCount":"true",
                "keywords":"","orgids":"",
                "startDate":"","endDate":"",
                "letterType":"","letterStatue":""
                }
        shijinOU(fuck,"http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.mailList.biz.ext",i)
        #break
        #print(fuck)

html1 = etree.HTML(html)

总结：对于页面元素内容的提取可以通过html = etree.html（html1) 将html元素转换成可以使用xpath解析定位的内容，进而通过xpath解析定位得到元素的值。

以上是关于2020学习05 爬虫，修改了一些bug的主要内容，如果未能解决你的问题，请参考以下文章