爬取全国的城市路口数量

Posted 空城机

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取全国的城市路口数量相关的知识,希望对你有一定的参考价值。

在图吧本地生活中https://poi.mapbar.com/

python写方法:

import requests
import re
from lxml import etree
from Roadtest01 import respon
import pymysql
import random
import json
from bs4 import BeautifulSoup

allroads = 0

#ip代理
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list

def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = 'http': proxy_ip
    return proxies

#数据库提交
def serMySql(n,name,loads):
    db = pymysql.connect('localhost', 'root', '**********', 'cityroads')
    cursor = db.cursor()
    # name = 'cityroad_nums'
    # sql = 'select * from '+name
    sql = 'insert into cityroad_nums(city_id,city_name,cityroad_nums) values(%s,%s,%s);'

    try:
        value = (n,name,loads)
        print(value)
        cursor.execute(sql,value)
        # print("****")
        db.commit()
        # print("成功")
        # reslist = cursor.fetchall()
        # for row in reslist:
        #     print('%d--%s--%s--%s' % (row[0], row[1], row[3],row[4]))
        # return reslist
    except:
        print('失败')
        # 如果提交失败,回到上一次的数据
        db.rollback()
    cursor.close()
    db.close()

#查询路口数量
def parse(text):
    if text!="":
        global allroads
        #初始化,标准化
        html = etree.HTML(text)
        #提取我们想要的信息 需要写xpath语法
        # 城市名称
        # citys = html.xpath('//li[@class="logo"]/a[@class="logo_a"]/text()')
        # print(citys)
        #路口信息
        roads = html.xpath('//div[@class="sortC"]/dl/dd/a/text()')
        # print(roads)
        # print(len(roads))
        allroads += len(roads)

#访问所有的城市
def getName(url):
    global allroads
    # 请求头,告诉服务器这是浏览器
    user_agent_list = [
        'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
        'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
        'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
        'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
        'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
        'Opera/8.0 (Windows NT 5.1; U; en)',
        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
    ]
    # header = 
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    header = 
        'User-Agent': random.choice(user_agent_list)
    
    #伪造cookie
    cookies = dict(uuid='b18f0e70-8705-470d-bc4b-09a8da617e15',
                   UM_distinctid='15d188be71d50-013c49b12ec14a-3f73035d-100200-15d188be71ffd')
    #ip代理:
    headers = 
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    
    url0 = 'http://www.xicidaili.com/nn/'
    ip_list = get_ip_list(url0, headers=headers)
    proxies = get_random_ip(ip_list)

    # 模拟浏览器发送HTTP请求
    response = requests.get(url, headers=header,cookies=cookies,proxies=proxies)
    # print(response.text)
    # 初始化,标准化
    html = etree.HTML(response.text)
    names = html.xpath('//dl[@id="city_list"]/dd/a/text()')
    # print(names)
    urls = html.xpath('//dl[@id="city_list"]/dd/a/@href')
    # print(urls)
    n = 0
    print(n)
    for name, href in zip(names, urls):
        href += 'GA0/'
        name = str(name)
        # print(name, href)
        # print(name, allroads)
        n += 1
        print(n,name)
#这里是之前爬取到327我的IP被封了,然后继续爬
        if  n>327:
            text = respon(href)
            # 查询路口数量
            parse(text)
            print(name, allroads)
            # print(type(n),type(name),type(allroads))
            serMySql(n,name,allroads)
        # print(type(href))

        allroads = 0

url = 'https://poi.mapbar.com/'
getName(url)
print("结束")

# serMySql(275,"图木舒克市",201)

 

python检查更新

因为在爬取的过程中遇到了网页中某个城市的路口数量有多页的,这些也的跳转按钮是伪类元素,所以通过更改网页URL来循环遍历

import requests
import re
from lxml import etree
from Roadtest01 import respon
import pymysql
import random
from bs4 import BeautifulSoup

allroads = 0

#ip代理
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list

def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = 'http': proxy_ip
    return proxies

#数据库更新
def serMySql(n,loads):
    db = pymysql.connect('localhost', 'root', '*********', 'cityroads')
    cursor = db.cursor()
    # name = 'cityroad_nums'
    # sql = 'select * from '+name
    sql = 'update cityroad_nums set cityroad_nums = %s where city_id = %s;'

    try:
        value = (loads,n)
        print(value)
        cursor.execute(sql,value)
        print("****")
        db.commit()
        print("成功")
        # reslist = cursor.fetchall()
        # for row in reslist:
        #     print('%d--%s--%s--%s' % (row[0], row[1], row[3],row[4]))
        # return reslist
    except:
        print('失败')
        # 如果提交失败,回到上一次的数据
        db.rollback()
    cursor.close()
    db.close()

#检查SQL数据表中cityroad_nums为0的城市
def checkRoads():
    db = pymysql.connect('localhost', 'root', '********', 'cityroads')
    cursor = db.cursor()
    #在数据库中检查是路口数否有为0的城市,然后再重新爬取对比
    # sql = 'select city_id from cityroad_nums where cityroad_nums=0 and city_id>285;'
    #因为存在伪元素,所以需要把路口9000以上的城市再去做一次排查路口数量
    sql = 'select city_id from cityroad_nums where cityroad_nums>9000'
    try:
        cursor.execute(sql)
        # print("成功")
        reslist = cursor.fetchall()
        # print(type(reslist))
        # for row in reslist:
        #     print('%d' % row[0])
        return reslist
    except:
        print('失败')
        # 如果提交失败,回到上一次的数据
        db.rollback()
    cursor.close()
    db.close()

#查询路口数量
def parse(text):
    if text!="":
        global allroads
        #初始化,标准化
        html = etree.HTML(text)
        #提取我们想要的信息 需要写xpath语法
        # 城市名称
        # citys = html.xpath('//li[@class="logo"]/a[@class="logo_a"]/text()')
        # print(citys)
        #路口信息
        roads = html.xpath('//div[@class="sortC"]/dl/dd/a/text()')
        # print(roads)
        # print(len(roads))
        allroads += len(roads)


#访问所有的城市
def getName(url):
    global allroads
    # 请求头,告诉服务器这是浏览器
    user_agent_list = [
        'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
        'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
        'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
        'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
        'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
        'Opera/8.0 (Windows NT 5.1; U; en)',
        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
    ]
    # header = 
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    header = 
        'User-Agent': random.choice(user_agent_list)
    
    #伪造cookie
    cookies = dict(uuid='b18f0e70-8705-470d-bc4b-09a8da617e15',
                   UM_distinctid='15d188be71d50-013c49b12ec14a-3f73035d-100200-15d188be71ffd')
    #ip代理:
    headers = 
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    
    url0 = 'http://www.xicidaili.com/nn/'
    ip_list = get_ip_list(url0, headers=headers)
    proxies = get_random_ip(ip_list)

    # 模拟浏览器发送HTTP请求
    response = requests.get(url, headers=header,cookies=cookies,proxies=proxies)
    # print(response.text)
    # 初始化,标准化
    html = etree.HTML(response.text)
    names = html.xpath('//dl[@id="city_list"]/dd/a/text()')
    # print(names)
    urls = html.xpath('//dl[@id="city_list"]/dd/a/@href')
    # print(urls)
    n = 0
    # print(n)
    for name, href in zip(names, urls):
        href += 'GA0/'
        name = str(name)
        # print(name, href)
        # print(name, allroads)
        n += 1
        lists = checkRoads()
        for row in lists:
            if n == row[0]:

                #城市的英文名
                ename = re.findall(r"com/(.+?)/GA0/",href)
                print(n, name, href,ename[0])
                for i in range(1, 5):
                    # print(n)
                    cityUrl(ename[0],i)
                print(allroads)
                # text = respon(href)
                # # 查询路口数量
                # parse(text)
                # print(n,name,allroads)
                serMySql(n, allroads)

        allroads = 0

def cityUrl(ename,n):
    url = f'https://poi.mapbar.com/ename/GA0_n/'
    print(url)
    text = respon(url)
    parse(text)

url = 'https://poi.mapbar.com/'
getName(url)
print("结束")

# lists = checkRoads()
# for row in lists:
#     print('%d' % row[0])

 

最终爬取的效果

城市名称和城市的中路口数量,以及该城市爬取的网址

 

 

 

 

 

 

 

一起学习,一起进步 -.- ,如有错误,可以发评论 

以上是关于爬取全国的城市路口数量的主要内容,如果未能解决你的问题,请参考以下文章

2455 繁忙的都市

bzoj1083

vijos--繁华的都市

智慧城市交通的要素:路口监管可视化系统的解决方案

vijos 1190 繁忙的都市

最小生成树——繁忙的都市