第二天

Posted 2021-05-13 韦德·沃兹

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了第二天相关的知识，希望对你有一定的参考价值。

数据的爬取

高德poi的全国下至到县的景点数据爬取

 1 import requests
 2 import pandas as pd
 3 import json
 4 import time
 5 import math
 6 import sheet
 7 
 8 def read_poi():
 9 
10     #各下级行政区的代码，若是嫌逐个复制麻烦可以通过读取文件的方式实现，此处不进行讲解
11     # arr= [\'120100\',\'120101\',\'120102\',\'120103\',\'120104\',\'120105\',\'120106\',\'1201010\',\'120111\',\'120112\',\'120113\',\'120114\',\'120115\',\'120116\',\'120117\',\'120118\',\'120119\']
12     #API的URL，在这里进行了结构化处理
13     arr = sheet.read_excel()
14     url1="https://restapi.amap.com/v3/place/text?key=807c3aaf8b58a288aa83b28d11c817e4&keywords=景区&types=风景名胜&city="
15     url2="&output=JSON&children=&offset=20&page="
16     url3="&extensions=all"
17     #用于储存数据
18     x=[]
19     #用于计数
20     num=0
21 
22     #循环各下级行政区进行POI检索
23     for i in range(0,len(arr)):
24         #当前行政区
25         city=arr[i]
26         #因为官方对API检索进行了45页限制，所以只要检索到45页即可
27         for page in range(1,46):
28             #若该下级行政区的POI数量达到了限制，则警告使用者，之后考虑进行POI类型切分
29             if page==45:
30                 print("警告！！POI检索可能受到限制！！")
31             #构造URL
32             thisUrl=url1+city+url2+str(page)+url3
33             #获取POI数据
34             data=requests.get(thisUrl)
35             #转为JSON格式
36             s=data.json()
37 
38             #解析JSON
39             aa = s["pois"]
40 
41             #若解析的JSON为空，即当前行政区的数据不够45页（即没有达到限制），返回
42             if len(aa)==0:
43                 break
44             #对每条POI进行存储
45             for k in range(0,len(aa)):
46                 b={}
47                 b["name"]=aa[k]["name"]
48                 b["type"]=aa[k]["type"]
49                 b["address"]=aa[k].get("address")
50                 b["adname"]=aa[k]["adname"]
51                 b["locationleft"]=str(aa[k]["location"].split(",")[0])
52                 b["locationright"]=str(aa[k]["location"].split(",")[1])
53                 x.append(b)
54                 num+=1
55                 print("爬取了 "+str(num)+" 条数据")
56             time.sleep(0.5)
57 
58 
59     #将数据结构化存储至规定目录的CSV文件中
60     result = json.dumps(x, sort_keys=True, indent=2)
61     with open(\'./jingqu/datapoi.json\',\'w\',encoding=\'utf-8\') as file:
62         for i in result:
63                 file.write(i)
64         print(\'数据已写入json文件...\')

import xlrd
import datetime
from datetime import date
def read_excel():
    wb = xlrd.open_workbook(r\'E:\\poi.xlsx\')
    print(wb.sheet_names())
    sheet1 = wb.sheet_by_index(0)
    cols2 = sheet1.col_values(1)
    return cols2
col= read_excel()
print(col)

存入数据库

import mysql.connector
import json
import time
import datetime

import null as null

with open(\'./jingqu/datapoi.json\', \'r\') as file:
    data = file.read()
    data = json.loads(data)

def du_sql():
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="password",
        database="scence",
        auth_plugin="mysql_native_password"
    )
    dbpath = mydb.cursor()
    savaDataSql(dbpath)
    mydb.commit()
def savaDataSql(dbpath):
cur = dbpath
    try:
        for each in data:
            name = each[\'name\']
            type = each[\'type\']
            if(each[\'address\']!=None):
                if(len(each[\'address\'])==0):
                    address = \'\'
                else:
                    address = each[\'address\']
            else:
                address = \'\'
            adname = each[\'adname\']
            locationleft = each[\'locationleft\']
            locationright = each[\'locationright\']
            sql = "INSERT INTO scence1 (name,type,address,adname,locationleft,locationright) values (%s,%s,%s,%s,%s,%s)"
            var = (name,type,address,adname,locationleft,locationright)
            cur.execute(sql,var)
    except :
        print(name)

main.py

import poi
poi.read_poi()
import daosqlscene
daosqlscene.du_sql()

以上是关于第二天的主要内容，如果未能解决你的问题，请参考以下文章