第二天
Posted 韦德·沃兹
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了第二天相关的知识,希望对你有一定的参考价值。
数据的爬取
高德poi的全国下至到县的景点数据爬取
1 import requests 2 import pandas as pd 3 import json 4 import time 5 import math 6 import sheet 7 8 def read_poi(): 9 10 #各下级行政区的代码,若是嫌逐个复制麻烦可以通过读取文件的方式实现,此处不进行讲解 11 # arr= [\'120100\',\'120101\',\'120102\',\'120103\',\'120104\',\'120105\',\'120106\',\'1201010\',\'120111\',\'120112\',\'120113\',\'120114\',\'120115\',\'120116\',\'120117\',\'120118\',\'120119\'] 12 #API的URL,在这里进行了结构化处理 13 arr = sheet.read_excel() 14 url1="https://restapi.amap.com/v3/place/text?key=807c3aaf8b58a288aa83b28d11c817e4&keywords=景区&types=风景名胜&city=" 15 url2="&output=JSON&children=&offset=20&page=" 16 url3="&extensions=all" 17 #用于储存数据 18 x=[] 19 #用于计数 20 num=0 21 22 #循环各下级行政区进行POI检索 23 for i in range(0,len(arr)): 24 #当前行政区 25 city=arr[i] 26 #因为官方对API检索进行了45页限制,所以只要检索到45页即可 27 for page in range(1,46): 28 #若该下级行政区的POI数量达到了限制,则警告使用者,之后考虑进行POI类型切分 29 if page==45: 30 print("警告!!POI检索可能受到限制!!") 31 #构造URL 32 thisUrl=url1+city+url2+str(page)+url3 33 #获取POI数据 34 data=requests.get(thisUrl) 35 #转为JSON格式 36 s=data.json() 37 38 #解析JSON 39 aa = s["pois"] 40 41 #若解析的JSON为空,即当前行政区的数据不够45页(即没有达到限制),返回 42 if len(aa)==0: 43 break 44 #对每条POI进行存储 45 for k in range(0,len(aa)): 46 b={} 47 b["name"]=aa[k]["name"] 48 b["type"]=aa[k]["type"] 49 b["address"]=aa[k].get("address") 50 b["adname"]=aa[k]["adname"] 51 b["locationleft"]=str(aa[k]["location"].split(",")[0]) 52 b["locationright"]=str(aa[k]["location"].split(",")[1]) 53 x.append(b) 54 num+=1 55 print("爬取了 "+str(num)+" 条数据") 56 time.sleep(0.5) 57 58 59 #将数据结构化存储至规定目录的CSV文件中 60 result = json.dumps(x, sort_keys=True, indent=2) 61 with open(\'./jingqu/datapoi.json\',\'w\',encoding=\'utf-8\') as file: 62 for i in result: 63 file.write(i) 64 print(\'数据已写入json文件...\')
import xlrd import datetime from datetime import date def read_excel(): wb = xlrd.open_workbook(r\'E:\\poi.xlsx\') print(wb.sheet_names()) sheet1 = wb.sheet_by_index(0) cols2 = sheet1.col_values(1) return cols2 col= read_excel() print(col)
存入数据库
import mysql.connector import json import time import datetime import null as null with open(\'./jingqu/datapoi.json\', \'r\') as file: data = file.read() data = json.loads(data) def du_sql(): mydb = mysql.connector.connect( host="localhost", user="root", password="password", database="scence", auth_plugin="mysql_native_password" ) dbpath = mydb.cursor() savaDataSql(dbpath) mydb.commit() def savaDataSql(dbpath): cur = dbpath try: for each in data: name = each[\'name\'] type = each[\'type\'] if(each[\'address\']!=None): if(len(each[\'address\'])==0): address = \'\' else: address = each[\'address\'] else: address = \'\' adname = each[\'adname\'] locationleft = each[\'locationleft\'] locationright = each[\'locationright\'] sql = "INSERT INTO scence1 (name,type,address,adname,locationleft,locationright) values (%s,%s,%s,%s,%s,%s)" var = (name,type,address,adname,locationleft,locationright) cur.execute(sql,var) except : print(name)
main.py
import poi poi.read_poi() import daosqlscene daosqlscene.du_sql()
以上是关于第二天的主要内容,如果未能解决你的问题,请参考以下文章