python3 爬取汽车之家所有车型操作步骤
Posted 康仔☆
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python3 爬取汽车之家所有车型操作步骤相关的知识,希望对你有一定的参考价值。
题记:
互联网上关于使用python3去爬取汽车之家的汽车数据(主要是汽车基本参数,配置参数,颜色参数,内饰参数)的教程已经非常多了,但大体的方案分两种:
1.解析出汽车之家某个车型的网页,然后正则表达式匹配出混淆后的数据对象与混淆后的js,并对混淆后的js使用pyv8进行解析返回正常字符,然后通过字符与数据对象进行匹配,具体方法见这位园友,传送门:https://www.cnblogs.com/my8100/p/js_qichezhijia.html (感谢这位大神前半部分的思路)
2.解析出汽车之家某个车型的网页,然后正则表达式匹配出混淆后的数据对象与混淆后的js,针对混淆后的js进行进行手动匹配,因为混淆的js大概分为8大类(无参数 返回常量,无参数 返回函数,参数等于返回值函数,无参数 返回常量,无参数 返回常量中间无混淆代码,字符串拼接时使无参常量,字符串拼接时使用返回参数的函数),然后通过正则表达式进行解析出8类内容并进行逐个替换,最终也会返回一个带有顺序的字符串,将这个字符串与前边的数据对象再次替换,最终数据对象中的所有span都会被替换成中文,具体操作见园友的地址,传送门:https://www.cnblogs.com/dyfblog/p/6753251.html (感谢这位大神前半部分的思路)
不过鉴于作者技术有限,上述的两种方案,我都没有完整的执行完成,哪怕花了一周的时间也没有,但是没有办法,谁让我是一个很爱钻牛角尖的人呢,下一步提出我自己琢磨出来的方案,流程上稍微有点复杂,但是稳打稳扎,还是可以爬出来的,好了话不多说了,贴出步骤;
1.获取所有车型的网页,保存到本地:
import bs4
import requests as req
\'\'\'
第一步,下载出所有车型的网页。
\'\'\'
def mainMethod():
\'\'\'
解析汽车之家所有车型数据保存到D盘
\'\'\'
li = [chr(i) for i in range(ord("T"),ord("Z")+1)]
firstSite="https://www.autohome.com.cn/grade/carhtml/"
firstSiteSurfixe=".html"
secondSite = "https://car.autohome.com.cn/config/series/"
secondSiteSurfixe = ".html"
for a in li:
if a is not None:
requestUrl = firstSite+a+firstSiteSurfixe
print(requestUrl)
#开始获取每个品牌的车型
resp = req.get(requestUrl)
# print(str(resp.content,"gbk"))
bs = bs4.BeautifulSoup(str(resp.content,"gbk"),"html.parser")
bss = bs.find_all("li")
con = 0
for b in bss:
d = b.h4
if d is not None:
her = str(d.a.attrs[\'href\'])
her = her.split("#")[0]
her = her[her.index(".cn")+3:].replace("/",\'\')
if her is not None:
secSite = secondSite +her + secondSiteSurfixe
print("secSite="+secSite)
# print(secSite)
#奥迪A3
if her is not None:
resp = req.get(secSite)
text = str(resp.content,encoding="utf-8")
print(a)
fil = open("d:\\\\autoHome\\\\html\\\\"+str(her),"a",encoding="utf-8")
fil.write(text)
con = (con+1)
else:
print(con)
if __name__ =="__main__":
mainMethod()
2.解析出每个车型的关键js并拼装成一个html,保存到本地。
import os
import re
\'\'\'
第二步,解析出每个车型的关键js拼装成一个html
\'\'\'
if __name__=="__main__":
print("Start...")
rootPath = "D:\\\\autoHome\\\\html\\\\"
files = os.listdir(rootPath)
for file in files:
print("fileName=="+file.title())
text = ""
for fi in open(rootPath+file,\'r\',encoding="utf-8"):
text = text+fi
else:
print("fileName=="+file.title())
#解析数据的json
alljs = ("var rules = \'2\';"
"var document = {};"
"function getRules(){return rules}"
"document.createElement = function() {"
" return {"
" sheet: {"
" insertRule: function(rule, i) {"
" if (rules.length == 0) {"
" rules = rule;"
" } else {"
" rules = rules + \'#\' + rule;"
" }"
" }"
" }"
" }"
"};"
"document.querySelectorAll = function() {"
" return {};"
"};"
"document.head = {};"
"document.head.appendChild = function() {};"
"var window = {};"
"window.decodeURIComponent = decodeURIComponent;")
try:
js = re.findall(\'(\\(function\\([a-zA-Z]{2}.*?_\\).*?\\(document\\);)\', text)
for item in js:
alljs = alljs + item
except Exception as e:
print(\'makejs function exception\')
newHtml = "<html><meta http-equiv=\'Content-Type\' content=\'text/html; charset=utf-8\' /><head></head><body> <script type=\'text/javascript\'>"
alljs = newHtml + alljs+" document.write(rules)</script></body></html>"
f = open("D:\\\\autoHome\\\\newhtml\\\\"+file+".html","a",encoding="utf-8")
f.write(alljs)
f.close()
3.解析出每个车型的数据json,比如var config ,var option , var bag var innerbag..但我就解析了基本信息跟配置信息,其他的无所谓。
import os
import re
\'\'\'
第三步 解析出每个车型的数据json,保存到本地。
\'\'\'
if __name__=="__main__":
print("Start...")
rootPath = "D:\\\\autoHome\\\\html\\\\"
files = os.listdir(rootPath)
for file in files:
print("fileName=="+file.title())
text = ""
for fi in open(rootPath+file,\'r\',encoding="utf-8"):
text = text+fi
else:
print("fileName=="+file.title())
#解析数据的json
jsonData = ""
config = re.search(\'var config = (.*?){1,};\',text)
if config!= None:
print(config.group(0))
jsonData = jsonData+ config.group(0)
option = re.search(\'var option = (.*?)};\',text)
if option != None:
print(option.group(0))
jsonData = jsonData+ option.group(0)
bag = re.search(\'var bag = (.*?);\',text)
if bag != None:
print(bag.group(0))
jsonData = jsonData+ bag.group(0)
# print(jsonData)
f = open("D:\\\\autoHome\\\\json\\\\"+file,"a",encoding="utf-8")
f.write(jsonData)
f.close()
4.生成样式文件,保存 到本地。
import os
from selenium import webdriver
\'\'\'
第四步,浏览器执行第二步生成的html文件,抓取执行结果,保存到本地。
\'\'\'
class Crack():
def __init__(self,keyword,username,passod):
self.url = \'https://www.baidu.com\'
self.browser = webdriver.Chrome(\'E:\\work\\ChromePortable\\App\\Google Chrome\\chromedriver.exe\')
if __name__=="__main__":
lists = os.listdir("D:/autoHome/newHtml/")
crack = Crack(\'测试公司\',\'17610177519\',\'17610177519\')
for fil in lists:
file = os.path.exists("D:\\\\autoHome\\\\content\\\\"+fil)
if file :
print(\'文件已经解析。。。\'+str(file))
continue
print(fil)
crack.browser.get("file:///D:/autoHome/newHtml/"+fil+"")
text = crack.browser.find_element_by_tag_name(\'body\')
print(text.text)
f = open("D:\\\\autoHome\\\\content\\\\"+fil,"a",encoding="utf-8")
f.write(text.text)
else:
f.close()
crack.browser.close()
5.读取样式文件,匹配数据文件,生成正常数据文件
import os
import re
\'\'\'
第五步 匹配样式文件与json数据文件,生成正常的数据文件。
\'\'\'
if __name__ =="__main__":
rootPath = "D:\\\\autoHome\\\\json\\\\"
listdir = os.listdir(rootPath)
for json_s in listdir:
print(json_s.title())
jso = ""
#读取json数据文件
for fi in open(rootPath+json_s,\'r\',encoding="utf-8"):
jso = jso+fi
content = ""
#读取样式文件
spansPath = "D:\\\\autoHome\\\\content\\\\"+json_s.title()+".html"
# print(spansPath)
for spans in open(spansPath,"r",encoding="utf-8"):
content = content+ spans
print(content)
#获取所有span对象
jsos = re.findall("<span(.*?)></span>",jso)
num = 0
for js in jsos:
print("匹配到的span=>>"+js)
num = num +1
#获取class属性值
sea = re.search("\'(.*?)\'",js)
print("匹配到的class==>"+sea.group(1))
spanContent = str(sea.group(1))+"::before { content:(.*?)}"
#匹配样式值
spanContentRe = re.search(spanContent,content)
if spanContentRe != None:
if sea.group(1) != None:
print("匹配到的样式值="+spanContentRe.group(1))
jso = jso.replace(str("<span class=\'"+sea.group(1)+"\'></span>"),re.search("\\"(.*?)\\"",spanContentRe.group(1)).group(1))
print(jso)
fi = open("D:\\\\autoHome\\\\newJson\\\\"+json_s.title(),"a",encoding="utf-8")
fi.write(jso)
fi.close()
6.到前五步已经可以看到json数据文件都已经是混淆前的了,说明已经爬取成功了。
7.读取数据文件,生成excel
import json
import os
import re
import xlwt
\'\'\'
第七步读取数据文件,生成excel
\'\'\'
if __name__ == "__main__":
rootPath = "D:\\\\autoHome\\\\newJson\\\\"
workbook = xlwt.Workbook(encoding = \'ascii\')#创建一个文件
worksheet = workbook.add_sheet(\'汽车之家\')#创建一个表
files = os.listdir(rootPath)
startRow = 0
isFlag = True #默认记录表头
for file in files:
list = []
carItem = {}
print("fileName=="+file.title())
text = ""
for fi in open(rootPath+file,\'r\',encoding="utf-8"):
text = text+fi
# else:
# print("文件内容=="+text)
#解析基本参数配置参数,颜色三种参数,其他参数
config = "var config = (.*?);"
option = "var option = (.*?);var"
bag = "var bag = (.*?);"
configRe = re.findall(config,text)
optionRe = re.findall(option,text)
bagRe = re.findall(bag,text)
for a in configRe:
config = a
print("++++++++++++++++++++++\\n")
for b in optionRe:
option = b
print("---------------------\\n")
for c in bagRe:
bag = c
# print(config)
# print(option)
# print(bag)
# print(bag)
try:
config = json.loads(config)
option = json.loads(option)
bag = json.loads(bag)
# print(config)
# print(option)
# print(bag)
path = "D:\\\\autoHome\\\\autoHome.xls"
configItem = config[\'result\'][\'paramtypeitems\'][0][\'paramitems\']
optionItem = option[\'result\'][\'configtypeitems\'][0][\'configitems\']
except Exception as e:
f = open("D:\\\\autoHome\\\\异常数据\\\\exception.txt","a",encoding="utf-8")
f.write(file.title()+"\\n")
continue
#解析基本参数
for car in configItem:
carItem[car[\'name\']]=[]
for ca in car[\'valueitems\']:
carItem[car[\'name\']].append(ca[\'value\'])
# print(carItem)
#解析配置参数
for car in optionItem:
carItem[car[\'name\']]=[]
for ca in car[\'valueitems\']:
carItem[car[\'name\']].append(ca[\'value\'])
if isFlag:
co1s = 0
for co in carItem:
co1s = co1s +1
worksheet.write(startRow,co1s,co)
else:
startRow = startRow+1
isFlag = False
#计算起止行号
endRowNum = startRow + len(carItem[\'车型名称\']) #车辆款式记录数
for row in range(startRow,endRowNum):
print(row)
colNum = 0
for col in carItem:
colNum = colNum +1
print(str(carItem[col][row-startRow]),end=\'|\')
worksheet.write(row,colNum,str(carItem[col][row-startRow]))
else:
startRow = endRowNum
workbook.save(\'d:\\\\autoHome\\\\Mybook.xls\')
8.最后打开excel文件,给你们看看。
数据量大概有8300的样子。以后买车就用这个参考了。
以上是关于python3 爬取汽车之家所有车型操作步骤的主要内容,如果未能解决你的问题,请参考以下文章