Day535.爬虫解析 -python
Posted 阿昌喜欢吃黄桃
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Day535.爬虫解析 -python相关的知识,希望对你有一定的参考价值。
解析
一、xpath
- 安装lxml库
pip install lxml ‐i https://pypi.douban.com/simple
- 导入lxml.etree
from lxml import etree
- etree.parse() 解析本地文件
html_tree = etree.parse('XX.html')
- etree.HTML() 服务器响应文件
html_tree = etree.HTML(response.read().decode('utf‐8')
- html_tree.xpath(xpath路径)
抓取站长素材图片案例
# @Author: Achang
# @Time: 2022/2/11 16:27
# @File: lxml_解析站长素材图片
# @Project: 爬虫基础
# https://sc.chinaz.com/tupian/shuaigetupian.html
import urllib.request
import lxml.etree
headers =
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
def create_request(page):
if page == 1:
url = "https://sc.chinaz.com/tupian/shuaigetupian.html"
else:
url = "https://sc.chinaz.com/tupian/shuaigetupian_"+str(page)+".html"
print(url)
requestObj = urllib.request.Request(headers=headers,url=url)
return requestObj
def get_context(requestObj):
context = urllib.request.urlopen(requestObj)
result = context.read().decode('utf-8')
return result
def down_load(context):
tree = lxml.etree.HTML(context)
name_list = tree.xpath("//div[@id='container']//a/img/@alt")
pic_url_list = tree.xpath("//div[@id='container']//a/img/@src2")
for index in range(len(name_list)):
urllib.request.urlretrieve(url="https:"+pic_url_list[index],filename="./img/"+name_list[index]+'.jpg')
if __name__ == '__main__':
start_page = int(input("请输入起始页"))
end_page = int(input("请输入结束页"))
for page in range(start_page,end_page+1):
requestObj = create_request(page)
context = get_context(requestObj)
down_load(context)
二、JsonPath
- pip安装
pip install jsonpath
- jsonpath的使用
obj = json.load(open('json文件', 'r', encoding='utf‐8'))
ret = jsonpath.jsonpath(obj, 'jsonpath语法')
- jsonpath与lxml语法对比
- 数据源
"store":
"book": [
"category": "reference",
"author": "Nigel Rees",
"title": "Sayings of the Century",
"price": 8.95
,
"category": "fiction",
"author": "Evelyn Waugh",
"title": "Sword of Honour",
"price": 12.99
,
"category": "fiction",
"author": "Herman Melville",
"title": "Moby Dick",
"isbn": "0-553-21311-3",
"price": 8.99
,
"category": "fiction",
"author": "J. R. R. Tolkien",
"title": "The Lord of the Rings",
"isbn": "0-395-19395-8",
"price": 22.99
],
"bicycle":
"color": "red",
"price": 19.95
- jsonpath与lxml写法对比
- 通过jsonpath_读取淘票票城市接口案例
import json
import jsonpath
import urllib.request
headers =
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'cookie': 'miid=536765677517889060; t=78466542de5dbe84715c098fa2366f87; cookie2=11c90be2b7bda713126ed897ab23e35d; v=0; _tb_token_=ee5863e335344; cna=jYeFGkfrFXoCAXPrFThalDwd; xlly_s=1; tfstk=cdlVBIX7qIdVC-V6pSNwCDgVlVEAa8mxXMa3nx9gjUzPOZeuYsAcXzbAiJwAzG2c.; l=eBxbMUncLj6r4x9hBO5aourza77T6BAb4sPzaNbMiInca6BOT3r6QNCnaDoy7dtjgtCxretPp0kihRLHR3xg5c0c07kqm0JExxvO.; isg=BHBwrClf5nUOJrpxMvRIOGsqQT7CuVQDlydQ-WrHREsaJRDPEsmVk5EbfS1FtQzb',
'referer': 'https://dianying.taobao.com/',
'content-type': 'text/html;charset=UTF-8'
def create_request():
res_obj = urllib.request.Request(url="https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1644570795658_173&jsoncallback=jsonp174&action=cityAction&n_s=new&event_submit_doGetAllRegion=true",headers=headers)
return res_obj
def get_context(req_obj):
resp = urllib.request.urlopen(req_obj)
origin_context = resp.read().decode('utf-8')
result = origin_context.split('jsonp174(')[1].split(')')[0]
return result
def download_and_parse(context):
with open('jsonpath_淘票票案例.json','w',encoding='utf-8') as fp:
fp.write(context)
def parse_json():
obj = json.load(open('jsonpath_淘票票案例.json', mode='r', encoding='utf-8'))
region_name_list = jsonpath.jsonpath(obj, '$..regionName')
print(region_name_list)
print(len(region_name_list))
if __name__ == '__main__':
req_obj = create_request()
context = get_context(req_obj)
download_and_parse(context)
parse_json()
- 爬取Boss直聘城市信息
import urllib.request
import jsonpath
import json
url = 'https://www.zhipin.com/wapi/zpgeek/common/data/citysites.json'
resp = urllib.request.urlopen(url)
context = resp.read().decode('utf-8')
result = jsonpath.jsonpath(json.loads(context),'$..name')
print(result)
三、BeautifulSoup
-
基本简介
-
安装以及创建
-
节点定位
-
节点信息
-
爬取星巴克图片案例
from bs4 import BeautifulSoup import urllib.request url = 'https://www.starbucks.com.cn/menu/' resp = urllib.request.urlopen(url) context = resp.read().decode('utf-8') soup = BeautifulSoup(context,'lxml') obj = soup.select("ul[class='grid padded-3 product'] div[class='preview circle']") for item in obj: completePicUrl = 'https://www.starbucks.com.cn'+item.attrs.get('style').split('url("')[1].split('")')[0] print(completePicUrl)
-
爬取证券之光板块信息案例
from bs4 import BeautifulSoup import urllib.request url = 'http://quote.stockstar.com/' context = urllib.request.urlopen(url).read().decode('gb2312') soup = BeautifulSoup(context,'lxml') list = soup.select('#datalist2 .align_left a') for item in list: print(item.get_text())
以上是关于Day535.爬虫解析 -python的主要内容,如果未能解决你的问题,请参考以下文章