python yyds
Posted J哥。
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python yyds相关的知识,希望对你有一定的参考价值。
爬取:http://www.5uz.net/xiaochengxu/list/201338/
应用名标签及内容 里面有很多细节
# import glob
import requests
from lxml import etree
# import csv
# from concurrent.futures import ThreadPoolExecutor
# f = open("数据.csv" , mode='w' , encoding= 'utf-8 ')
# csvwriter = csv.writer(f)
def main():
# 会话
# global href
session = requests.session()
url = 'http://www.5uz.net/xiaochengxu/list/201338/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Cache-Control': 'no-cache'
}
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
html = etree.HTML(resp.text)
lis = html.xpath('/html/body/div[10]/div[1]/div[1]/div/div/li')
href = []
for hrefs in lis:
# href = hrefs.xpath('./a/@href')[-1].strip('/') 这个是错误的 strip是清除 ’/‘
href += hrefs.xpath('./a/@href')
# sub.append(href)
# print(href)
# with ThreadPoolExecutor(20) as t:
for i in href:
# print(i)
subpage1 = session.get(i)
subpage = etree.HTML(subpage1.text)
# name1 = name.replace('\\r\\n','')
name = subpage.xpath('/html/body/div[10]/div[1]/div[1]/div/div[1]/div[1]/div[1]/h1/text()')[0]
company_name = subpage.xpath('normalize-space(//div[3]/span[2]/text())')[3:]
desc_content = subpage.xpath('normalize-space(//*[@id="article"]/p/text())')
# print(subpage)
# name.replace('\\r\\n')
print(name,':',company_name,'介绍',desc_content)
if __name__ == '__main__':
main()
# 处理 \\r\\n
# 把数据晒出来,然后用replace('\\r\\n','')把不需要的符号去掉
# passage1 = re.sub("</?\\w+[^>]*>", "", str(passage))
#
# passage2 = passage1.replace('\\\\r', '\\r').replace('\\\\n', ' \\n').replace('\\\\t', '\\t').replace(']', '').replace('[',
# '').replace(
# ' ', ' ')
#
# print(passage2)
# name1 = ''.join(name).replace('\\r\\n','')
# @Python-学生 name1 = ''.join(name). replace( 'lrin','')
# html.xpath('normalize-space(//div//tr/td[@class="inquiry_intitleb"]/span/text())'
简约版的过程思路:
import requests
from lxml import etree
def main():
session = requests.session()
url = 'http://www.5uz.net/xiaochengxu/list/201338/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Cache-Control': 'no-cache'
}
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
html = etree.HTML(resp.text)
lis = html.xpath('/html/body/div[10]/div[1]/div[1]/div/div/li')
hrefs = []
for href in lis:
hrefs += href.xpath('./a/@href')
for i in hrefs:
subpage1 = session.get(i)
subpage = etree.HTML(subpage1.text)
name = subpage.xpath('/html/body/div[10]/div[1]/div[1]/div/div[1]/div[1]/div[1]/h1/text()')
print(name)
if __name__ == '__main__':
main()
获取图片链接:
import requests
from lxml import etree
url = 'http://www.5uz.net/xiaochengxu/list/201338/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Cache-Control': 'no-cache'
}
resp = requests.get(url, headers=headers)
resp.encoding='utf-8'
html = etree.HTML(resp.text)
lis = html.xpath('/html/body/div[10]/div[1]/div[1]/div/div/li')
for imgs in lis:
img = imgs.xpath('./a/img/@src')
print(img)
以上是关于python yyds的主要内容,如果未能解决你的问题,请参考以下文章
YYDS!Dexplot:一行 Python 代码轻松绘制统计图表!
# yyds干货盘点 # 盘点一份JS逆向代码转换为Python代码的教程