京东爬虫实现词云图展示
Posted inti
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了京东爬虫实现词云图展示相关的知识,希望对你有一定的参考价值。
一.前期准备,抓取HTML我们所需要关键信息
目标url:https://search.jd.com/Search?keyword=shouji&enc=utf-8&wq=shouji&pvid=a1727a28a24544829b30ef54d049feae
目标url其中page可以换页可以更改
然后跳转url转到相关数据页面:
跳转到手机详细页面我们需要找到手机相关信息eg:名称 价格 销量等
前期准备工作完成
二.代码编写
1.库的导入
2.函数功能 a.请求网页 b.数据处理 c.词云图设计
3.效果展示
完整代码
![技术图片](/img/jia.gif)
import requests import re from bs4 import BeautifulSoup import json import os import csv import jieba from wordcloud import WordCloud import numpy as np import PIL header = { ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36‘ } if os.path.exists("./存储文件/JDphone.xlsx"): os.remove("./存储文件/JDphone.xlsx") #获取每个手机对应的详情页 def get_list(): # 非法URL invalidLink=‘javascript:;‘ result = [] _url = ‘https://search.jd.com/Search?keyword=手机&suggest=1.rem.0.V03--38s0&wq=手机&page={p}&s=1&click=0‘ for i in range(1): #爬取的页数 url = _url.format(p = str(i)) session = requests.session() session.headers = header res = session.get(url=url,timeout=20) res.encoding = res.apparent_encoding html = res.text soup = BeautifulSoup(html,‘lxml‘) # print(soup) _result = soup.find(‘ul‘,class_=‘gl-warp clearfix‘).find_all(‘a‘) for k in _result: try: link = k.get(‘href‘) except: link = ‘‘ if link is not None: #过滤非法链接 if link == invalidLink: pass elif ‘https:‘ + link in result: pass else: # print(‘https:‘ + link) result.append(‘https:‘ + link) return result #数据存储 def save_data(href_list): filename = ‘手机信息.csv‘ # 写入的表格名称 outputfile = open(filename, ‘w‘, newline=‘‘) # 以写方式打开表格,取消空行 # 创建一个csv.writer对象 传入一个文件句柄,默认以逗号作为分隔符 csv_writer = csv.writer(outputfile, dialect=‘excel‘) urls = href_list total = len(urls) count = 1 for url in urls: print(‘正在爬取:‘ + str(count) + ‘/‘ + str(total)) count += 1 try: commodity_id = re.sub(r‘D‘, "", url) # 商品编号 session = requests.session() session.headers = header priceUrl = ‘https://p.3.cn/prices/mgets?skuIds={}‘.format(commodity_id) priceres = session.get(url=priceUrl) jsons = json.loads(priceres.text[0:-1]) commodity_price= jsons[0][‘p‘] # 商品价格 res = session.get(url=url, timeout=20) res.encoding = res.apparent_encoding html = res.text soup = BeautifulSoup(html, ‘lxml‘) result_1 = soup.find(‘div‘, class_=‘jqzoom main-img‘).find_all(‘img‘)[0] commodity_name = result_1.get(‘alt‘) # 商品名称 csv_writer.writerow([commodity_id,commodity_price,commodity_name,url]) f = open(‘11.txt‘, ‘a‘) f.write(commodity_name,‘utf-8‘) f.close() except: pass def ciyuntu(): # 1.读出词语 text = open(‘11.txt‘, ‘r‘).read() print(text) # 2.把数据切开 cut_text = jieba.cut(text) # 3.以空格拼接起来 result = " ".join(cut_text) print(result) # 4.生成词云 #导入图片 image1 = PIL.Image.open(r‘640.png‘) Mask = np.array(image1) wc = WordCloud( font_path=‘font/yunmo.ttf‘, # 字体路劲 background_color=‘white‘, # 背景颜色 width=1600, height=800, max_font_size=25, # 字体大小 min_font_size=5, mask=Mask, # 背景图片 max_words=1000 ) wc.generate(result) wc.to_file(‘词云图12.png‘) # 图片保存 print(‘词云图生成完成-----‘) if __name__ == ‘__main__‘: href_list = get_list() save_data(href_list) ciyuntu()
以上是关于京东爬虫实现词云图展示的主要内容,如果未能解决你的问题,请参考以下文章