Python高级应用程序设计任务要求
Posted 王忠达
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python高级应用程序设计任务要求相关的知识,希望对你有一定的参考价值。
一、主题式网络爬虫设计方案(15分)
1.主题式网络爬虫名称:小米官网爬虫手机类型价格爬虫
2.主题式网络爬虫爬取的内容与数据特征分析:小米官网爬虫手机类型价格爬虫
3.主题式网络爬虫设计方案概述(包括实现思路与技术难点)
本次设计方案主要依靠BeautifulSoup库对目标页面进行信息的爬取采集,对数据进行清洗,最后将结果打印出来
实现思路:获取小米官网HTML页面,爬取数据,使用beautifulsoup进行数据存储、读取,最后打印出来数据
技术难点:对于数据的清洗还有各个价格之间的对应关系。
实现思路:获取豆瓣音乐目标的HTML页面,爬取数据,使用pandas进行数据存储、读取,最后打印出来数据
二、主题页面的结构特征分析(15分)
1. 1.主题页面的结构特征http://detail.zol.com.cn/cell_phone_index/subcate57_34645_list_1.html
打开小米官网,通过右击鼠-标查看网页源代码,找到对应要爬取的信息
2.Htmls页面解析
使用BeautifulSoup进行网页页面解析,通过观察发现我想要获取的内容是在“div”标签下的“a”标签中。
3.节点(标签)查找方法与遍历方法
(必要时画出节点树结构)
查找:get函数,find。
遍历:for循环嵌套
三、网络爬虫程序设计(60分)
爬虫程序主体要包括以下各部分,要附源代码及较详细注释,并在每部分程序后面提供输出结果的截图。
1.数据爬取与采集
import requests from bs4 import BeautifulSoup import matplotlib.pyplot as plt import numpy as np import threading header={ \'user-agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36\', \'cookie\': \'cna=WLxnFpWWi2YCAd5YmGBzb1LE; lid=%E5%A2%A8%E6%83%9C%E5%A6%82%E9%A3%8E; mbk=d104fe4feee1e4c8; enc=ZCXWltgoZbBKllIe42s2UMcdQrPHmbPRvsr5bu64hsyhih2chiIXNMdBlKbSjBosRRqbW8Ba58RiIkOj5bUr1Q%3D%3D; tk_trace=1; t=416ebaf372aac9e714d2411257bebe66; tracknick=%5Cu58A8%5Cu60DC%5Cu5982%5Cu98CE; lgc=%5Cu58A8%5Cu60DC%5Cu5982%5Cu98CE; _tb_token_=e33db43b7fe30; cookie2=130ad5a94570e50984de0fa8439d8b65; dnk=%5Cu58A8%5Cu60DC%5Cu5982%5Cu98CE; uc1=cookie21=VFC%2FuZ9ainBZ&cookie14=UoTbm8RWp827BA%3D%3D&pas=0&existShop=false&lng=zh_CN&cookie15=WqG3DMC9VAQiUQ%3D%3D&tag=8&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D; uc3=nk2=p2MwXab0cT8%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&vt3=F8dByus1oAedGs7HXvs%3D&id2=UNDUK%2FSwTIuBMQ%3D%3D; _l_g_=Ug%3D%3D; uc4=nk4=0%40pVXnDf4QgAF6OsvRnr8f86t9pQ%3D%3D&id4=0%40UgckEyzfCeaEbCy9LaVJ3V%2BC1%2B2o; unb=3004348014; cookie1=AVcQal%2F7P9z%2B8EjUWhp7%2BQvoVbt%2Fz5oUDaF9k92YW%2BU%3D; login=true; cookie17=UNDUK%2FSwTIuBMQ%3D%3D; _nk_=%5Cu58A8%5Cu60DC%5Cu5982%5Cu98CE; sg=%E9%A3%8E4e; csg=8ac18de6; l=dBORoGnuqd-_KXXvBOCanurza77OjIRYouPzaNbMi_5Zl6L6H_QOkUgh7Fp6cjWft4TB4dH2-sp9-etkiepTY-cHtBU4RxDc.; isg=BLi41LSjEe7kQn1tu6bgpcSKiWZKIRyr208sQPIpC_OmDVj3mjFoOukrxUUYW9SD\' } gLock = threading.Lock() #引入解锁和上锁的类 def get_bar(name,list): gLock.acquire() #上锁 plt.rcParams[\'font.sans-serif\'] = [\'Microsoft YaHei\'] #显示中文字体 plt.title(\'各品牌手机部分平均价格\') #标题 plt.xlabel(\'品牌\') #x轴标签 plt.ylabel(\'价格\') #同上 colors=[\'yellow\',\'red\',\'blue\',\'green\',\'orange\'] #设置颜色 plt.bar(name,list6,alpha=0.8,color=colors) #开始绘图 plt.show() #展示绘图结果 gLock.release() #解锁 \'\'\'以下五个函数的代码部分是相似的,换句话说是一样的,但是他们传入的参数是不同的,不一一注释了\'\'\' def get_xiaomi(url): price_list=[] #价格列表 name_list=[] #名称 txt = requests.get(url, headers=header).text #获取网页内容,携带请求头进行伪装爬虫 bs = BeautifulSoup(txt, \'html.parser\') #设置解析方式 for product in bs.find_all(\'div\', class_="product"): #提取数据,初步筛选信息 price = product.find_all(\'em\')[0][\'title\'] #获取价格 name = product.find_all(\'a\', attrs={\'target\': "_blank"})[1].text.replace(\'\\n\', \'\') #获取名称,并对民称进行处理 print(price,name) #打印名称和价格 price=float(price) #将字符型价格改为浮点型价格,强制转化 name_list.append(name) #将名字和价格添加进列表 price_list.append(price) return name_list,price_list
2.对数据进行清洗和处理
def get_rongyao(url): price_list=[] name_list=[] txt = requests.get(url, headers=header).text bs = BeautifulSoup(txt, \'html.parser\') for product in bs.find_all(\'div\', class_="product"): price = product.find_all(\'em\')[0][\'title\'] name = product.find_all(\'a\', attrs={\'target\': "_blank"})[1].text.replace(\'\\n\', \'\') print(price,name) price=float(price) name_list.append(name) price_list.append(price) return name_list,price_list def get_huawei(url): price_list = [] name_list = [] txt = requests.get(url, headers=header).text bs = BeautifulSoup(txt, \'html.parser\') for product in bs.find_all(\'div\', class_="product"): price = product.find_all(\'em\')[0][\'title\'] name = product.find_all(\'a\', attrs={\'target\': "_blank"})[1].text.replace(\'\\n\', \'\') print(price, name) price = float(price) name_list.append(name) price_list.append(price) return name_list, price_list
3.文本分析(可选):jieba分词、wordcloud可视化
4.数据分析与可视化
def get_figure(name1,list1,name2,list2,name3,list3,name4,list4,name5,list5): gLock.acquire() \'\'\'绘图中,显示中文字体\'\'\' plt.rcParams[\'font.sans-serif\'] = [\'Microsoft YaHei\'] fig=plt.figure() \'\'\'分配子图\'\'\' ax1=fig.add_subplot(321) ax2=fig.add_subplot(322) ax3=fig.add_subplot(323) ax4=fig.add_subplot(324) ax5=fig.add_subplot(325) name1 = range(len(name1)) name2 = range(len(name2)) name3 = range(len(name3)) name4 = range(len(name4)) name5 = range(len(name5)) \'\'\'绘制各个子图,并且设置各个子图的x,y,轴标签,以及标题\'\'\' ax1.set_title(\'小米价格图\') ax1.set_xlabel(\'品牌\') ax1.set_ylabel(\'价格\') ax1.plot(name1,list1) ax2.set_title(\'荣耀价格图\') ax3.set_xlabel(\'品牌\') ax2.set_ylabel(\'价格\') ax2.plot(name2, list2,\'r\') ax3.set_title(\'华为价格图\') ax3.set_xlabel(\'品牌\') ax3.set_ylabel(\'价格\') ax3.plot(name3, list3,\'g\') ax4.set_title(\'vivo价格图\') ax4.set_xlabel(\'品牌\') ax4.set_ylabel(\'价格\') ax4.plot(name4, list4,\'y\') ax5.set_title(\'三星价格图\') ax5.set_xlabel(\'品牌\') ax5.set_ylabel(\'价格\') ax5.plot(name5, list5,\'b\') plt.savefig(\'多品牌价格图.png\') #保存绘图结果 plt.show() #展示绘图结果 gLock.release() #解锁 def price_bar(list1,list2,list3,list4,list5): gLock.acquire() #上锁 plt.rcParams[\'font.sans-serif\'] = [\'Microsoft YaHei\'] #显示中文字体 plt.xlabel(\'价位\') plt.ylabel(\'品种数\') p_500,p_1000,p_2000,p_3000,p_4000,p_5000,p_6000,p_else=0,0,0,0,0,0,0,0 #计数器,用于统计 kind_list=[list1,list2,list3,list4,list5] for i in range(5): for j in range(len(kind_list[i])): # print(j) if(kind_list[i][j]<=500): p_500+=1 elif(kind_list[i][j]<=1000): p_1000+=1 elif(kind_list[i][j]<=2000): p_2000+=1 elif(kind_list[i][j]<=3000): p_3000+=1 elif(kind_list[i][j]<=4000): p_4000+=1 elif(kind_list[i][j]<=5000): p_5000+=1 elif(kind_list[i][j]<=6000): p_6000+=1 else: p_else+=1 xlist=[\'500\',\'1000\',\'2000\',\'3000\',\'4000\',\'5000\',\'6000\',\'6000+\',] ylist=[p_500,p_1000,p_2000,p_3000,p_4000,p_5000,p_6000,p_else] colors=[\'purple\',\'pink\',\'red\',\'yellow\',\'orange\',\'blue\',\'silver\',\'green\'] #设置颜色列表 plt.bar(xlist,ylist,color=colors) #绘制条形图 plt.show() #展示绘图结果 gLock.release() #解锁
5.数据持久化
if __name__==\'__main__\': \'\'\'设置url列表,以下这些均为天猫,手机下的,综合部分的链接,如果需要,可以进行更换\'\'\' urllist = [ \'https://list.tmall.com/search_product.htm?q=%D0%A1%C3%D7%CA%D6%BB%FA&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton\', \'https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000723.1.3e5e17e7HMiPUP&&active=2&from=rs_1_key-top-s&q=%C8%D9%D2%AB%CA%D6%BB%FA\', \'https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000723.1.19364d7ePqKoRA&&active=2&from=rs_1_key-top-s&q=%BB%AA%CE%AA%CA%D6%BB%FA\', \'https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000723.2.4cd110b21pk5Js&&active=2&from=rs_1_key-top-s&q=vivo+%CA%D6%BB%FA\', \'https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000723.6.159e3278ogZpdE&&active=2&from=rs_1_key-top-s&q=%C8%FD%D0%C7+%CA%D6%BB%FA\', ] \'\'\'返回爬取结果\'\'\' name1,list1=get_xiaomi(urllist[0]) name2,list2=get_rongyao(urllist[1]) name3,list3=get_huawei(urllist[2]) name4,list4=get_vivo(urllist[3]) name5,list5=get_sanxing(urllist[4]) list6=[] #设置二维矩阵(数组),分别是对应品牌的价格信息 list6.append(np.average(list1)) list6.append(np.average(list2)) list6.append(np.average(list3)) list6.append(np.average(list4)) list6.append(np.average(list5)) name=[\'小米\',\'荣耀\',\'华为\',\'vivo\',\'三星\'] \'\'\'设置多线程,为防止冲突,需要给对应的函数上锁,和解锁\'\'\' \'\'\'设置三个线程\'\'\' t1=threading.Thread(target=get_bar,args=(name,list6)) t2=threading.Thread(target=get_figure,args=(name1, list1, name2, list2, name3, list3, name4, list4, name5, list5)) t3=threading.Thread(target=price_bar,args=(list1,list2,list3,list4,list5)) \'\'\'开启三个线程\'\'\' t1.start() t2.start() t3.start()
完整代码:
import requests from bs4 import BeautifulSoup import matplotlib.pyplot as plt import numpy as np import threading header={ \'user-agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36\', \'cookie\': \'cna=WLxnFpWWi2YCAd5YmGBzb1LE; lid=%E5%A2%A8%E6%83%9C%E5%A6%82%E9%A3%8E; mbk=d104fe4feee1e4c8; enc=ZCXWltgoZbBKllIe42s2UMcdQrPHmbPRvsr5bu64hsyhih2chiIXNMdBlKbSjBosRRqbW8Ba58RiIkOj5bUr1Q%3D%3D; tk_trace=1; t=416ebaf372aac9e714d2411257bebe66; tracknick=%5Cu58A8%5Cu60DC%5Cu5982%5Cu98CE; lgc=%5Cu58A8%5Cu60DC%5Cu5982%5Cu98CE; _tb_token_=e33db43b7fe30; cookie2=130ad5a94570e50984de0fa8439d8b65; dnk=%5Cu58A8%5Cu60DC%5Cu5982%5Cu98CE; uc1=cookie21=VFC%2FuZ9ainBZ&cookie14=UoTbm8RWp827BA%3D%3D&pas=0&existShop=false&lng=zh_CN&cookie15=WqG3DMC9VAQiUQ%3D%3D&tag=8&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D; uc3=nk2=p2MwXab0cT8%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&vt3=F8dByus1oAedGs7HXvs%3D&id2=UNDUK%2FSwTIuBMQ%3D%3D; _l_g_=Ug%3D%3D; uc4=nk4=0%40pVXnDf4QgAF6OsvRnr8f86t9pQ%3D%3D&id4=0%40UgckEyzfCeaEbCy9LaVJ3V%2BC1%2B2o; unb=3004348014; cookie1=AVcQal%2F7P9z%2B8EjUWhp7%2BQvoVbt%2Fz5oUDaF9k92YW%2BU%3D; login=true; cookie17=UNDUK%2FSwTIuBMQ%3D%3D; _nk_=%5Cu58A8%5Cu60DC%5Cu5982%5Cu98CE; sg=%E9%A3%8E4e; csg=8ac18de6; l=dBORoGnuqd-_KXXvBOCanurza77OjIRYouPzaNbMi_5Zl6L6H_QOkUgh7Fp6cjWft4TB4dH2-sp9-etkiepTY-cHtBU4RxDc.; isg=BLi41LSjEe7kQn1tu6bgpcSKiWZKIRyr208sQPIpC_OmDVj3mjFoOukrxUUYW9SD\' } gLock = threading.Lock() #引入解锁和上锁的类 def get_bar(name,list): gLock.acquire() #上锁 plt.rcParams[\'font.sans-serif\'] = [\'Microsoft YaHei\'] #显示中文字体 plt.title(\'各品牌手机部分平均价格\') #标题 plt.xlabel(\'品牌\') #x轴标签 plt.ylabel(\'价格\') #同上 colors=[\'yellow\',\'red\',\'blue\',\'green\',\'orange\'] #设置颜色 plt.bar(name,list6,alpha=0.8,color=colors) #开始绘图 plt.show() #展示绘图结果 gLock.release() #解锁 \'\'\'以下五个函数的代码部分是相似的,换句话说是一样的,但是他们传入的参数是不同的,不一一注释了\'\'\' def get_xiaomi(url): price_list=[] #价格列表 name_list=[] #名称 txt = requests.get(url, headers=header).text #获取网页内容,携带请求头进行伪装爬虫 bs = BeautifulSoup(txt, \'html.parser\') #设置解析方式 for product in bs.find_all(\'div\', class_="product"): #提取数据,初步筛选信息 price = product.find_all(\'em\')[0][\'title\'] #获取价格 name = product.find_all(\'a\', attrs={\'target\': "_blank"})[1].text.replace(\'\\n\', \'\') #获取名称,并对民称进行处理 print(price,name) #打印名称和价格 price=float(price) #将字符型价格改为浮点型价格,强制转化 name_list.append(name) #将名字和价格添加进列表 price_list.append(price) return name_list,price_list def get_rongyao(url): price_list=[] name_list=[] txt = requests.get(url, headers=header).text bs = BeautifulSoup(txt, \'html.parser\') for product in bs.find_all(\'div\', class_="product"): price = product.find_all(\'em\')[0][\'title\'] name = product.find_all(\'a\', attrs={\'target\': "_blank"})[1].text.replace(\'\\n\', \'\') print(price,name) price=float(price) name_list.append(name) price_list.append(price) return name_list,price_list def get_huawei(url): price_list = [] name_list = [] txt = requests.get(url, headers=header).text bs = BeautifulSoup(txt, \'html.parser\') for product in bs.find_all(\'div\', class_="product"): price = product.find_all(\'em\')[0][\'title\'] name = product.find_all(\'a\', attrs={\'target\': "_blank"})[1].text.replace(\'\\n\', \'\') print(price, name) price = float(price) name_list.append(name) price_list.append(price) return name_list, price_list def get_vivo(url): price_list=[] name_list=[] txt = requests.get(url, headers=header).text bs = BeautifulSoup(txt, \'html.parser\') for product in bs.find_all(\'div\', class_="product"): price = product.find_all(\'em\')[0][\'title\'] name = product.find_all(\'a\', attrs={\'target\': "_blank"})[1].text.replace(\'\\n\', \'\') print(price,name) price=float(price) name_list.append(name) price_list.append(price) return name_list,price_list def get_sanxing(url): price_list=[] name_list=[] txt = requests.get(url, headers=header).text bs = BeautifulSoup(txt, \'html.parser\') for product in bs.find_all(\'div\', class_="product"): price = product.find_all(\'em\')[0][\'title\'] name = product.find_all(\'a\', attrs={\'target\': "_blank"})[1].text.replace(\'\\n\', \'\') print(price,name) price=float(price) name_list.append(name) price_list.append(price) return name_list,price_list def get_figure(name1,list1,name2,list2,name3,list3,name4,list4,name5,list5): gLock.acquire() \'\'\'绘图中,显示中文字体\'\'\' plt.rcParams[\'font.sans-serif\'] = [\'Microsoft YaHei\'] fig=plt.figure() \'\'\'分配子图\'\'\' ax1=fig.add_subplot(321) ax2=fig.add_subplot(322) ax3=fig.add_subplot(323) ax4=fig.add_subplot(324) ax5=fig.add_subplot(325) name1 = range(len(name1)) name2 = range(len(name2)) name3 = range(len(name3)) name4 = range(len(name4)) name5 = range(len(name5)) \'\'\'绘制各个子图,并且设置各个子图的x,y,轴标签,以及标题\'\'\' ax1.set_title(\'小米价格图\') ax1.set_xlabel(\'品牌\') ax1.set_ylabel(\'价格\') ax1.plot(name1,list1) ax2.set_title(\'荣耀价格图\') ax3.set_xlabel(\'品牌\') ax2.set_ylabel(\'价格\') ax2.plot(name2, list2,\'r\') ax3.set_title(\'华为价格图\') ax3.set_xlabel(\'品牌\') ax3.set_ylabel(\'价格\') ax3.plot(name3, list3,\'g\') ax4.set_title(\'vivo价格图\') ax4.set_xlabel(\'品牌\') ax4.set_ylabel(\'价格\') ax4.plot(name4, list4,\'y\') ax5.set_title(\'三星价格图\') ax5.set_xlabel(\'品牌\') ax5.set_ylabel(\'价格\') ax5.plot(name5, list5,\'b\') plt.savefig(\'多品牌价格图.png\') #保存绘图结果 plt.show() #展示绘图结果 gLock.release() #解锁 def price_bar(list1,list2,list3,list4,list5): gLock.acquire() #上锁 plt.rcParams[\'font.sans-serifPython高级应用程序设计任务要求