xpath爬取新浪天气
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了xpath爬取新浪天气相关的知识,希望对你有一定的参考价值。
参考资料:
http://cuiqingcai.com/1052.html
http://cuiqingcai.com/2621.html
http://www.cnblogs.com/jixin/p/5131040.html
完整代码:
1 # -*- coding:utf-8 -*- 2 import urllib2 3 from lxml import etree 4 user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0" 5 headers = {‘User-Agent‘:user_agent} 6 url = ‘http://weather.sina.com.cn/‘ 7 user_input = raw_input("请输入你想查询天气的城市的拼音,如beijing\\n") 8 # print user_input 9 url = url+user_input 10 # print url 11 req = urllib2.Request(url,headers=headers) 12 reponse = urllib2.urlopen(req) 13 text = reponse.read() 14 # print html 15 # print type(text) 16 html = etree.HTML(text) 17 # print html 18 # result = etree.tostring(html) 19 # print result 20 # 有时候当天天气信息的icons和times数据只有19条,分开处理 21 def change_list(lis): 22 new_lis = [] 23 if len(lis) == 19: 24 if lis == icons: 25 new_lis.append(lis[0]) 26 for i in range(1,19,2): 27 new_lis.append(lis[i]+u‘转‘+lis[i+1]) 28 elif lis == times: 29 new_lis.append(lis[0].text) 30 for i in range(1, 19, 2): 31 new_lis.append(lis[i].text + u‘到‘ + lis[i + 1].text) 32 elif len(lis) == 20: 33 if lis == icons: 34 for i in range(20,2): 35 new_lis.append(lis[i]+u‘转‘+lis[i+1]) 36 elif lis == times: 37 for i in range(20,2): 38 new_lis.append(lis[i].text + u‘到‘ + lis[i + 1].text) 39 return new_lis 40 note1 = html.xpath(‘//*[@class="wt_tt0_note"]‘) 41 note2 = html.xpath(‘//*[@class="wt_tt0_note"]/..‘) 42 # print note[0].text 43 dates = html.xpath(‘//*[@class="wt_fc_c0_i_date"]‘) 44 days = html.xpath(‘//*[@class="wt_fc_c0_i_date"]/following-sibling::*[1]‘) 45 icons = html.xpath(‘//*[@class="wt_fc_c0_i_icons clearfix"]/img/@alt‘) 46 # print len(icons) 47 icons = change_list(icons) 48 times = html.xpath(‘//*[@class="wt_fc_c0_i_times"]/span‘) 49 times = change_list(times) 50 temps = html.xpath(‘//*[@class="wt_fc_c0_i_temp"]‘) 51 tips = html.xpath(‘//*[@class="wt_fc_c0_i_tip"]‘) 52 ls = html.xpath(‘//*[@class="l"]‘) 53 rs = html.xpath(‘//*[@class="r"]‘) 54 print note1[0].text,note2[0].text 55 # print len(ls),len(rs) 56 # PM2.5和空气质量只有7条数据 57 for i in range(7): 58 print dates[i].text, days[i].text, times[i], icons[i], temps[i].text, tips[i].text, u‘PM2.5:‘+ls[i].text, u‘空气质量:‘+rs[i].text 59 for i in range(7,10): 60 print dates[i].text, days[i].text, times[i], icons[i], temps[i].text, tips[i].text
以上是关于xpath爬取新浪天气的主要内容,如果未能解决你的问题,请参考以下文章