麦田厦门下区信息数据爬取
Posted venvive
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了麦田厦门下区信息数据爬取相关的知识,希望对你有一定的参考价值。
刚开始爬取的时候没有用headers伪装成是浏览器,导致麦田北京和福州小区把我的ip给禁掉了,还好后来发现原因也还剩下厦门小区没被我弄坏,代码如下:
#-*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup page_url = "http://xm.maitian.cn/xqall" headers = "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Referer":"http://xm.maitian.cn/esfall", "Connection":"keep-alive", "Content-Type":"text/plain; charset=utf-8" def get_communities_url(): all_data =[] try: reponse = requests.get(url=page_url,headers=headers) except Exception as e: print("请求连接错误") raise e soup = BeautifulSoup(reponse.text,"lxml") soup = soup.find("div","list_wrap") tag_li = soup.find_all("li") for tag_li in soup.find_all("li"): href = tag_li.h1.a[‘href‘] new_url = page_url.replace("/xqall",href) #all_url.append(new_url) dict_data =get_target_info(new_url) if dict_data: all_data.append(dict_data) #print(all_data) return all_data def get_target_info(new_url): # all_url = get_communities_url() # print(len(all_url)) dict = try: reponse = requests.get(url=new_url,headers=headers) except Exception as e: print("请求连接错误") raise e #print(reponse.text) soup = BeautifulSoup(reponse.text,‘lxml‘) soup1 = soup.find("section","home_main") ps = soup1.find_all("p") # 小区均价 community_avg = ps[0].b.string.strip() dict["community_avg"] =community_avg #待售房源 unsold_homes = ps[1].find_all("em")[0].a.string+"套" dict["unsold_homes"] = unsold_homes #待租房源 rent_homes = ps[1].find_all("em")[1].a.string + "套" dict["rent_homes"] = rent_homes #所属商圈 business_circle = ps[2].label.string dict["business_circle"] =business_circle #开发商 developers = ps[2].em.string dict["developers"] = developers soup2 = soup.find("ul","home_details") for tag_li in soup2.find_all("li"): if tag_li["class"] == [‘li_left‘]: p = tag_li.find_all("p") #建筑面积 area=p[0].em.string dict["area"] = area #物业公司 property_company=p[1].em.string dict["property_company"] = property_company #物业费 industry_fee = p[2].em.string dict["industry_fee"] = industry_fee elif tag_li["class"] == [‘li_center‘]: p = tag_li.find_all("p") #建成年代 built_year = p[0].em.string dict["built_year"] = built_year #房屋总数 total_houses = p[1].em.string dict["total_houses"] = total_houses #绿化率 green_rates = p[2].em.string dict["green_rates"] =green_rates elif tag_li["class"] == [‘li_right‘]: p = tag_li.find_all("p") # 占地面积 cover_area = p[0].em.string dict["cover_area"] = cover_area # 楼栋总数 total_built = p[1].em.string dict["total_built"] = total_built # 容积率 product_rates = p[2].em.string dict["product_rates"] = product_rates return dict if __name__ == ‘__main__‘: data_all = get_communities_url() print(data_all)
以上是关于麦田厦门下区信息数据爬取的主要内容,如果未能解决你的问题,请参考以下文章
数字中心荣获“厦门大数据安全开放创新应用大赛·交通专题”算法赛一等奖