python 爬虫启航2.0
Posted 远离人类,加入硅基
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 爬虫启航2.0相关的知识,希望对你有一定的参考价值。
文章解析:
1.正则表达式解析
2.beautifulsoup,BeautifulSoup是一个复杂的树形结构,她的每一个节点都是一个python对象,获取网页的内容就是一个提取对象内容的过程,它的提取方法可以归类为三种,1)遍历文档树 2)搜索文档树 3)css选择器
# -*- coding: utf-8 -*-
# @Time : 2018/11/28 17:23
# @Author : Bo
# @Email : [email protected]
# @File : re_spider.py
# @Software: PyCharm
import requests
import re
from bs4 import BeautifulSoup
from lxml import etree
def get_title_re_spider():
url = "http://www.santostang.com/"
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
}
r = requests.get(url,headers= headers,timeout =10)
html = r.text
title_list = re.findall(‘<h1 class="post-title"><a href=.*?>(.*?)</a></h1>‘,html)
print(title_list)
def beautifulsoup_spider():
url = "http://www.santostang.com/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
}
r = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(r.text,"html.parser")
title_list = soup.find_all("h1",class_="post-title")
for i in range(len(title_list)):
title = title_list[i].a.text.strip()
print("第 %s篇文章的标题是:%s" %(i+1,title))
def beautiful_methods():
url = "http://www.santostang.com/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
}
r = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(r.text, "html.parser")
# print(soup.prettify()) #soup对代码进行优化
#遍历文档树
one_element = soup.header.h1 #获取具体的标签节点
elements = soup.header.contents #获取header标签的子节点
first_element = soup.header.contents[1] #标签都在奇数项
# print(one_element)
# print(elements)
# # print(first_element)
# #获取子节点(只有下一级)
# for child in soup.body.children:
# print(child)
# #获取所有子子孙孙的节点
# for child in soup.body.descendants:
# print(child)
# a_tag = soup.header.div.a
# a_parent = a_tag.parent
# print(a_parent)
# #搜索节点 find() 和find_all()
#css选择器
print(soup.select("header h1"))
print(soup.select("header > h1"))
#css也可以实现文档搜索功能
#使用lxml解析网页
def lxml_spider():
url = "http://www.santostang.com/"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
}
r = requests.get(url, headers=headers, timeout=10)
html = etree.HTML(r.text)
title_list = html.xpath("/html/body/div[1]/div/div[1]/article/header/h1/a/text()")
print(title_list)
# 项目实践-爬取安居客二手房信息
def second_house_spider():
url = "https://weihai.anjuke.com/sale/gaoqu/?from=SearchBar"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0",
}
r = requests.get(url,headers = headers,timeout = 10)
#使用BeautifulSoup解析网页
soup = BeautifulSoup(r.text,‘lxml‘)
house_list = soup.find_all("li",class_="list-item")
for house in house_list:
name = house.find("div",class_="house-title").a.text.strip()
price = house.find("span",class_="price-det").text.strip()
price_area = house.find("span",class_="unit-price").text.strip()
no_room = house.find("div",class_=‘details-item‘).span.text.strip()
area = house.find("div",class_="details-item").contents[3].text
floor = house.find("div",class_="details-item").contents[5].text
address = house.find("span",class_="comm-address").text.strip()
address = address.replace(‘xa0xa0
‘,‘ ‘)
tag_list = house.find_all("span",class_="item-tags")
tag = [i.text for i in tag_list]
with open(‘b.txt‘, "a+",encoding="utf-8") as f:
f.write(address)
print(name)
print(price)
print(price_area)
print(no_room)
print(area)
print(floor)
print(address)
print(tag)
if __name__ == "__main__":
# get_title_re_spider()
# beautifulsoup_spider()
# beautiful_methods()
lxml_spider()
学习网址:
https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/#id27
以上是关于python 爬虫启航2.0的主要内容,如果未能解决你的问题,请参考以下文章
牛逼!StarRocks 2.0正式发布,新一年,新启航,新极速!
Django学习《玩转Django 2.0》PDF+代码分析
Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段