爬虫 requests 和 beautiful soup 提取内容
Posted acthis
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫 requests 和 beautiful soup 提取内容相关的知识,希望对你有一定的参考价值。
import requests
import time
from bs4 import BeautifulSoup
class getContents():
# 获取html页面
def getHTMLText(self, url):
try:
kv = {‘user-agent‘: ‘Mozilla/5.0‘}
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
# 获取标题
def getTitle(self, str):
try:
tag = BeautifulSoup(str, ‘html.parser‘)
return tag.h3.string
except:
return ""
# 获取内容
def getContent(self, str):
try:
soup = BeautifulSoup(str, ‘html.parser‘)
p = soup.find_all(‘span‘)
if p.__len__() == 0:
p = soup.find_all(‘font‘)
if len(p) == 0:
p = soup.find_all(‘div‘)
print(len(p))
s = ""
for i in p:
if i.string == None:
continue
s = s + i.text
print(s)
return s
except:
return "1"
# 写入内容
def write(self, str, filename):
try:
filename = filename + ‘.txt‘
with open(filename, "w", encoding="utf-8") as f:
f.write(str)
print("成功")
except:
print("错误")
def main():
with open("urlneimenggu.txt", ‘r‘, encoding="utf-8") as f:
url = f.read().split(‘ ‘)
address = getContents()
for i in url:
print(i)
html = address.getHTMLText(i)
while(True):
if html == "":
print("等待中....")
time.sleep(5)
html = address.getHTMLText(i)
else:
break
title = address.getTitle(html)
content = address.getContent(html)
address.write(content, title)
main()
以上是关于爬虫 requests 和 beautiful soup 提取内容的主要内容,如果未能解决你的问题,请参考以下文章
详解Python 采用 requests + Beautiful Soup 爬取房天下新楼盘推荐
爬虫初体验:Python+Requests+BeautifulSoup抓取广播剧