python3爬虫-快速入门-爬取图片和标题
Posted SZUhg
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python3爬虫-快速入门-爬取图片和标题相关的知识,希望对你有一定的参考价值。
直接上代码,先来个爬取豆瓣图片的,大致思路就是发送请求-得到响应数据-储存数据,原理的话可以先看看这个
https://www.cnblogs.com/sss4/p/7809821.html
import os#同来创造文件夹 import requests#发送请求和得到响应用的 from bs4 import BeautifulSoup#用来解析回应的数据 def GetHtmlText(url):#得到响应数据 try: r = requests.get(url)#发送url r.raise_for_status()#判断是否成功 r.encoding = \'utf-8\'#设置编码格式 return r.text#返回他的响应数据 except: return \'\' def main(pages): filepath=os.getcwd()+\'\\爬的图片\\\\\'#创造一个文件夹 if not os.path.exists(filepath):#如果没有则创造 os.makedirs(filepath) pagenum=pages#要爬取的页数 fnum=1 for page in range(pages): url="https://movie.douban.com/celebrity/1048000/photos/?type=C&start="+str(page*30)+\'&sortby=like&size=a&subtype=a\'#第几页 html=GetHtmlText(url) soup=BeautifulSoup(html,\'html.parser\')#html。parser是解析器 uls=soup.find_all(\'ul\',class_="poster-col3 clearfix")#从响应的数据中找到ul class是xxxx的数据 for ul in uls: imgs=ul.find_all(\'img\') #找到img的标签 for img in imgs: imgurl=img[\'src\']#得到img的url imgcontent=requests.get(imgurl).content#得到这个url下的内容content,应该是二进制的 filename=str(fnum)+\'.jpg\' with open(filepath+filename,\'wb\') as wf:#二进制形式写入数据 wf.write(imgcontent) fnum+=1 if __name__ == \'__main__\': main(9)
再来个爬去标题类的
import requests from bs4 import BeautifulSoup url="http://www.jianshu.com" headers={\'User-Agent\':\'SE 2.X MetaSr 1.0\'}#设置请求头的User-Agent,理解的话可以认为是从哪个浏览器发出的,不然的话会被反爬虫 page=requests.get(url=url,headers=headers) page_info=page.text page_bf=BeautifulSoup(page_info,\'html.parser\') #print(page_bf.prettify()) titles=page_bf.find_all(\'a\',\'title\') for title in titles: print(title.string) print(\'http://www.jianshu.com\'+title.get(\'href\')) with open(r"D:\\untitled\\爬虫爬到的标题.txt","w",encoding=\'utf-8\') as file: for title in titles: file.write(title.string+\'\\n\') file.write("http://www.jianshu.com"+title.get(\'href\')+\'\\n\\n\')
这个是下载小说的---(别人的代码)
from bs4 import BeautifulSoup import requests,sys class downloader(object): def __init__(self): self.server="http://www.biqukan.com/" self.target="http://www.biqukan.com/1_1094" self.name=[] self.urls=[] self.nums=0 def get_download_url(self): req=requests.get(url=self.target) html=req.text div_bf=BeautifulSoup(html) div=div_bf.find_all(\'div\',class_=\'listmain\') a_bf=BeautifulSoup(str(div[0])) a=a_bf.find_all(\'a\') self.nums=len(a[15:]) for each in a[15:]: self.name.append(each.string) self.urls.append(self.server+each.get(\'href\')) def get_contents(self ,target): req=requests.get(url=target) html=req.text bf=BeautifulSoup(html) texts=bf.find_all(\'div\',class_=\'showtxt\') texts=texts[0].text.replace(\'\\xa0\'*8,\'\\n\\n\') return texts def writer(self,name,path,text): write_flag=True with open(path,"a",encoding=\'utf-8\') as f: f.write(name+\'\\n\') f.writelines(text) f.write(\'\\n\\n\') dl=downloader() dl.get_download_url() print("开始下载") for i in range(dl.nums): dl.writer(dl.name[i], \'一念永恒.txt\', dl.get_contents(dl.urls[i])) sys.stdout.write(" 已下载:%.3f%%" % float(i/dl.nums) + \'\\r\') sys.stdout.flush() print(\'《一年永恒》下载完成\')
以上是关于python3爬虫-快速入门-爬取图片和标题的主要内容,如果未能解决你的问题,请参考以下文章