python3爬虫-快速入门-爬取图片和标题

Posted 2020-12-04 SZUhg

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python3爬虫-快速入门-爬取图片和标题相关的知识，希望对你有一定的参考价值。

直接上代码，先来个爬取豆瓣图片的，大致思路就是发送请求-得到响应数据-储存数据，原理的话可以先看看这个

https://www.cnblogs.com/sss4/p/7809821.html

import os#同来创造文件夹
import requests#发送请求和得到响应用的
from bs4 import BeautifulSoup#用来解析回应的数据

def GetHtmlText(url):#得到响应数据
    try:
        r = requests.get(url)#发送url
        r.raise_for_status()#判断是否成功
        r.encoding = \'utf-8\'#设置编码格式
        return r.text#返回他的响应数据
    except:
        return \'\'
def main(pages):
    filepath=os.getcwd()+\'\\爬的图片\\\\\'#创造一个文件夹
    if not os.path.exists(filepath):#如果没有则创造
        os.makedirs(filepath)

    pagenum=pages#要爬取的页数
    fnum=1
    for page in range(pages):
        url="https://movie.douban.com/celebrity/1048000/photos/?type=C&start="+str(page*30)+\'&sortby=like&size=a&subtype=a\'#第几页
        html=GetHtmlText(url)
        soup=BeautifulSoup(html,\'html.parser\')#html。parser是解析器
        uls=soup.find_all(\'ul\',class_="poster-col3 clearfix")#从响应的数据中找到ul class是xxxx的数据
        for ul in uls:
            imgs=ul.find_all(\'img\') #找到img的标签
            for img in imgs:
                imgurl=img[\'src\']#得到img的url
                imgcontent=requests.get(imgurl).content#得到这个url下的内容content，应该是二进制的
                filename=str(fnum)+\'.jpg\'
                with open(filepath+filename,\'wb\') as wf:#二进制形式写入数据
                    wf.write(imgcontent)
                fnum+=1


if __name__ == \'__main__\':
    main(9)

再来个爬去标题类的

import requests
from bs4 import BeautifulSoup

url="http://www.jianshu.com"
headers={\'User-Agent\':\'SE 2.X MetaSr 1.0\'}#设置请求头的User-Agent，理解的话可以认为是从哪个浏览器发出的，不然的话会被反爬虫
page=requests.get(url=url,headers=headers)
page_info=page.text
page_bf=BeautifulSoup(page_info,\'html.parser\')

#print(page_bf.prettify())
titles=page_bf.find_all(\'a\',\'title\')

for title in titles:
    print(title.string)
    print(\'http://www.jianshu.com\'+title.get(\'href\'))
with open(r"D:\\untitled\\爬虫爬到的标题.txt","w",encoding=\'utf-8\') as file:
    for title in titles:
        file.write(title.string+\'\\n\')
        file.write("http://www.jianshu.com"+title.get(\'href\')+\'\\n\\n\')

这个是下载小说的---（别人的代码）

from bs4 import BeautifulSoup
import requests,sys
class downloader(object):
    def __init__(self):
        self.server="http://www.biqukan.com/"
        self.target="http://www.biqukan.com/1_1094"
        self.name=[]
        self.urls=[]
        self.nums=0

    def get_download_url(self):
        req=requests.get(url=self.target)
        html=req.text
        div_bf=BeautifulSoup(html)
        div=div_bf.find_all(\'div\',class_=\'listmain\')
        a_bf=BeautifulSoup(str(div[0]))
        a=a_bf.find_all(\'a\')
        self.nums=len(a[15:])
        for each in a[15:]:
            self.name.append(each.string)
            self.urls.append(self.server+each.get(\'href\'))
    def get_contents(self ,target):
        req=requests.get(url=target)
        html=req.text
        bf=BeautifulSoup(html)
        texts=bf.find_all(\'div\',class_=\'showtxt\')
        texts=texts[0].text.replace(\'\\xa0\'*8,\'\\n\\n\')
        return texts
    def writer(self,name,path,text):
        write_flag=True
        with open(path,"a",encoding=\'utf-8\') as f:
            f.write(name+\'\\n\')
            f.writelines(text)
            f.write(\'\\n\\n\')

dl=downloader()
dl.get_download_url()
print("开始下载")
for i in range(dl.nums):
    dl.writer(dl.name[i], \'一念永恒.txt\', dl.get_contents(dl.urls[i]))
    sys.stdout.write("  已下载:%.3f%%" %  float(i/dl.nums) + \'\\r\')
    sys.stdout.flush()
print(\'《一年永恒》下载完成\')

以上是关于python3爬虫-快速入门-爬取图片和标题的主要内容，如果未能解决你的问题，请参考以下文章

python3爬虫爬取煎蛋网妹纸图片

Python爬虫入门案例教学：批量爬取彼岸桌面4K超清美女壁纸

Python3 爬虫实例 -- 爬取豆瓣首页图片

Python爬虫入门教程 5-100 27270图片爬取

Python爬虫入门：27270图片爬取

Python爬虫实战——爬取今日头条美女图片