爬虫的简单操作

Posted 2021-03-11 greenduck

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了爬虫的简单操作相关的知识，希望对你有一定的参考价值。

今天学习了python爬虫的简单操作。

1.学会创建文件夹和创建文件：

 1 import os
 2 
 3 def mkdir(path):
 4     if os.path.exists(path):###判断是文件夹否存在，否则有可能会报错
 5         print("The path has already existed .")
 6     else:
 7         os.makedirs(path)###新建文件夹
 8         print("Done .")
 9 
10 def write(path,str):
11     with open(path,"w+") as file:###写文件
12         file.write(str)
13 
14 def main():
15     mkdir("test")
16     write("test/test.txt","hello world")
17 
18 if __name__=="__main__":
19     main()

View Code

2.得到一个网站的源码（如果能够访问）：

 1 from bs4 import BeautifulSoup
 2 import requests
 3 
 4 def main():
 5     html=requests.get("https://www.baidu.com")###去找这个网址
 6     html.encoding="utf-8"###中文网址，换个字符集
 7     soup=BeautifulSoup(html.text,"lxml")###美味的汤，就是正则表达式
 8     print(soup.prettify())###将源码格式化（不是删数据）
 9 
10 if __name__=="__main__":
11     main()

View Code

3.得到一个网站的源码中相应标签的元素（如果能够访问）：

 1 import requests
 2 from bs4 import BeautifulSoup
 3 
 4 def write_to_file(content):
 5     with open("save.txt","a",encoding="utf-8") as f:
 6         f.write(content)
 7 
 8 def get_blog_info(url):
 9     html=requests.get(url)
10     soup=BeautifulSoup(html.text,"lxml")
11     print(soup.title)###各种各样的元素
12     print("="*100)
13     print(type(soup.title))
14     print("="*100)
15     print(type(soup.title.string))
16     print("="*100)
17     print(soup.title.string)
18     print("="*100)
19     print(soup.head)
20     print("="*100)
21     print(soup.p)
22 
23 def main():
24     blog_url="https://www.cnblogs.com/sgh1023"
25     get_blog_info(blog_url)
26 
27 if __name__=="__main__":
28     main()

View Code

4.下载一个图片（如果能够访问）：

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import os
 4 
 5 tot=0
 6 path="save"
 7 
 8 def mkdir(path):
 9     if os.path.exists(path):
10         return
11     else:
12         os.makedirs(path)
13 
14 def save(content):
15     global tot,path
16     mkdir(path)
17     with open(path+"/"+str(tot)+".png","wb+") as file:
18         file.write(content)
19         file.close()
20         tot=tot+1
21 
22 def download_image(url):###下图片，不保证一定成功
23     print("Now downloading...",tot)
24     response=requests.get(url)
25     save(response.content)
26     print("Done !")
27 
28 def main():
29     download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png")
30 
31 if __name__=="__main__":
32     main()

View Code

5.下载一个网页的图片：

 1 import requests
 2 import urllib
 3 import os
 4 from bs4 import BeautifulSoup
 5 
 6 tot=0
 7 path="save"
 8 
 9 def mkdir(path):
10     if os.path.exists(path):
11         return
12     else:
13         os.makedirs(path)
14 
15 def save(content):
16     global tot,path
17     mkdir(path)
18     with open(path+"/"+str(tot)+".png","wb+") as file:
19         file.write(content)
20         file.close()
21         tot=tot+1
22 ######################################################################
23 def get_html_content(url):###获得网址的源码
24     req=urllib.request.Request(url)###添加头部，伪装Goole浏览器，这是抄的代码。
25     req.add_header(‘user-agent‘,‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘)
26     response=urllib.request.urlopen(req)
27     html=response.read()
28     return html
29 
30 def url_exist(url):###判断网址是否存在
31     try:
32         response=requests.get(url)
33         return True
34     except:
35         return False
36 
37 def download_image(url):###下图片
38     print("Now downloading...",tot,url)
39     if(url_exist(url)):###判断网址是否存在
40         response=requests.get(url)
41         save(response.content)
42         print("Done !")
43     else:
44         print("Unavailable !")
45 ######################################################################
46 def process(str):###简单地处理网址
47     if(str[0]==‘h‘):
48         return str;
49     elif(str[0]==‘/‘ and str[1]!=‘/‘):
50         return "https:/"+str
51     return "https:"+str;
52 
53 def get_image(url):
54     soup=BeautifulSoup(get_html_content(url),"lxml")
55     items=soup.find_all("img",{"src":True})
56     for i in items:
57         download_image(process(i["src"]))
58 
59 def main():
60     url="https://www.bilibili.com"
61     get_image(url)
62 
63 if __name__=="__main__":
64     main()

View Code

当然，find_all的参数视具体情况而定。

以上是关于爬虫的简单操作的主要内容，如果未能解决你的问题，请参考以下文章