Python爬虫爬取ECVA论文标题作者链接
Posted 杨传伟
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫爬取ECVA论文标题作者链接相关的知识,希望对你有一定的参考价值。
1 import re
2 import requests
3 from bs4 import BeautifulSoup
4 import lxml
5 import traceback
6 import time
7 import json
8 from lxml import etree
9 def get_paper():
10 #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/267_ECCV_2020_paper.php
11 #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/283_ECCV_2020_paper.php
12 #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php
13 url=\'https://www.ecva.net/papers.php\'
14 headers = {
15 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36\'
16 }
17 response=requests.get(url,headers)
18 response.encoding=\'utf-8\'
19 page_text=response.text
20 #输出页面html
21 # print(page_text)
22 soup = BeautifulSoup(page_text,\'lxml\')
23 all_dt=soup.find_all(\'dt\',class_=\'ptitle\')
24 #暂存信息
25 temp_res=[]
26 #最后结果集
27 res=[]
28 #链接
29 link_res = []
30 for dt in all_dt:
31 single_dt=str(dt)
32 single_soup=BeautifulSoup(single_dt,\'lxml\')
33 title=single_soup.find(\'a\').text
34 #存标题
35 temp_res.append(title)
36 #存摘要
37
38 #存关键字
39
40 #存源链接
41 sourcelink=single_soup.find(\'a\')[\'href\']
42 sourcelink="https://www.ecva.net/"+sourcelink
43 temp_res.append(sourcelink)
44 res.append(temp_res)
45 temp_res=[]
46 #爬取作者和pdf文件链接
47 all_dd=soup.find(\'div\',id=\'content\')
48 all_dd=all_dd.find_all(\'dd\')
49 flag=0
50 author=[]
51 download=[]
52 pdfinfo=[]
53 for item in all_dd:
54 if(flag%2==0):
55 #保存作者
56 author.append(item)
57 else:
58 linktext=str(item)
59 linksoup=BeautifulSoup(linktext,\'lxml\')
60 link_res.append(linksoup.find_all(\'div\',class_=\'link2\'))
61 #解析download 和 pdfinfo
62 flag = flag + 1
63 """
64 继续使用beautifulsoup
65 download_text 和 pdfinfo_text
66 存储author
67 "https://www.ecva.net/"
68 """
69 linkflag=1
70 print(len(link_res))
71 for items in link_res:
72 for item in items:
73 if(linkflag%2==0):
74 pdfinfo_text = str(item)
75 else:
76 download_text = str(item)
77 linkflag=linkflag+1
78 download_text_soup=BeautifulSoup(download_text,\'lxml\')
79 pdfinfo_text_soup=BeautifulSoup(pdfinfo_text,\'lxml\')
80 #解析两个链接
81 download.append("https://www.ecva.net/"+download_text_soup.find(\'a\')[\'href\'])
82 pdfinfo.append(pdfinfo_text_soup.find(\'a\')[\'href\'])
83 print(len(download))
84 print(len(pdfinfo))
85 # for item in download :
86 # print(item)
87 print("------------------------------")
88
89 #把作者和download pdfinfo 存到res
90 for i in range(0,len(res)):
91 #添加作者
92 res[i].append(author[0])
93 #添加download
94 res[i].append(download[0])
95 #添加pdfinfo
96 res[i].append(pdfinfo[0])
97 #遍历最终结果集
98 print(res[0])
99 # for item in res:
100 # print(item)
101 return
102
103 if (__name__==\'__main__\'):
104 get_paper()
以上是关于Python爬虫爬取ECVA论文标题作者链接的主要内容,如果未能解决你的问题,请参考以下文章
23个Python爬虫开源项目代码:爬取微信淘宝豆瓣知乎微博等
Python爬虫开源项目代码,爬取微信淘宝豆瓣知乎新浪微博QQ去哪网等 代码整理