Python爬虫爬取ECVA论文标题作者链接

Posted 2021-06-11 杨传伟

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Python爬虫爬取ECVA论文标题作者链接相关的知识，希望对你有一定的参考价值。

  1 import re
  2 import requests
  3 from bs4 import BeautifulSoup
  4 import lxml
  5 import traceback
  6 import time
  7 import json
  8 from lxml import etree
  9 def get_paper():
 10     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/267_ECCV_2020_paper.php
 11     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/283_ECCV_2020_paper.php
 12     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php
 13     url=\'https://www.ecva.net/papers.php\'
 14     headers = {
 15         \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36\'
 16     }
 17     response=requests.get(url,headers)
 18     response.encoding=\'utf-8\'
 19     page_text=response.text
 20     #输出页面html
 21     # print(page_text)
 22     soup = BeautifulSoup(page_text,\'lxml\')
 23     all_dt=soup.find_all(\'dt\',class_=\'ptitle\')
 24     #暂存信息
 25     temp_res=[]
 26     #最后结果集
 27     res=[]
 28     #链接
 29     link_res = []
 30     for dt in all_dt:
 31         single_dt=str(dt)
 32         single_soup=BeautifulSoup(single_dt,\'lxml\')
 33         title=single_soup.find(\'a\').text
 34         #存标题
 35         temp_res.append(title)
 36         #存摘要
 37 
 38         #存关键字
 39 
 40         #存源链接
 41         sourcelink=single_soup.find(\'a\')[\'href\']
 42         sourcelink="https://www.ecva.net/"+sourcelink
 43         temp_res.append(sourcelink)
 44         res.append(temp_res)
 45         temp_res=[]
 46     #爬取作者和pdf文件链接
 47     all_dd=soup.find(\'div\',id=\'content\')
 48     all_dd=all_dd.find_all(\'dd\')
 49     flag=0
 50     author=[]
 51     download=[]
 52     pdfinfo=[]
 53     for item in all_dd:
 54         if(flag%2==0):
 55             #保存作者
 56             author.append(item)
 57         else:
 58             linktext=str(item)
 59             linksoup=BeautifulSoup(linktext,\'lxml\')
 60             link_res.append(linksoup.find_all(\'div\',class_=\'link2\'))
 61             #解析download 和 pdfinfo
 62         flag = flag + 1
 63     """
 64     继续使用beautifulsoup
 65     download_text 和 pdfinfo_text
 66     存储author
 67     "https://www.ecva.net/"
 68     """
 69     linkflag=1
 70     print(len(link_res))
 71     for items in link_res:
 72         for item in items:
 73             if(linkflag%2==0):
 74                 pdfinfo_text = str(item)
 75             else:
 76                 download_text = str(item)
 77             linkflag=linkflag+1
 78         download_text_soup=BeautifulSoup(download_text,\'lxml\')
 79         pdfinfo_text_soup=BeautifulSoup(pdfinfo_text,\'lxml\')
 80         #解析两个链接
 81         download.append("https://www.ecva.net/"+download_text_soup.find(\'a\')[\'href\'])
 82         pdfinfo.append(pdfinfo_text_soup.find(\'a\')[\'href\'])
 83     print(len(download))
 84     print(len(pdfinfo))
 85     # for item in download :
 86     #     print(item)
 87     print("------------------------------")
 88 
 89     #把作者和download pdfinfo 存到res
 90     for i in range(0,len(res)):
 91         #添加作者
 92         res[i].append(author[0])
 93         #添加download
 94         res[i].append(download[0])
 95         #添加pdfinfo
 96         res[i].append(pdfinfo[0])
 97     #遍历最终结果集
 98     print(res[0])
 99     # for item in res:
100     #     print(item)
101     return
102 
103 if (__name__==\'__main__\'):
104     get_paper()

以上是关于Python爬虫爬取ECVA论文标题作者链接的主要内容，如果未能解决你的问题，请参考以下文章