知网爬取
Posted wukai66
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了知网爬取相关的知识,希望对你有一定的参考价值。
知网爬取勿做商用
import requests, time, parsel, re from selenium.webdriver.chrome.options import Options from urllib.parse import urlencode from selenium import webdriver session = requests.session() proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "xxxx" proxyPass = "xxxxx" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = {‘http‘: proxyMeta, ‘https‘: proxyMeta} # 获取首页cookie def cookie_request(search): chrome_options = Options() chrome_options.add_argument(‘--window-size=1920,1080‘) # 设置窗口界面大小 chrome_options.add_argument(‘--headless‘) driver = webdriver.Chrome(chrome_options=chrome_options) url = "https://kns.cnki.net/kns/brief/default_result.aspx" driver.get(url) driver.find_element_by_xpath(‘//*[@id="txt_1_value1"]‘).send_keys(search) # driver.switch_to.frame("iframeResult") driver.find_element_by_xpath(‘//*[@id="btnSearch"]‘).click() frame = driver.find_element_by_xpath(‘//*[@id="iframeResult"]‘) # 定位到iframe标签 driver.switch_to.frame(frame) # 这个地方即将刷新 time.sleep(2) driver.find_element_by_xpath(‘//*[@id="J_ORDER"]/tbody/tr[1]/td/table/tbody/tr/td[2]/div[1]/a[1]‘).click() cookies_dic = {} for dict1 in driver.get_cookies(): name = dict1[‘name‘] value = dict1[‘value‘] cookies_dic[name] = value # print(cookies_dic) driver.quit() NET_SessionId = cookies_dic.get(‘ASP.NET_SessionId‘) SID_kns = cookies_dic.get(‘SID_kns‘) cookie = f"ASP.NET_SessionId={NET_SessionId}; SID_kns={SID_kns};" headers = { "Referer": "https://kns.cnki.net/kns/brief/default_result.aspx", # "Cookie":"ASP.NET_SessionId=kvxz1ynkhwhzb0gqetuvderq; SID_kns=123106;", # 只需要这两个 "Cookie": cookie, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/83.0.4103.61 Safari/537.36" } return headers # 拿取详情页 def requests_detail(url, title, authors, publication, timestamp, database): try: res = session.get(url, proxies=proxies, timeout=5) except: return data = parsel.Selector(res.text) place = data.xpath(‘//div[@class="orgn"]/span/a/text()‘) # 发文单位 if place: place = place[0].extract() else: place = None abstract_list = data.xpath(‘//*[@id="ChDivSummary"]/text()‘) abstract = "" # 摘要 if abstract_list: abstract_list = abstract_list.extract() for abs in abstract_list: abstract = abstract + abs keywords_list = data.xpath(‘//label[@id="catalog_KEYWORD"]/following-sibling::a/text()‘) keywords = "" if keywords_list: keywords_list = keywords_list.extract() for keyword in keywords_list: keyword = keyword.strip() keywords = keywords + keyword classno = data.xpath(‘//label[@id="catalog_ZTCLS"]/parent::p/text()‘) # 分号类 if classno: classno = classno[0].extract() else: classno = None publicationpic = data.xpath(‘//div[@class="cover"]/a/img/@src‘) # 图片 获取不到 if publicationpic: publicationpic = publicationpic[0].extract() else: publicationpic = None publicationen = data.xpath(‘//div[@class="sourinfo"]/p[2]/a/text()‘) # 英文名 if publicationen: publicationen = publicationen[0].extract() else: publicationen = None publicationdate = data.xpath(‘//div[@class="sourinfo"]/p[3]/a/text()‘) # 发表时间 if publicationdate: publicationdate = publicationdate[0].extract() else: publicationdate = None publication_title = data.xpath(‘//div[@class="sourinfo"]/p[1]/a/text()‘) # 杂志中文名 if publication_title: publication_title = publication_title[0].extract() else: publication_title = None issn = data.xpath(‘//div[@class="sourinfo"]/p[4]/text()‘) # issn if issn: issn = issn[0].extract().strip().strip(‘ISSN:‘) else: issn = None core = data.xpath(‘//div[@class="sourinfo"]/p[5]/text()‘) # 核心刊物 if core: core = core[0].extract() else: core = None dict1 = {} dict1[‘title‘] = title dict1[‘authors‘] = authors dict1[‘publication‘] = publication dict1[‘timestamp‘] = timestamp dict1[‘database‘] = database dict1[‘place‘] = place dict1[‘abstract‘] = abstract dict1[‘classno‘] = classno dict1[‘publicationpic‘] = publicationpic dict1[‘publicationen‘] = publicationen dict1[‘publicationdate‘] = publicationdate dict1[‘publication_title‘] = publication_title dict1[‘issn‘] = issn dict1[‘core‘] = core dict1[‘href‘] = url print(dict1) # 列表页 def requests_list(count,search): headers = cookie_request(search) # datas = { # # "pagename": "ASP.brief_default_result_aspx", # # "isinEn": "1", # # "dbPrefix": "CFLS", # # "ConfigFile": "SCDBINDEX.xml", # "keyValue": "肾结石" # } # url = "https://kns.cnki.net/kns/brief/brief.aspx?pagename=ASP.brief_default_result_aspx&isinEn=1&dbPrefix=SCDB&keyValue=%E8%82%BE%E7%BB%93%E7%9F%B3&S=1&sorttype=" # url = "https://kns.cnki.net/kns/brief/brief.aspx?&pagename=ASP.brief_default_result_aspx&isinEn=1&dbPrefix=SCDB&S=1&sorttype=" for i in range(1, count): url = "https://kns.cnki.net/kns/brief/brief.aspx?curpage=%s&RecordsPerPage=20&QueryID=11&ID=&turnpage=1&tpagemode=L&dbPrefix=CFLS&Fields=&DisplayMode=listmode&PageName=ASP.brief_default_result_aspx&t=1&" % i try: res = session.get(url, headers=headers,proxies=proxies, timeout=10) except: continue data = parsel.Selector(res.text) table = data.xpath(‘//table[@class="GridTableContent"]‘) if table: tr_list = table.xpath(‘//tr‘).extract() tr_list = tr_list[7:27] for tr in tr_list: data1 = parsel.Selector(str(tr)) title = data1.xpath(‘//a[@class="fz14"]/text()‘)[0].extract() # 标题 href = data1.xpath(‘//a[@class="fz14"]/@href‘)[0].extract() # 详情页 res1 = re.search(r"FileName=(.*?)&", href) filename = res1.group().replace(‘FileName=‘, ‘‘).replace(‘&‘, ‘‘) href = "https://kns.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFQ&dbname=CJFDAUTO&filename=%s" % filename author_list = data1.xpath(‘//td/a[@class="KnowledgeNetLink"]‘) authors = None # 作者 if author_list: authors = "" author_list = author_list.extract() for author_html in author_list: data2 = parsel.Selector(str(author_html)) author = data2.xpath(‘//a/text()‘)[0].extract().strip() authors = authors + f"{author};" else: author_list = data1.xpath(‘//td[@class="author_flag"]/text()‘) if author_list: authors = author_list[0].extract().strip() if not authors: author_list = data1.xpath(‘//td[@class="author_flag"]/a/text()‘) if author_list: authors = "" author_list = author_list.extract() for author in author_list: authors = authors + f"{author};" publication = data1.xpath(‘//tr/td[4]/a/text()‘)[0].extract() # 来源 timestamp = data1.xpath(‘//tr/td[5]/text()‘)[0].extract().strip() # 发表日期 database = data1.xpath(‘//tr/td[6]/text()‘)[0].extract().strip() # 数据库 # print(title) requests_detail(href, title, authors, publication, timestamp, database) else: print("cookie校验失败!", i) # session = requests.session() # cookie_request() continue return True # 翻页 def page(search): headers = cookie_request(search) url = "https://kns.cnki.net/kns/brief/brief.aspx?curpage=1&RecordsPerPage=20&QueryID=11&ID=&turnpage=1&tpagemode=L&dbPrefix=CFLS&Fields=&DisplayMode=listmode&PageName=ASP.brief_default_result_aspx&isinEn=1&" res = session.get(url, headers=headers) data = parsel.Selector(res.text) try: page = data.xpath(‘//div[@class="pagerTitleCell"]/text()‘)[0].extract().strip() page = page.replace(‘找到‘, ‘‘).replace(‘条结果‘, ‘‘).replace(‘,‘, ‘‘).strip() page = int(page) # 总数 print("总计:%s 条数据" % page) page = int(page / 20) + 2 return page except: return False def main(search): count = page(search) if count: requests_list(count,search) else: print("获取cookie失败,请重新运行!") search = input("请输入搜索关键字: ").strip() if __name__ == ‘__main__‘: main(search)
以上是关于知网爬取的主要内容,如果未能解决你的问题,请参考以下文章
Python爬虫实战,Scrapy实战,爬取并简单分析知网中国专利数据