Python 爬取外文期刊论文信息(机械 仪表工业)

Posted 右介

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python 爬取外文期刊论文信息(机械 仪表工业)相关的知识,希望对你有一定的参考价值。

NSTL国家科技图书文献中心    2017  机械 仪表工业  所有期刊论文信息

代码比较随意,不要介意

第一步,爬取所有期刊链接

#coding=utf-8

import time
from selenium import webdriver
from lxml import etree
from pymongo import MongoClient

client = MongoClient("IP", 27017)
db = client["nstl"]
collection=db["journal_urls"]
db.authenticate("","")
 
driver = webdriver.Chrome(executable_path=r"D:chromedriver_win32chromedriver.exe")
driver.get(https://www.nstl.gov.cn/facade/search/clcSearch.do?&lan=eng&clc=TH)

html = driver.page_source
tree = etree.HTML(html)
count = int(tree.xpath("//span[@id=‘totalPages1‘]/text()")[0])

# 共47页
for i in range(count):

    html = driver.page_source
    tree = etree.HTML(html)

    # 提取当前页所有期刊链接并存储
    table = tree.xpath("//div[@class=‘s2listtd2‘]/span/a/@href")
    for j in table:
        bson = {}
        bson[url] = j
        collection.insert(bson)

    # i等于46时终止
    if i==(count-1):
        break

    # 点击接下来一页按钮
    driver.find_element_by_xpath(//div[@id="page"]/div//a[text()="%s"]%str(i+2)).click()

    # 判断翻页成功后跳出while
    while True:
        time.sleep(1)
        if driver.page_source!=html:
            break

driver.close()

第二步,爬取每个期刊中所有2017年论文链接

#coding=utf-8
import requests
from pymongo import MongoClient
from lxml import etree
from selenium import webdriver
import time

client = MongoClient("IP", 27017)
db = client["nstl"]
collection1=db["journal_urls"]
collection2=db["journalArticle2017_urls"]
db.authenticate("","")
driver = webdriver.Chrome(executable_path=r"D:chromedriver_win32chromedriver.exe")
# 循环所有期刊链接
for item in collection1.find({}, {"url":1, "_id":0}):
    driver.get(item[url][29:-4])
    html = driver.page_source
    tree = etree.HTML(html)
    # 判断如果有18年论文,需要点击出17年论文
    table_2018 = tree.xpath("//div[@id=‘year_2018‘]")
    if table_2018!=[]:
        driver.find_element_by_xpath("//div[@id=‘year_2017‘]").click()
        time.sleep(1)
        driver.find_element_by_xpath("//div[@id=‘volumeUl_2017‘]/div[@class=‘ltreebom2‘]").click()
    # 获取17年期的个数并循环
    table = tree.xpath("//div[@id=‘volumeUl_2017‘]//div[@class=‘ltreebom3‘]/a")
    for i in range(1, len(table)+1):
        wen_html = driver.page_source
        wen_tree = etree.HTML(wen_html)
        # 获取当前一期的所有论文链接
        wen_table = tree.xpath("//div[@class=‘s2listtd2‘]/a/@href")
        for j in wen_table:
            bson = {}
            bson[url] = j
            collection2.insert(bson)
        # 判断结束循环
        if i==len(table):
            break
        # 点击出下一期论文
        try:
            driver.find_element_by_xpath("//div[@id=‘volumeUl_2017‘]//div[@class=‘ltreebom3‘][%s]"%str(i+1)).click()
        except:
            break
        # 判断是否点击成功
        while True:
            time.sleep(1)
            if driver.page_source!=wen_html:
                break

driver.close()

第三步,爬取论文信息详情页源码

#coding=utf-8
import requests
from pymongo import MongoClient
from lxml import etree
from selenium import webdriver
import time

client = MongoClient("IP", 27017)
db = client["nstl"]
collection=db["journalArticle2017_urls"]
collection1=db["journalArticle2017_codes"]
db.authenticate("","")

driver = webdriver.Chrome(executable_path=r"D:chromedriver_win32chromedriver.exe")

# 循环所有论文并构造链接
for item in collection.find({}, {"url":1, "_id":0}):

    url = "https://www.nstl.gov.cn/facade/search/toFullView.do?checkedSEQNO="+item[url][23:-11]+"&subDocType="+item[url][-8:-3]

    # # post方法获取当前页源码
    # for i in range(100):
    #     try:
    #         result = requests.post(url, verify = False)
    #     except:
    #         time.sleep(1)
    #         continue

    #     html = result.text
    #     if html:
    #         break
    
    # 模拟浏览器获取源码, 得到含有文献数据的源码后跳出循环
    driver.get(url)
    for i in range(100):
        time.sleep(1)
        if driver.page_source!=html:
            break

    # 存储
    bson = {}
    html1 = driver.page_source
    bson[html] = html1
    collection1.insert(bson)

driver.close()

第四步,解析源码

#coding=utf-8
from pymongo import MongoClient
from lxml import etree

client = MongoClient("IP", 27017)
db = client["nstl"]
collection1 = db["journalArticle2017_codes"]
collection2 = db["journalArticle2017_data"]
db.authenticate("","")

zzdw, km, ma, cbn, j, q, qy, zy, zys, flh, gjc, yz, wz = u【作者单位】:, u【刊名】:, u【ISSN】:, u【出版年】:, u【卷】:, u【期】:, u【起页】:, u【止页】:, u【总页数】:, u【分类号】:, u【关键词】:, u【语种】:, u【文摘】:

# 循环所有论文并构造链接
n = 0
for item in collection1.find({}, {"html":1, "_id":0}):
    html = item["html"]
    tree = etree.HTML(html)

    title = tree.xpath("//span[@name=‘title‘]/text()")
    author = tree.xpath("//a[starts-with(@href,‘javascript:searchByAuthor‘)]/text()")
    organization = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%zzdw)
    journal_name = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%km)
    issn = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%ma)
    publication_year = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%cbn)
    volume = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%j)
    issue = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%q)
    page_start = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%qy)
    page_end = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%zy)
    page_count = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%zys)
    clc = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%flh)
    keywords = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/span/a/text()"%gjc)
    language = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%yz)
    summary = tree.xpath("//div[text()=‘%s‘]/following-sibling::*/text()"%wz)

    dc = {}
    dc[title] = title[0]
    if author: dc[author] = author
    if organization: dc[organization] = organization[0]
    if journal_name: dc[journal_name] = journal_name[0]
    if issn: dc[issn] = issn[0]
    if publication_year: dc[publication_year] = publication_year[0]
    if volume: dc[volume] = volume[0]
    if issue: dc[issue] = issue[0]
    if page_start: dc[page_start] = page_start[0]
    if page_end: dc[page_end] = page_end[0]
    if page_count: dc[page_count] = page_count[0]
    if clc: dc[clc] = clc[0]
    if keywords: dc[keywords] = keywords[0]
    if language: dc[language] = language[0]
    if summary: dc[summary] = summary[0]

    collection2.insert(dc)

 

以上是关于Python 爬取外文期刊论文信息(机械 仪表工业)的主要内容,如果未能解决你的问题,请参考以下文章

外文期刊中的brief report 跟一般的article 有区别吗?

《文献管理与信息分析》第四讲学习笔记

高分求《人工智能和专家系统》的外文文献及其翻译

欧宗瑛的出版著作和论文

论文参考文献中的字母是啥意思

急求关于国际服务贸易的外文文献两篇!!