提取mongodb中论文的信息,填入mysql,加快统计速度

Posted life is tough,so are you

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了提取mongodb中论文的信息,填入mysql,加快统计速度相关的知识,希望对你有一定的参考价值。

1.创建mysql的alzheimer表,包括pmc_id,journal,title,abstract,name,authorinfor,pun_year,keyword,reference信息。

#encoding = utf-8
import pymysql
import json

def input_from_json(filename):
    with open(filename,r) as file:
        data = json.loads(file.read())
        return data
conn= pymysql.connect(
        host=localhost,
        port = 3306,
        user=root,
        passwd=‘‘,
        db=python
        )
cur = conn.cursor()

def createdb():
    cur.execute(create table alzheimer(id int AUTO_INCREMENT,PRIMARY KEY (id)))
    cur.execute(alter table alzheimer add pmc_id int)
    cur.execute(alter table alzheimer add journal text)
    cur.execute(alter table alzheimer add title text)
    cur.execute(alter table alzheimer add abstract text)
    cur.execute(alter table alzheimer add namestr text)
    cur.execute(alter table alzheimer add authorinfor text)
    cur.execute(alter table alzheimer add pub_year varchar(5))
    cur.execute(alter table alzheimer add union_kwd_str text)
    cur.execute(alter table alzheimer add reference_str text)

createdb()

#up is all the test
def addnewcloumn():
    cur.execute(alter table test add transactor varchar(10))
    cur.execute(alter table ad add hasid varchar(10))

def addtomysql():
    idlist = input_from_json(id_list.json)[idList]
    values = []
    for i in range(len(idlist)):
        values.append((i,idlist[i]))

    cur.executemany(insert into ad  values(%s,%s),values)


def updatenewcol():
    idlist = input_from_json(id_list.json)[idList]
    values = []
    for i in range(len(idlist)):
        values.append((yes,i))
    cur.executemany(update ad set hasid = (%s) where id = (%s),values)

def selectpmcid():
    sql = "SELECT pmc_id FROM ad "
    a = cur.execute(sql)
    print a
    b = cur.fetchmany(a)  #b has 7887 abstract list

    print b[0][0],b[2][0]
    print type(b[0][0])   #int

def addnewcolunm():
    cur.execute(alter table ad add journal text)
    cur.execute(alter table ad add title text)
    cur.execute(alter table ad add abstract text)
    cur.execute(alter table ad add namestr text)
    cur.execute(alter table ad add authorinfor text)
    cur.execute(alter table ad add pub_year varchar(5))
    cur.execute(alter table ad add union_kwd_str text)
    cur.execute(alter table ad add reference_str text)

def inserttest():
    cur.execute(create table test2(id int AUTO_INCREMENT,PRIMARY KEY (id)))
    cur.execute(alter table test2 add pmc_id int)
    cur.execute(alter table test2 add title text)
def inserttest2():
    values = []
    for i in range(10):
        values.append((i,hello+str(i)))
    cur.executemany(insert into test2(pmc_id,title) values(%s,%s),values)

   
conn.commit()   
cur.close()
conn.close()

2.从mongodb中获取信息并且填入mysql,因为有12万条,中间可能有不正确的数据无法填入,分段插入。实际结果表明,insert语句比update语句快了将近十倍不止。最终选择新建一个完全为空的表,采用insert进行插入。

问题:引用的信息提取的有问题。接下来要进行的工作。

from pymongo import MongoClient
from bs4 import BeautifulSoup
from nltk.tokenize import MWETokenizer
import re
import pymysql
import time

# create link
client = MongoClient(localhost, 27017)
db = client[ad]   #or db = client.ad
collection = db[xml]   #create cousin

mongodb_to_mysql_list = []
num = 0
#first [:27810]
#second [27811:60000]
#third [60000:100000]   100000 cant 
#[100001:] 9859 cant  109861cant
#[100001:109860]
begin = time.time()
for item in collection.find()[109861:]:
    pmc_xml = item[xml]
    pmc_id = item[pmc_id].encode(utf-8)
    pmc_id = int(pmc_id)
    soup = BeautifulSoup(pmc_xml,"lxml")  #if not add "lxml" will warning but not error

    print num
    #find the journal
    journal = soup.find(journal-title)
    if journal != None:
        journal=journal.text.encode(utf-8)
    else:
        journal = ‘‘
    
    #find the article
    title = soup.find(article-title)
    if title != None:
        title = title.text.encode(utf-8)
    else:
        title = ‘‘

    #show the author
    authornamelist = soup.find_all(contrib)
    namestr = ‘‘
    for x in authornamelist:
        if x.surname != None:
            name = x.surname.text
            if x.surname.next_sibling != None:
                if x.surname.next_sibling.next_sibling != None:
                    name = name + + x.surname.next_sibling.next_sibling.text
        namestr = namestr+name.encode(utf-8)+,

    #show the firt author information
    authorinfor = ‘‘
    authorinfor = soup.find(aff)
    if authorinfor != None:
        authorinfor =authorinfor.text.encode(utf-8)
    else:
        authorinfor = ‘‘

    #show the receive time year
    pub_year = soup.find(year)
    if pub_year != None:
        pub_year=pub_year.text.encode(utf-8)
    else:
        pub_year=‘‘
    
    #show the abstract
    a = soup.find(abstract)
    if a != None:
        if a.p != None:
            abstract = a.p.text.encode(utf-8)
        else:
            abstract = ‘‘

    #show the key-words
    kwdlist = soup.find_all(kwd)
    union_kwd_str = ‘‘
    for x in kwdlist:
        kwd = x.text.lower().encode(utf-8)
        kwdstr = re.sub("\"|,|\.", "", kwd)
        kwd = tuple(kwdstr.split())
        tokenizer = MWETokenizer([kwd])
        union_kwd = tokenizer.tokenize(kwdstr.split())
        if union_kwd != []:
            union_kwd_str = union_kwd_str+union_kwd[0]+,

    #show the reference id
    pub_id_list = soup.find_all(pub-id)
    reference_idlist = []
    reference_str = ‘‘
    for x in pub_id_list:
        if x != None:
            reference = x.text.encode(utf-8)
            reference_idlist.append(reference)
        reference_str = reference_str+reference+,

    mongodb_to_mysql_list.append((pmc_id,journal,title,abstract,namestr,authorinfor,pub_year,union_kwd_str,reference_str))

    num += 1

print num
end1 = time.time() -begin
print end1


conn= pymysql.connect(
        host=localhost,
        port = 3306,
        user=root,
        passwd=‘‘,
        db=python
        )
cur = conn.cursor()
#cur.executemany(‘update ad set journal=(%s),title=(%s),abstract=(%s),namestr=(%s),authorinfor=(%s),pub_year=(%s),union_kwd_str=(%s),reference_str=(%s) where pmc_id = (%s)‘,mongodb_to_mysql_list)
#99s 100
cur.executemany(insert into alzheimer(pmc_id,journal,title,abstract,namestr,authorinfor,pub_year,union_kwd_str,reference_str) values(%s,%s,%s,%s,%s,%s,%s,%s,%s),mongodb_to_mysql_list)
#8.5s 100

#cur.executemany(‘insert into test2(pmc_id,title) values(%s,%s)‘,values)
conn.commit()   
cur.close()
conn.close()

end2 = time.time() -begin
print end2

 

以上是关于提取mongodb中论文的信息,填入mysql,加快统计速度的主要内容,如果未能解决你的问题,请参考以下文章

论文解读丨Zero-Shot场景下的信息结构化提取

python操作MONGODB数据库,提取部分数据再存储

text 用乳胶写论文时如何加资助信息

论文泛读76将来自bert的提取信息和多种嵌入方法与深度神经网络集成在一起,以进行幽默检测

带你看论文丨全局信息对于图网络文档解析的影响

c#使用正则表达式匹配数据 并计算填入新的值