提取mongodb中论文的信息,填入mysql,加快统计速度
Posted life is tough,so are you
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了提取mongodb中论文的信息,填入mysql,加快统计速度相关的知识,希望对你有一定的参考价值。
1.创建mysql的alzheimer表,包括pmc_id,journal,title,abstract,name,authorinfor,pun_year,keyword,reference信息。
#encoding = utf-8 import pymysql import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) return data conn= pymysql.connect( host=‘localhost‘, port = 3306, user=‘root‘, passwd=‘‘, db=‘python‘ ) cur = conn.cursor() def createdb(): cur.execute(‘create table alzheimer(id int AUTO_INCREMENT,PRIMARY KEY (id))‘) cur.execute(‘alter table alzheimer add pmc_id int‘) cur.execute(‘alter table alzheimer add journal text‘) cur.execute(‘alter table alzheimer add title text‘) cur.execute(‘alter table alzheimer add abstract text‘) cur.execute(‘alter table alzheimer add namestr text‘) cur.execute(‘alter table alzheimer add authorinfor text‘) cur.execute(‘alter table alzheimer add pub_year varchar(5)‘) cur.execute(‘alter table alzheimer add union_kwd_str text‘) cur.execute(‘alter table alzheimer add reference_str text‘) createdb() #up is all the test def addnewcloumn(): cur.execute(‘alter table test add transactor varchar(10)‘) cur.execute(‘alter table ad add hasid varchar(10)‘) def addtomysql(): idlist = input_from_json(‘id_list.json‘)[‘idList‘] values = [] for i in range(len(idlist)): values.append((i,idlist[i])) cur.executemany(‘insert into ad values(%s,%s)‘,values) def updatenewcol(): idlist = input_from_json(‘id_list.json‘)[‘idList‘] values = [] for i in range(len(idlist)): values.append((‘yes‘,i)) cur.executemany(‘update ad set hasid = (%s) where id = (%s)‘,values) def selectpmcid(): sql = "SELECT pmc_id FROM ad " a = cur.execute(sql) print a b = cur.fetchmany(a) #b has 7887 abstract list print b[0][0],b[2][0] print type(b[0][0]) #int def addnewcolunm(): cur.execute(‘alter table ad add journal text‘) cur.execute(‘alter table ad add title text‘) cur.execute(‘alter table ad add abstract text‘) cur.execute(‘alter table ad add namestr text‘) cur.execute(‘alter table ad add authorinfor text‘) cur.execute(‘alter table ad add pub_year varchar(5)‘) cur.execute(‘alter table ad add union_kwd_str text‘) cur.execute(‘alter table ad add reference_str text‘) def inserttest(): cur.execute(‘create table test2(id int AUTO_INCREMENT,PRIMARY KEY (id))‘) cur.execute(‘alter table test2 add pmc_id int‘) cur.execute(‘alter table test2 add title text‘) def inserttest2(): values = [] for i in range(10): values.append((i,‘hello‘+str(i))) cur.executemany(‘insert into test2(pmc_id,title) values(%s,%s)‘,values) conn.commit() cur.close() conn.close()
2.从mongodb中获取信息并且填入mysql,因为有12万条,中间可能有不正确的数据无法填入,分段插入。实际结果表明,insert语句比update语句快了将近十倍不止。最终选择新建一个完全为空的表,采用insert进行插入。
问题:引用的信息提取的有问题。接下来要进行的工作。
from pymongo import MongoClient from bs4 import BeautifulSoup from nltk.tokenize import MWETokenizer import re import pymysql import time # create link client = MongoClient(‘localhost‘, 27017) db = client[‘ad‘] #or db = client.ad collection = db[‘xml‘] #create cousin mongodb_to_mysql_list = [] num = 0 #first [:27810] #second [27811:60000] #third [60000:100000] 100000 cant #[100001:] 9859 cant 109861cant #[100001:109860] begin = time.time() for item in collection.find()[109861:]: pmc_xml = item[‘xml‘] pmc_id = item[‘pmc_id‘].encode(‘utf-8‘) pmc_id = int(pmc_id) soup = BeautifulSoup(pmc_xml,"lxml") #if not add "lxml" will warning but not error print num #find the journal journal = soup.find(‘journal-title‘) if journal != None: journal=journal.text.encode(‘utf-8‘) else: journal = ‘‘ #find the article title = soup.find(‘article-title‘) if title != None: title = title.text.encode(‘utf-8‘) else: title = ‘‘ #show the author authornamelist = soup.find_all(‘contrib‘) namestr = ‘‘ for x in authornamelist: if x.surname != None: name = x.surname.text if x.surname.next_sibling != None: if x.surname.next_sibling.next_sibling != None: name = name +‘ ‘+ x.surname.next_sibling.next_sibling.text namestr = namestr+name.encode(‘utf-8‘)+‘,‘ #show the firt author information authorinfor = ‘‘ authorinfor = soup.find(‘aff‘) if authorinfor != None: authorinfor =authorinfor.text.encode(‘utf-8‘) else: authorinfor = ‘‘ #show the receive time year pub_year = soup.find(‘year‘) if pub_year != None: pub_year=pub_year.text.encode(‘utf-8‘) else: pub_year=‘‘ #show the abstract a = soup.find(‘abstract‘) if a != None: if a.p != None: abstract = a.p.text.encode(‘utf-8‘) else: abstract = ‘‘ #show the key-words kwdlist = soup.find_all(‘kwd‘) union_kwd_str = ‘‘ for x in kwdlist: kwd = x.text.lower().encode(‘utf-8‘) kwdstr = re.sub("\"|,|\.", "", kwd) kwd = tuple(kwdstr.split()) tokenizer = MWETokenizer([kwd]) union_kwd = tokenizer.tokenize(kwdstr.split()) if union_kwd != []: union_kwd_str = union_kwd_str+union_kwd[0]+‘,‘ #show the reference id pub_id_list = soup.find_all(‘pub-id‘) reference_idlist = [] reference_str = ‘‘ for x in pub_id_list: if x != None: reference = x.text.encode(‘utf-8‘) reference_idlist.append(reference) reference_str = reference_str+reference+‘,‘ mongodb_to_mysql_list.append((pmc_id,journal,title,abstract,namestr,authorinfor,pub_year,union_kwd_str,reference_str)) num += 1 print num end1 = time.time() -begin print end1 conn= pymysql.connect( host=‘localhost‘, port = 3306, user=‘root‘, passwd=‘‘, db=‘python‘ ) cur = conn.cursor() #cur.executemany(‘update ad set journal=(%s),title=(%s),abstract=(%s),namestr=(%s),authorinfor=(%s),pub_year=(%s),union_kwd_str=(%s),reference_str=(%s) where pmc_id = (%s)‘,mongodb_to_mysql_list) #99s 100 cur.executemany(‘insert into alzheimer(pmc_id,journal,title,abstract,namestr,authorinfor,pub_year,union_kwd_str,reference_str) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)‘,mongodb_to_mysql_list) #8.5s 100 #cur.executemany(‘insert into test2(pmc_id,title) values(%s,%s)‘,values) conn.commit() cur.close() conn.close() end2 = time.time() -begin print end2