统计中国,美国,世界排名前50的关键词并进行比较
Posted life is tough,so are you
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了统计中国,美国,世界排名前50的关键词并进行比较相关的知识,希望对你有一定的参考价值。
1 获取中国所有关键词
import pymysql import json conn= pymysql.connect( host=‘localhost‘, port = 3306, user=‘root‘, passwd=‘‘, db =‘python‘, ) cursor = conn.cursor() sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where authorinfor like ‘%china%‘ && union_kwd_str != ‘‘" a = cursor.execute(sql) print a b = cursor.fetchmany(a) #b has 7887 abstract list abstract_list = [] pmc_id_dict= {} for j in range(a): abstract_list.append(b[j][0]) pmc_id_dict[j] = b[j][1] def output_to_json(data,filename): with open(filename,‘w‘) as file: file.write(json.dumps(data)) file.close() return json.dumps(data) output_data = { ‘country‘: "china", ‘count‘: a, ‘keyword‘: abstract_list } output_to_json(output_data, ‘1203_china_kwd.json‘)
选出排名前50的关键词
import re import collections import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) file.close() return data def count_word(path): result = {} keyword_list = input_from_json(path)[‘keyword‘] for all_the_text in keyword_list: for word in all_the_text.split(‘,‘): if word not in result: result[word] = 0 result[word] += 1 return result def sort_by_count(d): d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return d if __name__ == ‘__main__‘: file_name = "1203_china_kwd.json" fobj1 = open(‘1204_top50_china_kwd_list.json‘,‘w‘) fobj2 = open(‘1203_top15_china_kwd.json‘,‘w‘) dword = count_word(file_name) dword = sort_by_count(dword) jsonlist = [] num = 0 top_china_kwd_list = [] for key,value in dword.items(): num += 1 key = re.sub("_", " ", key) data = { ‘name‘: key, ‘value‘: value } json_data = json.dumps(data) if num < 50: top_china_kwd_list.append(key) fobj2.write(json_data) fobj2.write(‘,‘) if num == 50: top_china_kwd_list.append(key) fobj2.write(json_data) data = { ‘china_kwd‘:top_china_kwd_list } json_data = json.dumps(data) fobj1.write(json_data)
2.获取美国的所有关键词,并做统计,与中国的统计代码相似,下一步工作是整合代码。
import pymysql import json conn= pymysql.connect( host=‘localhost‘, port = 3306, user=‘root‘, passwd=‘‘, db =‘python‘, ) cursor = conn.cursor() sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where authorinfor like ‘%USA%‘ && union_kwd_str != ‘‘" a = cursor.execute(sql) print a b = cursor.fetchmany(a) #b has 7887 abstract list abstract_list = [] pmc_id_dict= {} for j in range(a): abstract_list.append(b[j][0]) pmc_id_dict[j] = b[j][1] def output_to_json(data,filename): with open(filename,‘w‘) as file: file.write(json.dumps(data)) file.close() return json.dumps(data) output_data = { ‘country‘: "USA", ‘count‘: a, ‘keyword‘: abstract_list } output_to_json(output_data, ‘1204_USA_kwd.json‘)
美国前50的关键词
import re import collections import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) file.close() return data def count_word(path): result = {} keyword_list = input_from_json(path)[‘keyword‘] for all_the_text in keyword_list: for word in all_the_text.split(‘,‘): if word not in result: result[word] = 0 result[word] += 1 return result def sort_by_count(d): d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return d if __name__ == ‘__main__‘: file_name = "1204_USA_kwd.json" fobj1 = open(‘1204_top50_USA_kwd_list.json‘,‘w‘) fobj2 = open(‘1204_top50_USA_kwd.json‘,‘w‘) dword = count_word(file_name) dword = sort_by_count(dword) jsonlist = [] num = 0 top_USA_kwd_list = [] for key,value in dword.items(): num += 1 key = re.sub("_", " ", key) data = { ‘name‘: key, ‘value‘: value } json_data = json.dumps(data) if num < 50: top_USA_kwd_list.append(key) fobj2.write(json_data) fobj2.write(‘,‘) if num == 50: top_USA_kwd_list.append(key) fobj2.write(json_data) data = { ‘USA_kwd‘:top_USA_kwd_list } json_data = json.dumps(data) fobj1.write(json_data)
3,世界的前50的关键词
import pymysql import json conn= pymysql.connect( host=‘localhost‘, port = 3306, user=‘root‘, passwd=‘‘, db =‘python‘, ) cursor = conn.cursor() sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where union_kwd_str != ‘‘" a = cursor.execute(sql) print a b = cursor.fetchmany(a) #b has 7887 abstract list abstract_list = [] pmc_id_dict= {} for j in range(a): abstract_list.append(b[j][0]) pmc_id_dict[j] = b[j][1] def output_to_json(data,filename): with open(filename,‘w‘) as file: file.write(json.dumps(data)) file.close() return json.dumps(data) output_data = { ‘country‘: "world", ‘count‘: a, ‘keyword‘: abstract_list } output_to_json(output_data, ‘1203_world_kwd.json‘)
世界前50关键词
import re import collections import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) file.close() return data def count_word(path): result = {} keyword_list = input_from_json(path)[‘keyword‘] for all_the_text in keyword_list: for word in all_the_text.split(‘,‘): if word not in result: result[word] = 0 result[word] += 1 return result def sort_by_count(d): d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return d if __name__ == ‘__main__‘: file_name = "1203_world_kwd.json" fobj1 = open(‘1204_top50_world_kwd_list.json‘,‘w‘) fobj2 = open(‘1203_top15_world_kwd.json‘,‘w‘) dword = count_word(file_name) dword = sort_by_count(dword) jsonlist = [] num = 0 top_world_kwd_list = [] for key,value in dword.items(): num += 1 key = re.sub("_", " ", key) data = { ‘name‘: key, ‘value‘: value } json_data = json.dumps(data) if num < 50: top_world_kwd_list.append(key) fobj2.write(json_data) fobj2.write(‘,‘) if num == 50: top_world_kwd_list.append(key) fobj2.write(json_data) data = { ‘world_kwd‘:top_world_kwd_list } json_data = json.dumps(data) fobj1.write(json_data)
4.比较中国与美国的关键词有哪些相似的,以及中国与世界的研究热点有哪些相似的
import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) file.close() return data china_path = ‘1204_top50_china_kwd_list.json‘ world_path = ‘1204_top50_world_kwd_list.json‘ USA_path = ‘1204_top50_USA_kwd_list.json‘ china_kwd_list = input_from_json(china_path)[‘china_kwd‘] world_kwd_list = input_from_json(world_path)[‘world_kwd‘] USA_kwd_list = input_from_json(USA_path)[‘USA_kwd‘] a = set(china_kwd_list) b = set(world_kwd_list) c = set(USA_kwd_list) china_world_same_kwd =list(a&b) for kwd in china_world_same_kwd: kwd = kwd.encode(‘utf-8‘) print kwd print len(china_world_same_kwd) print ‘\n‘ china_USA_same_kwd =list(a&c) for kwd in china_USA_same_kwd: kwd = kwd.encode(‘utf-8‘) print kwd print len(china_world_same_kwd)
以上是关于统计中国,美国,世界排名前50的关键词并进行比较的主要内容,如果未能解决你的问题,请参考以下文章