json格式的文本处理
Posted 炫云云
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了json格式的文本处理相关的知识,希望对你有一定的参考价值。
JSON(javascript Object Notation) 是一种轻量级的数据交换格式。第一次接触到它是在进行服务器端接口测试的时候。现在很多服务器返回的结果都是json格式。主要是由于它比较容易解析和生成。JSON格式的数据本质上一种被格式化了的字符串。
编码
json.dumps()把一个Python对象编,码转换成Json字符串。
-
dumps操作的是字符串
-
dump操作的是文件流
import json
data = {"postag": [{"word": "内容", "pos": "n"}, {"word": "简介", "pos": "n"},
{"word": "《", "pos": "w"}, {"word": "宜兴紫砂图典", "pos": "nw"},
{"word": "》", "pos": "w"}, {"word": "由", "pos": "p"},
{"word": "故宫出版社", "pos": "nt"}, {"word": "出版", "pos": "v"}],
"text": "内容简介《宜兴紫砂图典》由故宫出版社出版",
"spo_list": [{"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍",
"object": "故宫出版社", "subject": "宜兴紫砂图典"}]}
data2 = json.dumps(data)
print(data2)
{"postag": [{"word": "\\u5185\\u5bb9", "pos": "n"}, {"word": "\\u7b80\\u4ecb", "pos": "n"},
{"word": "\\u300a", "pos": "w"}, {"word": "\\u5b9c\\u5174\\u7d2b\\u7802\\u56fe\\u5178", "pos": "nw"},
{"word": "\\u300b", "pos": "w"}, {"word": "\\u7531", "pos": "p"},
{"word": "\\u6545\\u5bab\\u51fa\\u7248\\u793e", "pos": "nt"}, {"word": "\\u51fa\\u7248", "pos": "v"}],
"text": "\\u5185\\u5bb9\\u7b80\\u4ecb\\u300a\\u5b9c\\u5174\\u7d2b\\u7802\\u56fe\\u5178\\u300b
\\u7531\\u6545\\u5bab\\u51fa\\u7248\\u793e\\u51fa\\u7248", "spo_list":
[{"predicate": "\\u51fa\\u7248\\u793e", "object_type": "\\u51fa\\u7248\\u793e",
"subject_type": "\\u4e66\\u7c4d", "object": "\\u6545\\u5bab\\u51fa\\u7248\\u793e",
"subject": "\\u5b9c\\u5174\\u7d2b\\u7802\\u56fe\\u5178"}]}
sort_keys =True:是告诉编码器按照字典排序(a到z)输出。如果是字典类型的python对象,就把关键字按照字典排序。
indent:参数根据数据格式缩进显示,读起来更加清晰。
separators:是分隔符的意思,参数意思分别为不同dict项之间的分隔符和dict项内key和value之间的分隔符,把:和,后面的空格都除去了。
## 按关键字排序
# indent表示每行缩进2
data3 = json.dumps(data, sort_keys=True,indent=2,ensure_ascii=False)
print(data3)
{
"postag": [
{
"pos": "n",
"word": "内容"
},
{
"pos": "n",
"word": "简介"
},
{
"pos": "w",
"word": "《"
},
{
"pos": "nw",
"word": "宜兴紫砂图典"
},
{
"pos": "w",
"word": "》"
},
{
"pos": "p",
"word": "由"
},
{
"pos": "nt",
"word": "故宫出版社"
},
{
"pos": "v",
"word": "出版"
}
],
"spo_list": [
{
"object": "故宫出版社",
"object_type": "出版社",
"predicate": "出版社",
"subject": "宜兴紫砂图典",
"subject_type": "书籍"
}
],
"text": "内容简介《宜兴紫砂图典》由故宫出版社出版"
}
data4 = json.dumps(data,separators=(',',':'),ensure_ascii=False)
print(data4)
{"postag":[{"word":"内容","pos":"n"},{"word":"简介","pos":"n"},{"word":"《","pos":"w"},{"word":"宜兴紫砂图典","pos":"nw"},{"word":"》","pos":"w"},
{"word":"由","pos":"p"},{"word":"故宫出版社","pos":"nt"},{"word":"出版","pos":"v"}],"text":"内容简介《宜兴紫砂图典》由故宫出版社出版","spo_list":
[{"predicate":"出版社","object_type":"出版社","subject_type":"书籍","object":"故宫出版社","subject":"宜兴紫砂图典"}]}
解码
json.loads()把Json格式字符串解码,转换成Python对象。
-
loads操作的是字符串
-
load操作的是文件流
print(json.loads(data3))
{'postag': [{'pos': 'n', 'word': '内容'}, {'pos': 'n', 'word': '简介'}, {'pos': 'w', 'word': '《'}, {'pos': 'nw', 'word': '宜兴紫砂图典'}, {'pos': 'w', 'word': '》'},
{'pos': 'p', 'word': '由'}, {'pos': 'nt', 'word': '故宫出版社'}, {'pos': 'v', 'word': '出版'}], 'spo_list': [{'object': '故宫出版社', 'object_type': '出版社',
'predicate': '出版社', 'subject': '宜兴紫砂图典', 'subject_type': '书籍'}], 'text': '内容简介《宜兴紫砂图典》由故宫出版社出版'}
读与写
# 写进json 数据
with open('data.json', 'w') as f:
json.dump(d1, f)
# 读取json数据
with open('data.json', 'r') as f:
data = json.load(f)
tqdm
tqdm中的tqdm()是实现进度条美化的基本方法,在for循环体中用tqdm()包裹指定的迭代器或range()即可,下面是个简单的例子:
import json
from tqdm import tqdm
import codecs
import os
all_50_schemas=set()
with open(r'D:\\学习·\\自然语言处理\\数据集\\DuIE_2_0\\2019_data\\all_50_schemas','r',encoding='utf-8',) as f:
for l in tqdm(f):
a = json.loads(l)
all_50_schemas.add(a['predicate'])
print(all_50_schemas)
50it [00:00, ?it/s]
{'出品公司', '作者', '官方语言', '毕业院校', '注册资本', '目', '导演', '面积', '海拔', '改编自', '人口数量', '作词', '丈夫', '妻子', '出生日期', '祖籍', '气候', '成
立日期', '首都', '专业代码', '嘉宾', '创始人', '出生地', '字', '所属专辑', '邮政编码', '作曲', '母亲', '号', '编剧', '简称', '国籍', '所在城市', '歌手', '制片人',
'连载网站', '修业年限', '董事长', '朝代', '出版社', '占地面积', '民族', '主角', '主持人', '主演', '身高', '父亲', '总部地点', '上映时间'}
读函数
path = r'D:\\学习·\\自然语言处理\\数据集\\DuIE_2_0\\2019_data\\train_data.json'
def read_json(src_filename):
with open(src_filename,encoding='utf-8' ) as f:
return json.load(f)
datastore = read_json(path)
print(datastore[1:3])
---------------------------------------------------------------------------
JSONDecodeError Traceback (most recent call last)
<ipython-input-3-340a0261b343> in <module>
5 return json.load(f)
6
----> 7 datastore = read_json(path)
8
9 print(datastore[1:3])
<ipython-input-3-340a0261b343> in read_json(src_filename)
3 def read_json(src_filename):
4 with open(src_filename,encoding='utf-8' ) as f:
----> 5 return json.load(f)
6
7 datastore = read_json(path)
D:\\study_Software\\Anaconda3\\lib\\json\\__init__.py in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
294 cls=cls, object_hook=object_hook,
295 parse_float=parse_float, parse_int=parse_int,
--> 296 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
297
298
D:\\study_Software\\Anaconda3\\lib\\json\\__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
346 parse_int is None and parse_float is None and
347 parse_constant is None and object_pairs_hook is None and not kw):
--> 348 return _default_decoder.decode(s)
349 if cls is None:
350 cls = JSONDecoder
D:\\study_Software\\Anaconda3\\lib\\json\\decoder.py in decode(self, s, _w)
338 end = _w(s, end).end()
339 if end != len(s):
--> 340 raise JSONDecodeError("Extra data", s, end)
341 return obj
342
JSONDecodeError: Extra data: line 2 column 1 (char 395)
大量数据,里面有多行多列,出现类似标题报错
raise JSONDecodeError(“Extra data”, s, end)
Extra data: line 2 column 1 (char 395)
可以逐行读取,然后再处理成列表
def read_jsonline(src_filename, encoding='utf-8' , *, default=None):
"""
read jsonl file
:param src_filename: source file path
:param encoding: file encoding
:param default: default value to return if file is not existed. Set it to None to disable it.
:return: object list, an object corresponding a line
"""
if default is not None and not os.path.exists(src_filename):
return default
file =open(src_filename, encoding=encoding)
items = []
for line in file:
items.append(json.loads(line))
file.close()
return items
datastore = read_jsonline(path)
print(datastore[1])
{'postag': [{'word': '《', 'pos': 'w'}, {'word': '中国风水十讲', 'pos': 'nw'}, {'word': '》', 'pos': 'w'}, {'word': '是', 'pos': 'v'}, {'word': '2007年', 'pos': 't'}, {'word': '华夏出版社', 'pos': 'nt'}, {'word': '出版', 'pos': 'v'}, {'word': '的', 'pos': 'u'}, {'word': '图书', 'pos': 'n'}, {'word': ',', 'pos': 'w'}, {'word': '作者', 'pos': 'n'}, {'word': '是', 'pos': 'v'}, {'word': '杨文衡', 'pos': 'nr'}], 'text': '《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡', 'spo_list': [{'predicate': '出版社', 'object_type': '出版社', 'subject_type': '书籍', 'object': '华夏出版社', 'subject': '中国风水十讲'}, {'predicate': '作者', 'object_type': '人物', 'subject_type': '图书作品', 'object': '杨文衡', 'subject': '中国风水十讲'}]}
train_data = []
chars={}
with open(r'D:\\学习·\\自然语言处理\\数据集\\DuIE_2_0\\2019_data\\train_data.json','r',encoding='utf-8') as f:
for line in f.readlines():
datastore = json.loads(line)
train_data.append(
{
'text': datastore['text'],
'spo_list': [(i['subject'], i['predicate'], i['object']) for i in datastore['spo_list']]
}
)
for c in datastore['text']:
chars[c] = chars.get(c, 0) + 1
print(train_data[1:3])
[{'text': '《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡', 'spo_list': [('中国风水十讲', '出版社', '华夏出版社'), ('中国风水十讲', '作者', '杨文衡')]}, {'text': '《空城未央》是夙言以信创作的网络小说,发表于17K小说网', 'spo_list': [('空城未央', '作者', '夙言以信'), ('空城未央', '连载网站', '17K小说网')]}]
写函数
def write_jsonline(dest_filename, items, encoding='utf-8'):
"""write items to file with json line format
:param dest_filename: destination file path
:param items: items to be saved line by line
:param encoding: file encoding
:return:
"""
if isinstance(items, str):
raise TypeError('json object list can\\'t be str')
if not dest_filename.endswith('.jsonl'):
print('json line filename doesn\\'t end with .jsonl')
if not isinstance(items, Iterable):
raise TypeError('items can\\'t be iterable')
file = open(dest_filename, "w",encoding = encoding)
for item in items:
file.write(json.dumps(item, ensure_ascii = False) + '\\n')
file.close()
参考
https://blog.csdn.net/xyz1584172808/article/details/82117220
https://wuwt.me/2017/08/21/pre-trained-embedding-keras/
https://www.eliyar.biz/using-pre-trained-gensim-word2vector-in-a-keras-model-and-visualizing/
https://m.imooc.com/article/295512
https://radimrehurek.com/gensim/models/word2vec.html
https://blog.csdn.net/lilong117194/article/details/82849054
以上是关于json格式的文本处理的主要内容,如果未能解决你的问题,请参考以下文章
Alamofire 文件上传出现错误“JSON 文本未以数组或对象开头,并且允许未设置片段的选项”