json格式的文本处理

Posted 2021-06-26 炫云云

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了json格式的文本处理相关的知识，希望对你有一定的参考价值。

文章目录

JSON(javascript Object Notation) 是一种轻量级的数据交换格式。第一次接触到它是在进行服务器端接口测试的时候。现在很多服务器返回的结果都是json格式。主要是由于它比较容易解析和生成。JSON格式的数据本质上一种被格式化了的字符串。

编码

json.dumps()把一个Python对象编，码转换成Json字符串。

dumps操作的是字符串
dump操作的是文件流

import json

data =  {"postag": [{"word": "内容", "pos": "n"}, {"word": "简介", "pos": "n"},
                    {"word": "《", "pos": "w"}, {"word": "宜兴紫砂图典", "pos": "nw"},
                    {"word": "》", "pos": "w"}, {"word": "由", "pos": "p"}, 
                    {"word": "故宫出版社", "pos": "nt"}, {"word": "出版", "pos": "v"}], 
         "text": "内容简介《宜兴紫砂图典》由故宫出版社出版", 
         "spo_list": [{"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍", 
                       "object": "故宫出版社", "subject": "宜兴紫砂图典"}]}


data2 = json.dumps(data)
print(data2)

{"postag": [{"word": "\\u5185\\u5bb9", "pos": "n"}, {"word": "\\u7b80\\u4ecb", "pos": "n"}, 
{"word": "\\u300a", "pos": "w"}, {"word": "\\u5b9c\\u5174\\u7d2b\\u7802\\u56fe\\u5178", "pos": "nw"}, 
{"word": "\\u300b", "pos": "w"}, {"word": "\\u7531", "pos": "p"},
{"word": "\\u6545\\u5bab\\u51fa\\u7248\\u793e", "pos": "nt"}, {"word": "\\u51fa\\u7248", "pos": "v"}], 
"text": "\\u5185\\u5bb9\\u7b80\\u4ecb\\u300a\\u5b9c\\u5174\\u7d2b\\u7802\\u56fe\\u5178\\u300b
\\u7531\\u6545\\u5bab\\u51fa\\u7248\\u793e\\u51fa\\u7248", "spo_list": 
[{"predicate": "\\u51fa\\u7248\\u793e", "object_type": "\\u51fa\\u7248\\u793e", 
"subject_type": "\\u4e66\\u7c4d", "object": "\\u6545\\u5bab\\u51fa\\u7248\\u793e",
"subject": "\\u5b9c\\u5174\\u7d2b\\u7802\\u56fe\\u5178"}]}

sort_keys =True:是告诉编码器按照字典排序(a到z)输出。如果是字典类型的python对象，就把关键字按照字典排序。

indent:参数根据数据格式缩进显示，读起来更加清晰。

separators:是分隔符的意思，参数意思分别为不同dict项之间的分隔符和dict项内key和value之间的分隔符，把：和，后面的空格都除去了。


## 按关键字排序
# indent表示每行缩进2
data3 = json.dumps(data, sort_keys=True,indent=2,ensure_ascii=False)
print(data3)

{
  "postag": [
    {
      "pos": "n",
      "word": "内容"
    },
    {
      "pos": "n",
      "word": "简介"
    },
    {
      "pos": "w",
      "word": "《"
    },
    {
      "pos": "nw",
      "word": "宜兴紫砂图典"
    },
    {
      "pos": "w",
      "word": "》"
    },
    {
      "pos": "p",
      "word": "由"
    },
    {
      "pos": "nt",
      "word": "故宫出版社"
    },
    {
      "pos": "v",
      "word": "出版"
    }
  ],
  "spo_list": [
    {
      "object": "故宫出版社",
      "object_type": "出版社",
      "predicate": "出版社",
      "subject": "宜兴紫砂图典",
      "subject_type": "书籍"
    }
  ],
  "text": "内容简介《宜兴紫砂图典》由故宫出版社出版"
}

data4 = json.dumps(data,separators=(',',':'),ensure_ascii=False)
print(data4)

{"postag":[{"word":"内容","pos":"n"},{"word":"简介","pos":"n"},{"word":"《","pos":"w"},{"word":"宜兴紫砂图典","pos":"nw"},{"word":"》","pos":"w"},
{"word":"由","pos":"p"},{"word":"故宫出版社","pos":"nt"},{"word":"出版","pos":"v"}],"text":"内容简介《宜兴紫砂图典》由故宫出版社出版","spo_list":
[{"predicate":"出版社","object_type":"出版社","subject_type":"书籍","object":"故宫出版社","subject":"宜兴紫砂图典"}]}

解码

json.loads()把Json格式字符串解码，转换成Python对象。

loads操作的是字符串
load操作的是文件流

print(json.loads(data3))

{'postag': [{'pos': 'n', 'word': '内容'}, {'pos': 'n', 'word': '简介'}, {'pos': 'w', 'word': '《'}, {'pos': 'nw', 'word': '宜兴紫砂图典'}, {'pos': 'w', 'word': '》'}, 
{'pos': 'p', 'word': '由'}, {'pos': 'nt', 'word': '故宫出版社'}, {'pos': 'v', 'word': '出版'}], 'spo_list': [{'object': '故宫出版社', 'object_type': '出版社', 
'predicate': '出版社', 'subject': '宜兴紫砂图典', 'subject_type': '书籍'}], 'text': '内容简介《宜兴紫砂图典》由故宫出版社出版'}

读与写

# 写进json 数据
with open('data.json', 'w') as f:
    json.dump(d1, f)
# 读取json数据
with open('data.json', 'r') as f:
    data = json.load(f)

tqdm

tqdm中的tqdm()是实现进度条美化的基本方法，在for循环体中用tqdm()包裹指定的迭代器或range()即可，下面是个简单的例子：

import json
from tqdm import tqdm
import codecs
import os

all_50_schemas=set()

with open(r'D:\\学习·\\自然语言处理\\数据集\\DuIE_2_0\\2019_data\\all_50_schemas','r',encoding='utf-8',) as f:
    for l in tqdm(f):
        a = json.loads(l)
        all_50_schemas.add(a['predicate'])

print(all_50_schemas)

50it [00:00, ?it/s]
{'出品公司', '作者', '官方语言', '毕业院校', '注册资本', '目', '导演', '面积', '海拔', '改编自', '人口数量', '作词', '丈夫', '妻子', '出生日期', '祖籍', '气候', '成
立日期', '首都', '专业代码', '嘉宾', '创始人', '出生地', '字', '所属专辑', '邮政编码', '作曲', '母亲', '号', '编剧', '简称', '国籍', '所在城市', '歌手', '制片人', 
'连载网站', '修业年限', '董事长', '朝代', '出版社', '占地面积', '民族', '主角', '主持人', '主演', '身高', '父亲', '总部地点', '上映时间'}

读函数

path = r'D:\\学习·\\自然语言处理\\数据集\\DuIE_2_0\\2019_data\\train_data.json'

def read_json(src_filename):
    with open(src_filename,encoding='utf-8' ) as f:
        return json.load(f)

datastore =  read_json(path)

print(datastore[1:3])

---------------------------------------------------------------------------

JSONDecodeError                           Traceback (most recent call last)

<ipython-input-3-340a0261b343> in <module>
      5         return json.load(f)
      6 
----> 7 datastore =  read_json(path)
      8 
      9 print(datastore[1:3])


<ipython-input-3-340a0261b343> in read_json(src_filename)
      3 def read_json(src_filename):
      4     with open(src_filename,encoding='utf-8' ) as f:
----> 5         return json.load(f)
      6 
      7 datastore =  read_json(path)


D:\\study_Software\\Anaconda3\\lib\\json\\__init__.py in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    294         cls=cls, object_hook=object_hook,
    295         parse_float=parse_float, parse_int=parse_int,
--> 296         parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
    297 
    298 


D:\\study_Software\\Anaconda3\\lib\\json\\__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    346             parse_int is None and parse_float is None and
    347             parse_constant is None and object_pairs_hook is None and not kw):
--> 348         return _default_decoder.decode(s)
    349     if cls is None:
    350         cls = JSONDecoder


D:\\study_Software\\Anaconda3\\lib\\json\\decoder.py in decode(self, s, _w)
    338         end = _w(s, end).end()
    339         if end != len(s):
--> 340             raise JSONDecodeError("Extra data", s, end)
    341         return obj
    342 


JSONDecodeError: Extra data: line 2 column 1 (char 395)

大量数据，里面有多行多列，出现类似标题报错

raise JSONDecodeError(“Extra data”, s, end)

Extra data: line 2 column 1 (char 395)

可以逐行读取，然后再处理成列表


def read_jsonline(src_filename, encoding='utf-8' , *, default=None):
    """
    read jsonl file
    :param src_filename: source file path
    :param encoding: file encoding
    :param default: default value to return if file is not existed. Set it to None to disable it.
    :return: object list, an object corresponding a line
    """
    if default is not None and not os.path.exists(src_filename):
        return default
    file =open(src_filename, encoding=encoding)
    items = []
    for line in file:
        items.append(json.loads(line))
    file.close()
    return items


datastore = read_jsonline(path)
print(datastore[1])

{'postag': [{'word': '《', 'pos': 'w'}, {'word': '中国风水十讲', 'pos': 'nw'}, {'word': '》', 'pos': 'w'}, {'word': '是', 'pos': 'v'}, {'word': '2007年', 'pos': 't'}, {'word': '华夏出版社', 'pos': 'nt'}, {'word': '出版', 'pos': 'v'}, {'word': '的', 'pos': 'u'}, {'word': '图书', 'pos': 'n'}, {'word': '，', 'pos': 'w'}, {'word': '作者', 'pos': 'n'}, {'word': '是', 'pos': 'v'}, {'word': '杨文衡', 'pos': 'nr'}], 'text': '《中国风水十讲》是2007年华夏出版社出版的图书，作者是杨文衡', 'spo_list': [{'predicate': '出版社', 'object_type': '出版社', 'subject_type': '书籍', 'object': '华夏出版社', 'subject': '中国风水十讲'}, {'predicate': '作者', 'object_type': '人物', 'subject_type': '图书作品', 'object': '杨文衡', 'subject': '中国风水十讲'}]}

train_data = []
chars={}
with open(r'D:\\学习·\\自然语言处理\\数据集\\DuIE_2_0\\2019_data\\train_data.json','r',encoding='utf-8') as f:
    for line in f.readlines():
        datastore = json.loads(line)
        train_data.append(
            {
                'text': datastore['text'],
                 'spo_list': [(i['subject'], i['predicate'], i['object']) for i in datastore['spo_list']]
            }
        )
        for c in datastore['text']:
                chars[c] = chars.get(c, 0) + 1

print(train_data[1:3])

[{'text': '《中国风水十讲》是2007年华夏出版社出版的图书，作者是杨文衡', 'spo_list': [('中国风水十讲', '出版社', '华夏出版社'), ('中国风水十讲', '作者', '杨文衡')]}, {'text': '《空城未央》是夙言以信创作的网络小说，发表于17K小说网', 'spo_list': [('空城未央', '作者', '夙言以信'), ('空城未央', '连载网站', '17K小说网')]}]

写函数

def write_jsonline(dest_filename, items, encoding='utf-8'):
    """write items to file with json line format
    :param dest_filename: destination file path
    :param items: items to be saved line by line
    :param encoding: file encoding
    :return:
    """
    if isinstance(items, str):
        raise TypeError('json object list can\\'t be str')

    if not dest_filename.endswith('.jsonl'):
        print('json line filename doesn\\'t end with .jsonl')

    if not isinstance(items, Iterable):
        raise TypeError('items can\\'t be iterable')

    file = open(dest_filename, "w",encoding = encoding)
    for item in items:
        file.write(json.dumps(item, ensure_ascii = False) + '\\n')
    file.close()