如何在 JavaScript 中解析大型 JSON 流中的项目?
Posted
技术标签:
【中文标题】如何在 JavaScript 中解析大型 JSON 流中的项目?【英文标题】:How to parse items from a large JSON stream in JavaScript? 【发布时间】:2021-02-11 06:49:57 【问题描述】:所以我下载了 Wikidata JSON 转储,它大约 90GB,太大而无法加载到内存中。它由一个简单的 JSON 结构组成,如下所示:
[
item,
item,
item,
...
]
每个“项目”看起来像这样:
"type": "item",
"id": "Q23",
"labels":
"<lang>": obj
,
"descriptions":
"<lang>":
"language": "<lang>",
"value": "<string>"
,
,
"aliases":
"<key>": [
obj,
obj,
],
,
"claims":
"<keyID>": [
"mainsnak":
"snaktype": "value",
"property": "<keyID>",
"datavalue":
"value":
"entity-type": "<type>",
"numeric-id": <num>,
"id": "<id>"
,
"type": "wikibase-entityid"
,
"datatype": "wikibase-item"
,
"type": "statement",
"id": "<anotherId>",
"rank": "preferred",
"references": [
"hash": "<hash>",
"snaks":
"<keyIDX>": [
"snaktype": "value",
"property": "P854",
"datavalue": obj,
"datatype": "url"
]
,
"snaks-order": [
"<propID>"
]
]
]
,
"sitelinks":
"<lang>wiki":
"site": "<lang>wiki",
"title": "<string>",
"badges": []
JSON 流的配置如下:
const fs = require('fs')
const zlib = require('zlib')
const parser = require('stream-json')
let stream = fs.createReadStream('./wikidata/latest-all.json.gz')
stream
.pipe(zlib.createGunzip())
.pipe(parser())
.on('data', buildItem)
function buildItem(data)
switch (data.name)
case `startArray`:
break
case `startObject`:
break
case `startKey`:
break
case `stringChunk`:
break
case `endKey`:
break
case `keyValue`:
break
case `startString`:
break
case `endString`:
break
case `stringValue`:
break
case `endObject`:
break
case `endArray`:
break
注意buildItem
有关键信息,它表明 JSON 流发出这样的对象(这些是日志):
name: 'startArray'
name: 'startObject'
name: 'startKey'
name: 'startString'
name: 'stringValue', value: 'type'
name: 'endString'
...
您如何将其解析为上述item
对象?把这个线性流解析成一棵树是很难理解的。
JSON 流的输出示例是 here,如果有帮助,您可以使用它来测试解析器。
【问题讨论】:
【参考方案1】:如果我理解正确,你想要这样的东西。我使用了一个 ObjectBuilder
类,它结合了所有方法来构建一个 JSON 对象。
它使用parentStack
来跟踪所有对象和数组。当对象/数组以startObject/startArray
启动时,一个新的 JSON 对象/数组被推入堆栈。一旦这个对象/数组完成,它就会从堆栈中弹出。从堆栈中弹出的最后一个对象是整个 item 对象,可以进一步处理(在下面的示例中,我只是将其打印出来)。
当前正在构造的当前对象或数组总是在栈顶。
我不得不使用您提供的样本的一个子集,因为它不包含匹配数量的 startObject
和 endObject
项目,这导致 JSON 无效。我在代码下面包含了这个子集。
希望这就是您想要的:)
(注意,我只将buildItem()
函数包装在runSample()
函数中,以便我可以在底部包含示例JSON,使其在此在线编辑器中看起来更整洁。您可以将buildItem()
函数移到外面。)
class ObjectBuilder
constructor()
this.finalObject = undefined;
this.parentStack = [];
this.currentKey = undefined;
hasFinished()
return this.finalObject !== undefined;
getFinalObject()
return this.finalObject;
currentObject()
return this.parentStack[this.parentStack.length - 1];
addValue(val)
if (Array.isArray(this.currentObject()))
this.currentObject().push(val);
else
this.currentObject()[this.currentKey] = val;
this.currentKey = undefined;
processData(data)
switch (data.name)
case `startKey`:
case `endKey`:
case `startString`:
case `endString`:
case `stringChunk`:
// ignore, always followed by [something]Value
break;
case `keyValue`:
this.currentObject()[data.value] = undefined;
this.currentKey = data.value;
break;
case `numberValue`:
this.addValue(Number(data.value))
break
case `stringValue`:
this.addValue(data.value);
break;
case `startObject`:
let newObject = ;
if (this.parentStack.length === 0)
// do nothing else, initialises first parent
else if (Array.isArray(this.currentObject()))
this.currentObject().push(newObject);
else
this.currentObject()[this.currentKey] = newObject;
this.parentStack.push(newObject);
this.currentKey = undefined;
break;
case `endObject`:
let parent = this.parentStack.pop();
if (this.parentStack.length === 0)
this.finalObject = parent;
break;
case `startArray`:
let newArray = [];
if (Array.isArray(this.currentObject()))
this.currentObject().push(newArray);
else
this.currentObject()[this.currentKey] = newArray;
this.parentStack.push(newArray);
this.currentKey = undefined;
break;
case `endArray`:
this.parentStack.pop();
this.currentKey = undefined;
break;
function runSample(streamData)
let currentlyProcessing = undefined;
function buildItem(data)
if (currentlyProcessing === undefined && data.name === "endArray")
return; // stream ended
if (currentlyProcessing === undefined)
currentlyProcessing = new ObjectBuilder();
currentlyProcessing.processData(data);
if (currentlyProcessing.hasFinished())
// Finished building project; do something with it
let niceOutput = JSON.stringify(currentlyProcessing.getFinalObject(), null, 4);
console.log(niceOutput);
currentlyProcessing = undefined;
// simulate reading stream
for (let i = 0; i < streamData.length; ++i)
if (i === 0)
// Skip first chunk as it starts the array of items
continue;
buildItem(streamData[i]);
const streamData = ["name": "startArray","name": "startObject","name": "startKey","name": "stringChunk","value": "type","name": "endKey","name": "keyValue","value": "type","name": "startString","name": "stringChunk","value": "item","name": "endString","name": "stringValue","value": "item","name": "startKey","name": "stringChunk","value": "id","name": "endKey","name": "keyValue","value": "id","name": "startString","name": "stringChunk","value": "Q31","name": "endString","name": "stringValue","value": "Q31","name": "startKey","name": "stringChunk","value": "labels","name": "endKey","name": "keyValue","value": "labels","name": "startObject","name": "startKey","name": "stringChunk","value": "el","name": "endKey","name": "keyValue","value": "el","name": "startObject","name": "startKey","name": "stringChunk","value": "language","name": "endKey","name": "keyValue","value": "language","name": "startString","name": "stringChunk","value": "el","name": "endString","name": "stringValue","value": "el","name": "startKey","name": "stringChunk","value": "value","name": "endKey","name": "keyValue","value": "value","name": "startString","name": "stringChunk","value": "Β","name": "stringChunk","value": "έ","name": "stringChunk","value": "λ","name": "stringChunk","value": "γ","name": "stringChunk","value": "ι","name": "stringChunk","value": "ο","name": "endString","name": "stringValue","value": "Βέλγιο","name": "endObject","name": "startKey","name": "stringChunk","value": "ay","name": "endKey","name": "keyValue","value": "ay","name": "startObject","name": "startKey","name": "stringChunk","value": "language","name": "endKey","name": "keyValue","value": "language","name": "startString","name": "stringChunk","value": "ay","name": "endString","name": "stringValue","value": "ay","name": "startKey","name": "stringChunk","value": "value","name": "endKey","name": "keyValue","value": "value","name": "startString","name": "stringChunk","value": "Bilkiya","name": "endString","name": "stringValue","value": "Bilkiya","name": "endObject","name": "startKey","name": "stringChunk","value": "pnb","name": "endKey","name": "keyValue","value": "pnb","name": "startObject","name": "startKey","name": "stringChunk","value": "language","name": "endKey","name": "keyValue","value": "language","name": "startString","name": "stringChunk","value": "pnb","name": "endString","name": "stringValue","value": "pnb","name": "startKey","name": "stringChunk","value": "value","name": "endKey","name": "keyValue","value": "value","name": "startString","name": "stringChunk","value": "ب","name": "stringChunk","value": "ی","name": "stringChunk","value": "ل","name": "stringChunk","value": "ج","name": "stringChunk","value": "ی","name": "stringChunk","value": "م","name": "endString","name": "stringValue","value": "بیلجیم","name": "endObject","name": "endObject","name": "endObject","name": "startObject","name": "startKey","name": "stringChunk","value": "type","name": "endKey","name": "keyValue","value": "type","name": "startString","name": "stringChunk","value": "item","name": "endString","name": "stringValue","value": "item","name": "startKey","name": "stringChunk","value": "id","name": "endKey","name": "keyValue","value": "id","name": "startString","name": "stringChunk","value": "Q31","name": "endString","name": "stringValue","value": "Q31","name": "startKey","name": "stringChunk","value": "labels","name": "endKey","name": "keyValue","value": "labels","name": "startObject","name": "startKey","name": "stringChunk","value": "el","name": "endKey","name": "keyValue","value": "el","name": "startObject","name": "startKey","name": "stringChunk","value": "language","name": "endKey","name": "keyValue","value": "language","name": "startString","name": "stringChunk","value": "el","name": "endString","name": "stringValue","value": "el","name": "startKey","name": "stringChunk","value": "value","name": "endKey","name": "keyValue","value": "value","name": "startString","name": "stringChunk","value": "Β","name": "stringChunk","value": "έ","name": "stringChunk","value": "λ","name": "stringChunk","value": "γ","name": "stringChunk","value": "ι","name": "stringChunk","value": "ο","name": "endString","name": "stringValue","value": "Βέλγιο","name": "endObject","name": "startKey","name": "stringChunk","value": "ay","name": "endKey","name": "keyValue","value": "ay","name": "startObject","name": "startKey","name": "stringChunk","value": "language","name": "endKey","name": "keyValue","value": "language","name": "startString","name": "stringChunk","value": "ay","name": "endString","name": "stringValue","value": "ay","name": "startKey","name": "stringChunk","value": "value","name": "endKey","name": "keyValue","value": "value","name": "startString","name": "stringChunk","value": "Bilkiya","name": "endString","name": "stringValue","value": "Bilkiya","name": "endObject","name": "startKey","name": "stringChunk","value": "pnb","name": "endKey","name": "keyValue","value": "pnb","name": "startObject","name": "startKey","name": "stringChunk","value": "language","name": "endKey","name": "keyValue","value": "language","name": "startString","name": "stringChunk","value": "pnb","name": "endString","name": "stringValue","value": "pnb","name": "startKey","name": "stringChunk","value": "value","name": "endKey","name": "keyValue","value": "value","name": "startString","name": "stringChunk","value": "ب","name": "stringChunk","value": "ی","name": "stringChunk","value": "ل","name": "stringChunk","value": "ج","name": "stringChunk","value": "ی","name": "stringChunk","value": "م","name": "endString","name": "stringValue","value": "بیلجیم","name": "endObject","name": "startKey","name": "stringChunk","value": "nestedArray","name": "endKey","name": "keyValue","value": "nestedArray","name": "startArray","name": "stringValue","value": "a","name": "stringValue","value": "b","name": "startArray","name": "stringValue","value": "c","name": "startObject","name": "keyValue","value": "another object","name": "stringValue","value": "d","name": "endObject","name": "stringValue","value": "e","name": "endArray","name": "stringValue","value": "b","name": "endArray","name": "endObject","name": "endObject","name": "endArray"];
runSample(streamData);
【讨论】:
哇,这看起来太棒了!这有点过头了,所以给我一些时间来深入了解一下。 当然,如果您希望我更深入地描述它,请告诉我。它并不像看起来那么复杂:) 如果你不知道流的大小,如何让它工作,所以它不会在最后一个失败?现在它正在使用“在开始和结束项目数组时跳过第一个和最后一个块”。您如何删除该要求,以便我们可以使其在未知长度的流上工作(即我不知道这个 gzip 压缩的 JSON 文件中有多少“项目”对象哈哈)? 我还注意到这个this.isArray = true
似乎只允许一层数组,我不确定。如果您的对象的属性值是数组和/或深度嵌套的数组,会发生什么?它适用吗?如果不是,如何修改它以使用它?
抱歉,我忘了嵌套数组。我会尽快解决的。【参考方案2】:
使用内置函数(StreamArray)
stream-json
已经具有将流转换为对象的内置函数(在这种情况下,您正在寻找StreamArray)。您可能希望使用内置函数,因为它们的编码考虑了性能。
要使用它,它看起来像:
const fs = require('fs')
const zlib = require('zlib')
const parser = require('stream-json')
const streamArray = require('stream-json/streamers/StreamArray')
let stream = fs.createReadStream('./wikidata/latest-all.json.gz')
stream
.pipe(zlib.createGunzip())
.pipe(parser())
.pipe(streamArray())
.on('data', d => processData(d.value))
function processData(data)
console.log(data)
我建议您查看https://github.com/uhop/stream-json/wiki 的 wiki 以了解更多信息,因为它具有额外的功能,尤其是过滤或转换,这可能对您有用,尤其是在关注速度的情况下。
【讨论】:
酷 :) 一个问题,我怎样才能在特定点恢复?我可以问另一个问题,在这里,解释我刚刚遇到的问题哈哈***.com/questions/64625354/…。以上是关于如何在 JavaScript 中解析大型 JSON 流中的项目?的主要内容,如果未能解决你的问题,请参考以下文章
如何在 javascript 中使用表单数据上传大型 JSON 对象?
在 Nodejs 中解析大型 JSON 文件并独立处理每个对象