以更快的方式展平和扩展 json
Posted
技术标签:
【中文标题】以更快的方式展平和扩展 json【英文标题】:Flatten and expand json in a faster way 【发布时间】:2021-04-12 01:05:39 【问题描述】:上下文
我有一个 json 作为条目,我想展开列表并展开嵌套在原始 json 中的字典。我在递归的explode/expand 方法中执行此操作,直到没有更多的嵌套列表/ dics。这是我的代码:
def one_step_parser(df):
# loop over all cols and explode lists
for col, _ in df.items():
# fill nan values with empty dics
df[col].fillna(i: for i in df.index, inplace=True)
try:
df = df.explode(col)
except Exception as e:
pass
# loop over all columns and expand dics
for col, _ in df.items():
# fill nan values with empty dics
df[col] = df[col].fillna(i: for i in df.index)
try:
# this expands nested dics in df[col]
series = df[col].apply(pd.Series)
# rename new cols
series_cols = list(series.columns)
names =
for output_col in series_cols:
names[output_col] = str(col + "." + output_col)
series = series.rename(columns=names)
# concat the expanded result to the dataframe
df = pd.concat([df, series], axis=1)
df = df.drop(columns=[col])
except Exception as e:
pass
return df
def multi_step_parser(df_bf):
# first parse
df_af = one_step_parser(df_bf)
# keep parsing until there's no change
if len(list(df_bf.columns)) != len(list(df_af.columns)):
df_af = multi_step_parser(df_af)
return df_af
def parser(json_file):
json_f = open(json_file, 'r')
my_json = json.load(json_f)
# 1st json normaliziation to data frame
df = pd.json_normalize(my_json)
return multi_step_parser(df)
输入数据示例:
"agents": [
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
"id": 101819,
"name": "O Laptops"
],
"id": 2198802,
"ip": "ip1",
"name": "x1x1x1x1",
"platform": "WINDOWS",
"plugin_feed_id": "201810182051",
"status": "on",
"uuid": "ca8b941a-80cd-4c1c-8044-760e69781eb7"
,
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
"id": 101839,
"name": "windy"
,
"id": 102037,
"name": "W6"
,
"id": 102049,
"name": "MS8"
],
"id": 2097601,
"ip": "ip2s",
"name": "x2xx2x2x2",
"platform": "WINDOWS",
"plugin_feed_id": "201810181351",
"status": "on",
"uuid": "7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2"
,
"core_version": "7.1.0",
"distro": "win-x86-64",
"id": 2234103,
"ip": "x6x6x6x6x",
"last_connect": 1537384290,
"linked_on": 1537384247,
"name": "x7x7x7x",
"platform": "WINDOWS",
"status": "off",
"uuid": "0696ee38-402a-4866-b753-2816482dfce6"
],
"pagination":
"limit": 5000,
"offset": 0,
"sort": [
"name": "john",
"order": "asc"
,
"name": "mark",
"order": "dfg"
],
"total": 14416
请注意,这只是一个示例,我的输入可能会非常不同,因此我处理的内容与内容无关。
结果
pagination.limit pagination.offset pagination.total unique_index agents.core_build ... agents.linked_on pagination.sort.name pagination.sort.order agents.groups.id agents.groups.name
0 5000 0 14416 0 17 ... john asc 101819 O Laptops
0 5000 0 14416 0 17 ... mark dfg 101819 O Laptops
0 5000 0 14416 0 17 ... john asc 101839 windy
0 5000 0 14416 0 17 ... john asc 102037 W6
0 5000 0 14416 0 17 ... john asc 102049 MS8
0 5000 0 14416 0 17 ... mark dfg 101839 windy
0 5000 0 14416 0 17 ... mark dfg 102037 W6
0 5000 0 14416 0 17 ... mark dfg 102049 MS8
0 5000 0 14416 0 ... 1.5373... john asc
0 5000 0 14416 0 ... 1.5373... mark dfg ```
问题
如您所见,我所做的工作完成了。但是,当在大 json 上使用它时,执行时间会呈指数增长,因为每次迭代都会创建更多的行和列 + concat
在循环内。我想要的是使用apply
或numpy
矢量化使其更高效(尤其是第二个循环)。
尝试
我设法绕过fillna
函数列上的for
循环:
df = df.apply(lambda x: x.fillna(i: for i in df.index))
但我找不到为此功能执行此操作的方法
series = df[col].apply(pd.Series)
【问题讨论】:
【参考方案1】:定义一个函数expand_dict_col
用dict扩展列。
然后使用explode
和expand_dict_col
来实现目标。
import pandas as pd
def expand_dict_col(df, targe_col):
"""
e.g. pagination ->
'limit': 5000,
'offset': 0,
'sort': ['name': 'john', 'order': 'asc', 'name': 'mark', 'order': 'dfg'],
'total': 14416
- pagination.limit
- pagination.offset
- pagination.sort
- pagination.total
"""
# fill na values with empty dict
cond = df[targe_col].isnull()
df.loc[cond, targe_col] = df.loc[cond, targe_col].fillna('').map(eval)
dfn = pd.DataFrame(df[targe_col].tolist())
dfn.columns = targe_col + '.' + dfn.columns.astype(str)
for col in dfn.columns:
df[col] = dfn[col].values
执行。
data # see below
df1 = pd.json_normalize(data, record_path='agents',record_prefix='agents.', meta='pagination', )
df2 = df1.explode('agents.groups')
expand_dict_col(df2, 'agents.groups')
expand_dict_col(df2, 'pagination')
df3 = df2.explode('pagination.sort')
expand_dict_col(df3, 'pagination.sort')
df4 = df3.drop(['pagination.sort', 'pagination', 'agents.groups'], axis=1)
输出 -> df4.iloc[0]
:
0
agents.core_build 17
agents.core_version 7.1.1
agents.distro win-x86-64
agents.id 2198802
agents.ip ip1
agents.name x1x1x1x1
agents.platform WINDOWS
agents.plugin_feed_id 201810182051
agents.status on
agents.uuid ca8b941a-80cd-4c1c-8044-760e69781eb7
agents.last_connect NaN
agents.linked_on NaN
agents.groups.id 101819
agents.groups.name O Laptops
pagination.limit 5000
pagination.offset 0
pagination.total 14416
pagination.sort.name john
pagination.sort.order asc
使用while循环自动处理df
df = pd.json_normalize(data, record_path='agents',record_prefix='agents.', meta='pagination', )
while True:
col_type = dict()
for col in df.columns:
cond = df[col].notnull()
col_type[col] = type(df.loc[cond, col].iloc[0])
obj_col_type = pd.Series(col_type)
list_cols = obj_col_type[obj_col_type == type(list())].index.tolist()
dict_cols = obj_col_type[obj_col_type == type(dict())].index.tolist()
# list_cols = col_type[col_type == type(list())].index.tolist()
n = len(list_cols) + len(dict_cols)
if n == 0:
break
for col in list_cols:
df = df.explode(col)
for col in dict_cols:
expand_dict_col(df, col)
del df[col]
数据
data =
"agents": [
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
"id": 101819,
"name": "O Laptops"
],
"id": 2198802,
"ip": "ip1",
"name": "x1x1x1x1",
"platform": "WINDOWS",
"plugin_feed_id": "201810182051",
"status": "on",
"uuid": "ca8b941a-80cd-4c1c-8044-760e69781eb7"
,
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
"id": 101839,
"name": "windy"
,
"id": 102037,
"name": "W6"
,
"id": 102049,
"name": "MS8"
],
"id": 2097601,
"ip": "ip2s",
"name": "x2xx2x2x2",
"platform": "WINDOWS",
"plugin_feed_id": "201810181351",
"status": "on",
"uuid": "7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2"
,
"core_version": "7.1.0",
"distro": "win-x86-64",
"id": 2234103,
"ip": "x6x6x6x6x",
"last_connect": 1537384290,
"linked_on": 1537384247,
"name": "x7x7x7x",
"platform": "WINDOWS",
"status": "off",
"uuid": "0696ee38-402a-4866-b753-2816482dfce6"
],
"pagination":
"limit": 5000,
"offset": 0,
"sort": [
"name": "john",
"order": "asc"
,
"name": "mark",
"order": "dfg"
],
"total": 14416
【讨论】:
您提出的解决方案与我给出的数据示例绑定。我正在寻找像我写的函数一样的通用解决方案,但速度要快得多 它可以是一个通用的解决方案,在答案中使用 while 循环。只需将 json 读取为数据框即可。 当我在示例上运行您的通用解决方案时,两列消失了agents.last_connect
和 agents.linked_on
。我找不到发生这种情况的地方
哦,您的示例数据没有提到。你可以使用pd.DataFrame(df[taget_col].tolist())
来处理dict列,但要小心na值和行索引。
我不确定我是否理解。这两列在那里,它们不是 dict 列。以上是关于以更快的方式展平和扩展 json的主要内容,如果未能解决你的问题,请参考以下文章