python xml2json
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python xml2json相关的知识,希望对你有一定的参考价值。
# use beautiful soup for XML parsing
from bs4 import BeautifulSoup
# use json for writing json file and json formatting
import json
# use sys for displaying python version information
import sys
# display python and pandas versions
print (sys.version_info)
## 2. Parse XML File
### 2.1 define XML file and open using BeautifulSoup with utf-8 encoding
xml_file = './xml.xml'
json_file = './json.dat'
## 3 supporting functions
### 3.1 build_tag: define a build tag function which will extract specific data for saving to json file
def build_tag (tag):
# return a constructed dictionary object with tag elements
# create an empty dictionary to contain a dict for each XML element
tag_dict = {};
# extract from tag Title, Description and list of RelatedTerms (Title, Relationship)
title_string = [feature.string for feature in tag.find('Title')][0];
description_string = [ feature.string for feature in tag.find('Description')][0];
l_rt_title = [feature.string for feature in tag.findChildren('Title') if feature.parent.parent.name == 'RelatedTerms'];
l_relationship = [feature.string for feature in tag.findChildren('Relationship')];
# add Description to tag dictionary
tag_dict.update({'Description' : description_string});
# tag_list is defined for a set of RelatedTerms
tag_list = [];
# use a counter for keeping track of which RelatedTerms element is current in the for loop
index = 0;
for relationship_string in l_relationship:
# Check the boundary of the Title list otherwise index out of range
if ( index < len(l_rt_title)):
rt_title_string = l_rt_title[index];
else:
# set Title as blank for this RelatedTerm (no Title for this Relationship tag)
rt_title_string = '';
# add a Relationship and Title tag to the RelatedTerms list
tag_list.append({'Relationship': relationship_string,'Title': rt_title_string});
# keep track of which list item (RelatedTerms - Relationship was added
index = index + 1;
# add RelatedTerms and Title elements to tag dict
tag_dict.update({'RelatedTerms' : tag_list});
tag_dict.update({'Title' : title_string});
return tag_dict;
###
### 3.2 traverseXML: iterate through all XML tags in the document
# for each tag build a dictionary element with the features to include
def traverseXML(tag):
# create an empty list to store all tags of dict
tag_list = [];
# build a list of dict tags representing all the XML tags in the document
for child in tag.findChildren(recursive=False):
tag_list.append(build_tag(child))
# return constructed list of tags from document
return tag_list;
###
## 4. Main
### 4.1 open XML file into soup object
try:
soup = BeautifulSoup(open(xml_file,encoding='UTF-8'),'lxml-xml')
except Exception as error:
print('[ERROR]: opening file :'+xml_file);
print(error);
### 4.2 optionally for debugging purposes display "pretty" format of XML document
# optionally display XML file structure
print(soup.prettify())
### 4.3 extract tags from XML document and build a dictionary and list of tags for JSON formatting
# tagDict contains the XML document
tagDict = {};
# tagList contains a dict item per XML element that is iteratively constructed
tagList = [];
# start with the root node in the XML document
for child in soup.findChildren(recursive=False):
# process from root element
if child.name:
tagList = (traverseXML(child));
# create a dictionary with 'thesaurus' as the key
tagDict = {'thesaurus' : tagList};
### 4.4 write dataDict to json file
# format the json output
try:
with open(json_file,'w') as output:
out = json.dump(tagDict,output,sort_keys=False,indent=0)
except Exception as error:
print('[ERROR]: writing to file :'+json_file);
以上是关于python xml2json的主要内容,如果未能解决你的问题,请参考以下文章