Python解析XML数据
Posted SpikeKing
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python解析XML数据相关的知识,希望对你有一定的参考价值。
XML数据类似于多个维度字典,XML中包括Elements和Attribute两种样式。
- Elements,元素:类似于meta和image等元素,调用方法getElementsByTagName();
- Attribute,属性:类似于id、name、width、height等属性,调用方法getAttribute();
- 从documentElement开始逐层解析
数据:
<?xml version="1.0" encoding="utf-8"?>
<annotations>
<version>1.1</version>
<meta>
</meta>
<image id="0" name="sample_1.jpg" width="3250" height="2130">
</image>
</annotations>
应用文档,参考:
- 读取文档的meta和image元素;
- 遍历各个图像image;
- 读取图像的name、box、points元素;
- box是框、points是点,根据点和框拆分框;
- 写入文件。
待处理数据:XML解析数据.zip
源码:
#!/usr/bin/env python
# -- coding: utf-8 --
"""
Copyright (c) 2021. All rights reserved.
Created by C. L. Wang on 24.6.21
"""
import cv2
import os
import json
import collections
from xml.dom.minidom import parse
import xml.dom.minidom
from myutils.project_utils import write_list_to_file, unfold_nested_list
from myutils.cv_utils import check_point_in_box
from root_dir import DATA_DIR
class SampleLabeledParser(object):
"""
简单样本解析
"""
def __init__(self):
self.image_dir = os.path.join(DATA_DIR, 'tmps')
self.label_path = os.path.join(DATA_DIR, 'tmps', 'annotations.xml')
self.out_labeled = os.path.join(DATA_DIR, 'tmps', 'out_labeled.txt')
@staticmethod
def split_boxes(pnt_list, box):
"""
根据点列表拆分box
"""
if not pnt_list:
return [box]
x_min, y_min, x_max, y_max = box
x_list = []
for pnt in pnt_list:
x_list.append(pnt[0])
x_list = sorted(x_list)
sub_boxes = []
x_s = x_min
for x in x_list:
sub_boxes.append([x_s, y_min, x, y_max])
x_s = x
sub_boxes.append([x_s, y_min, x_max, y_max])
return sub_boxes
@staticmethod
def parse_pnt_and_box(box_pnt_dict, box_list, img_bgr=None):
"""
解析点和box
"""
sub_boxes_list = []
for idx in box_pnt_dict.keys():
pnt_list = box_pnt_dict[idx]
# print('[Info] pnt_list: {}'.format(pnt_list))
box = box_list[idx]
sub_boxes = SampleLabeledParser.split_boxes(pnt_list, box)
sub_boxes_list.append(sub_boxes)
sub_boxes_list = unfold_nested_list(sub_boxes_list) # 双层list变成单层list
# 划掉文字的区域需要区分对待
for x_idx in range(len(box_list)):
if x_idx not in box_pnt_dict.keys():
sub_boxes_list.append(box_list[x_idx])
# tmp_path = os.path.join(DATA_DIR, 'tmps', 'sub_boxes.jpg')
# draw_box_list(img_bgr, sub_boxes_list, is_text=False, color=(255, 0, 0), save_name=tmp_path)
return sub_boxes_list
def process_annotations(self):
"""
处理解析标签
"""
DOMTree = xml.dom.minidom.parse(self.label_path)
collection = DOMTree.documentElement
meta = collection.getElementsByTagName("meta")
# print('[Info] meta: {}'.format(meta))
image_data = collection.getElementsByTagName("image")
print('[Info] 样本数: {}'.format(len(image_data)))
anno_list = [] # 标签信息列表
for image in image_data:
image_name = image.getAttribute("name")
print('[Info] image: {}'.format(image_name))
img_bgr = cv2.imread(os.path.join(self.image_dir, image_name))
print('[Info] img_bgr: {}'.format(img_bgr.shape))
box_data = image.getElementsByTagName("box")
points_data = image.getElementsByTagName("points")
print('[Info] box_data: {}'.format(len(box_data)))
print('[Info] points_data: {}'.format(len(points_data)))
box_list = []
for box in box_data:
x_min = float(box.getAttribute("xtl"))
y_min = float(box.getAttribute("ytl"))
x_max = float(box.getAttribute("xbr"))
y_max = float(box.getAttribute("ybr"))
box = [x_min, y_min, x_max, y_max]
box = [int(x) for x in box]
box_list.append(box)
print('[Info] 框数量: {}'.format(len(box_list)))
# tmp_path = os.path.join(DATA_DIR, 'tmps', 'boxes.jpg')
# draw_box_list(img_bgr, box_list, is_text=False, color=(255, 0, 0), save_name=tmp_path)
box_pnt_dict = collections.defaultdict(list)
for points in points_data:
pnt_str = points.getAttribute("points")
pnt_list = pnt_str.split(",")
pnt = [int(float(x)) for x in pnt_list]
is_inside = False
for idx, box in enumerate(box_list):
if check_point_in_box(pnt, box):
box_pnt_dict[idx].append(pnt)
is_inside = True
break
if not is_inside:
print('[Info] error pnt: {}'.format(pnt))
sub_boxes_list = self.parse_pnt_and_box(box_pnt_dict, box_list, img_bgr=img_bgr)
print('[Info] 全部框数: {}'.format(len(sub_boxes_list)))
img_anno_dict = {
"image_name": image_name,
"char_boxes": sub_boxes_list
}
img_anno_str = json.dumps(img_anno_dict)
anno_list.append(img_anno_str)
print('[Info] 标签数量: {}'.format(len(anno_list)))
write_list_to_file(self.out_labeled, anno_list)
print('[Info] 标签文本写入完成: {}'.format(self.out_labeled))
def main():
slp = SampleLabeledParser()
slp.process_annotations()
if __name__ == '__main__':
main()
以上是关于Python解析XML数据的主要内容,如果未能解决你的问题,请参考以下文章
Android 逆向使用 Python 解析 ELF 文件 ( Capstone 反汇编 ELF 文件中的机器码数据 | 创建反汇编解析器实例对象 | 设置汇编解析器显示细节 )(代码片段