Python处理xml文件_文件格式怎么转换

Python处理xml文件_文件格式怎么转换由于项目组中原来的文件使用的XML格式作为配置,扩展性很好,但是编辑与阅读不是很直观,特别一些规则的二维表,所以为了方便阅读与编辑,花了一些时间写了一个Python脚本,以实现将XML文件转为Excel文件。这里支持XML文件转为一个Sheet或者多个Sheet:如果第二层所有标签都相同则会转为一个Sheet,所有第二层的标签都会作为行数据如果第二层的标签有多种,则会把第二层的不同标签作为…

大家好,又见面了,我是你们的朋友全栈君。如果您正在找激活码,请点击查看最新教程,关注关注公众号 “全栈程序员社区” 获取激活教程,可能之前旧版本教程已经失效.最新Idea2022.1教程亲测有效,一键激活。

Jetbrains全系列IDE使用 1年只要46元 售后保障 童叟无欺

由于项目组中原来的文件使用的XML格式作为配置,扩展性很好,但是阅读起来不是很直观,编辑起来也是不很方便,特别一些规则的二维表,个人觉得使用EXCEL文件会方便很多。所以为了方便阅读与编辑这些规则的二维表,花了一些时间写了一个Python脚本,以实现将XML文件转为Excel文件。
这里支持XML文件转为一个Sheet或者多个Sheet:

  • 如果第二层所有标签都相同则会转为一个Sheet,所有第二层的标签都会作为行数据
  • 如果第二层的标签有多种,则会把第二层的不同标签作为不同的Sheet,第三层的数据作为行数据

其它情况未作测试。

注意:脚本会自动根据所有同层标签计算所有字段以及所有子标签及其重复数,并作为列的扩展,如果XML的嵌套过深或者太复杂,可能会导致扩展后的列数超过EXCEL所支持的最大列数(16384列),导致失败。

附上源码以共享,当然,如果有更好的Idea进行改进欢迎留言。

#
# XML文件转换成Excel文件
#
# 版本:1.0
#
# 作者:Witton Bell
# E_Mail:witton@163.com
#
#
# 功能描述:
#
# 自动检测源文件编码是否为UTF8,如果不是,则使用GB18030进行解码,并转换成UTF8编码的文件,再进行XML源文件的读取
#
# 自动提取XML中的标签作为Excel的标题列,支持嵌套与重复,但是所有列不能超过EXCEL支持的最大列(16384列)
#
# 仅支持XML的第一层只有一个标签,
# 第二层标签可以只有一个标签(允许重复),此时第一层标签作为Excel的WorkSheet,第二层标签及其嵌套作为一行数据)
# 第二层标签也可以有多个标签(允许重复),此时第二层标签作为Excel的WorkSheet,第三层标签及其嵌套作为一行数据)
#
# 由于openpyxl默认为Excel的xlsx格式,所以本脚本的所有Excel文件格式都为xlsx
#
#
import openpyxl
from openpyxl.styles import Alignment
import xml.dom.minidom
from xml.dom.minidom import Document
from openpyxl.styles import Border, Side
import chardet
import os


class XMLNode:
    def __init__(self):
        self.name = ""
        self.properties = []
        self.child = []
        self.layer = 0
        self.index = 0
        self.parent = None
        self.node_info = None


class XMLNodeInfo:
    def __init__(self, node):
        self.map_properties = { 
   }
        self.map_child = { 
   }
        self.max_index = 0
        self.layer = node.layer
        self.name = node.name

        self.register(node)

    def register(self, node):
        for k in node.properties:
            if self.map_properties.get(k[0]) is None:
                self.map_properties[k[0]] = self.map_properties.__len__()

        for ch in node.child:
            v = self.map_child.get(ch.name)
            if v is None:
                self.map_child[ch.name] = ch.node_info

        if node.index > self.max_index:
            self.max_index = node.index


class XMLReader:
    # is_auto_convert2utf8:是否自动转换为UTF8编码
    # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
    def __init__(self, file_path, is_auto_convert2utf8, is_convert_to_original_file):
        self.__root = []
        self.__map_node_info = { 
   }
        if is_auto_convert2utf8:
            is_tmp_file, tmp_file_path = self.__convert_to_utf8(file_path)
            fd = xml.dom.minidom.parse(tmp_file_path)
            if is_tmp_file:
                if is_convert_to_original_file:
                    os.remove(file_path)
                    os.rename(tmp_file_path, file_path)
                else:
                    os.remove(tmp_file_path)
        else:
            fd = xml.dom.minidom.parse(file_path)
        index = 0
        for child in fd.childNodes:
            if child.nodeType != xml.dom.Node.ELEMENT_NODE:
                continue

            self.__read_node(child, self.__root, index, None)
            index += 1

    def get_root_node(self):
        return self.__root

    @staticmethod
    def __convert_to_utf8(file_path):
        fd = open(file_path, "rb")
        fd.seek(0, 2)
        size = fd.tell()
        if size > 1024 * 1024:
            size = 1024 * 1024
        fd.seek(0, 0)
        text = fd.read(size)
        ret = chardet.detect(text)
        if ret['encoding'].lower().find("utf-8") != -1:
            return False, file_path

        tmp_file = file_path + ".tmp"
        file = open(tmp_file, "w", encoding="utf-8", newline="\n")
        fd.seek(0, 0)
        line = fd.readline()
        while line.__len__() > 0:
            file.write(line.decode("gb18030"))
            line = fd.readline()
        file.close()
        fd.close()

        return True, tmp_file

    @staticmethod
    def __get_attrib(node, rc):
        if node._attrs is None:
            return
        for key in node._attrs:
            v = node._attrs[key]._value
            rc.append([key, v])

    def __read_node(self, node, root, index, parent, layer=1):
        xml_node = XMLNode()
        xml_node.name = node.nodeName
        xml_node.layer = layer
        xml_node.index = index
        xml_node.parent = parent
        self.__get_attrib(node, xml_node.properties)
        i = 0
        for child in node.childNodes:
            if child.nodeType != xml.dom.Node.ELEMENT_NODE:
                continue

            self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
            i += 1
        root.append(xml_node)
        self.__register(xml_node)

    def __register(self, node):
        key = node.name + str(node.layer)
        nd = self.__map_node_info.get(key)
        if nd is None:
            nd = XMLNodeInfo(node)
            node.node_info = nd
            self.__map_node_info[key] = nd
        else:
            nd.register(node)
            node.node_info = nd


class XMLWriter:
    def __init__(self, xml_file_path, xml_node_list):
        doc = Document()
        for node in xml_node_list:
            ele = self.__write_node(node, doc)
            doc.appendChild(ele)

        f = open(xml_file_path, 'w', encoding='utf-8')
        doc.writexml(f, indent='\t', addindent='\t', encoding='utf-8', newl='\n')

    def __write_node(self, node, doc):
        ele = doc.createElement(node.name)
        for prop in node.properties:
            ele.setAttribute(prop[0], prop[1])

        for child in node.childs:
            ret = self.__write_node(child, doc)
            ele.appendChild(ret)

        return ele


class XmlToXls:
    # read_from_xml_file_path:XML源文件完整路径
    # save_to_xls_file_path:保存转换后的Excel文件完整路径
    # is_auto_convert2utf8:是否自动转换为UTF8编码
    # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
    # is_merge_head:如果表头有多行,是否合并层属关系的表头
    # is_border_cell:是否添加单元格的边框
    # is_alignment_center:单元格是否居中对齐
    def __init__(self, read_from_xml_file_path, save_to_xls_file_path,
                 is_auto_convert2utf8=True, is_convert_to_original_file=False,
                 is_merge_head=False, is_border_cell=False, is_alignment_center=True):
        try:
            self.__is_merge_head = is_merge_head
            if is_alignment_center:
                self.__alignment = Alignment(horizontal='center', vertical='center')
            else:
                self.__alignment = None
            if is_border_cell:
                side = Side(border_style='thin', color='000000')
                self.__border = Border(side, side, side, side, side)
            else:
                self.__border = None
            wb = openpyxl.Workbook()
            wb.encoding = 'utf-8'
            for sh in wb.worksheets:
                wb.remove_sheet(sh)

            reader = XMLReader(read_from_xml_file_path,
                               is_auto_convert2utf8,
                               is_convert_to_original_file)
            self.__write(reader.get_root_node(), wb)
            wb.save(save_to_xls_file_path)
            wb.close()
        except Exception as e:
            print(e)

    def __write(self, xml_node, wb):
        self.__map_field = { 
   }
        for node in xml_node:
            if node.node_info.map_child.__len__() == 1:
                self.__start_layer = 1
                self.__write_sheet(node, wb)
            else:
                self.__start_layer = 2
                for child in node.child:
                    self.__write_sheet(child, wb)

    def __write_sheet(self, node, wb):
        sh = wb.create_sheet(node.name)
        self.__write_head(node.node_info, sh)
        self.__write_data(node, sh)

    def __write_head(self, node, sh):
        for key in node.map_child:
            col = 1
            child = node.map_child[key]
            for k in child.map_properties:
                c = child.map_properties[k]
                self.__write_cell(sh, child.layer - self.__start_layer, c + col, k)
            col += child.map_properties.__len__()
            self.__write_head_ext(child, sh, col)
            break

    def __write_head_ext(self, node, sh, col):
        for key in node.map_child:
            child = node.map_child[key]
            num = child.map_properties.__len__()
            for i in range(0, child.max_index + 1):
                if col > 16384:
                    raise Exception("超过EXCEL最大列数(16384列)限制,转换失败")

                old_col = col
                self.__write_cell(sh, child.layer - self.__start_layer - 1, col, child.name)
                for k in child.map_properties:
                    c = child.map_properties[k]
                    self.__write_cell(sh, child.layer - self.__start_layer, c + col, k)
                col += num
                col = self.__write_head_ext(child, sh, col)
                if self.__is_merge_head:
                    merge_row = child.layer - self.__start_layer - 1
                    sh.merge_cells(start_row=merge_row, end_row=merge_row,
                                   start_column=old_col, end_column=col - 1)
        return col

    def __write_data(self, node, sh):
        row = sh.max_row + 1
        sh.freeze_panes = sh.cell(row, 1)
        for child in node.child:
            col = 1
            self.__write_data_ext(child, sh, row, col)
            row += 1

    def __write_data_ext(self, node, sh, row, col):
        m = node.node_info.map_properties
        max_prop_num = m.__len__()
        for prop in node.properties:
            c = m[prop[0]]
            self.__write_cell(sh, row, c + col, prop[1])
        col += max_prop_num

        map_child = { 
   }
        for child in node.child:
            if map_child.get(child.name) is None:
                map_child[child.name] = 1
            else:
                map_child[child.name] += 1
            col = self.__write_data_ext(child, sh, row, col)

        for key in node.node_info.map_child:
            child = node.node_info.map_child[key]
            all_count = child.max_index + 1
            count = map_child.get(key)
            if count is not None:
                all_count -= count

            for i in range(0, all_count):
                col += child.map_properties.__len__()

        return col

    def __write_cell(self, sh, row, col, value):
        if value.isdigit():
            cell = sh.cell(row, col, int(value))
        else:
            cell = sh.cell(row, col, value)
        cell.alignment = self.__alignment
        cell.border = self.__border

由于使用的XML解析器不支持值中有大于(>),小于(<)这些特殊符号,所以如果值中有这些符号的XML文件会解析失败,报错:

not well-formed (invalid token)

比如下面的XML文件就会报上面的错:

<?xml version='1.0' encoding='UTF-8'?>
<test>
<testData value="<测试数据>"/>
</test>

也不支持没有根节点的XML文件
比如:

<?xml version='1.0' encoding='UTF-8'?>
<A Value="A1">
    <AA value="a"/>
</A>
<B Value="B1">
    <BB value="b"/>
</B>
<C Value="C1">
    <CC value="c"/>
</C>

会报错:

junk after document element

C++使用的tinyxml是可以正常解析大于小于等特殊符号的,网上有一个pytinyxml2开源项目,让python可以使用tinyxml进行解析。

安装pytinyxml2之前需要先安装swig,Windows下可以下载:swigwin-4.0.1.zip,这是已经编译好的版本,解压出来后添加路径到PATH环境变量即可。
非Windows可以下载swig-4.0.1.tar.gz进行编译安装,也可以直接:

yum install swig

安装好swig后,在pytinyxml2源码目录中执行:

python setup.py install

不能使用

pip install pytinyxml2

进行安装,我遇到有报错:

running bdist_wheel
  running build
  running build_py
  creating build
  creating build/lib.linux-x86_64-3.5
  copying pytinyxml2.py -> build/lib.linux-x86_64-3.5
  running build_ext
  building '_pytinyxml2' extension
  swigging pytinyxml2.i to pytinyxml2_wrap.cpp
  swig -python -c++ -o pytinyxml2_wrap.cpp pytinyxml2.i
  pytinyxml2.i:5: Error: Unable to find 'tinyxml2.h'
  error: command 'swig' failed with exit status 1

查看pytinyxml2的包可以看到,里面缺少tinyxml2.h

项目中原来的程序只能读取XML格式的文档,我们为了方便编辑与查阅,所以把XML转为Excel,编辑完成后,还需要把Excel转为原来的XML,所以实现了XLS转XML,并对之前的代码作了部分修改,附上源码:

# -*- coding: UTF-8 -*-
#
# XML文件与Excel文件互转
#
# 版本:1.1
#
# 作者:Witton Bell
# E_Mail:witton@163.com
#
#
# 功能描述:
#
# XmlToXls:
#
# 自动检测源文件编码是否为UTF8,如果不是,则使用GB18030进行解码,并转换成UTF8编码的文件,再进行XML源文件的读取
#
# 自动提取XML中的标签作为Excel的标题列,支持嵌套与重复,但是所有列不能超过EXCEL支持的最大列(16384列)
#
# 使用XMLReader仅支持XML的只有一个根标签(标准XML格式),使用TinyXMLReader支持有多个根标签
# 第二层标签可以只有一个标签(允许重复),此时第一层标签作为Excel的WorkSheet,第二层标签及其嵌套作为一行数据)
# 第二层标签也可以有多个标签(允许重复),此时第二层标签作为Excel的WorkSheet,第三层标签及其嵌套作为一行数据)
#
# 由于openpyxl默认为Excel的xlsx格式,所以本脚本的所有Excel文件格式都为xlsx
#
# XlsToXml:
# 同样XMLWriter仅支持XML只有一个根标签(标准XML格式),使用TinyXMLWriter支持有多个根标签
# Excel文件需要有层次分明的表头,并且需要冻结表头
#
import openpyxl
from openpyxl.styles import Alignment
from openpyxl.styles import Border, Side
from openpyxl.comments import Comment
import openpyxl.utils as xls_util
import chardet
import os
import xml.dom.minidom
import copy

class XMLNode:
    def __init__(self):
        self.name = ""
        self.properties = []
        self.child = []
        self.layer = 0
        self.index = 0
        self.parent = None
        self.node_info = None


class XMLNodeInfo:
    def __init__(self, node):
        self.map_properties = { 
   }
        self.map_child = { 
   }
        self.max_index = 0
        self.layer = node.layer
        self.name = node.name
        self.parent = node.parent

        self.register(node)

    def register(self, node):
        for k in node.properties:
            if self.map_properties.get(k[0]) is None:
                self.map_properties[k[0]] = self.map_properties.__len__()

        for ch in node.child:
            v = self.map_child.get(ch.name)
            if v is None:
                self.map_child[ch.name] = ch.node_info

        if node.index > self.max_index:
            self.max_index = node.index


class _XMLReaderBase:
    def __init__(self):
        self._root = []
        self._map_node_info = { 
   }

    def __calc_node_key(self, node):
        key = ""
        if node.parent is not None:
            key = self.__calc_node_key(node.parent)
        return "%s_%s" % (key, node.name)

    def _register(self, node):
        key = self.__calc_node_key(node)
        nd = self._map_node_info.get(key)
        if nd is None:
            nd = XMLNodeInfo(node)
            node.node_info = nd
            self._map_node_info[key] = nd
        else:
            nd.register(node)
            node.node_info = nd

    @staticmethod
    def _convert_to_utf8(file_path):
        fd = open(file_path, "rb")
        fd.seek(0, 2)
        size = fd.tell()
        if size > 1024 * 1024:
            size = 1024 * 1024
        fd.seek(0, 0)
        text = fd.read(size)
        ret = chardet.detect(text)
        if ret['encoding'].lower().find("utf-8") != -1:
            return False, file_path

        tmp_file = file_path + ".tmp"
        file = open(tmp_file, "w", encoding="utf-8", newline="\n")
        fd.seek(0, 0)
        line = fd.readline()
        while line.__len__() > 0:
            file.write(line.decode("gb18030"))
            line = fd.readline()
        file.close()
        fd.close()

        return True, tmp_file


# 该类使用xml不能解析值中带特殊符号的文件,也不支持没有统一根节点的XML文件,建议使用TinyXMLReader
class XMLReader(_XMLReaderBase):
    # is_auto_convert2utf8:是否自动转换为UTF8编码
    # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
    def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False):
        _XMLReaderBase.__init__(self)
        if is_auto_convert2utf8:
            is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path)
            fd = xml.dom.minidom.parse(tmp_file_path)
            if is_tmp_file:
                if is_convert_to_original_file:
                    os.remove(file_path)
                    os.rename(tmp_file_path, file_path)
                else:
                    os.remove(tmp_file_path)
        else:
            fd = xml.dom.minidom.parse(file_path)
        index = 0
        for child in fd.childNodes:
            if child.nodeType != xml.dom.Node.ELEMENT_NODE:
                continue

            self.__read_node(child, self._root, index, None)
            index += 1

        return self._root

    @staticmethod
    def __get_attrib(node, rc):
        if node._attrs is None:
            return
        for key in node._attrs:
            v = node._attrs[key]._value
            rc.append([key, v])

    def __read_node(self, node, root, index, parent, layer=1):
        xml_node = XMLNode()
        xml_node.name = node.nodeName
        xml_node.layer = layer
        xml_node.index = index
        xml_node.parent = parent
        self.__get_attrib(node, xml_node.properties)
        i = 0
        for child in node.childNodes:
            if child.nodeType != xml.dom.Node.ELEMENT_NODE:
                continue

            self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
            i += 1
        root.append(xml_node)
        self._register(xml_node)


# 该类需要安装pytinyxml2,参见:https://blog.csdn.net/witton/article/details/100302498
class TinyXMLReader(_XMLReaderBase):
    # is_auto_convert2utf8:是否自动转换为UTF8编码
    # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
    def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False):
        _XMLReaderBase.__init__(self)
        import pytinyxml2
        doc = pytinyxml2.XMLDocument()
        if is_auto_convert2utf8:
            is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path)
            doc.LoadFile(tmp_file_path)
            if is_tmp_file:
                if is_convert_to_original_file:
                    os.remove(file_path)
                    os.rename(tmp_file_path, file_path)
                else:
                    os.remove(tmp_file_path)
        else:
            doc.LoadFile(file_path)
        node = doc.RootElement()
        index = 0
        while node is not None:
            self.__read_node(node, self._root, index, None)
            node = node.NextSiblingElement()
            index += 1

        return self._root

    @staticmethod
    def __get_attrib(node, rc):
        attrib = node.FirstAttribute()
        while attrib is not None:
            key = attrib.Name()
            v = attrib.Value()
            rc.append([key, v])
            attrib = attrib.Next()

    def __read_node(self, node, root, index, parent, layer=1):
        xml_node = XMLNode()
        xml_node.name = node.Value()
        xml_node.layer = layer
        xml_node.index = index
        xml_node.parent = parent
        self.__get_attrib(node, xml_node.properties)
        i = 0
        child = node.FirstChildElement()
        while child is not None:
            self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
            child = child.NextSiblingElement()
            i += 1
        root.append(xml_node)
        self._register(xml_node)


class XMLWriter:
    def __init__(self, xml_file_path, xml_node_list):
        doc = xml.dom.minidom.Document()
        for node in xml_node_list:
            ele = self.__write_node(node, doc)
            doc.appendChild(ele)

        f = open(xml_file_path, 'w', encoding='utf-8')
        doc.writexml(f, indent='\t', addindent='\t', encoding='utf-8', newl='\n')
        f.close()

    def __write_node(self, node, doc):
        ele = doc.createElement(node.name)
        for prop in node.properties:
            ele.setAttribute(prop[0], prop[1])

        for child in node.child:
            ret = self.__write_node(child, doc)
            ele.appendChild(ret)

        return ele


class TinyXMLWriter:
    # is_convert_spec_character: 是否转换特殊字符
    def __init__(self, xml_file_path, xml_node_list, is_convert_spec_character=True):
        import pytinyxml2
        doc = pytinyxml2.XMLDocument(is_convert_spec_character)
        decl = doc.NewDeclaration()
        doc.LinkEndChild(decl)
        for node in xml_node_list:
            ele = self.__write_node(node, doc)
            doc.LinkEndChild(ele)

        doc.SaveFile(xml_file_path)

    def __write_node(self, node, doc):
        ele = doc.NewElement(node.name)
        for prop in node.properties:
            ele.SetAttribute(prop[0], prop[1])

        for child in node.child:
            ret = self.__write_node(child, doc)
            ele.LinkEndChild(ret)

        return ele


class XmlToXls:
    def __init__(self):
        self.__is_merge_head = False
        self.__alignment_center = None
        self.__border = None

    # read_from_xml_file_path:XML源文件完整路径
    # save_to_xls_file_path:保存转换后的Excel文件完整路径
    # is_auto_convert2utf8:是否自动转换为UTF8编码
    # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
    # is_merge_head:如果表头有多行,是否合并层属关系的表头
    # is_border_cell:是否添加单元格的边框
    # is_use_tiny_xml:是否使用tinyXML
    def convert(self, read_from_xml_file_path, save_to_xls_file_path,
                is_auto_convert2utf8=True, is_convert_to_original_file=False,
                is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True):
        try:
            self.__is_merge_head = is_merge_head
            self.__alignment_center = Alignment(horizontal='center', vertical='center')
            if is_border_cell:
                side = Side(border_style='thin', color='000000')
                self.__border = Border(side, side, side, side, side)
            else:
                self.__border = None
            wb = openpyxl.Workbook()
            wb.encoding = 'utf-8'
            for sh in wb.worksheets:
                wb.remove_sheet(sh)

            if is_use_tiny_xml:
                reader = TinyXMLReader()
            else:
                reader = XMLReader()
            root = reader.read(read_from_xml_file_path,
                               is_auto_convert2utf8,
                               is_convert_to_original_file)
            self.__write(root, wb)
            wb.save(save_to_xls_file_path)
            wb.close()
        except Exception as e:
            print(e)

    # src_path_dir:XML源目录完整路径
    # dst_path_dir:保存转换后的Excel文件完整目录路径,如果为None或者为空,则直接转换在源文件应对目录下
    # is_auto_convert2utf8:是否自动转换为UTF8编码
    # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
    # is_merge_head:如果表头有多行,是否合并层属关系的表头
    # is_border_cell:是否添加单元格的边框
    # is_use_tiny_xml:是否使用tinyXML
    def convert_dirs(self, src_path_dir, dst_path_dir=None,
                     is_auto_convert2utf8=True, is_convert_to_original_file=False,
                     is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True):
        if dst_path_dir is not None and dst_path_dir != "":
            if not os.path.exists(dst_path_dir):
                os.mkdir(dst_path_dir)

        for root, dirs, files in os.walk(src_path_dir):
            for name in files:
                basename, ext = os.path.splitext(name)
                if ext != ".xml":
                    continue

                src = os.path.join(root, name)
                target = basename + ".xlsx"
                print("处理%s" % src)
                if dst_path_dir is None or dst_path_dir == "":
                    dst = os.path.join(root, target)
                else:
                    dst = os.path.join(dst_path_dir, target)
                self.convert(src, dst, is_auto_convert2utf8, is_convert_to_original_file,
                             is_merge_head, is_border_cell, is_use_tiny_xml)

    def __write(self, xml_node, wb):
        self.__map_field = { 
   }
        for node in xml_node:
            if node.node_info.map_child.__len__() == 1:
                self.__is_multi_sheet = False
                self.__write_sheet(node, wb)
            else:
                self.__is_multi_sheet = True
                for child in node.child:
                    self.__write_sheet(child, wb)

    def __write_sheet(self, node, wb):
        sh = wb.create_sheet(node.name)
        self.__write_head(node, sh)
        self.__write_data(node, sh)

    def __write_head(self, node, sh):
        node_info = node.node_info
        if self.__is_multi_sheet:
            self.__write_head_cell(sh, node_info.parent.layer, 1, node_info.parent.name)
            self.__write_head_cell(sh, node_info.layer, 1, node_info.name)
            comment_str = ""
            for prop in node.properties:
                comment_str += '%s="%s"\n' % (prop[0], prop[1])
            if comment_str != "":
                sh.cell(node_info.layer, 1).comment = Comment(comment_str, "", width=300)
        elif not self.__is_multi_sheet:
            self.__write_head_cell(sh, node_info.layer, 1, node_info.name)

        child_name = None
        for key in node_info.map_child:
            col = 1
            child = node_info.map_child[key]
            child_name = child.name
            for k in child.map_properties:
                c = child.map_properties[k]
                self.__write_head_cell(sh, child.layer, c + col, k)
            col += child.map_properties.__len__()
            self.__write_head_ext(child, sh, col)
            break

        if self.__is_multi_sheet:
            row = 3
        else:
            row = 2

        if child_name is not None:
            sh.insert_rows(row)
            self.__write_head_cell(sh, row, 1, child_name)

    def __write_head_ext(self, node, sh, col):
        for key in node.map_child:
            child = node.map_child[key]
            num = child.map_properties.__len__()
            for i in range(0, child.max_index + 1):
                if col > 16384:
                    raise Exception("超过EXCEL最大列数(16384列)限制,转换失败")

                old_col = col
                self.__write_head_cell(sh, child.layer - 1, col, child.name)
                for k in child.map_properties:
                    c = child.map_properties[k]
                    self.__write_head_cell(sh, child.layer, c + col, k)
                col += num
                col = self.__write_head_ext(child, sh, col)
                if self.__is_merge_head:
                    merge_row = child.layer - 1
                    sh.merge_cells(start_row=merge_row, end_row=merge_row,
                                   start_column=old_col, end_column=col - 1)
        return col

    def __write_data(self, node, sh):
        row = sh.max_row + 1
        sh.freeze_panes = sh.cell(row, 1)
        for child in node.child:
            col = 1
            self.__write_data_ext(child, sh, row, col)
            row += 1

    def __write_data_ext(self, node, sh, row, col):
        m = node.node_info.map_properties
        max_prop_num = m.__len__()
        for prop in node.properties:
            c = m[prop[0]]
            self.__write_cell(sh, row, c + col, prop[1])
        col += max_prop_num

        map_child = { 
   }
        for child in node.child:
            if map_child.get(child.name) is None:
                map_child[child.name] = 1
            else:
                map_child[child.name] += 1
            col = self.__write_data_ext(child, sh, row, col)

        for key in node.node_info.map_child:
            child = node.node_info.map_child[key]
            all_count = child.max_index + 1
            count = map_child.get(key)
            if count is not None:
                all_count -= count

            for i in range(0, all_count):
                col += child.map_properties.__len__()

        return col

    def __write_head_cell(self, sh, row, col, value):
        cell = sh.cell(row, col, value)
        cell.border = self.__border
        cell.alignment = self.__alignment_center

    def __write_cell(self, sh, row, col, value):
        if value.isdigit():
            cell = sh.cell(row, col, int(value))
        else:
            cell = sh.cell(row, col, value)
        cell.border = self.__border


class XlsToXml:
    def __init__(self):
        pass

    @staticmethod
    def __add_prop(map_field, col, value):
        if map_field.__len__() == 0:
            return

        if map_field.get(col) is None:
            # 找本节点
            c = col - 1
            while c >= 1:
                if map_field.get(c) is not None:
                    node = map_field[c][0]
                    break
                c -= 1
        else:
            node = map_field[col][0]
        node.properties.append([value, col])

    @staticmethod
    def __add_node(map_field, row, col, value):
        node = XMLNode()
        node.name = value

        if map_field.get(col) is not None:
            node.parent = map_field[col][0]
        else:
            # 找父节点
            c = col - 1
            while c >= 1:
                if map_field.get(c) is not None:
                    if row > map_field[c][1]:
                        node.parent = map_field[c][0]
                        break
                c -= 1

        if node.parent is not None:
            node.parent.child.append(node)
            node.layer = node.parent.layer + 1
        else:
            node.layer = 1
        map_field[col] = [node, row]
        return node

    def __read_xls(self, file_path):
        wb = openpyxl.load_workbook(file_path)
        root_data = XMLNode()
        is_multi_sheet = wb.worksheets.__len__() > 1
        for sh in wb.worksheets:
            max_row = sh.max_row + 1
            max_col = sh.max_column + 1
            if sh.freeze_panes is None:
                raise Exception("文件[%s]表单[%s]的无冻结窗口,无法确定表头,转为XML失败" %
                                (os.path.basename(file_path), sh.title))

            head_row, head_col = xls_util.coordinate_to_tuple(sh.freeze_panes)
            if head_col != 1:
                raise Exception("文件[%s]表单[%s]的冻结窗口列不为1,无法转为XML" % (os.path.basename(file_path), sh.title))

            root = None

            map_field = { 
   }
            for col in range(1, max_col):
                for row in range(1, head_row):
                    cell = sh.cell(row, col)
                    value = cell.value
                    comment = cell.comment

                    if value is None:
                        continue

                    next_row = row + 1
                    if next_row >= head_row:
                        self.__add_prop(map_field, col, value)
                        continue

                    next_row_value = sh.cell(next_row, col).value
                    if next_row_value is None:
                        self.__add_prop(map_field, col, value)
                        continue

                    node = self.__add_node(map_field, row, col, value)
                    if root is None:
                        root = node

                    if comment is None:
                        continue

                    comment = comment.text
                    lines = comment.splitlines()
                    for line in lines:
                        props = line.split('=')
                        kv = []
                        for prop in props:
                            prop = prop.replace('"', '')
                            kv.append(prop)
                            if kv.__len__() == 2:
                                node.properties.append(kv)
                                kv = []

            root_data.name = root.name
            root_data.layer = root.layer

            if is_multi_sheet and root.child.__len__() > 0:
                child_list = copy.deepcopy(root.child[0].child)
                root.child[0].child = []
                root_data.child.append(root.child[0])
                root_data_child = root.child[0].child
            else:
                child_list = copy.deepcopy(root.child)
                root_data_child = root_data.child

            for row in range(head_row, max_row):
                clone = copy.deepcopy(child_list)
                for child in clone:
                    self.__read_node_data(child, sh, row)
                    root_data_child.append(child)

        return root_data

    def __read_node_data(self, node, sh, row):
        prop_list = []
        for prop in node.properties:
            col = prop[1]
            value = sh.cell(row, col).value
            if value is not None:
                prop_list.append([prop[0], value])

        child_list = []
        for child in node.child:
            self.__read_node_data(child, sh, row)
            if child.properties.__len__() > 0 or child.child.__len__() > 0:
                child_list.append(child)
                copy.copy(child)

        node.properties = prop_list
        node.child = child_list

    def convert(self, src_file_path, dst_file_path, is_use_tiny_xml=True, is_convert_spec_character=False):
        root = self.__read_xls(src_file_path)
        if is_use_tiny_xml:
            TinyXMLWriter(dst_file_path, [root], is_convert_spec_character)
        else:
            XMLWriter(dst_file_path, [root])

    def convert_dirs(self, src_path_dir, dst_path_dir=None, is_use_tiny_xml=True, is_convert_spec_character=False):
        for root, dirs, files in os.walk(src_path_dir):
            for name in files:
                basename, ext = os.path.splitext(name)
                if ext != ".xls" and ext != ".xlsx":
                    continue

                src = os.path.join(root, name)
                target = basename + "1" + ".xml"
                print("处理%s" % src)
                if dst_path_dir is None or dst_path_dir == "":
                    dst = os.path.join(root, target)
                else:
                    dst = os.path.join(dst_path_dir, target)

                try:
                    self.convert(src, dst, is_use_tiny_xml, is_convert_spec_character)
                except Exception as e:
                    print(e)

有人在问如何使用,给一个XML转Excel的示例,假如有一个test.xml文件:

<?xml version="1.0" encoding="UTF-8"?>
<Test>
	<Data Type="1001" Desc = "这是描述">
		<Condition Value="10" Name="名字1">
			<AwardItem ItemId = "5" SubId = "10" Count = "10" />
		</Condition>
		<Condition Value="20" Name="名字2">
			<AwardItem ItemId = "5" SubId = "100" Count = "50" />
		</Condition>
		<Condition Value="30" Name="名字3">
			<AwardItem ItemId = "5" SubId = "1000" Count = "100" />
		</Condition>
	</Data>
	<Data Type="1002" Desc = "这是描述">
		<Condition Value="100" Name="名字10">
			<AwardItem ItemId = "5" SubId = "10" Count = "10" />
		</Condition>
		<Condition Value="200" Name="名字20">
			<AwardItem ItemId = "5" SubId = "100" Count = "50" />
		</Condition>
		<Condition Value="300" Name="名字30">
			<AwardItem ItemId = "5" SubId = "1000" Count = "100" />
		</Condition>
		<Condition Value="400" Name="名字40">
			<AwardItem ItemId = "5" SubId = "5000" Count = "200" />
		</Condition>
	</Data>
</Test>

我们写一个XML转XLSX的调用程序test.py

import sys
import xml2xls
import traceback

def main():
    if sys.argv.__len__() < 3:
        print("格式错误,格式:<命令> <XML源文件> <XLSX目标文件>")
        return
        
    c = xml2xls.XmlToXls()
    try:
        c.convert(sys.argv[1], sys.argv[2])
        print("处理文件%s成功" % sys.argv[1])
    except Exception as e:
        print("处理文件%s失败, 异常:%s" % (sys.argv[1], e))
        print(traceback.format_exc())

if __name__ == '__main__':
    main()

在命令行执行:

python test.py test.xml test.xlsx

就会生成一个test.xlsx的Excel文件,使用Excel打开,结果如图:
在这里插入图片描述
从前面的XML可以看到,第一行只有3组Condition,第二行有4组Condition,所以最后按最大值4组来展示,第一行没第4组Condition,则为空。
同时,转换后的Excel表头是锁定状态,并且表头是有层次结构的,对应XML的层次。同样,如果要将一个Excel表转换成XML,也需要有锁定状态的且有层次结构的表头。

祝好!

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请联系我们举报,一经查实,本站将立刻删除。

发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/171942.html原文链接:https://javaforall.net

(0)
全栈程序员-站长的头像全栈程序员-站长


相关推荐

  • MySQL 字符集 注意事项

    MySQL 字符集 注意事项utf8 unicode ci 与 utf8 general ci 区别 utf8 unicode ci 和 utf8 general ci 对中英文来说没有实质的差别 utf8 general ci 校对速度快 但准确度稍差 utf8 unicode ci 准确度高 但校对速度稍慢 若数据库中有德语 法语或者俄语需求 需使用 utf8 unicode ci 其他情况用 utf8 general ci 即可 如果你想使用 gb2312 编码 那么建议你使用 latin1 作为数据表的默认字符集 这样就能直

    2025年12月12日
    2
  • 网络文件共享服务—NFS服务

    网络文件共享服务—NFS服务

    2022年4月3日
    31
  • WERTC-在Ubuntu中搭建ICE服务器

    WERTC-在Ubuntu中搭建ICE服务器1.ICE服务器介绍最近一段时间在做关于webrtc语音视频通话,网上免费的公开的STUN服务器可以用于测试,使用起来响应时间非常的慢,用起来不爽。作为一个有情怀的技术人员的我,就需要自己搭建了coturn服务器,现在分享给大家。前提先介绍一下ICE服务器,ICE服务器包含了STUN和TURN两部分,实际上我们就需要TURN服务器就可以进行P2P穿墙。coturn服务器是在turnserver上增加了部分高级特性(感谢项目作者,为我们提供了很好的环境)。coturn支持cp,udp,

    2022年4月30日
    45
  • python recvfrom函数详解_recvfrom函数详解

    python recvfrom函数详解_recvfrom函数详解intret;srtuctsockaddr_infrom;ret=revcfrom(sock,recvbuf,BUFSIZErecvfrom函数用于从(已连接)套接口上接收数据,并捕获数据发送源的地址。本函数用于从(已连接)套接口上接收数据,并捕获数据发送源的地址。对于SOCK_STREAM类型的套接口,最多可接收缓冲区大小个数据。udp的recvfrom函数,能接收指定ip和端口发…

    2022年7月23日
    8
  • move_uploaded_file

    move_uploaded_file

    2021年9月19日
    41
  • 计算流体力学基础与网格概述(与书同行)——ANSYS ICEM CFD网格划分从入门到精通——丁源「建议收藏」

    计算流体力学基础与网格概述(与书同行)——ANSYS ICEM CFD网格划分从入门到精通——丁源「建议收藏」一、计算流体力学基础:1、 建立物理模型,将其抽象为数学、力学模型后,要分析几何体的空间影响区域;2、 建立整个几个形体与其空间影响区域(计算区域的CAD模型),将整个计算区域进行空间网格划分。3、 加入求解所需要的初始条件;4、 选择适当的算法,设置具体的控制求解过程和精度的一些条件,对所研究的问题进行分析,保存数据文件结果;5、 选择合适的后处理器(postprocessor)读取计算结果文件,分析并且显示出来。数值模拟方法:1、 有限差分法;2、 有限元法;3、 有限体积法;子域法

    2022年5月26日
    50

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注

关注全栈程序员社区公众号