关于带命名空间的中文XML的解析

python与大数据分析

发布于 2022-03-11 05:55:41

76200

代码可运行

文章被收录于专栏：python与大数据分析python与大数据分析

运行总次数：0

代码可运行

本来感觉XML文件解析是一件很简单的事情，结果折腾了一两周没什么进展，直到昨天才陆陆续续找到了一些解决方案，现在把踩坑过程和解决过程一并叙说一遍。

起因是源于项目中对XML的处理过程非常不满意，想着怎么把XML文件中的关键数据项找到并解析到关系数据库中，XML是SOA系统生成的，里面自然包含不少中文内容。

以前没做过完整的XML解析，按照最简单的例子对这个XML文件解析，发现总是报各种错误，却不知道问题在哪里，百度是最好的老师，发现有两点问题，一个是XML对字符集的识别相当不友好，另外一个是命名空间问题。

决定从最简单的XML文件-全英文无命名空间的XML开始，先解决命名空间问题-全英文带命名空间的XML，再解决-含中文带命名空间的XML。

一路跟过来，不断挣扎，不断踩坑，不断解决，不断前行，最终有一定进展。

english.xml

<data>
    <customer name="wbq">
        <firstname>wang</firstname>
        <midname>bao</midname>
        <lastname>qiang</lastname>
    </customer>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>

namespacexml.xml

<?xml version="1.0"?>
<actors xmlns:fictional="http://characters.example.com"
        xmlns="http://people.example.com">
    <actor>
        <name>John Cleese</name>
        <fictional:character>Lancelot</fictional:character>
        <fictional:character>Archie Leach</fictional:character>
    </actor>
    <actor>
        <name>Eric Idle</name>
        <fictional:character>Sir Robin</fictional:character>
        <fictional:character>Gunther</fictional:character>
        <fictional:character>Commander Clement</fictional:character>
    </actor>
</actors>

chinesenamespace.xml

<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
  <soap:Header>
    <username>mk</username>
    <password/>
  </soap:Header>
  <soap:Body>
    <PublishCustomerTroubleTicketRequest xmlns="http://soa.csg.cn">
      <CustomerTroubleTicket>
        <influenceRange>singleFamily</influenceRange>
        <contactAddress>海南省</contactAddress>
        <faultLocation>海南省</faultLocation>
        <mRID>07000010000030535173</mRID>
        <servicekind>01</servicekind>
        <importance>ordinary</importance>
        <powerType>cityNetwork</powerType>
        <content>该户于11月04日出现表不亮灯</content>
        <areaKind>cityCenter</areaKind>
        <ErpPersonRoles>
          <category>01</category>
          <ErpPerson>
            <name>先生</name>
          </ErpPerson>
        </ErpPersonRoles>
      </CustomerTroubleTicket>
    </PublishCustomerTroubleTicketRequest>
  </soap:Body>
</soap:Envelope>

代码实现：

import xml.etree.ElementTree as ET
import codecs
import re
from lxml import etree
#XML文件为英文可通过ElementTree直接解析
#XML文件以utf-8格式存储，可通过ElementTree直接解析
#XML文件以gb2312格式，解析会报错
#全英文无命名空间的XML
#全英文带命名空间的XML
#含中文带命名空间的XML

def replaceXMLEncoding(xmlfilename):
    #将gb2312的中文XML转码为utf-8格式
    try:
        f = open(xmlfilename, mode='r')
        content = f.read()                              #文本方式读入
        content = re.sub("GB2312", "UTF-8", content)    #替换encoding头
        f.close()
        f = open(xmlfilename, 'w')                      #重新以文本方式写入文件
        f.write(content)
        f.close()
        f = codecs.open(xmlfilename, 'rb', 'mbcs')      #二进制方式读入
        text = f.read().encode("utf-8")                 #使用utf-8方式编码
        f.close()
        f = open(xmlfilename, 'wb')                     #重新以二进制方式写入文件
        f.write(text)
        f.close()
    except:
        return

xmlfilename="english.xml"
# <data>
#     <customer name="wbq">
#         <firstname>wang</firstname>
#         <midname>bao</midname>
#         <lastname>qiang</lastname>
#     </customer>
#     <country name="Liechtenstein">
#         <rank>1</rank>
#         <year>2008</year>
#         <gdppc>141100</gdppc>
#         <neighbor name="Austria" direction="E"/>
#         <neighbor name="Switzerland" direction="W"/>
#     </country>
tree=ET.parse(xmlfilename)
root=tree.getroot()
print(tree)
#<xml.etree.ElementTree.ElementTree object at 0x0000018DF8C8DB38>
print(root)
#<Element 'data' at 0x0000018DF8B1E458>
print(root.tag)
#data
print(root.attrib)
#{}
#查找并打印根节点tag和属性值
for child in root:
    print(child.tag, child.attrib)
    # customer {'name': 'wbq'}
    # country {'name': 'Liechtenstein'}
    # country {'name': 'Singapore'}
    # country {'name': 'Panama'}
#查找并打印根节点中country标签的tag和属性值
for country in root.iter('country'):
    print(country.tag,country.attrib)
    # country {'name': 'Liechtenstein'}
    # country {'name': 'Singapore'}
    # country {'name': 'Panama'
##查找根节点中country标签，并打印其rank标签内容和name属性值
for country in root.findall('country'):
    rank = country.find('rank').text
    name = country.get('name')
    print(name, rank)
    # Liechtenstein 1
    # Singapore 4
    # Panama 68

xmlfilename="namespacexml.xml"
# <actors xmlns:fictional="http://characters.example.com"
#         xmlns="http://people.example.com">
#     <actor>
#         <name>John Cleese</name>
#         <fictional:character>Lancelot</fictional:character>
#         <fictional:character>Archie Leach</fictional:character>
tree=ET.parse(xmlfilename)
root=tree.getroot()
print(root)
#第一步遍历actor标签
for actor in root.findall('{http://people.example.com}actor'):
    #查找并打印actor name属性
    name = actor.find('{http://people.example.com}name')
    print(name.text)
    #查询下级character标签，并打印其标签值
    for char in actor.findall('{http://characters.example.com}character'):
        print(' |-->', char.text)

# 定义命名空间字典，简化查询操作
ns = {'real_person': 'http://people.example.com',
      'role': 'http://characters.example.com'}
for actor in root.findall('real_person:actor', ns):
    name = actor.find('real_person:name', ns)
    print(name.text)
    for char in actor.findall('role:character', ns):
        print(' |-->', char.text)

#解析带namespace的中文XML文件
# 定义命名空间字典，简化查询操作
# <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
#   <soap:Header><username>mk</username></soap:Header>
#   <soap:Body>
#     <PublishCustomerTroubleTicketRequest xmlns="http://soa.csg.cn">
xmlfilename="chinesenamespace.xml"
replaceXMLEncoding(xmlfilename)
tree=ET.parse(xmlfilename)
ns = {'topns': 'http://schemas.xmlsoap.org/soap/envelope/',
      'childns': 'http://soa.csg.cn'}
root=tree.getroot()
print(root)
# <Element '{http://schemas.xmlsoap.org/soap/envelope/}Envelope' at 0x000002168F7DC9F8>
#通过find一层层按照命名空间查找
body = root.find('topns:Body', ns)
print(body)
# <Element '{http://schemas.xmlsoap.org/soap/envelope/}Body' at 0x000002168F7DC9A8>
PublishCustomerTroubleTicketRequest = body.find('childns:PublishCustomerTroubleTicketRequest', ns)
print(PublishCustomerTroubleTicketRequest)
# <Element '{http://soa.csg.cn}PublishCustomerTroubleTicketRequest' at 0x000002168F7DC908>
CustomerTroubleTicket=PublishCustomerTroubleTicketRequest.find('childns:CustomerTroubleTicket',ns)
print(CustomerTroubleTicket)
# <Element '{http://soa.csg.cn}CustomerTroubleTicket' at 0x000002168F7DC8B8>
mRID = CustomerTroubleTicket.find('childns:mRID',ns)
print(mRID)
# <Element '{http://soa.csg.cn}mRID' at 0x000002168F7BA188>
print(mRID.text)
# 07000010000030535173

#用lxml.etree和xpath进行解析
xmlfilename="chinesenamespace.xml"
replaceXMLEncoding(xmlfilename)
xml = etree.parse(xmlfilename)
root = xml.getroot()
print(root)
#<Element {http://schemas.xmlsoap.org/soap/envelope/}Envelope at 0x1b067b124c8>
#通过xpath进行目录曾经查找
mRID=root.xpath('topns:Body/childns:PublishCustomerTroubleTicketRequest/childns:CustomerTroubleTicket/childns:mRID/text()',namespaces=ns)[0]
print(mRID)
#07000010000030535173