本来感觉XML文件解析是一件很简单的事情,结果折腾了一两周没什么进展,直到昨天才陆陆续续找到了一些解决方案,现在把踩坑过程和解决过程一并叙说一遍。
起因是源于项目中对XML的处理过程非常不满意,想着怎么把XML文件中的关键数据项找到并解析到关系数据库中,XML是SOA系统生成的,里面自然包含不少中文内容。
以前没做过完整的XML解析,按照最简单的例子对这个XML文件解析,发现总是报各种错误,却不知道问题在哪里,百度是最好的老师,发现有两点问题,一个是XML对字符集的识别相当不友好,另外一个是命名空间问题。
决定从最简单的XML文件-全英文无命名空间的XML开始,先解决命名空间问题-全英文带命名空间的XML,再解决-含中文带命名空间的XML。
一路跟过来,不断挣扎,不断踩坑,不断解决,不断前行,最终有一定进展。
english.xml
<data>
<customer name="wbq">
<firstname>wang</firstname>
<midname>bao</midname>
<lastname>qiang</lastname>
</customer>
<country name="Liechtenstein">
<rank>1</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank>4</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Panama">
<rank>68</rank>
<year>2011</year>
<gdppc>13600</gdppc>
<neighbor name="Costa Rica" direction="W"/>
<neighbor name="Colombia" direction="E"/>
</country>
</data>
namespacexml.xml
<?xml version="1.0"?>
<actors xmlns:fictional="http://characters.example.com"
xmlns="http://people.example.com">
<actor>
<name>John Cleese</name>
<fictional:character>Lancelot</fictional:character>
<fictional:character>Archie Leach</fictional:character>
</actor>
<actor>
<name>Eric Idle</name>
<fictional:character>Sir Robin</fictional:character>
<fictional:character>Gunther</fictional:character>
<fictional:character>Commander Clement</fictional:character>
</actor>
</actors>
chinesenamespace.xml
<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
<soap:Header>
<username>mk</username>
<password/>
</soap:Header>
<soap:Body>
<PublishCustomerTroubleTicketRequest xmlns="http://soa.csg.cn">
<CustomerTroubleTicket>
<influenceRange>singleFamily</influenceRange>
<contactAddress>海南省</contactAddress>
<faultLocation>海南省</faultLocation>
<mRID>07000010000030535173</mRID>
<servicekind>01</servicekind>
<importance>ordinary</importance>
<powerType>cityNetwork</powerType>
<content>该户于11月04日出现表不亮灯</content>
<areaKind>cityCenter</areaKind>
<ErpPersonRoles>
<category>01</category>
<ErpPerson>
<name>先生</name>
</ErpPerson>
</ErpPersonRoles>
</CustomerTroubleTicket>
</PublishCustomerTroubleTicketRequest>
</soap:Body>
</soap:Envelope>
代码实现:
import xml.etree.ElementTree as ET
import codecs
import re
from lxml import etree
#XML文件为英文可通过ElementTree直接解析
#XML文件以utf-8格式存储,可通过ElementTree直接解析
#XML文件以gb2312格式,解析会报错
#全英文无命名空间的XML
#全英文带命名空间的XML
#含中文带命名空间的XML
def replaceXMLEncoding(xmlfilename):
#将gb2312的中文XML转码为utf-8格式
try:
f = open(xmlfilename, mode='r')
content = f.read() #文本方式读入
content = re.sub("GB2312", "UTF-8", content) #替换encoding头
f.close()
f = open(xmlfilename, 'w') #重新以文本方式写入文件
f.write(content)
f.close()
f = codecs.open(xmlfilename, 'rb', 'mbcs') #二进制方式读入
text = f.read().encode("utf-8") #使用utf-8方式编码
f.close()
f = open(xmlfilename, 'wb') #重新以二进制方式写入文件
f.write(text)
f.close()
except:
return
xmlfilename="english.xml"
# <data>
# <customer name="wbq">
# <firstname>wang</firstname>
# <midname>bao</midname>
# <lastname>qiang</lastname>
# </customer>
# <country name="Liechtenstein">
# <rank>1</rank>
# <year>2008</year>
# <gdppc>141100</gdppc>
# <neighbor name="Austria" direction="E"/>
# <neighbor name="Switzerland" direction="W"/>
# </country>
tree=ET.parse(xmlfilename)
root=tree.getroot()
print(tree)
#<xml.etree.ElementTree.ElementTree object at 0x0000018DF8C8DB38>
print(root)
#<Element 'data' at 0x0000018DF8B1E458>
print(root.tag)
#data
print(root.attrib)
#{}
#查找并打印根节点tag和属性值
for child in root:
print(child.tag, child.attrib)
# customer {'name': 'wbq'}
# country {'name': 'Liechtenstein'}
# country {'name': 'Singapore'}
# country {'name': 'Panama'}
#查找并打印根节点中country标签的tag和属性值
for country in root.iter('country'):
print(country.tag,country.attrib)
# country {'name': 'Liechtenstein'}
# country {'name': 'Singapore'}
# country {'name': 'Panama'
##查找根节点中country标签,并打印其rank标签内容和name属性值
for country in root.findall('country'):
rank = country.find('rank').text
name = country.get('name')
print(name, rank)
# Liechtenstein 1
# Singapore 4
# Panama 68
xmlfilename="namespacexml.xml"
# <actors xmlns:fictional="http://characters.example.com"
# xmlns="http://people.example.com">
# <actor>
# <name>John Cleese</name>
# <fictional:character>Lancelot</fictional:character>
# <fictional:character>Archie Leach</fictional:character>
tree=ET.parse(xmlfilename)
root=tree.getroot()
print(root)
#第一步遍历actor标签
for actor in root.findall('{http://people.example.com}actor'):
#查找并打印actor name属性
name = actor.find('{http://people.example.com}name')
print(name.text)
#查询下级character标签,并打印其标签值
for char in actor.findall('{http://characters.example.com}character'):
print(' |-->', char.text)
# 定义命名空间字典,简化查询操作
ns = {'real_person': 'http://people.example.com',
'role': 'http://characters.example.com'}
for actor in root.findall('real_person:actor', ns):
name = actor.find('real_person:name', ns)
print(name.text)
for char in actor.findall('role:character', ns):
print(' |-->', char.text)
#解析带namespace的中文XML文件
# 定义命名空间字典,简化查询操作
# <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
# <soap:Header><username>mk</username></soap:Header>
# <soap:Body>
# <PublishCustomerTroubleTicketRequest xmlns="http://soa.csg.cn">
xmlfilename="chinesenamespace.xml"
replaceXMLEncoding(xmlfilename)
tree=ET.parse(xmlfilename)
ns = {'topns': 'http://schemas.xmlsoap.org/soap/envelope/',
'childns': 'http://soa.csg.cn'}
root=tree.getroot()
print(root)
# <Element '{http://schemas.xmlsoap.org/soap/envelope/}Envelope' at 0x000002168F7DC9F8>
#通过find一层层按照命名空间查找
body = root.find('topns:Body', ns)
print(body)
# <Element '{http://schemas.xmlsoap.org/soap/envelope/}Body' at 0x000002168F7DC9A8>
PublishCustomerTroubleTicketRequest = body.find('childns:PublishCustomerTroubleTicketRequest', ns)
print(PublishCustomerTroubleTicketRequest)
# <Element '{http://soa.csg.cn}PublishCustomerTroubleTicketRequest' at 0x000002168F7DC908>
CustomerTroubleTicket=PublishCustomerTroubleTicketRequest.find('childns:CustomerTroubleTicket',ns)
print(CustomerTroubleTicket)
# <Element '{http://soa.csg.cn}CustomerTroubleTicket' at 0x000002168F7DC8B8>
mRID = CustomerTroubleTicket.find('childns:mRID',ns)
print(mRID)
# <Element '{http://soa.csg.cn}mRID' at 0x000002168F7BA188>
print(mRID.text)
# 07000010000030535173
#用lxml.etree和xpath进行解析
xmlfilename="chinesenamespace.xml"
replaceXMLEncoding(xmlfilename)
xml = etree.parse(xmlfilename)
root = xml.getroot()
print(root)
#<Element {http://schemas.xmlsoap.org/soap/envelope/}Envelope at 0x1b067b124c8>
#通过xpath进行目录曾经查找
mRID=root.xpath('topns:Body/childns:PublishCustomerTroubleTicketRequest/childns:CustomerTroubleTicket/childns:mRID/text()',namespaces=ns)[0]
print(mRID)
#07000010000030535173
结论:
XML文件为英文可通过ElementTree直接解析
XML文件以utf-8格式存储,可通过ElementTree直接解析
XML文件以gb2312格式,解析会报错,需要将文件更改为utf-8格式编码
命名空间问题,可以通过find加命名空间的方式,也可以通过定义别名的方式
逐层查找是可以的,也可以通过xpath方式进行解析
本文分享自 python与大数据分析 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有