典型XML文档示例:
<school>
<student id="1001">
<name>张三</name>
<score math="90" english="85"/>
</student>
</school>
from xml.dom.minidom import parse
doc = parse("data.xml")
students = doc.getElementsByTagName("student")
for s in students:
print(s.getAttribute("id"))
class StudentHandler(xml.sax.ContentHandler):
def startElement(self, name, attrs):
if name == "student":
print("ID:", attrs["id"])
parser = xml.sax.make_parser()
parser.setContentHandler(StudentHandler())
parser.parse("data.xml")
import xml.etree.ElementTree as ET
tree = ET.parse('data.xml')
root = tree.getroot()
for student in root.findall('student'):
print(student.attrib['id'])
方法1MB文件10MB文件内存占用DOM1201350高SAX85820低ElementTree65700中
try:
ET.parse('broken.xml')
except ET.ParseError as e:
print(f"解析错误:{e.position}: {e.msg}")
namespaces = {'ns': 'http://school.edu/schema'}
for student in root.findall('ns:student', namespaces):
print(student.find('ns:name', namespaces).text)
# 查找数学成绩大于90的学生
high_scores = root.findall(".//student[score/@math>'90']")
def load_config(config_file):
config = {}
tree = ET.parse(config_file)
for item in tree.findall('setting'):
config[item.get('key')] = item.text
return config
@app.route('/api/xml', methods=['POST'])
def handle_xml():
root = ET.fromstring(request.data)
# 处理逻辑...
return ET.tostring(response_xml)
parser = ET.XMLParser(
target=ET.TreeBuilder(),
forbid_dtd=True,
forbid_entities=True
)
safe_tree = ET.parse('input.xml', parser=parser)
from defusedxml.ElementTree import parse
safe_tree = parse('untrusted.xml')
for event, elem in ET.iterparse('large.xml'):
if elem.tag == 'student':
process_student(elem)
elem.clear() # 及时释放内存
from concurrent.futures import ThreadPoolExecutor
def parse_chunk(xml_chunk):
return ET.fromstring(xml_chunk)
with open('huge.xml') as f:
with ThreadPoolExecutor() as executor:
results = executor.map(parse_chunk, chunk_file(f))