
在现代办公和文档处理中,Word文档已经成为最常用的文件格式之一。这些文档不仅包含文本内容,还经常嵌入各种图片、图表和其他媒体元素。在许多场景下,我们需要从Word文档中提取这些图片,例如进行内容分析、创建图像数据库、或者在其他应用程序中重用这些图像。同样,将图片按照特定顺序加载到Word文档中也是一个常见需求。本文将深入探讨如何使用Python实现Word文档中图片的自动提取与加载功能,从理论基础到实际应用,提供全面的技术指南。
在深入技术实现之前,我们需要了解Word文档中图片的存储方式和基本特性。
现代Word文档(.docx格式)实际上是一个ZIP压缩包,包含多个XML文件和资源文件。当我们在Word文档中插入图片时,图片会被存储在文档包的word/media/目录下,并在文档的XML结构中通过引用的方式链接。
Word文档中的图片主要有以下几种存储形式:
Word文档支持多种图片格式,常见的包括:
每个图片在Word文档中还包含多种属性:
在Word文档的XML结构中,图片通过以下方式与文档内容关联:
了解这些基础知识对于我们实现图片提取和加载功能至关重要,因为我们需要正确解析文档结构,找到图片文件,并理解它们在文档中的位置和顺序。
在开始实现Word文档图片处理功能之前,我们需要准备适当的开发环境和工具。
首先,我们需要安装Python环境。推荐使用Python 3.6或更高版本,因为它提供了更好的Unicode支持和更多现代特性。
# 检查Python版本
python --version
# 创建虚拟环境(可选但推荐)
python -m venv word_image_env
source word_image_env/bin/activate # Linux/Mac
word_image_env\Scripts\activate # Windows我们将使用几个关键的Python库来处理Word文档和图片:
pip install python-docx # 处理.docx文件
pip install Pillow # 图像处理
pip install lxml # XML处理(python-docx的依赖,但可能需要单独安装)
pip install tqdm # 进度条显示(可选,用于批量处理)其中,python-docx是我们的核心库,用于读取和操作Word文档。但它在图片提取方面有一些限制,因此我们还需要直接处理文档的ZIP结构和XML内容。
安装完成后,我们可以简单测试环境是否正确配置:
import docx
import PIL
import lxml
import zipfile
import os
print(f"python-docx version: {docx.__version__}")
print(f"Pillow version: {PIL.__version__}")
print(f"lxml version: {lxml.__version__}")
print(f"zipfile module available: {zipfile.__name__}")
print(f"os module available: {os.__name__}")如果所有库都能正确导入并显示版本信息,说明我们的环境已经准备就绪。
为了使我们的代码组织良好且易于维护,我们可以按照以下结构设计项目:
word_image_processor/
│
├── word_image_extractor.py # 图片提取核心功能
├── word_image_loader.py # 图片加载核心功能
├── utils/
│ ├── __init__.py
│ ├── docx_utils.py # Word文档处理工具函数
│ ├── image_utils.py # 图像处理工具函数
│ └── metadata_utils.py # 元数据处理工具函数
│
├── examples/
│ ├── extract_images.py # 图片提取示例
│ └── load_images.py # 图片加载示例
│
└── tests/
├── __init__.py
├── test_extractor.py # 提取功能测试
└── test_loader.py # 加载功能测试这种结构将核心功能、工具函数和示例代码分开,使项目更加清晰和可维护。
要实现图片的提取和加载,我们首先需要深入理解Word文档的内部结构,特别是与图片相关的部分。
如前所述,.docx文件实际上是一个ZIP压缩包,包含多个XML文件和资源文件。这种格式被称为Office Open XML (OOXML),是一种国际标准。
我们可以通过以下方式查看.docx文件的内部结构:
import zipfile
def explore_docx_structure(docx_path):
"""探索Word文档的内部结构"""
with zipfile.ZipFile(docx_path) as docx_zip:
# 列出所有文件
file_list = docx_zip.namelist()
print("文档内部文件列表:")
for file in file_list:
print(f" - {file}")
# 检查是否存在图片文件夹
media_files = [f for f in file_list if f.startswith('word/media/')]
print(f"\n找到 {len(media_files)} 个媒体文件:")
for media in media_files:
print(f" - {media}")
# 使用示例
explore_docx_structure("example.docx")在Word文档中,图片与文档内容的关联主要通过以下文件实现:
我们需要解析这些文件来理解图片在文档中的位置和顺序。
import xml.etree.ElementTree as ET
from zipfile import ZipFile
def analyze_document_images(docx_path):
"""分析文档中的图片引用"""
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with ZipFile(docx_path) as docx_zip:
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用
drawing_elements = doc_root.findall('.//w:drawing', namespaces)
print(f"找到 {len(drawing_elements)} 个图形元素")
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 查找图片关系
image_rels = rels_root.findall(".//*[@Type='http://schemas.openxmlformats.org/officeDocument/2006/relationships/image']")
print(f"找到 {len(image_rels)} 个图片关系")
# 显示图片信息
for rel in image_rels:
rel_id = rel.get('Id')
target = rel.get('Target')
print(f"关系ID: {rel_id}, 目标文件: {target}")
# 使用示例
analyze_document_images("example.docx")在Word文档中,图片的顺序可以通过以下几种方式确定:
对于大多数情况,文档流顺序是最可靠的,因为它反映了图片在文档中的自然排列。但在复杂文档中,我们可能需要结合多种方法来确定准确的顺序。
def get_images_in_order(docx_path):
"""获取文档中图片的顺序"""
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with ZipFile(docx_path) as docx_zip:
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系ID到目标文件的映射
rel_map = {rel.get('Id'): rel.get('Target')
for rel in rels_root.findall("*")}
# 按文档流顺序查找图片引用
image_refs = []
for drawing in doc_root.findall('.//w:drawing', namespaces):
# 查找blip元素(包含图片引用)
blip = drawing.find('.//a:blip',
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
if blip is not None:
rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if rel_id in rel_map:
target = rel_map[rel_id]
image_refs.append({
'rel_id': rel_id,
'target': target,
'filename': target.split('/')[-1]
})
return image_refs
# 使用示例
images_in_order = get_images_in_order("example.docx")
for i, img in enumerate(images_in_order):
print(f"图片 {i+1}: {img['filename']} (关系ID: {img['rel_id']})")通过这种方式,我们可以确定图片在文档中的准确顺序,为后续的提取和处理奠定基础。
在了解了Word文档的结构后,我们可以开始实现图片提取的核心功能。
最直接的图片提取方法是从Word文档的ZIP结构中提取media文件夹中的所有图片:
import os
import zipfile
from pathlib import Path
def extract_all_images(docx_path, output_dir):
"""
从Word文档中提取所有图片
Args:
docx_path: Word文档路径
output_dir: 图片输出目录
Returns:
提取的图片文件路径列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
with zipfile.ZipFile(docx_path) as docx_zip:
# 查找所有媒体文件
media_files = [f for f in docx_zip.namelist()
if f.startswith('word/media/')]
# 提取每个媒体文件
for media_file in media_files:
# 获取文件名
filename = os.path.basename(media_file)
# 构建输出路径
output_path = os.path.join(output_dir, filename)
# 提取文件
with docx_zip.open(media_file) as source, open(output_path, 'wb') as target:
target.write(source.read())
extracted_images.append(output_path)
print(f"已提取: {filename}")
return extracted_images
# 使用示例
images = extract_all_images("example.docx", "extracted_images")
print(f"共提取了 {len(images)} 个图片")这种方法简单直接,但它有一个主要缺点:无法保证提取的图片与文档中的顺序一致。
为了按照文档中的顺序提取图片,我们需要结合前面分析的文档结构:
import os
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
def extract_images_in_order(docx_path, output_dir):
"""
按文档顺序提取Word文档中的图片
Args:
docx_path: Word文档路径
output_dir: 图片输出目录
Returns:
按顺序提取的图片文件路径列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
# 定义命名空间
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with zipfile.ZipFile(docx_path) as docx_zip:
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系ID到目标文件的映射
rel_map = {rel.get('Id'): rel.get('Target')
for rel in rels_root.findall("*")}
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用
image_count = 0
for drawing in doc_root.findall('.//w:drawing', namespaces):
# 查找blip元素(包含图片引用)
blip = drawing.find('.//a:blip', namespaces)
if blip is not None:
rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if rel_id in rel_map:
target = rel_map[rel_id]
image_path = f"word/{target}"
# 检查文件是否存在于ZIP中
if image_path in docx_zip.namelist():
# 生成序号化的文件名
image_count += 1
original_filename = os.path.basename(target)
file_ext = os.path.splitext(original_filename)[1]
new_filename = f"image_{image_count:03d}{file_ext}"
output_path = os.path.join(output_dir, new_filename)
# 提取图片
with docx_zip.open(image_path) as source, open(output_path, 'wb') as target:
target.write(source.read())
# 记录提取信息
extracted_images.append({
'original_path': image_path,
'original_filename': original_filename,
'new_path': output_path,
'new_filename': new_filename,
'rel_id': rel_id,
'order': image_count
})
print(f"已提取图片 {image_count}: {new_filename} (原文件: {original_filename})")
return extracted_images
# 使用示例
images = extract_images_in_order("example.docx", "extracted_images")
print(f"按顺序提取了 {len(images)} 个图片")这个实现确保了图片按照它们在文档中出现的顺序被提取,并使用序号化的文件名保存,便于后续处理。
在实际应用中,我们可能会遇到一些特殊情况,如:
我们需要扩展我们的代码来处理这些情况:
def extract_images_advanced(docx_path, output_dir):
"""增强版图片提取,处理特殊情况"""
# 基本设置与前面相同
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
processed_targets = set() # 跟踪已处理的图片,避免重复
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml'
}
with zipfile.ZipFile(docx_path) as docx_zip:
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系映射
rel_map = {}
for rel in rels_root.findall("*"):
rel_id = rel.get('Id')
target = rel.get('Target')
rel_type = rel.get('Type')
rel_map[rel_id] = {
'target': target,
'type': rel_type,
'is_external': target.startswith('http') or target.startswith('file:')
}
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 图片计数器
image_count = 0
# 处理常规图片 (w:drawing)
for drawing in doc_root.findall('.//w:drawing', namespaces):
blip = drawing.find('.//a:blip', namespaces)
if blip is not None:
# 处理嵌入图片
embed_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed_rel_id and embed_rel_id in rel_map:
rel_info = rel_map[embed_rel_id]
target = rel_info['target']
# 跳过已处理的图片
if target in processed_targets:
continue
processed_targets.add(target)
# 处理内部图片
if not rel_info['is_external']:
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
image_count += 1
file_ext = os.path.splitext(target)[1]
new_filename = f"image_{image_count:03d}{file_ext}"
output_path = os.path.join(output_dir, new_filename)
with docx_zip.open(image_path) as source, open(output_path, 'wb') as target_file:
target_file.write(source.read())
extracted_images.append({
'original_path': image_path,
'new_path': output_path,
'new_filename': new_filename,
'rel_id': embed_rel_id,
'order': image_count,
'type': 'embedded'
})
# 处理外部链接图片
else:
image_count += 1
link_info = f"external_link_{image_count:03d}.txt"
link_path = os.path.join(output_dir, link_info)
with open(link_path, 'w') as f:
f.write(f"External image link: {target}\n")
extracted_images.append({
'original_path': target,
'new_path': link_path,
'new_filename': link_info,
'rel_id': embed_rel_id,
'order': image_count,
'type': 'external_link'
})
# 处理链接图片
link_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}link')
if link_rel_id and link_rel_id in rel_map:
# 类似处理链接图片...
pass
# 处理VML图片 (v:imagedata) - 通常用于兼容性模式
for img_data in doc_root.findall('.//v:imagedata', namespaces):
rel_id = img_data.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
if rel_id and rel_id in rel_map:
# 处理VML图片...
pass
# 处理嵌入对象中的图片
# 这需要更复杂的处理,可能需要解析其他关系文件
return extracted_images这个增强版的实现能够处理更多特殊情况,并避免重复提取相同的图片。
现在,我们将前面的技术整合成一个完整的、可用的图片提取类。这个类将提供更多功能和更好的错误处理。
import os
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
import shutil
from datetime import datetime
import json
from PIL import Image
import io
class WordImageExtractor:
"""Word文档图片提取器"""
def __init__(self, docx_path):
"""
初始化提取器
Args:
docx_path: Word文档路径
"""
self.docx_path = docx_path
self.namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml',
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape'
}
# 验证文件存在
if not os.path.exists(docx_path):
raise FileNotFoundError(f"找不到Word文档: {docx_path}")
# 验证文件格式
if not docx_path.lower().endswith('.docx'):
raise ValueError(f"不支持的文件格式: {docx_path}. 仅支持.docx格式")
# 初始化关系映射
self.rel_map = {}
self.image_info = []
# 解析文档结构
self._parse_document_structure()
def _parse_document_structure(self):
"""解析文档结构,建立关系映射"""
try:
with zipfile.ZipFile(self.docx_path) as docx_zip:
# 检查是否是有效的Word文档
if 'word/document.xml' not in docx_zip.namelist():
raise ValueError(f"无效的Word文档: {self.docx_path}")
# 解析关系文件
if 'word/_rels/document.xml.rels' in docx_zip.namelist():
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 建立关系映射
for rel in rels_root.findall("*"):
rel_id = rel.get('Id')
target = rel.get('Target')
rel_type = rel.get('Type')
self.rel_map[rel_id] = {
'target': target,
'type': rel_type,
'is_external': target.startswith('http') or target.startswith('file:')
}
# 解析文档内容,查找图片引用
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用并记录顺序
self._find_image_references(doc_root)
except zipfile.BadZipFile:
raise ValueError(f"文件不是有效的ZIP格式: {self.docx_path}")
except ET.ParseError as e:
raise ValueError(f"XML解析错误: {e}")
def _find_image_references(self, doc_root):
"""查找文档中的所有图片引用"""
image_order = 0
# 处理常规图片 (w:drawing)
for drawing in doc_root.findall('.//w:drawing', self.namespaces):
blip = drawing.find('.//a:blip', self.namespaces)
if blip is not None:
embed_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed_rel_id and embed_rel_id in self.rel_map:
image_order += 1
rel_info = self.rel_map[embed_rel_id]
# 获取图片尺寸信息
extent = drawing.find('.//wp:extent', self.namespaces)
width = height = None
if extent is not None:
width = extent.get('cx') # EMU单位
height = extent.get('cy') # EMU单位
# 获取替代文本
alt_text = ""
doc_pr = drawing.find('.//wp:docPr', self.namespaces)
if doc_pr is not None:
alt_text = doc_pr.get('descr', '')
self.image_info.append({
'order': image_order,
'rel_id': embed_rel_id,
'target': rel_info['target'],
'type': 'embedded' if not rel_info['is_external'] else 'external',
'width_emu': width,
'height_emu': height,
'alt_text': alt_text,
'element_type': 'drawing'
})
# 处理VML图片 (v:imagedata) - 兼容性模式
for img_data in doc_root.findall('.//v:imagedata', self.namespaces):
rel_id = img_data.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
if rel_id and rel_id in self.rel_map:
image_order += 1
rel_info = self.rel_map[rel_id]
self.image_info.append({
'order': image_order,
'rel_id': rel_id,
'target': rel_info['target'],
'type': 'embedded' if not rel_info['is_external'] else 'external',
'width_emu': None,
'height_emu': None,
'alt_text': img_data.get('title', ''),
'element_type': 'vml'
})
def get_image_count(self):
"""获取文档中的图片数量"""
return len(self.image_info)
def get_image_info(self):
"""获取所有图片的信息"""
return self.image_info.copy()
def extract_images(self, output_dir, preserve_names=False, include_metadata=True):
"""
提取所有图片
Args:
output_dir: 输出目录
preserve_names: 是否保留原始文件名
include_metadata: 是否包含元数据文件
Returns:
提取结果列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
processed_targets = set()
with zipfile.ZipFile(self.docx_path) as docx_zip:
for img_info in self.image_info:
target = img_info['target']
# 跳过重复图片
if target in processed_targets:
continue
processed_targets.add(target)
# 处理嵌入图片
if img_info['type'] == 'embedded':
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
# 确定输出文件名
if preserve_names:
filename = os.path.basename(target)
else:
file_ext = os.path.splitext(target)[1]
filename = f"image_{img_info['order']:03d}{file_ext}"
output_path = os.path.join(output_dir, filename)
# 提取图片
with docx_zip.open(image_path) as source:
image_data = source.read()
with open(output_path, 'wb') as target_file:
target_file.write(image_data)
# 获取图片实际尺寸
actual_width = actual_height = None
try:
with Image.open(io.BytesIO(image_data)) as pil_img:
actual_width, actual_height = pil_img.size
except Exception:
pass
extracted_images.append({
'order': img_info['order'],
'original_path': image_path,
'output_path': output_path,
'filename': filename,
'rel_id': img_info['rel_id'],
'type': 'embedded',
'width_emu': img_info['width_emu'],
'height_emu': img_info['height_emu'],
'actual_width': actual_width,
'actual_height': actual_height,
'alt_text': img_info['alt_text'],
'element_type': img_info['element_type'],
'file_size': len(image_data)
})
print(f"已提取图片 {img_info['order']}: {filename}")
# 处理外部链接图片
elif img_info['type'] == 'external':
link_filename = f"external_link_{img_info['order']:03d}.txt"
link_path = os.path.join(output_dir, link_filename)
with open(link_path, 'w', encoding='utf-8') as f:
f.write(f"外部图片链接: {target}\n")
f.write(f"替代文本: {img_info['alt_text']}\n")
f.write(f"关系ID: {img_info['rel_id']}\n")
extracted_images.append({
'order': img_info['order'],
'original_path': target,
'output_path': link_path,
'filename': link_filename,
'rel_id': img_info['rel_id'],
'type': 'external',
'alt_text': img_info['alt_text']
})
print(f"已记录外部链接 {img_info['order']}: {target}")
# 生成元数据文件
if include_metadata:
metadata_path = os.path.join(output_dir, 'extraction_metadata.json')
metadata = {
'source_document': os.path.basename(self.docx_path),
'extraction_time': datetime.now().isoformat(),
'total_images': len(extracted_images),
'embedded_images': len([img for img in extracted_images if img['type'] == 'embedded']),
'external_links': len([img for img in extracted_images if img['type'] == 'external']),
'images': extracted_images
}
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
print(f"已生成元数据文件: {metadata_path}")
return extracted_images
def extract_single_image(self, image_order, output_path):
"""
提取单个图片
Args:
image_order: 图片序号(从1开始)
output_path: 输出文件路径
Returns:
提取结果信息
"""
# 查找指定序号的图片
target_image = None
for img_info in self.image_info:
if img_info['order'] == image_order:
target_image = img_info
break
if not target_image:
raise ValueError(f"找不到序号为 {image_order} 的图片")
if target_image['type'] != 'embedded':
raise ValueError(f"图片 {image_order} 是外部链接,无法提取")
# 确保输出目录存在
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with zipfile.ZipFile(self.docx_path) as docx_zip:
image_path = f"word/{target_image['target']}"
if image_path in docx_zip.namelist():
with docx_zip.open(image_path) as source:
image_data = source.read()
with open(output_path, 'wb') as target_file:
target_file.write(image_data)
print(f"已提取图片 {image_order} 到: {output_path}")
return {
'order': image_order,
'output_path': output_path,
'file_size': len(image_data),
'success': True
}
else:
raise FileNotFoundError(f"在文档中找不到图片文件: {image_path}")
# 使用示例
def main():
"""主函数示例"""
try:
# 创建提取器实例
extractor = WordImageExtractor("example.docx")
# 显示图片信息
print(f"文档中共有 {extractor.get_image_count()} 个图片")
# 获取图片详细信息
for img_info in extractor.get_image_info():
print(f"图片 {img_info['order']}: {img_info['target']} ({img_info['type']})")
# 提取所有图片
results = extractor.extract_images("extracted_images", preserve_names=False)
print(f"\n提取完成,共处理 {len(results)} 个图片")
# 提取单个图片示例
if results:
extractor.extract_single_image(1, "single_image/first_image.jpg")
except Exception as e:
print(f"错误: {e}")
if __name__ == "__main__":
main()在提取图片的过程中,保存完整的元数据信息对于后续的处理和分析非常重要。我们需要记录图片的各种属性,包括尺寸、格式、在文档中的位置等信息。
首先,我们设计一个完整的元数据结构来存储图片信息:
import json
from datetime import datetime
from PIL import Image
from PIL.ExifTags import TAGS
import hashlib
class ImageMetadataProcessor:
"""图片元数据处理器"""
def __init__(self):
self.metadata_schema = {
'extraction_info': {
'timestamp': None,
'source_document': None,
'extractor_version': '1.0.0'
},
'document_info': {
'total_images': 0,
'embedded_images': 0,
'external_links': 0,
'document_size': 0
},
'images': []
}
def process_image_metadata(self, image_data, image_info, output_path):
"""
处理单个图片的元数据
Args:
image_data: 图片二进制数据
image_info: 从文档中提取的图片信息
output_path: 输出文件路径
Returns:
完整的图片元数据
"""
metadata = {
'basic_info': {
'order': image_info.get('order'),
'filename': os.path.basename(output_path),
'file_path': output_path,
'file_size': len(image_data),
'file_hash': hashlib.md5(image_data).hexdigest()
},
'document_context': {
'rel_id': image_info.get('rel_id'),
'original_target': image_info.get('target'),
'alt_text': image_info.get('alt_text', ''),
'element_type': image_info.get('element_type')
},
'dimensions': {
'document_width_emu': image_info.get('width_emu'),
'document_height_emu': image_info.get('height_emu'),
'actual_width': None,
'actual_height': None,
'aspect_ratio': None
},
'image_properties': {
'format': None,
'mode': None,
'has_transparency': False,
'color_count': None,
'dpi': None
},
'exif_data': {}
}
# 使用PIL分析图片属性
try:
with Image.open(io.BytesIO(image_data)) as pil_img:
# 基本尺寸信息
width, height = pil_img.size
metadata['dimensions']['actual_width'] = width
metadata['dimensions']['actual_height'] = height
metadata['dimensions']['aspect_ratio'] = round(width / height, 3) if height > 0 else None
# 图片格式信息
metadata['image_properties']['format'] = pil_img.format
metadata['image_properties']['mode'] = pil_img.mode
metadata['image_properties']['has_transparency'] = pil_img.mode in ('RGBA', 'LA') or 'transparency' in pil_img.info
# DPI信息
if hasattr(pil_img, 'info') and 'dpi' in pil_img.info:
metadata['image_properties']['dpi'] = pil_img.info['dpi']
# 颜色数量(对于调色板模式)
if pil_img.mode == 'P':
metadata['image_properties']['color_count'] = len(pil_img.getcolors() or [])
# EXIF数据
if hasattr(pil_img, '_getexif') and pil_img._getexif():
exif_data = pil_img._getexif()
for tag_id, value in exif_data.items():
tag = TAGS.get(tag_id, tag_id)
metadata['exif_data'][tag] = str(value)
except Exception as e:
metadata['processing_error'] = str(e)
return metadata
def save_metadata(self, metadata, output_path):
"""保存元数据到JSON文件"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2, default=str)
def load_metadata(self, metadata_path):
"""从JSON文件加载元数据"""
with open(metadata_path, 'r', encoding='utf-8') as f:
return json.load(f)
def generate_summary_report(self, metadata):
"""生成元数据摘要报告"""
report = []
report.append("=== 图片提取摘要报告 ===\n")
# 基本统计
doc_info = metadata.get('document_info', {})
report.append(f"文档信息:")
report.append(f" - 总图片数: {doc_info.get('total_images', 0)}")
report.append(f" - 嵌入图片: {doc_info.get('embedded_images', 0)}")
report.append(f" - 外部链接: {doc_info.get('external_links', 0)}")
report.append("")
# 图片格式统计
images = metadata.get('images', [])
formats = {}
total_size = 0
for img in images:
if img.get('type') == 'embedded':
fmt = img.get('image_properties', {}).get('format', 'Unknown')
formats[fmt] = formats.get(fmt, 0) + 1
total_size += img.get('basic_info', {}).get('file_size', 0)
report.append("格式分布:")
for fmt, count in sorted(formats.items()):
report.append(f" - {fmt}: {count} 个")
report.append(f"\n总文件大小: {total_size / 1024:.1f} KB")
# 尺寸统计
sizes = [(img.get('dimensions', {}).get('actual_width', 0),
img.get('dimensions', {}).get('actual_height', 0))
for img in images if img.get('type') == 'embedded']
if sizes:
max_width = max(s[0] for s in sizes)
max_height = max(s[1] for s in sizes)
min_width = min(s[0] for s in sizes if s[0] > 0)
min_height = min(s[1] for s in sizes if s[1] > 0)
report.append(f"\n尺寸范围:")
report.append(f" - 最大: {max_width} x {max_height}")
report.append(f" - 最小: {min_width} x {min_height}")
return "\n".join(report)现在我们将元数据处理集成到主要的提取器中:
class EnhancedWordImageExtractor(WordImageExtractor):
"""增强版Word图片提取器,包含完整的元数据处理"""
def __init__(self, docx_path):
super().__init__(docx_path)
self.metadata_processor = ImageMetadataProcessor()
def extract_images_with_metadata(self, output_dir, preserve_names=False):
"""
提取图片并生成完整的元数据
Args:
output_dir: 输出目录
preserve_names: 是否保留原始文件名
Returns:
包含完整元数据的提取结果
"""
os.makedirs(output_dir, exist_ok=True)
# 初始化元数据结构
metadata = self.metadata_processor.metadata_schema.copy()
metadata['extraction_info']['timestamp'] = datetime.now().isoformat()
metadata['extraction_info']['source_document'] = os.path.basename(self.docx_path)
# 获取文档大小
metadata['document_info']['document_size'] = os.path.getsize(self.docx_path)
extracted_images = []
processed_targets = set()
with zipfile.ZipFile(self.docx_path) as docx_zip:
for img_info in self.image_info:
target = img_info['target']
if target in processed_targets:
continue
processed_targets.add(target)
if img_info['type'] == 'embedded':
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
# 确定文件名
if preserve_names:
filename = os.path.basename(target)
else:
file_ext = os.path.splitext(target)[1]
filename = f"image_{img_info['order']:03d}{file_ext}"
output_path = os.path.join(output_dir, filename)
# 提取图片数据
with docx_zip.open(image_path) as source:
image_data = source.read()
# 保存图片文件
with open(output_path, 'wb') as target_file:
target_file.write(image_data)
# 处理元数据
img_metadata = self.metadata_processor.process_image_metadata(
image_data, img_info, output_path
)
extracted_images.append(img_metadata)
metadata['document_info']['embedded_images'] += 1
print(f"已提取图片 {img_info['order']}: {filename}")
elif img_info['type'] == 'external':
# 处理外部链接
link_filename = f"external_link_{img_info['order']:03d}.txt"
link_path = os.path.join(output_dir, link_filename)
with open(link_path, 'w', encoding='utf-8') as f:
f.write(f"外部图片链接: {target}\n")
f.write(f"替代文本: {img_info['alt_text']}\n")
f.write(f"关系ID: {img_info['rel_id']}\n")
# 外部链接的元数据
link_metadata = {
'basic_info': {
'order': img_info['order'],
'filename': link_filename,
'file_path': link_path,
'type': 'external_link'
},
'document_context': {
'rel_id': img_info['rel_id'],
'original_target': target,
'alt_text': img_info['alt_text']
}
}
extracted_images.append(link_metadata)
metadata['document_info']['external_links'] += 1
# 完善元数据
metadata['document_info']['total_images'] = len(extracted_images)
metadata['images'] = extracted_images
# 保存元数据文件
metadata_path = os.path.join(output_dir, 'complete_metadata.json')
self.metadata_processor.save_metadata(metadata, metadata_path)
# 生成摘要报告
report = self.metadata_processor.generate_summary_report(metadata)
report_path = os.path.join(output_dir, 'extraction_report.txt')
with open(report_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f"\n已生成完整元数据: {metadata_path}")
print(f"已生成摘要报告: {report_path}")
return metadata除了从Word文档中提取图片,我们还经常需要将图片按照特定顺序插入到Word文档中。这在批量处理、模板生成等场景中非常有用。
使用python-docx库,我们可以实现基本的图片插入功能:
from docx import Document
from docx.shared import Inches, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH
import os
from pathlib import Path
class WordImageLoader:
"""Word文档图片加载器"""
def __init__(self, template_path=None):
"""
初始化加载器
Args:
template_path: 模板文档路径,如果为None则创建新文档
"""
if template_path and os.path.exists(template_path):
self.document = Document(template_path)
print(f"已加载模板文档: {template_path}")
else:
self.document = Document()
print("已创建新的Word文档")
def add_image(self, image_path, width=None, height=None, caption=None, alignment='left'):
"""
添加单个图片到文档
Args:
image_path: 图片文件路径
width: 图片宽度(英寸)
height: 图片高度(英寸)
caption: 图片标题
alignment: 对齐方式 ('left', 'center', 'right')
Returns:
添加的图片对象
"""
if not os.path.exists(image_path):
raise FileNotFoundError(f"找不到图片文件: {image_path}")
# 创建段落
paragraph = self.document.add_paragraph()
# 设置对齐方式
alignment_map = {
'left': WD_ALIGN_PARAGRAPH.LEFT,
'center': WD_ALIGN_PARAGRAPH.CENTER,
'right': WD_ALIGN_PARAGRAPH.RIGHT
}
paragraph.alignment = alignment_map.get(alignment, WD_ALIGN_PARAGRAPH.LEFT)
# 添加图片
run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
# 设置图片尺寸
if width and height:
picture = run.add_picture(image_path, width=Inches(width), height=Inches(height))
elif width:
picture = run.add_picture(image_path, width=Inches(width))
elif height:
picture = run.add_picture(image_path, height=Inches(height))
else:
picture = run.add_picture(image_path)
# 添加标题
if caption:
caption_paragraph = self.document.add_paragraph(caption)
caption_paragraph.alignment = alignment_map.get(alignment, WD_ALIGN_PARAGRAPH.LEFT)
# 设置标题样式
for run in caption_paragraph.runs:
run.font.size = Inches(0.1) # 小字体
run.italic = True
print(f"已添加图片: {os.path.basename(image_path)}")
return picture
def add_images_from_folder(self, folder_path, pattern="*", max_width=6, spacing=True):
"""
从文件夹批量添加图片
Args:
folder_path: 图片文件夹路径
pattern: 文件名模式(如 "*.jpg", "image_*.png")
max_width: 最大宽度(英寸)
spacing: 是否在图片间添加空行
Returns:
添加的图片数量
"""
folder = Path(folder_path)
if not folder.exists():
raise FileNotFoundError(f"找不到文件夹: {folder_path}")
# 获取匹配的图片文件
image_files = sorted(folder.glob(pattern))
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'}
image_files = [f for f in image_files if f.suffix.lower() in image_extensions]
if not image_files:
print(f"在文件夹 {folder_path} 中没有找到匹配的图片文件")
return 0
added_count = 0
for image_file in image_files:
try:
# 添加图片,自动调整宽度
self.add_image(str(image_file), width=max_width, alignment='center')
added_count += 1
# 添加间距
if spacing and added_count < len(image_files):
self.document.add_paragraph()
except Exception as e:
print(f"添加图片 {image_file.name} 时出错: {e}")
print(f"成功添加 {added_count} 个图片")
return added_count
def add_images_with_metadata(self, metadata_file):
"""
根据元数据文件添加图片
Args:
metadata_file: 元数据JSON文件路径
Returns:
添加的图片数量
"""
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
images = metadata.get('images', [])
embedded_images = [img for img in images if img.get('type') != 'external_link']
added_count = 0
for img_data in sorted(embedded_images, key=lambda x: x.get('basic_info', {}).get('order', 0)):
try:
basic_info = img_data.get('basic_info', {})
document_context = img_data.get('document_context', {})
dimensions = img_data.get('dimensions', {})
image_path = basic_info.get('file_path')
if not image_path or not os.path.exists(image_path):
continue
# 计算合适的显示尺寸
actual_width = dimensions.get('actual_width', 0)
actual_height = dimensions.get('actual_height', 0)
display_width = 4 # 默认4英寸宽度
if actual_width > 0 and actual_height > 0:
aspect_ratio = actual_width / actual_height
if aspect_ratio > 2: # 宽图片
display_width = 6
elif aspect_ratio < 0.5: # 高图片
display_width = 3
# 添加图片
caption = document_context.get('alt_text', '')
self.add_image(image_path, width=display_width, caption=caption, alignment='center')
added_count += 1
# 添加间距
self.document.add_paragraph()
except Exception as e:
print(f"添加图片时出错: {e}")
print(f"根据元数据成功添加 {added_count} 个图片")
return added_count
def save_document(self, output_path):
"""
保存文档
Args:
output_path: 输出文件路径
"""
# 确保输出目录存在
os.makedirs(os.path.dirname(output_path), exist_ok=True)
self.document.save(output_path)
print(f"文档已保存到: {output_path}")
def add_title_page(self, title, subtitle=None, author=None):
"""
添加标题页
Args:
title: 主标题
subtitle: 副标题
author: 作者
"""
# 主标题
title_paragraph = self.document.add_paragraph(title)
title_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
for run in title_paragraph.runs:
run.font.size = Inches(0.3)
run.bold = True
# 副标题
if subtitle:
subtitle_paragraph = self.document.add_paragraph(subtitle)
subtitle_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
for run in subtitle_paragraph.runs:
run.font.size = Inches(0.2)
# 作者
if author:
self.document.add_paragraph() # 空行
author_paragraph = self.document.add_paragraph(f"作者: {author}")
author_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
# 添加分页符
self.document.add_page_break()
# 使用示例
def create_image_document_example():
"""创建包含图片的文档示例"""
try:
# 创建加载器
loader = WordImageLoader()
# 添加标题页
loader.add_title_page(
title="图片文档示例",
subtitle="使用Python自动生成",
author="图片处理系统"
)
# 从文件夹批量添加图片
loader.add_images_from_folder("extracted_images", pattern="image_*.jpg", max_width=5)
# 保存文档
loader.save_document("generated_document.docx")
except Exception as e:
print(f"创建文档时出错: {e}")
if __name__ == "__main__":
create_image_document_example()在处理大量Word文档或大型文档时,性能优化变得非常重要。我们需要考虑内存使用、处理速度和错误处理等方面。
import concurrent.futures
import threading
from tqdm import tqdm
import logging
from pathlib import Path
class BatchImageProcessor:
"""批量图片处理器"""
def __init__(self, max_workers=4, progress_bar=True):
"""
初始化批量处理器
Args:
max_workers: 最大并发工作线程数
progress_bar: 是否显示进度条
"""
self.max_workers = max_workers
self.progress_bar = progress_bar
self.results = []
self.errors = []
self.lock = threading.Lock()
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('batch_processing.log'),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def process_single_document(self, docx_path, output_base_dir):
"""
处理单个文档
Args:
docx_path: Word文档路径
output_base_dir: 输出基础目录
Returns:
处理结果
"""
try:
# 为每个文档创建单独的输出目录
doc_name = Path(docx_path).stem
output_dir = os.path.join(output_base_dir, doc_name)
# 创建提取器并处理
extractor = EnhancedWordImageExtractor(docx_path)
metadata = extractor.extract_images_with_metadata(output_dir)
result = {
'document': docx_path,
'output_dir': output_dir,
'success': True,
'image_count': metadata['document_info']['total_images'],
'embedded_count': metadata['document_info']['embedded_images'],
'external_count': metadata['document_info']['external_links']
}
with self.lock:
self.results.append(result)
self.logger.info(f"成功处理文档: {docx_path}")
return result
except Exception as e:
error_info = {
'document': docx_path,
'error': str(e),
'success': False
}
with self.lock:
self.errors.append(error_info)
self.logger.error(f"处理文档 {docx_path} 时出错: {e}")
return error_info
def process_documents(self, docx_files, output_base_dir):
"""
批量处理多个文档
Args:
docx_files: Word文档文件路径列表
output_base_dir: 输出基础目录
Returns:
处理结果摘要
"""
os.makedirs(output_base_dir, exist_ok=True)
self.logger.info(f"开始批量处理 {len(docx_files)} 个文档")
# 使用线程池并行处理
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# 提交所有任务
future_to_doc = {
executor.submit(self.process_single_document, doc, output_base_dir): doc
for doc in docx_files
}
# 处理结果(带进度条)
if self.progress_bar:
futures = list(concurrent.futures.as_completed(future_to_doc))
for future in tqdm(futures, desc="处理文档"):
try:
result = future.result()
except Exception as e:
doc = future_to_doc[future]
self.logger.error(f"处理文档 {doc} 时发生异常: {e}")
else:
for future in concurrent.futures.as_completed(future_to_doc):
try:
result = future.result()
except Exception as e:
doc = future_to_doc[future]
self.logger.error(f"处理文档 {doc} 时发生异常: {e}")
# 生成处理摘要
summary = self.generate_processing_summary()
# 保存摘要报告
summary_path = os.path.join(output_base_dir, 'batch_processing_summary.json')
with open(summary_path, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
self.logger.info(f"批量处理完成,摘要已保存到: {summary_path}")
return summary
def generate_processing_summary(self):
"""生成处理摘要"""
successful_results = [r for r in self.results if r.get('success')]
total_images = sum(r.get('image_count', 0) for r in successful_results)
total_embedded = sum(r.get('embedded_count', 0) for r in successful_results)
total_external = sum(r.get('external_count', 0) for r in successful_results)
summary = {
'processing_info': {
'timestamp': datetime.now().isoformat(),
'total_documents': len(self.results) + len(self.errors),
'successful_documents': len(successful_results),
'failed_documents': len(self.errors),
'success_rate': len(successful_results) / (len(self.results) + len(self.errors)) * 100
},
'image_statistics': {
'total_images_extracted': total_images,
'embedded_images': total_embedded,
'external_links': total_external
},
'successful_documents': successful_results,
'failed_documents': self.errors
}
return summary
def find_docx_files(self, directory, recursive=True):
"""
查找目录中的所有Word文档
Args:
directory: 搜索目录
recursive: 是否递归搜索子目录
Returns:
Word文档文件路径列表
"""
directory = Path(directory)
if not directory.exists():
raise FileNotFoundError(f"目录不存在: {directory}")
if recursive:
docx_files = list(directory.rglob("*.docx"))
else:
docx_files = list(directory.glob("*.docx"))
# 过滤掉临时文件
docx_files = [f for f in docx_files if not f.name.startswith('~$')]
return [str(f) for f in docx_files]
# 性能优化工具
class PerformanceOptimizer:
"""性能优化工具"""
@staticmethod
def optimize_memory_usage():
"""优化内存使用"""
import gc
gc.collect() # 强制垃圾回收
@staticmethod
def get_memory_usage():
"""获取当前内存使用情况"""
import psutil
process = psutil.Process()
memory_info = process.memory_info()
return {
'rss': memory_info.rss / 1024 / 1024, # MB
'vms': memory_info.vms / 1024 / 1024 # MB
}
@staticmethod
def monitor_performance(func):
"""性能监控装饰器"""
import time
import functools
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
start_memory = PerformanceOptimizer.get_memory_usage()
result = func(*args, **kwargs)
end_time = time.time()
end_memory = PerformanceOptimizer.get_memory_usage()
execution_time = end_time - start_time
memory_delta = end_memory['rss'] - start_memory['rss']
print(f"函数 {func.__name__} 执行完成:")
print(f" - 执行时间: {execution_time:.2f} 秒")
print(f" - 内存变化: {memory_delta:.2f} MB")
return result
return wrapper
# 批量处理示例
def batch_processing_example():
"""批量处理示例"""
try:
# 创建批量处理器
processor = BatchImageProcessor(max_workers=4, progress_bar=True)
# 查找所有Word文档
docx_files = processor.find_docx_files("./documents", recursive=True)
print(f"找到 {len(docx_files)} 个Word文档")
# 批量处理
summary = processor.process_documents(docx_files, "./batch_output")
# 显示处理结果
print(f"\n批量处理完成:")
print(f" - 成功处理: {summary['processing_info']['successful_documents']} 个文档")
print(f" - 处理失败: {summary['processing_info']['failed_documents']} 个文档")
print(f" - 成功率: {summary['processing_info']['success_rate']:.1f}%")
print(f" - 总提取图片: {summary['image_statistics']['total_images_extracted']} 个")
except Exception as e:
print(f"批量处理时出错: {e}")
if __name__ == "__main__":
batch_processing_example()在实际工作中,Word文档图片提取和加载功能有很多应用场景。以下是一些典型的实用案例。
class DocumentImageArchiver:
"""文档图片归档器"""
def __init__(self, archive_base_dir):
self.archive_base_dir = archive_base_dir
os.makedirs(archive_base_dir, exist_ok=True)
def archive_document_images(self, docx_path, preserve_structure=True):
"""
归档单个文档的图片
Args:
docx_path: Word文档路径
preserve_structure: 是否保持原始文档结构
Returns:
归档结果信息
"""
doc_name = Path(docx_path).stem
archive_dir = os.path.join(self.archive_base_dir, doc_name)
# 创建归档目录结构
if preserve_structure:
images_dir = os.path.join(archive_dir, "images")
metadata_dir = os.path.join(archive_dir, "metadata")
os.makedirs(images_dir, exist_ok=True)
os.makedirs(metadata_dir, exist_ok=True)
else:
images_dir = archive_dir
metadata_dir = archive_dir
os.makedirs(archive_dir, exist_ok=True)
# 提取图片和元数据
extractor = EnhancedWordImageExtractor(docx_path)
metadata = extractor.extract_images_with_metadata(images_dir)
# 复制原始文档
original_doc_path = os.path.join(archive_dir, f"{doc_name}_original.docx")
shutil.copy2(docx_path, original_doc_path)
# 生成归档信息
archive_info = {
'archive_date': datetime.now().isoformat(),
'original_document': docx_path,
'archived_document': original_doc_path,
'images_directory': images_dir,
'metadata_directory': metadata_dir,
'total_images': metadata['document_info']['total_images'],
'archive_size': self._calculate_directory_size(archive_dir)
}
# 保存归档信息
archive_info_path = os.path.join(metadata_dir, 'archive_info.json')
with open(archive_info_path, 'w', encoding='utf-8') as f:
json.dump(archive_info, f, ensure_ascii=False, indent=2)
return archive_info
def _calculate_directory_size(self, directory):
"""计算目录大小"""
total_size = 0
for dirpath, dirnames, filenames in os.walk(directory):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
return total_size
def create_archive_index(self):
"""创建归档索引"""
index = {
'creation_date': datetime.now().isoformat(),
'total_archives': 0,
'total_images': 0,
'total_size': 0,
'archives': []
}
# 扫描所有归档
for archive_dir in os.listdir(self.archive_base_dir):
archive_path = os.path.join(self.archive_base_dir, archive_dir)
if os.path.isdir(archive_path):
archive_info_path = os.path.join(archive_path, 'metadata', 'archive_info.json')
if os.path.exists(archive_info_path):
with open(archive_info_path, 'r', encoding='utf-8') as f:
archive_info = json.load(f)
index['archives'].append({
'name': archive_dir,
'archive_date': archive_info.get('archive_date'),
'total_images': archive_info.get('total_images', 0),
'archive_size': archive_info.get('archive_size', 0)
})
index['total_images'] += archive_info.get('total_images', 0)
index['total_size'] += archive_info.get('archive_size', 0)
index['total_archives'] = len(index['archives'])
# 保存索引
index_path = os.path.join(self.archive_base_dir, 'archive_index.json')
with open(index_path, 'w', encoding='utf-8') as f:
json.dump(index, f, ensure_ascii=False, indent=2)
return indexclass ImageQualityAnalyzer:
"""图片质量分析器"""
def __init__(self):
self.quality_thresholds = {
'min_width': 100,
'min_height': 100,
'max_file_size': 5 * 1024 * 1024, # 5MB
'min_dpi': 72,
'preferred_formats': ['JPEG', 'PNG']
}
def analyze_extracted_images(self, metadata_file):
"""
分析提取的图片质量
Args:
metadata_file: 元数据文件路径
Returns:
质量分析报告
"""
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
analysis_report = {
'analysis_date': datetime.now().isoformat(),
'total_images': 0,
'quality_issues': [],
'recommendations': [],
'statistics': {
'low_resolution': 0,
'large_file_size': 0,
'unsupported_format': 0,
'missing_alt_text': 0
}
}
images = metadata.get('images', [])
embedded_images = [img for img in images if img.get('type') != 'external_link']
analysis_report['total_images'] = len(embedded_images)
for img in embedded_images:
basic_info = img.get('basic_info', {})
dimensions = img.get('dimensions', {})
image_props = img.get('image_properties', {})
doc_context = img.get('document_context', {})
image_issues = []
# 检查分辨率
width = dimensions.get('actual_width', 0)
height = dimensions.get('actual_height', 0)
if width < self.quality_thresholds['min_width'] or height < self.quality_thresholds['min_height']:
image_issues.append('低分辨率')
analysis_report['statistics']['low_resolution'] += 1
# 检查文件大小
file_size = basic_info.get('file_size', 0)
if file_size > self.quality_thresholds['max_file_size']:
image_issues.append('文件过大')
analysis_report['statistics']['large_file_size'] += 1
# 检查格式
format_type = image_props.get('format')
if format_type not in self.quality_thresholds['preferred_formats']:
image_issues.append('格式不推荐')
analysis_report['statistics']['unsupported_format'] += 1
# 检查替代文本
alt_text = doc_context.get('alt_text', '').strip()
if not alt_text:
image_issues.append('缺少替代文本')
analysis_report['statistics']['missing_alt_text'] += 1
# 记录问题
if image_issues:
analysis_report['quality_issues'].append({
'filename': basic_info.get('filename'),
'order': basic_info.get('order'),
'issues': image_issues,
'dimensions': f"{width}x{height}",
'file_size_kb': round(file_size / 1024, 1),
'format': format_type
})
# 生成建议
self._generate_recommendations(analysis_report)
return analysis_report
def _generate_recommendations(self, analysis_report):
"""生成优化建议"""
stats = analysis_report['statistics']
recommendations = []
if stats['low_resolution'] > 0:
recommendations.append(f"发现 {stats['low_resolution']} 个低分辨率图片,建议使用更高分辨率的原图")
if stats['large_file_size'] > 0:
recommendations.append(f"发现 {stats['large_file_size']} 个大文件,建议进行压缩优化")
if stats['unsupported_format'] > 0:
recommendations.append(f"发现 {stats['unsupported_format']} 个非推荐格式图片,建议转换为JPEG或PNG格式")
if stats['missing_alt_text'] > 0:
recommendations.append(f"发现 {stats['missing_alt_text']} 个图片缺少替代文本,建议添加描述性文本以提高可访问性")
analysis_report['recommendations'] = recommendations
def optimize_images(self, metadata_file, output_dir, quality=85, max_width=1920):
"""
优化图片质量
Args:
metadata_file: 元数据文件路径
output_dir: 优化后图片输出目录
quality: JPEG质量 (1-100)
max_width: 最大宽度
Returns:
优化结果
"""
os.makedirs(output_dir, exist_ok=True)
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
optimization_results = {
'optimization_date': datetime.now().isoformat(),
'settings': {
'quality': quality,
'max_width': max_width
},
'results': [],
'total_size_before': 0,
'total_size_after': 0
}
images = metadata.get('images', [])
embedded_images = [img for img in images if img.get('type') != 'external_link']
for img in embedded_images:
basic_info = img.get('basic_info', {})
original_path = basic_info.get('file_path')
if not original_path or not os.path.exists(original_path):
continue
try:
# 打开原图
with Image.open(original_path) as pil_img:
original_size = os.path.getsize(original_path)
optimization_results['total_size_before'] += original_size
# 调整尺寸
if pil_img.width > max_width:
ratio = max_width / pil_img.width
new_height = int(pil_img.height * ratio)
pil_img = pil_img.resize((max_width, new_height), Image.Resampling.LANCZOS)
# 转换为RGB(如果需要)
if pil_img.mode in ('RGBA', 'P'):
rgb_img = Image.new('RGB', pil_img.size, (255, 255, 255))
rgb_img.paste(pil_img, mask=pil_img.split()[-1] if pil_img.mode == 'RGBA' else None)
pil_img = rgb_img
# 保存优化后的图片
filename = basic_info.get('filename')
name, ext = os.path.splitext(filename)
optimized_filename = f"{name}_optimized.jpg"
optimized_path = os.path.join(output_dir, optimized_filename)
pil_img.save(optimized_path, 'JPEG', quality=quality, optimize=True)
optimized_size = os.path.getsize(optimized_path)
optimization_results['total_size_after'] += optimized_size
# 记录优化结果
optimization_results['results'].append({
'original_filename': filename,
'optimized_filename': optimized_filename,
'original_size_kb': round(original_size / 1024, 1),
'optimized_size_kb': round(optimized_size / 1024, 1),
'size_reduction_percent': round((1 - optimized_size / original_size) * 100, 1),
'original_dimensions': f"{img.get('dimensions', {}).get('actual_width', 0)}x{img.get('dimensions', {}).get('actual_height', 0)}",
'optimized_dimensions': f"{pil_img.width}x{pil_img.height}"
})
except Exception as e:
print(f"优化图片 {basic_info.get('filename')} 时出错: {e}")
# 计算总体优化效果
if optimization_results['total_size_before'] > 0:
total_reduction = (1 - optimization_results['total_size_after'] / optimization_results['total_size_before']) * 100
optimization_results['total_size_reduction_percent'] = round(total_reduction, 1)
# 保存优化报告
report_path = os.path.join(output_dir, 'optimization_report.json')
with open(report_path, 'w', encoding='utf-8') as f:
json.dump(optimization_results, f, ensure_ascii=False, indent=2)
return optimization_resultsclass ImageContentAnalyzer:
"""图片内容分析器"""
def __init__(self):
self.image_categories = {
'chart': ['图表', '表格', '统计', '数据'],
'diagram': ['流程图', '示意图', '架构图', '结构图'],
'photo': ['照片', '图片', '实物', '现场'],
'screenshot': ['截图', '界面', '屏幕', '软件'],
'logo': ['标志', '徽标', '品牌', 'logo'],
'other': ['其他']
}
def analyze_image_content(self, metadata_file):
"""
分析图片内容并分类
Args:
metadata_file: 元数据文件路径
Returns:
内容分析结果
"""
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
analysis_result = {
'analysis_date': datetime.now().isoformat(),
'categorization': {category: [] for category in self.image_categories.keys()},
'statistics': {category: 0 for category in self.image_categories.keys()},
'uncategorized': []
}
images = metadata.get('images', [])
embedded_images = [img for img in images if img.get('type') != 'external_link']
for img in embedded_images:
basic_info = img.get('basic_info', {})
doc_context = img.get('document_context', {})
dimensions = img.get('dimensions', {})
# 基于替代文本和文件名进行分类
alt_text = doc_context.get('alt_text', '').lower()
filename = basic_info.get('filename', '').lower()
# 基于尺寸特征进行分类
width = dimensions.get('actual_width', 0)
height = dimensions.get('actual_height', 0)
aspect_ratio = width / height if height > 0 else 1
category = self._classify_image(alt_text, filename, aspect_ratio, width, height)
image_info = {
'filename': basic_info.get('filename'),
'order': basic_info.get('order'),
'alt_text': doc_context.get('alt_text', ''),
'dimensions': f"{width}x{height}",
'aspect_ratio': round(aspect_ratio, 2),
'classification_reason': self._get_classification_reason(alt_text, filename, aspect_ratio)
}
if category:
analysis_result['categorization'][category].append(image_info)
analysis_result['statistics'][category] += 1
else:
analysis_result['uncategorized'].append(image_info)
return analysis_result
def _classify_image(self, alt_text, filename, aspect_ratio, width, height):
"""根据特征分类图片"""
text_content = f"{alt_text} {filename}"
# 基于关键词分类
for category, keywords in self.image_categories.items():
if category == 'other':
continue
for keyword in keywords:
if keyword in text_content:
return category
# 基于尺寸特征分类
if aspect_ratio > 2 or aspect_ratio < 0.5:
if '图' in text_content or 'chart' in text_content:
return 'chart'
return 'diagram'
# 基于尺寸大小分类
if width > 800 and height > 600:
if 'screenshot' in filename or '截图' in alt_text:
return 'screenshot'
return 'photo'
return None
def _get_classification_reason(self, alt_text, filename, aspect_ratio):
"""获取分类原因"""
reasons = []
if any(keyword in f"{alt_text} {filename}" for keyword in ['图表', '表格', 'chart']):
reasons.append("包含图表关键词")
if aspect_ratio > 2:
reasons.append("宽高比过大,可能是图表或流程图")
elif aspect_ratio < 0.5:
reasons.append("宽高比过小,可能是竖向图表")
if 'screenshot' in filename or '截图' in alt_text:
reasons.append("文件名或替代文本包含截图信息")
return "; ".join(reasons) if reasons else "基于默认规则分类"
def generate_content_report(self, analysis_result, output_path):
"""生成内容分析报告"""
report_lines = []
report_lines.append("=== 图片内容分析报告 ===\n")
# 统计信息
total_images = sum(analysis_result['statistics'].values())
report_lines.append(f"总图片数量: {total_images}")
report_lines.append(f"未分类图片: {len(analysis_result['uncategorized'])}")
report_lines.append("")
# 各类别统计
report_lines.append("类别分布:")
for category, count in analysis_result['statistics'].items():
if count > 0:
percentage = (count / total_images) * 100 if total_images > 0 else 0
report_lines.append(f" - {category}: {count} 个 ({percentage:.1f}%)")
report_lines.append("")
# 详细分类信息
for category, images in analysis_result['categorization'].items():
if images:
report_lines.append(f"\n{category.upper()} 类别图片:")
for img in images:
report_lines.append(f" - {img['filename']} ({img['dimensions']})")
if img['alt_text']:
report_lines.append(f" 替代文本: {img['alt_text']}")
report_lines.append(f" 分类原因: {img['classification_reason']}")
# 未分类图片
if analysis_result['uncategorized']:
report_lines.append("\n未分类图片:")
for img in analysis_result['uncategorized']:
report_lines.append(f" - {img['filename']} ({img['dimensions']})")
if img['alt_text']:
report_lines.append(f" 替代文本: {img['alt_text']}")
# 保存报告
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n".join(report_lines))
return output_path在实际使用过程中,可能会遇到各种问题。以下是一些常见问题及其解决方案。
class DocumentValidator:
"""文档验证器"""
@staticmethod
def validate_docx_file(docx_path):
"""
验证Word文档是否有效
Args:
docx_path: Word文档路径
Returns:
验证结果和错误信息
"""
validation_result = {
'is_valid': False,
'errors': [],
'warnings': [],
'file_info': {}
}
try:
# 检查文件是否存在
if not os.path.exists(docx_path):
validation_result['errors'].append(f"文件不存在: {docx_path}")
return validation_result
# 检查文件扩展名
if not docx_path.lower().endswith('.docx'):
validation_result['errors'].append("文件扩展名不是.docx")
return validation_result
# 获取文件信息
file_stat = os.stat(docx_path)
validation_result['file_info'] = {
'size': file_stat.st_size,
'modified_time': datetime.fromtimestamp(file_stat.st_mtime).isoformat()
}
# 检查文件大小
if file_stat.st_size == 0:
validation_result['errors'].append("文件大小为0")
return validation_result
# 尝试作为ZIP文件打开
try:
with zipfile.ZipFile(docx_path, 'r') as docx_zip:
# 检查必要的文件是否存在
required_files = [
'[Content_Types].xml',
'_rels/.rels',
'word/document.xml'
]
file_list = docx_zip.namelist()
for required_file in required_files:
if required_file not in file_list:
validation_result['errors'].append(f"缺少必要文件: {required_file}")
# 检查XML文件是否可以解析
try:
doc_xml = docx_zip.read('word/document.xml')
ET.fromstring(doc_xml)
except ET.ParseError as e:
validation_result['errors'].append(f"document.xml解析错误: {e}")
# 检查关系文件
if 'word/_rels/document.xml.rels' in file_list:
try:
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
ET.fromstring(rels_xml)
except ET.ParseError as e:
validation_result['warnings'].append(f"关系文件解析警告: {e}")
except zipfile.BadZipFile:
validation_result['errors'].append("文件不是有效的ZIP格式")
return validation_result
# 尝试使用python-docx打开
try:
doc = Document(docx_path)
validation_result['file_info']['paragraphs'] = len(doc.paragraphs)
except Exception as e:
validation_result['warnings'].append(f"python-docx打开警告: {e}")
# 如果没有错误,则文档有效
if not validation_result['errors']:
validation_result['is_valid'] = True
except Exception as e:
validation_result['errors'].append(f"验证过程中出现异常: {e}")
return validation_result
@staticmethod
def repair_docx_file(docx_path, output_path):
"""
尝试修复损坏的Word文档
Args:
docx_path: 原始文档路径
output_path: 修复后文档路径
Returns:
修复结果
"""
repair_result = {
'success': False,
'actions_taken': [],
'remaining_issues': []
}
try:
# 首先验证原文档
validation = DocumentValidator.validate_docx_file(docx_path)
if validation['is_valid']:
# 文档本身没问题,直接复制
shutil.copy2(docx_path, output_path)
repair_result['success'] = True
repair_result['actions_taken'].append("文档无需修复,已复制到目标位置")
return repair_result
# 尝试提取可用内容
with zipfile.ZipFile(docx_path, 'r') as source_zip:
with zipfile.ZipFile(output_path, 'w') as target_zip:
# 复制基本结构文件
basic_files = [
'[Content_Types].xml',
'_rels/.rels'
]
for file_name in basic_files:
if file_name in source_zip.namelist():
target_zip.writestr(file_name, source_zip.read(file_name))
repair_result['actions_taken'].append(f"已复制: {file_name}")
# 尝试修复document.xml
if 'word/document.xml' in source_zip.namelist():
try:
doc_xml = source_zip.read('word/document.xml')
# 尝试解析和清理XML
root = ET.fromstring(doc_xml)
cleaned_xml = ET.tostring(root, encoding='utf-8')
target_zip.writestr('word/document.xml', cleaned_xml)
repair_result['actions_taken'].append("已修复document.xml")
except Exception as e:
repair_result['remaining_issues'].append(f"无法修复document.xml: {e}")
# 复制媒体文件
for file_name in source_zip.namelist():
if file_name.startswith('word/media/'):
try:
target_zip.writestr(file_name, source_zip.read(file_name))
repair_result['actions_taken'].append(f"已复制媒体文件: {file_name}")
except Exception as e:
repair_result['remaining_issues'].append(f"无法复制媒体文件 {file_name}: {e}")
# 验证修复结果
repair_validation = DocumentValidator.validate_docx_file(output_path)
repair_result['success'] = repair_validation['is_valid']
except Exception as e:
repair_result['remaining_issues'].append(f"修复过程中出现异常: {e}")
return repair_resultclass MemoryEfficientExtractor:
"""内存高效的图片提取器"""
def __init__(self, docx_path, chunk_size=1024*1024): # 1MB chunks
self.docx_path = docx_path
self.chunk_size = chunk_size
def extract_large_images_streaming(self, output_dir):
"""
"""
流式提取大图片,减少内存使用
Args:
output_dir: 输出目录
Returns:
提取结果
"""
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
with zipfile.ZipFile(self.docx_path, 'r') as docx_zip:
# 查找媒体文件
media_files = [f for f in docx_zip.namelist() if f.startswith('word/media/')]
for media_file in media_files:
filename = os.path.basename(media_file)
output_path = os.path.join(output_dir, filename)
# 流式复制大文件
with docx_zip.open(media_file) as source:
with open(output_path, 'wb') as target:
while True:
chunk = source.read(self.chunk_size)
if not chunk:
break
target.write(chunk)
extracted_images.append(output_path)
# 强制垃圾回收
import gc
gc.collect()
return extracted_images
def get_image_info_lightweight(self):
"""
轻量级获取图片信息,不加载图片数据
Returns:
图片信息列表
"""
image_info = []
with zipfile.ZipFile(self.docx_path, 'r') as docx_zip:
# 只解析XML,不读取图片数据
if 'word/_rels/document.xml.rels' in docx_zip.namelist():
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
for rel in rels_root.findall("*"):
if 'image' in rel.get('Type', ''):
target = rel.get('Target')
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
# 获取文件信息但不读取内容
file_info = docx_zip.getinfo(image_path)
image_info.append({
'rel_id': rel.get('Id'),
'target': target,
'filename': os.path.basename(target),
'compressed_size': file_info.compress_size,
'file_size': file_info.file_size,
'compress_type': file_info.compress_type
})
return image_infoclass FastImageExtractor:
"""快速图片提取器"""
def __init__(self, docx_path):
self.docx_path = docx_path
self._cache = {}
def extract_images_fast(self, output_dir, skip_duplicates=True):
"""
快速提取图片,使用多种优化技术
Args:
output_dir: 输出目录
skip_duplicates: 是否跳过重复图片
Returns:
提取结果
"""
os.makedirs(output_dir, exist_ok=True)
# 使用缓存避免重复解析
cache_key = f"{self.docx_path}_{os.path.getmtime(self.docx_path)}"
if cache_key in self._cache:
image_refs = self._cache[cache_key]
else:
image_refs = self._parse_image_references_fast()
self._cache[cache_key] = image_refs
extracted_images = []
seen_hashes = set() if skip_duplicates else None
with zipfile.ZipFile(self.docx_path, 'r') as docx_zip:
for i, ref in enumerate(image_refs, 1):
image_path = f"word/{ref['target']}"
if image_path not in docx_zip.namelist():
continue
# 快速重复检测
if skip_duplicates:
with docx_zip.open(image_path) as f:
# 只读取前1KB用于快速哈希
quick_hash = hashlib.md5(f.read(1024)).hexdigest()
if quick_hash in seen_hashes:
continue
seen_hashes.add(quick_hash)
# 提取图片
file_ext = os.path.splitext(ref['target'])[1]
filename = f"image_{i:03d}{file_ext}"
output_path = os.path.join(output_dir, filename)
with docx_zip.open(image_path) as source:
with open(output_path, 'wb') as target:
shutil.copyfileobj(source, target)
extracted_images.append({
'filename': filename,
'original_target': ref['target'],
'order': i
})
return extracted_images
def _parse_image_references_fast(self):
"""快速解析图片引用"""
with zipfile.ZipFile(self.docx_path, 'r') as docx_zip:
# 使用更快的XML解析
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
# 使用正则表达式快速提取图片引用
import re
pattern = r'<Relationship[^>]*Type="[^"]*image"[^>]*Target="([^"]*)"[^>]*Id="([^"]*)"'
matches = re.findall(pattern, rels_xml.decode('utf-8'))
return [{'target': target, 'rel_id': rel_id} for target, rel_id in matches]class ImageWatermarkProcessor:
"""图片水印处理器"""
def __init__(self):
self.watermark_settings = {
'opacity': 0.3,
'position': 'bottom-right',
'margin': 20,
'font_size': 24
}
def add_watermark_to_extracted_images(self, image_dir, watermark_text, output_dir=None):
"""
为提取的图片添加水印
Args:
image_dir: 图片目录
watermark_text: 水印文字
output_dir: 输出目录,如果为None则覆盖原图
Returns:
处理结果
"""
if output_dir is None:
output_dir = image_dir
else:
os.makedirs(output_dir, exist_ok=True)
processed_images = []
for filename in os.listdir(image_dir):
if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
input_path = os.path.join(image_dir, filename)
output_path = os.path.join(output_dir, filename)
try:
self._add_text_watermark(input_path, output_path, watermark_text)
processed_images.append(output_path)
except Exception as e:
print(f"处理图片 {filename} 时出错: {e}")
return processed_images
def _add_text_watermark(self, input_path, output_path, text):
"""添加文字水印"""
from PIL import Image, ImageDraw, ImageFont
with Image.open(input_path) as img:
# 创建透明图层
watermark = Image.new('RGBA', img.size, (0, 0, 0, 0))
draw = ImageDraw.Draw(watermark)
# 尝试加载字体
try:
font = ImageFont.truetype("arial.ttf", self.watermark_settings['font_size'])
except:
font = ImageFont.load_default()
# 计算文字尺寸和位置
bbox = draw.textbbox((0, 0), text, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
# 根据设置确定位置
margin = self.watermark_settings['margin']
if self.watermark_settings['position'] == 'bottom-right':
x = img.width - text_width - margin
y = img.height - text_height - margin
elif self.watermark_settings['position'] == 'bottom-left':
x = margin
y = img.height - text_height - margin
elif self.watermark_settings['position'] == 'top-right':
x = img.width - text_width - margin
y = margin
else: # top-left
x = margin
y = margin
# 绘制水印
alpha = int(255 * self.watermark_settings['opacity'])
draw.text((x, y), text, font=font, fill=(255, 255, 255, alpha))
# 合并图片
if img.mode != 'RGBA':
img = img.convert('RGBA')
watermarked = Image.alpha_composite(img, watermark)
# 保存图片
if output_path.lower().endswith('.jpg') or output_path.lower().endswith('.jpeg'):
watermarked = watermarked.convert('RGB')
watermarked.save(output_path)class ImageConverter:
"""图片格式转换器"""
def __init__(self):
self.conversion_settings = {
'jpeg_quality': 85,
'png_optimize': True,
'webp_quality': 80,
'max_dimension': None
}
def batch_convert_images(self, input_dir, output_dir, target_format='JPEG', settings=None):
"""
批量转换图片格式
Args:
input_dir: 输入目录
output_dir: 输出目录
target_format: 目标格式 ('JPEG', 'PNG', 'WEBP')
settings: 转换设置
Returns:
转换结果
"""
if settings:
self.conversion_settings.update(settings)
os.makedirs(output_dir, exist_ok=True)
conversion_results = {
'converted_files': [],
'failed_files': [],
'total_size_before': 0,
'total_size_after': 0
}
for filename in os.listdir(input_dir):
if filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff')):
input_path = os.path.join(input_dir, filename)
# 生成输出文件名
name, _ = os.path.splitext(filename)
if target_format.upper() == 'JPEG':
output_filename = f"{name}.jpg"
elif target_format.upper() == 'PNG':
output_filename = f"{name}.png"
elif target_format.upper() == 'WEBP':
output_filename = f"{name}.webp"
else:
output_filename = f"{name}.{target_format.lower()}"
output_path = os.path.join(output_dir, output_filename)
try:
result = self._convert_single_image(input_path, output_path, target_format)
conversion_results['converted_files'].append(result)
conversion_results['total_size_before'] += result['size_before']
conversion_results['total_size_after'] += result['size_after']
except Exception as e:
conversion_results['failed_files'].append({
'filename': filename,
'error': str(e)
})
# 计算压缩率
if conversion_results['total_size_before'] > 0:
compression_ratio = (1 - conversion_results['total_size_after'] / conversion_results['total_size_before']) * 100
conversion_results['compression_ratio'] = round(compression_ratio, 2)
return conversion_results
def _convert_single_image(self, input_path, output_path, target_format):
"""转换单个图片"""
size_before = os.path.getsize(input_path)
with Image.open(input_path) as img:
# 调整尺寸(如果设置了最大尺寸)
if self.conversion_settings['max_dimension']:
max_dim = self.conversion_settings['max_dimension']
if img.width > max_dim or img.height > max_dim:
img.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
# 根据目标格式进行转换
if target_format.upper() == 'JPEG':
if img.mode in ('RGBA', 'P'):
# 转换为RGB
rgb_img = Image.new('RGB', img.size, (255, 255, 255))
if img.mode == 'P':
img = img.convert('RGBA')
rgb_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
img = rgb_img
img.save(output_path, 'JPEG',
quality=self.conversion_settings['jpeg_quality'],
optimize=True)
elif target_format.upper() == 'PNG':
img.save(output_path, 'PNG',
optimize=self.conversion_settings['png_optimize'])
elif target_format.upper() == 'WEBP':
img.save(output_path, 'WEBP',
quality=self.conversion_settings['webp_quality'],
optimize=True)
else:
img.save(output_path, target_format.upper())
size_after = os.path.getsize(output_path)
return {
'input_file': os.path.basename(input_path),
'output_file': os.path.basename(output_path),
'size_before': size_before,
'size_after': size_after,
'compression_ratio': round((1 - size_after / size_before) * 100, 2) if size_before > 0 else 0
}通过本文的详细介绍,我们已经构建了一个完整的Word文档图片处理系统,包括:
通过本文提供的完整解决方案,您可以根据具体需求选择合适的功能模块,构建适合自己的Word文档图片处理系统。这套系统不仅能够满足当前的处理需求,还具有良好的扩展性,可以随着需求的变化而不断完善和优化。
希望这个全面的技术指南能够帮助您在Word文档图片处理方面取得更好的效果,提高工作效率,并为进一步的技术创新奠定基础。
在现代办公和文档处理中,Word文档已经成为最常用的文件格式之一。这些文档不仅包含文本内容,还经常嵌入各种图片、图表和其他媒体元素。在许多场景下,我们需要从Word文档中提取这些图片,例如进行内容分析、创建图像数据库、或者在其他应用程序中重用这些图像。同样,将图片按照特定顺序加载到Word文档中也是一个常见需求。本文将深入探讨如何使用Python实现Word文档中图片的自动提取与加载功能,从理论基础到实际应用,提供全面的技术指南。
在深入技术实现之前,我们需要了解Word文档中图片的存储方式和基本特性。
现代Word文档(.docx格式)实际上是一个ZIP压缩包,包含多个XML文件和资源文件。当我们在Word文档中插入图片时,图片会被存储在文档包的word/media/目录下,并在文档的XML结构中通过引用的方式链接。
Word文档中的图片主要有以下几种存储形式:
Word文档支持多种图片格式,常见的包括:
每个图片在Word文档中还包含多种属性:
在Word文档的XML结构中,图片通过以下方式与文档内容关联:
了解这些基础知识对于我们实现图片提取和加载功能至关重要,因为我们需要正确解析文档结构,找到图片文件,并理解它们在文档中的位置和顺序。
在开始实现Word文档图片处理功能之前,我们需要准备适当的开发环境和工具。
首先,我们需要安装Python环境。推荐使用Python 3.6或更高版本,因为它提供了更好的Unicode支持和更多现代特性。
# 检查Python版本
python --version
# 创建虚拟环境(可选但推荐)
python -m venv word_image_env
source word_image_env/bin/activate # Linux/Mac
word_image_env\Scripts\activate # Windows我们将使用几个关键的Python库来处理Word文档和图片:
pip install python-docx # 处理.docx文件
pip install Pillow # 图像处理
pip install lxml # XML处理(python-docx的依赖,但可能需要单独安装)
pip install tqdm # 进度条显示(可选,用于批量处理)其中,python-docx是我们的核心库,用于读取和操作Word文档。但它在图片提取方面有一些限制,因此我们还需要直接处理文档的ZIP结构和XML内容。
安装完成后,我们可以简单测试环境是否正确配置:
import docx
import PIL
import lxml
import zipfile
import os
print(f"python-docx version: {docx.__version__}")
print(f"Pillow version: {PIL.__version__}")
print(f"lxml version: {lxml.__version__}")
print(f"zipfile module available: {zipfile.__name__}")
print(f"os module available: {os.__name__}")如果所有库都能正确导入并显示版本信息,说明我们的环境已经准备就绪。
为了使我们的代码组织良好且易于维护,我们可以按照以下结构设计项目:
word_image_processor/
│
├── word_image_extractor.py # 图片提取核心功能
├── word_image_loader.py # 图片加载核心功能
├── utils/
│ ├── __init__.py
│ ├── docx_utils.py # Word文档处理工具函数
│ ├── image_utils.py # 图像处理工具函数
│ └── metadata_utils.py # 元数据处理工具函数
│
├── examples/
│ ├── extract_images.py # 图片提取示例
│ └── load_images.py # 图片加载示例
│
└── tests/
├── __init__.py
├── test_extractor.py # 提取功能测试
└── test_loader.py # 加载功能测试这种结构将核心功能、工具函数和示例代码分开,使项目更加清晰和可维护。
要实现图片的提取和加载,我们首先需要深入理解Word文档的内部结构,特别是与图片相关的部分。
如前所述,.docx文件实际上是一个ZIP压缩包,包含多个XML文件和资源文件。这种格式被称为Office Open XML (OOXML),是一种国际标准。
我们可以通过以下方式查看.docx文件的内部结构:
import zipfile
def explore_docx_structure(docx_path):
"""探索Word文档的内部结构"""
with zipfile.ZipFile(docx_path) as docx_zip:
# 列出所有文件
file_list = docx_zip.namelist()
print("文档内部文件列表:")
for file in file_list:
print(f" - {file}")
# 检查是否存在图片文件夹
media_files = [f for f in file_list if f.startswith('word/media/')]
print(f"\n找到 {len(media_files)} 个媒体文件:")
for media in media_files:
print(f" - {media}")
# 使用示例
explore_docx_structure("example.docx")在Word文档中,图片与文档内容的关联主要通过以下文件实现:
我们需要解析这些文件来理解图片在文档中的位置和顺序。
import xml.etree.ElementTree as ET
from zipfile import ZipFile
def analyze_document_images(docx_path):
"""分析文档中的图片引用"""
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with ZipFile(docx_path) as docx_zip:
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用
drawing_elements = doc_root.findall('.//w:drawing', namespaces)
print(f"找到 {len(drawing_elements)} 个图形元素")
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 查找图片关系
image_rels = rels_root.findall(".//*[@Type='http://schemas.openxmlformats.org/officeDocument/2006/relationships/image']")
print(f"找到 {len(image_rels)} 个图片关系")
# 显示图片信息
for rel in image_rels:
rel_id = rel.get('Id')
target = rel.get('Target')
print(f"关系ID: {rel_id}, 目标文件: {target}")
# 使用示例
analyze_document_images("example.docx")在Word文档中,图片的顺序可以通过以下几种方式确定:
对于大多数情况,文档流顺序是最可靠的,因为它反映了图片在文档中的自然排列。但在复杂文档中,我们可能需要结合多种方法来确定准确的顺序。
def get_images_in_order(docx_path):
"""获取文档中图片的顺序"""
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with ZipFile(docx_path) as docx_zip:
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系ID到目标文件的映射
rel_map = {rel.get('Id'): rel.get('Target')
for rel in rels_root.findall("*")}
# 按文档流顺序查找图片引用
image_refs = []
for drawing in doc_root.findall('.//w:drawing', namespaces):
# 查找blip元素(包含图片引用)
blip = drawing.find('.//a:blip',
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
if blip is not None:
rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if rel_id in rel_map:
target = rel_map[rel_id]
image_refs.append({
'rel_id': rel_id,
'target': target,
'filename': target.split('/')[-1]
})
return image_refs
# 使用示例
images_in_order = get_images_in_order("example.docx")
for i, img in enumerate(images_in_order):
print(f"图片 {i+1}: {img['filename']} (关系ID: {img['rel_id']})")通过这种方式,我们可以确定图片在文档中的准确顺序,为后续的提取和处理奠定基础。
在了解了Word文档的结构后,我们可以开始实现图片提取的核心功能。
最直接的图片提取方法是从Word文档的ZIP结构中提取media文件夹中的所有图片:
import os
import zipfile
from pathlib import Path
def extract_all_images(docx_path, output_dir):
"""
从Word文档中提取所有图片
Args:
docx_path: Word文档路径
output_dir: 图片输出目录
Returns:
提取的图片文件路径列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
with zipfile.ZipFile(docx_path) as docx_zip:
# 查找所有媒体文件
media_files = [f for f in docx_zip.namelist()
if f.startswith('word/media/')]
# 提取每个媒体文件
for media_file in media_files:
# 获取文件名
filename = os.path.basename(media_file)
# 构建输出路径
output_path = os.path.join(output_dir, filename)
# 提取文件
with docx_zip.open(media_file) as source, open(output_path, 'wb') as target:
target.write(source.read())
extracted_images.append(output_path)
print(f"已提取: {filename}")
return extracted_images
# 使用示例
images = extract_all_images("example.docx", "extracted_images")
print(f"共提取了 {len(images)} 个图片")这种方法简单直接,但它有一个主要缺点:无法保证提取的图片与文档中的顺序一致。
为了按照文档中的顺序提取图片,我们需要结合前面分析的文档结构:
import os
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
def extract_images_in_order(docx_path, output_dir):
"""
按文档顺序提取Word文档中的图片
Args:
docx_path: Word文档路径
output_dir: 图片输出目录
Returns:
按顺序提取的图片文件路径列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
# 定义命名空间
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with zipfile.ZipFile(docx_path) as docx_zip:
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系ID到目标文件的映射
rel_map = {rel.get('Id'): rel.get('Target')
for rel in rels_root.findall("*")}
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用
image_count = 0
for drawing in doc_root.findall('.//w:drawing', namespaces):
# 查找blip元素(包含图片引用)
blip = drawing.find('.//a:blip', namespaces)
if blip is not None:
rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if rel_id in rel_map:
target = rel_map[rel_id]
image_path = f"word/{target}"
# 检查文件是否存在于ZIP中
if image_path in docx_zip.namelist():
# 生成序号化的文件名
image_count += 1
original_filename = os.path.basename(target)
file_ext = os.path.splitext(original_filename)[1]
new_filename = f"image_{image_count:03d}{file_ext}"
output_path = os.path.join(output_dir, new_filename)
# 提取图片
with docx_zip.open(image_path) as source, open(output_path, 'wb') as target:
target.write(source.read())
# 记录提取信息
extracted_images.append({
'original_path': image_path,
'original_filename': original_filename,
'new_path': output_path,
'new_filename': new_filename,
'rel_id': rel_id,
'order': image_count
})
print(f"已提取图片 {image_count}: {new_filename} (原文件: {original_filename})")
return extracted_images
# 使用示例
images = extract_images_in_order("example.docx", "extracted_images")
print(f"按顺序提取了 {len(images)} 个图片")这个实现确保了图片按照它们在文档中出现的顺序被提取,并使用序号化的文件名保存,便于后续处理。
在实际应用中,我们可能会遇到一些特殊情况,如:
我们需要扩展我们的代码来处理这些情况:
def extract_images_advanced(docx_path, output_dir):
"""增强版图片提取,处理特殊情况"""
# 基本设置与前面相同
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
processed_targets = set() # 跟踪已处理的图片,避免重复
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml'
}
with zipfile.ZipFile(docx_path) as docx_zip:
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系映射
rel_map = {}
for rel in rels_root.findall("*"):
rel_id = rel.get('Id')
target = rel.get('Target')
rel_type = rel.get('Type')
rel_map[rel_id] = {
'target': target,
'type': rel_type,
'is_external': target.startswith('http') or target.startswith('file:')
}
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 图片计数器
image_count = 0
# 处理常规图片 (w:drawing)
for drawing in doc_root.findall('.//w:drawing', namespaces):
blip = drawing.find('.//a:blip', namespaces)
if blip is not None:
# 处理嵌入图片
embed_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed_rel_id and embed_rel_id in rel_map:
rel_info = rel_map[embed_rel_id]
target = rel_info['target']
# 跳过已处理的图片
if target in processed_targets:
continue
processed_targets.add(target)
# 处理内部图片
if not rel_info['is_external']:
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
image_count += 1
file_ext = os.path.splitext(target)[1]
new_filename = f"image_{image_count:03d}{file_ext}"
output_path = os.path.join(output_dir, new_filename)
with docx_zip.open(image_path) as source, open(output_path, 'wb') as target_file:
target_file.write(source.read())
extracted_images.append({
'original_path': image_path,
'new_path': output_path,
'new_filename': new_filename,
'rel_id': embed_rel_id,
'order': image_count,
'type': 'embedded'
})
# 处理外部链接图片
else:
image_count += 1
link_info = f"external_link_{image_count:03d}.txt"
link_path = os.path.join(output_dir, link_info)
with open(link_path, 'w') as f:
f.write(f"External image link: {target}\n")
extracted_images.append({
'original_path': target,
'new_path': link_path,
'new_filename': link_info,
'rel_id': embed_rel_id,
'order': image_count,
'type': 'external_link'
})
# 处理链接图片
link_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}link')
if link_rel_id and link_rel_id in rel_map:
# 类似处理链接图片...
pass
# 处理VML图片 (v:imagedata) - 通常用于兼容性模式
for img_data in doc_root.findall('.//v:imagedata', namespaces):
rel_id = img_data.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
if rel_id and rel_id in rel_map:
# 处理VML图片...
pass
# 处理嵌入对象中的图片
# 这需要更复杂的处理,可能需要解析其他关系文件
return extracted_images这个增强版的实现能够处理更多特殊情况,并避免重复提取相同的图片。
现在,我们将前面的技术整合成一个完整的、可用的图片提取类。这个类将提供更多功能和更好的错误处理。
import os
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
import shutil
from datetime import datetime
import json
from PIL import Image
import io
class WordImageExtractor:
"""Word文档图片提取器"""
def __init__(self, docx_path):
"""
初始化提取器
Args:
docx_path: Word文档路径
"""
self.docx_path = docx_path
self.namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml',
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape'
}
# 验证文件存在
if not os.path.exists(docx_path):
raise FileNotFoundError(f"找不到Word文档: {docx_path}")
# 验证文件格式
if not docx_path.lower().endswith('.docx'):
raise ValueError(f"不支持的文件格式: {docx_path}. 仅支持.docx格式")
# 初始化关系映射
self.rel_map = {}
self.image_info = []
# 解析文档结构
self._parse_document_structure()
def _parse_document_structure(self):
"""解析文档结构,建立关系映射"""
try:
with zipfile.ZipFile(self.docx_path) as docx_zip:
# 检查是否是有效的Word文档
if 'word/document.xml' not in docx_zip.namelist():
raise ValueError(f"无效的Word文档: {self.docx_path}")
# 解析关系文件
if 'word/_rels/document.xml.rels' in docx_zip.namelist():
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 建立关系映射
for rel in rels_root.findall("*"):
rel_id = rel.get('Id')
target = rel.get('Target')
rel_type = rel.get('Type')
self.rel_map[rel_id] = {
'target': target,
'type': rel_type,
'is_external': target.startswith('http') or target.startswith('file:')
}
# 解析文档内容,查找图片引用
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用并记录顺序
self._find_image_references(doc_root)
except zipfile.BadZipFile:
raise ValueError(f"文件不是有效的ZIP格式: {self.docx_path}")
except ET.ParseError as e:
raise ValueError(f"XML解析错误: {e}")
def _find_image_references(self, doc_root):
"""查找文档中的所有图片引用"""
image_order = 0
# 处理常规图片 (w:drawing)
for drawing in doc_root.findall('.//w:drawing', self.namespaces):
blip = drawing.find('.//a:blip', self.namespaces)
if blip is not None:
embed_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed_rel_id and embed_rel_id in self.rel_map:
image_order += 1
rel_info = self.rel_map[embed_rel_id]
# 获取图片尺寸信息
extent = drawing.find('.//wp:extent', self.namespaces)
width = height = None
if extent is not None:
width = extent.get('cx') # EMU单位
height = extent.get('cy') # EMU单位
# 获取替代文本
alt_text = ""
doc_pr = drawing.find('.//wp:docPr', self.namespaces)
if doc_pr is not None:
alt_text = doc_pr.get('descr', '')
self.image_info.append({
'order': image_order,
'rel_id': embed_rel_id,
'target': rel_info['target'],
'type': 'embedded' if not rel_info['is_external'] else 'external',
'width_emu': width,
'height_emu': height,
'alt_text': alt_text,
'element_type': 'drawing'
})
# 处理VML图片 (v:imagedata) - 兼容性模式
for img_data in doc_root.findall('.//v:imagedata', self.namespaces):
rel_id = img_data.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
if rel_id and rel_id in self.rel_map:
image_order += 1
rel_info = self.rel_map[rel_id]
self.image_info.append({
'order': image_order,
'rel_id': rel_id,
'target': rel_info['target'],
'type': 'embedded' if not rel_info['is_external'] else 'external',
'width_emu': None,
'height_emu': None,
'alt_text': img_data.get('title', ''),
'element_type': 'vml'
})
def get_image_count(self):
"""获取文档中的图片数量"""
return len(self.image_info)
def get_image_info(self):
"""获取所有图片的信息"""
return self.image_info.copy()
def extract_images(self, output_dir, preserve_names=False, include_metadata=True):
"""
提取所有图片
Args:
output_dir: 输出目录
preserve_names: 是否保留原始文件名
include_metadata: 是否包含元数据文件
Returns:
提取结果列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
processed_targets = set()
with zipfile.ZipFile(self.docx_path) as docx_zip:
for img_info in self.image_info:
target = img_info['target']
# 跳过重复图片
if target in processed_targets:
continue
processed_targets.add(target)
# 处理嵌入图片
if img_info['type'] == 'embedded':
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
# 确定输出文件名
if preserve_names:
filename = os.path.basename(target)
else:
file_ext = os.path.splitext(target)[1]
filename = f"image_{img_info['order']:03d}{file_ext}"
output_path = os.path.join(output_dir, filename)
# 提取图片
with docx_zip.open(image_path) as source:
image_data = source.read()
with open(output_path, 'wb') as target_file:
target_file.write(image_data)
# 获取图片实际尺寸
actual_width = actual_height = None
try:
with Image.open(io.BytesIO(image_data)) as pil_img:
actual_width, actual_height = pil_img.size
except Exception:
pass
extracted_images.append({
'order': img_info['order'],
'original_path': image_path,
'output_path': output_path,
'filename': filename,
'rel_id': img_info['rel_id'],
'type': 'embedded',
'width_emu': img_info['width_emu'],
'height_emu': img_info['height_emu'],
'actual_width': actual_width,
'actual_height': actual_height,
'alt_text': img_info['alt_text'],
'element_type': img_info['element_type'],
'file_size': len(image_data)
})
print(f"已提取图片 {img_info['order']}: {filename}")
# 处理外部链接图片
elif img_info['type'] == 'external':
link_filename = f"external_link_{img_info['order']:03d}.txt"
link_path = os.path.join(output_dir, link_filename)
with open(link_path, 'w', encoding='utf-8') as f:
f.write(f"外部图片链接: {target}\n")
f.write(f"替代文本: {img_info['alt_text']}\n")
f.write(f"关系ID: {img_info['rel_id']}\n")
extracted_images.append({
'order': img_info['order'],
'original_path': target,
'output_path': link_path,
'filename': link_filename,
'rel_id': img_info['rel_id'],
'type': 'external',
'alt_text': img_info['alt_text']
})
print(f"已记录外部链接 {img_info['order']}: {target}")
# 生成元数据文件
if include_metadata:
metadata_path = os.path.join(output_dir, 'extraction_metadata.json')
metadata = {
'source_document': os.path.basename(self.docx_path),
'extraction_time': datetime.now().isoformat(),
'total_images': len(extracted_images),
'embedded_images': len([img for img in extracted_images if img['type'] == 'embedded']),
'external_links': len([img for img in extracted_images if img['type'] == 'external']),
'images': extracted_images
}
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
print(f"已生成元数据文件: {metadata_path}")
return extracted_images
def extract_single_image(self, image_order, output_path):
"""
提取单个图片
Args:
image_order: 图片序号(从1开始)
output_path: 输出文件路径
Returns:
提取结果信息
"""
# 查找指定序号的图片
target_image = None
for img_info in self.image_info:
if img_info['order'] == image_order:
target_image = img_info
break
if not target_image:
raise ValueError(f"找不到序号为 {image_order} 的图片")
if target_image['type'] != 'embedded':
raise ValueError(f"图片 {image_order} 是外部链接,无法提取")
# 确保输出目录存在
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with zipfile.ZipFile(self.docx_path) as docx_zip:
image_path = f"word/{target_image['target']}"
if image_path in docx_zip.namelist():
with docx_zip.open(image_path) as source:
image_data = source.read()
with open(output_path, 'wb') as target_file:
target_file.write(image_data)
print(f"已提取图片 {image_order} 到: {output_path}")
return {
'order': image_order,
'output_path': output_path,
'file_size': len(image_data),
'success': True
}
else:
raise FileNotFoundError(f"在文档中找不到图片文件: {image_path}")
# 使用示例
def main():
"""主函数示例"""
try:
# 创建提取器实例
extractor = WordImageExtractor("example.docx")
# 显示图片信息
print(f"文档中共有 {extractor.get_image_count()} 个图片")
# 获取图片详细信息
for img_info in extractor.get_image_info():
print(f"图片 {img_info['order']}: {img_info['target']} ({img_info['type']})")
# 提取所有图片
results = extractor.extract_images("extracted_images", preserve_names=False)
print(f"\n提取完成,共处理 {len(results)} 个图片")
# 提取单个图片示例
if results:
extractor.extract_single_image(1, "single_image/first_image.jpg")
except Exception as e:
print(f"错误: {e}")
if __name__ == "__main__":
main()在提取图片的过程中,保存完整的元数据信息对于后续的处理和分析非常重要。我们需要记录图片的各种属性,包括尺寸、格式、在文档中的位置等信息。
首先,我们设计一个完整的元数据结构来存储图片信息:
import json
from datetime import datetime
from PIL import Image
from PIL.ExifTags import TAGS
import hashlib
class ImageMetadataProcessor:
"""图片元数据处理器"""
def __init__(self):
self.metadata_schema = {
'extraction_info': {
'timestamp': None,
'source_document': None,
'extractor_version': '1.0.0'
},
'document_info': {
'total_images': 0,
'embedded_images': 0,
'external_links': 0,
'document_size': 0
},
'images': []
}
def process_image_metadata(self, image_data, image_info, output_path):
"""
处理单个图片的元数据
Args:
image_data: 图片二进制数据
image_info: 从文档中提取的图片信息
output_path: 输出文件路径
Returns:
完整的图片元数据
"""
metadata = {
'basic_info': {
'order': image_info.get('order'),
'filename': os.path.basename(output_path),
'file_path': output_path,
'file_size': len(image_data),
'file_hash': hashlib.md5(image_data).hexdigest()
},
'document_context': {
'rel_id': image_info.get('rel_id'),
'original_target': image_info.get('target'),
'alt_text': image_info.get('alt_text', ''),
'element_type': image_info.get('element_type')
},
'dimensions': {
'document_width_emu': image_info.get('width_emu'),
'document_height_emu': image_info.get('height_emu'),
'actual_width': None,
'actual_height': None,
'aspect_ratio': None
},
'image_properties': {
'format': None,
'mode': None,
'has_transparency': False,
'color_count': None,
'dpi': None
},
'exif_data': {}
}
# 使用PIL分析图片属性
try:
with Image.open(io.BytesIO(image_data)) as pil_img:
# 基本尺寸信息
width, height = pil_img.size
metadata['dimensions']['actual_width'] = width
metadata['dimensions']['actual_height'] = height
metadata['dimensions']['aspect_ratio'] = round(width / height, 3) if height > 0 else None
# 图片格式信息
metadata['image_properties']['format'] = pil_img.format
metadata['image_properties']['mode'] = pil_img.mode
metadata['image_properties']['has_transparency'] = pil_img.mode in ('RGBA', 'LA') or 'transparency' in pil_img.info
# DPI信息
if hasattr(pil_img, 'info') and 'dpi' in pil_img.info:
metadata['image_properties']['dpi'] = pil_img.info['dpi']
# 颜色数量(对于调色板模式)
if pil_img.mode == 'P':
metadata['image_properties']['color_count'] = len(pil_img.getcolors() or [])
# EXIF数据
if hasattr(pil_img, '_getexif') and pil_img._getexif():
exif_data = pil_img._getexif()
for tag_id, value in exif_data.items():
tag = TAGS.get(tag_id, tag_id)
metadata['exif_data'][tag] = str(value)
except Exception as e:
metadata['processing_error'] = str(e)
return metadata
def save_metadata(self, metadata, output_path):
"""保存元数据到JSON文件"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2, default=str)
def load_metadata(self, metadata_path):
"""从JSON文件加载元数据"""
with open(metadata_path, 'r', encoding='utf-8') as f:
return json.load(f)
def generate_summary_report(self, metadata):
"""生成元数据摘要报告"""
report = []
report.append("=== 图片提取摘要报告 ===\n")
# 基本统计
doc_info = metadata.get('document_info', {})
report.append(f"文档信息:")
report.append(f" - 总图片数: {doc_info.get('total_images', 0)}")
report.append(f" - 嵌入图片: {doc_info.get('embedded_images', 0)}")
report.append(f" - 外部链接: {doc_info.get('external_links', 0)}")
report.append("")
# 图片格式统计
images = metadata.get('images', [])
formats = {}
total_size = 0
for img in images:
if img.get('type') == 'embedded':
fmt = img.get('image_properties', {}).get('format', 'Unknown')
formats[fmt] = formats.get(fmt, 0) + 1
total_size += img.get('basic_info', {}).get('file_size', 0)
report.append("格式分布:")
for fmt, count in sorted(formats.items()):
report.append(f" - {fmt}: {count} 个")
report.append(f"\n总文件大小: {total_size / 1024:.1f} KB")
# 尺寸统计
sizes = [(img.get('dimensions', {}).get('actual_width', 0),
img.get('dimensions', {}).get('actual_height', 0))
for img in images if img.get('type') == 'embedded']
if sizes:
max_width = max(s[0] for s in sizes)
max_height = max(s[1] for s in sizes)
min_width = min(s[0] for s in sizes if s[0] > 0)
min_height = min(s[1] for s in sizes if s[1] > 0)
report.append(f"\n尺寸范围:")
report.append(f" - 最大: {max_width} x {max_height}")
report.append(f" - 最小: {min_width} x {min_height}")
return "\n".join(report)现在我们将元数据处理集成到主要的提取器中:
class EnhancedWordImageExtractor(WordImageExtractor):
"""增强版Word图片提取器,包含完整的元数据处理"""
def __init__(self, docx_path):
super().__init__(docx_path)
self.metadata_processor = ImageMetadataProcessor()
def extract_images_with_metadata(self, output_dir, preserve_names=False):
"""
提取图片并生成完整的元数据
Args:
output_dir: 输出目录
preserve_names: 是否保留原始文件名
Returns:
包含完整元数据的提取结果
"""
os.makedirs(output_dir, exist_ok=True)
# 初始化元数据结构
metadata = self.metadata_processor.metadata_schema.copy()
metadata['extraction_info']['timestamp'] = datetime.now().isoformat()
metadata['extraction_info']['source_document'] = os.path.basename(self.docx_path)
# 获取文档大小
metadata['document_info']['document_size'] = os.path.getsize(self.docx_path)
extracted_images = []
processed_targets = set()
with zipfile.ZipFile(self.docx_path) as docx_zip:
for img_info in self.image_info:
target = img_info['target']
if target in processed_targets:
continue
processed_targets.add(target)
if img_info['type'] == 'embedded':
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
# 确定文件名
if preserve_names:
filename = os.path.basename(target)
else:
file_ext = os.path.splitext(target)[1]
filename = f"image_{img_info['order']:03d}{file_ext}"
output_path = os.path.join(output_dir, filename)
# 提取图片数据
with docx_zip.open(image_path) as source:
image_data = source.read()
# 保存图片文件
with open(output_path, 'wb') as target_file:
target_file.write(image_data)
# 处理元数据
img_metadata = self.metadata_processor.process_image_metadata(
image_data, img_info, output_path
)
extracted_images.append(img_metadata)
metadata['document_info']['embedded_images'] += 1
print(f"已提取图片 {img_info['order']}: {filename}")
elif img_info['type'] == 'external':
# 处理外部链接
link_filename = f"external_link_{img_info['order']:03d}.txt"
link_path = os.path.join(output_dir, link_filename)
with open(link_path, 'w', encoding='utf-8') as f:
f.write(f"外部图片链接: {target}\n")
f.write(f"替代文本: {img_info['alt_text']}\n")
f.write(f"关系ID: {img_info['rel_id']}\n")
# 外部链接的元数据
link_metadata = {
'basic_info': {
'order': img_info['order'],
'filename': link_filename,
'file_path': link_path,
'type': 'external_link'
},
'document_context': {
'rel_id': img_info['rel_id'],
'original_target': target,
'alt_text': img_info['alt_text']
}
}
extracted_images.append(link_metadata)
metadata['document_info']['external_links'] += 1
# 完善元数据
metadata['document_info']['total_images'] = len(extracted_images)
metadata['images'] = extracted_images
# 保存元数据文件
metadata_path = os.path.join(output_dir, 'complete_metadata.json')
self.metadata_processor.save_metadata(metadata, metadata_path)
# 生成摘要报告
report = self.metadata_processor.generate_summary_report(metadata)
report_path = os.path.join(output_dir, 'extraction_report.txt')
with open(report_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f"\n已生成完整元数据: {metadata_path}")
print(f"已生成摘要报告: {report_path}")
return metadata除了从Word文档中提取图片,我们还经常需要将图片按照特定顺序插入到Word文档中。这在批量处理、模板生成等场景中非常有用。
使用python-docx库,我们可以实现基本的图片插入功能:
from docx import Document
from docx.shared import Inches, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH
import os
from pathlib import Path
class WordImageLoader:
"""Word文档图片加载器"""
def __init__(self, template_path=None):
"""
初始化加载器
Args:
template_path: 模板文档路径,如果为None则创建新文档
"""
if template_path and os.path.exists(template_path):
self.document = Document(template_path)
print(f"已加载模板文档: {template_path}")
else:
self.document = Document()
print("已创建新的Word文档")
def add_image(self, image_path, width=None, height=None, caption=None, alignment='left'):
"""
添加单个图片到文档
Args:
image_path: 图片文件路径
width: 图片宽度(英寸)
height: 图片高度(英寸)
caption: 图片标题
alignment: 对齐方式 ('left', 'center', 'right')
Returns:
添加的图片对象
"""
if not os.path.exists(image_path):
raise FileNotFoundError(f"找不到图片文件: {image_path}")
# 创建段落
paragraph = self.document.add_paragraph()
# 设置对齐方式
alignment_map = {
'left': WD_ALIGN_PARAGRAPH.LEFT,
'center': WD_ALIGN_PARAGRAPH.CENTER,
'right': WD_ALIGN_PARAGRAPH.RIGHT
}
paragraph.alignment = alignment_map.get(alignment, WD_ALIGN_PARAGRAPH.LEFT)
# 添加图片
run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
# 设置图片尺寸
if width and height:
picture = run.add_picture(image_path, width=Inches(width), height=Inches(height))
elif width:
picture = run.add_picture(image_path, width=Inches(width))
elif height:
picture = run.add_picture(image_path, height=Inches(height))
else:
picture = run.add_picture(image_path)
# 添加标题
if caption:
caption_paragraph = self.document.add_paragraph(caption)
caption_paragraph.alignment = alignment_map.get(alignment, WD_ALIGN_PARAGRAPH.LEFT)
# 设置标题样式
for run in caption_paragraph.runs:
run.font.size = Inches(0.1) # 小字体
run.italic = True
print(f"已添加图片: {os.path.basename(image_path)}")
return picture
def add_images_from_folder(self, folder_path, pattern="*", max_width=6, spacing=True):
"""
从文件夹批量添加图片
Args:
folder_path: 图片文件夹路径
pattern: 文件名模式(如 "*.jpg", "image_*.png")
max_width: 最大宽度(英寸)
spacing: 是否在图片间添加空行
Returns:
添加的图片数量
"""
folder = Path(folder_path)
if not folder.exists():
raise FileNotFoundError(f"找不到文件夹: {folder_path}")
# 获取匹配的图片文件
image_files = sorted(folder.glob(pattern))
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'}
image_files = [f for f in image_files if f.suffix.lower() in image_extensions]
if not image_files:
print(f"在文件夹 {folder_path} 中没有找到匹配的图片文件")
return 0
added_count = 0
for image_file in image_files:
try:
# 添加图片,自动调整宽度
self.add_image(str(image_file), width=max_width, alignment='center')
added_count += 1
# 添加间距
if spacing and added_count < len(image_files):
self.document.add_paragraph()
except Exception as e:
print(f"添加图片 {image_file.name} 时出错: {e}")
print(f"成功添加 {added_count} 个图片")
return added_count
def add_images_with_metadata(self, metadata_file):
"""
根据元数据文件添加图片
Args:
metadata_file: 元数据JSON文件路径
Returns:
添加的图片数量
"""
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
images = metadata.get('images', [])
embedded_images = [img for img in images if img.get('type') != 'external_link']
added_count = 0
for img_data in sorted(embedded_images, key=lambda x: x.get('basic_info', {}).get('order', 0)):
try:
basic_info = img_data.get('basic_info', {})
document_context = img_data.get('document_context', {})
dimensions = img_data.get('dimensions', {})
image_path = basic_info.get('file_path')
if not image_path or not os.path.exists(image_path):
continue
# 计算合适的显示尺寸
actual_width = dimensions.get('actual_width', 0)
actual_height = dimensions.get('actual_height', 0)
display_width = 4 # 默认4英寸宽度
if actual_width > 0 and actual_height > 0:
aspect_ratio = actual_width / actual_height
if aspect_ratio > 2: # 宽图片
display_width = 6
elif aspect_ratio < 0.5: # 高图片
display_width = 3
# 添加图片
caption = document_context.get('alt_text', '')
self.add_image(image_path, width=display_width, caption=caption, alignment='center')
added_count += 1
# 添加间距
self.document.add_paragraph()
except Exception as e:
print(f"添加图片时出错: {e}")
print(f"根据元数据成功添加 {added_count} 个图片")
return added_count
def save_document(self, output_path):
"""
保存文档
Args:
output_path: 输出文件路径
"""
# 确保输出目录存在
os.makedirs(os.path.dirname(output_path), exist_ok=True)
self.document.save(output_path)
print(f"文档已保存到: {output_path}")
def add_title_page(self, title, subtitle=None, author=None):
"""
添加标题页
Args:
title: 主标题
subtitle: 副标题
author: 作者
"""
# 主标题
title_paragraph = self.document.add_paragraph(title)
title_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
for run in title_paragraph.runs:
run.font.size = Inches(0.3)
run.bold = True
# 副标题
if subtitle:
subtitle_paragraph = self.document.add_paragraph(subtitle)
subtitle_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
for run in subtitle_paragraph.runs:
run.font.size = Inches(0.2)
# 作者
if author:
self.document.add_paragraph() # 空行
author_paragraph = self.document.add_paragraph(f"作者: {author}")
author_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
# 添加分页符
self.document.add_page_break()
# 使用示例
def create_image_document_example():
"""创建包含图片的文档示例"""
try:
# 创建加载器
loader = WordImageLoader()
# 添加标题页
loader.add_title_page(
title="图片文档示例",
subtitle="使用Python自动生成",
author="图片处理系统"
)
# 从文件夹批量添加图片
loader.add_images_from_folder("extracted_images", pattern="image_*.jpg", max_width=5)
# 保存文档
loader.save_document("generated_document.docx")
except Exception as e:
print(f"创建文档时出错: {e}")
if __name__ == "__main__":
create_image_document_example()在处理大量Word文档或大型文档时,性能优化变得非常重要。我们需要考虑内存使用、处理速度和错误处理等方面。
import concurrent.futures
import threading
from tqdm import tqdm
import logging
from pathlib import Path
class BatchImageProcessor:
"""批量图片处理器"""
def __init__(self, max_workers=4, progress_bar=True):
"""
初始化批量处理器
Args:
max_workers: 最大并发工作线程数
progress_bar: 是否显示进度条
"""
self.max_workers = max_workers
self.progress_bar = progress_bar
self.results = []
self.errors = []
self.lock = threading.Lock()
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('batch_processing.log'),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
def process_single_document(self, docx_path, output_base_dir):
"""
处理单个文档
Args:
docx_path: Word文档路径
output_base_dir: 输出基础目录
Returns:
处理结果
"""
try:
# 为每个文档创建单独的输出目录
doc_name = Path(docx_path).stem
output_dir = os.path.join(output_base_dir, doc_name)
# 创建提取器并处理
extractor = EnhancedWordImageExtractor(docx_path)
metadata = extractor.extract_images_with_metadata(output_dir)
result = {
'document': docx_path,
'output_dir': output_dir,
'success': True,
'image_count': metadata['document_info']['total_images'],
'embedded_count': metadata['document_info']['embedded_images'],
'external_count': metadata['document_info']['external_links']
}
with self.lock:
self.results.append(result)
self.logger.info(f"成功处理文档: {docx_path}")
return result
except Exception as e:
error_info = {
'document': docx_path,
'error': str(e),
'success': False
}
with self.lock:
self.errors.append(error_info)
self.logger.error(f"处理文档 {docx_path} 时出错: {e}")
return error_info
def process_documents(self, docx_files, output_base_dir):
"""
批量处理多个文档
Args:
docx_files: Word文档文件路径列表
output_base_dir: 输出基础目录
Returns:
处理结果摘要
"""
os.makedirs(output_base_dir, exist_ok=True)
self.logger.info(f"开始批量处理 {len(docx_files)} 个文档")
# 使用线程池并行处理
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# 提交所有任务
future_to_doc = {
executor.submit(self.process_single_document, doc, output_base_dir): doc
for doc in docx_files
}
# 处理结果(带进度条)
if self.progress_bar:
futures = list(concurrent.futures.as_completed(future_to_doc))
for future in tqdm(futures, desc="处理文档"):
try:
result = future.result()
except Exception as e:
doc = future_to_doc[future]
self.logger.error(f"处理文档 {doc} 时发生异常: {e}")
else:
for future in concurrent.futures.as_completed(future_to_doc):
try:
result = future.result()
except Exception as e:
doc = future_to_doc[future]
self.logger.error(f"处理文档 {doc} 时发生异常: {e}")
# 生成处理摘要
summary = self.generate_processing_summary()
# 保存摘要报告
summary_path = os.path.join(output_base_dir, 'batch_processing_summary.json')
with open(summary_path, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
self.logger.info(f"批量处理完成,摘要已保存到: {summary_path}")
return summary
def generate_processing_summary(self):
"""生成处理摘要"""
successful_results = [r for r in self.results if r.get('success')]
total_images = sum(r.get('image_count', 0) for r in successful_results)
total_embedded = sum(r.get('embedded_count', 0) for r in successful_results)
total_external = sum(r.get('external_count', 0) for r in successful_results)
summary = {
'processing_info': {
'timestamp': datetime.now().isoformat(),
'total_documents': len(self.results) + len(self.errors),
'successful_documents': len(successful_results),
'failed_documents': len(self.errors),
'success_rate': len(successful_results) / (len(self.results) + len(self.errors)) * 100
},
'image_statistics': {
'total_images_extracted': total_images,
'embedded_images': total_embedded,
'external_links': total_external
},
'successful_documents': successful_results,
'failed_documents': self.errors
}
return summary
def find_docx_files(self, directory, recursive=True):
"""
查找目录中的所有Word文档
Args:
directory: 搜索目录
recursive: 是否递归搜索子目录
Returns:
Word文档文件路径列表
"""
directory = Path(directory)
if not directory.exists():
raise FileNotFoundError(f"目录不存在: {directory}")
if recursive:
docx_files = list(directory.rglob("*.docx"))
else:
docx_files = list(directory.glob("*.docx"))
# 过滤掉临时文件
docx_files = [f for f in docx_files if not f.name.startswith('~$')]
return [str(f) for f in docx_files]
# 性能优化工具
class PerformanceOptimizer:
"""性能优化工具"""
@staticmethod
def optimize_memory_usage():
"""优化内存使用"""
import gc
gc.collect() # 强制垃圾回收
@staticmethod
def get_memory_usage():
"""获取当前内存使用情况"""
import psutil
process = psutil.Process()
memory_info = process.memory_info()
return {
'rss': memory_info.rss / 1024 / 1024, # MB
'vms': memory_info.vms / 1024 / 1024 # MB
}
@staticmethod
def monitor_performance(func):
"""性能监控装饰器"""
import time
import functools
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
start_memory = PerformanceOptimizer.get_memory_usage()
result = func(*args, **kwargs)
end_time = time
# Python实现Word文档中图片的自动提取与加载:从理论到实践
在现代办公和文档处理中,Word文档已经成为最常用的文件格式之一。这些文档不仅包含文本内容,还经常嵌入各种图片、图表和其他媒体元素。在许多场景下,我们需要从Word文档中提取这些图片,例如进行内容分析、创建图像数据库、或者在其他应用程序中重用这些图像。同样,将图片按照特定顺序加载到Word文档中也是一个常见需求。本文将深入探讨如何使用Python实现Word文档中图片的自动提取与加载功能,从理论基础到实际应用,提供全面的技术指南。
## 目录
1. [Word文档中的图片:基础知识](#word文档中的图片基础知识)
2. [技术准备与环境搭建](#技术准备与环境搭建)
3. [Word文档结构解析](#word文档结构解析)
4. [图片提取核心技术](#图片提取核心技术)
5. [按序提取图片的实现](#按序提取图片的实现)
6. [图片保存与元数据处理](#图片保存与元数据处理)
7. [图片加载与插入技术](#图片加载与插入技术)
8. [批量处理与性能优化](#批量处理与性能优化)
9. [实用案例与应用场景](#实用案例与应用场景)
10. [常见问题与解决方案](#常见问题与解决方案)
11. [扩展功能与高级应用](#扩展功能与高级应用)
12. [总结与展望](#总结与展望)
## Word文档中的图片:基础知识
在深入技术实现之前,我们需要了解Word文档中图片的存储方式和基本特性。
### 图片在Word文档中的存储方式
现代Word文档(.docx格式)实际上是一个ZIP压缩包,包含多个XML文件和资源文件。当我们在Word文档中插入图片时,图片会被存储在文档包的`word/media/`目录下,并在文档的XML结构中通过引用的方式链接。
Word文档中的图片主要有以下几种存储形式:
1. **嵌入式图片**:直接存储在文档包中,最常见的形式
2. **链接式图片**:仅存储图片的引用路径,实际图片存储在外部
3. **嵌入式与链接式混合**:存储缩略图在文档中,原图通过外部链接引用
### 图片格式与属性
Word文档支持多种图片格式,常见的包括:
- **位图格式**:JPEG、PNG、BMP、GIF等
- **矢量格式**:EMF、WMF等
- **其他格式**:TIFF、SVG(较新版本支持)等
每个图片在Word文档中还包含多种属性:
- **尺寸信息**:宽度、高度(原始像素和显示尺寸)
- **位置信息**:在文档中的位置、与文本的排列方式
- **格式设置**:边框、效果、裁剪信息等
- **替代文本**:为图片设置的描述性文本
- **ID与名称**:系统分配的唯一标识符
### 图片与文档结构的关系
在Word文档的XML结构中,图片通过以下方式与文档内容关联:
1. **文档内容XML**(document.xml)包含图片的引用和位置信息
2. **关系文件**(document.xml.rels)定义了内容与媒体文件的关联关系
3. **媒体文件夹**(media)存储实际的图片文件
了解这些基础知识对于我们实现图片提取和加载功能至关重要,因为我们需要正确解析文档结构,找到图片文件,并理解它们在文档中的位置和顺序。
## 技术准备与环境搭建
在开始实现Word文档图片处理功能之前,我们需要准备适当的开发环境和工具。
### Python环境准备
首先,我们需要安装Python环境。推荐使用Python 3.6或更高版本,因为它提供了更好的Unicode支持和更多现代特性。
```bash
# 检查Python版本
python --version
# 创建虚拟环境(可选但推荐)
python -m venv word_image_env
source word_image_env/bin/activate # Linux/Mac
word_image_env\Scripts\activate # Windows我们将使用几个关键的Python库来处理Word文档和图片:
pip install python-docx # 处理.docx文件
pip install Pillow # 图像处理
pip install lxml # XML处理(python-docx的依赖,但可能需要单独安装)
pip install tqdm # 进度条显示(可选,用于批量处理)其中,python-docx是我们的核心库,用于读取和操作Word文档。但它在图片提取方面有一些限制,因此我们还需要直接处理文档的ZIP结构和XML内容。
安装完成后,我们可以简单测试环境是否正确配置:
import docx
import PIL
import lxml
import zipfile
import os
print(f"python-docx version: {docx.__version__}")
print(f"Pillow version: {PIL.__version__}")
print(f"lxml version: {lxml.__version__}")
print(f"zipfile module available: {zipfile.__name__}")
print(f"os module available: {os.__name__}")如果所有库都能正确导入并显示版本信息,说明我们的环境已经准备就绪。
为了使我们的代码组织良好且易于维护,我们可以按照以下结构设计项目:
word_image_processor/
│
├── word_image_extractor.py # 图片提取核心功能
├── word_image_loader.py # 图片加载核心功能
├── utils/
│ ├── __init__.py
│ ├── docx_utils.py # Word文档处理工具函数
│ ├── image_utils.py # 图像处理工具函数
│ └── metadata_utils.py # 元数据处理工具函数
│
├── examples/
│ ├── extract_images.py # 图片提取示例
│ └── load_images.py # 图片加载示例
│
└── tests/
├── __init__.py
├── test_extractor.py # 提取功能测试
└── test_loader.py # 加载功能测试这种结构将核心功能、工具函数和示例代码分开,使项目更加清晰和可维护。
要实现图片的提取和加载,我们首先需要深入理解Word文档的内部结构,特别是与图片相关的部分。
如前所述,.docx文件实际上是一个ZIP压缩包,包含多个XML文件和资源文件。这种格式被称为Office Open XML (OOXML),是一种国际标准。
我们可以通过以下方式查看.docx文件的内部结构:
import zipfile
def explore_docx_structure(docx_path):
"""探索Word文档的内部结构"""
with zipfile.ZipFile(docx_path) as docx_zip:
# 列出所有文件
file_list = docx_zip.namelist()
print("文档内部文件列表:")
for file in file_list:
print(f" - {file}")
# 检查是否存在图片文件夹
media_files = [f for f in file_list if f.startswith('word/media/')]
print(f"\n找到 {len(media_files)} 个媒体文件:")
for media in media_files:
print(f" - {media}")
# 使用示例
explore_docx_structure("example.docx")在Word文档中,图片与文档内容的关联主要通过以下文件实现:
我们需要解析这些文件来理解图片在文档中的位置和顺序。
import xml.etree.ElementTree as ET
from zipfile import ZipFile
def analyze_document_images(docx_path):
"""分析文档中的图片引用"""
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with ZipFile(docx_path) as docx_zip:
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用
drawing_elements = doc_root.findall('.//w:drawing', namespaces)
print(f"找到 {len(drawing_elements)} 个图形元素")
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 查找图片关系
image_rels = rels_root.findall(".//*[@Type='http://schemas.openxmlformats.org/officeDocument/2006/relationships/image']")
print(f"找到 {len(image_rels)} 个图片关系")
# 显示图片信息
for rel in image_rels:
rel_id = rel.get('Id')
target = rel.get('Target')
print(f"关系ID: {rel_id}, 目标文件: {target}")
# 使用示例
analyze_document_images("example.docx")在Word文档中,图片的顺序可以通过以下几种方式确定:
对于大多数情况,文档流顺序是最可靠的,因为它反映了图片在文档中的自然排列。但在复杂文档中,我们可能需要结合多种方法来确定准确的顺序。
def get_images_in_order(docx_path):
"""获取文档中图片的顺序"""
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with ZipFile(docx_path) as docx_zip:
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系ID到目标文件的映射
rel_map = {rel.get('Id'): rel.get('Target')
for rel in rels_root.findall("*")}
# 按文档流顺序查找图片引用
image_refs = []
for drawing in doc_root.findall('.//w:drawing', namespaces):
# 查找blip元素(包含图片引用)
blip = drawing.find('.//a:blip',
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
if blip is not None:
rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if rel_id in rel_map:
target = rel_map[rel_id]
image_refs.append({
'rel_id': rel_id,
'target': target,
'filename': target.split('/')[-1]
})
return image_refs
# 使用示例
images_in_order = get_images_in_order("example.docx")
for i, img in enumerate(images_in_order):
print(f"图片 {i+1}: {img['filename']} (关系ID: {img['rel_id']})")通过这种方式,我们可以确定图片在文档中的准确顺序,为后续的提取和处理奠定基础。
在了解了Word文档的结构后,我们可以开始实现图片提取的核心功能。
最直接的图片提取方法是从Word文档的ZIP结构中提取media文件夹中的所有图片:
import os
import zipfile
from pathlib import Path
def extract_all_images(docx_path, output_dir):
"""
从Word文档中提取所有图片
Args:
docx_path: Word文档路径
output_dir: 图片输出目录
Returns:
提取的图片文件路径列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
with zipfile.ZipFile(docx_path) as docx_zip:
# 查找所有媒体文件
media_files = [f for f in docx_zip.namelist()
if f.startswith('word/media/')]
# 提取每个媒体文件
for media_file in media_files:
# 获取文件名
filename = os.path.basename(media_file)
# 构建输出路径
output_path = os.path.join(output_dir, filename)
# 提取文件
with docx_zip.open(media_file) as source, open(output_path, 'wb') as target:
target.write(source.read())
extracted_images.append(output_path)
print(f"已提取: {filename}")
return extracted_images
# 使用示例
images = extract_all_images("example.docx", "extracted_images")
print(f"共提取了 {len(images)} 个图片")这种方法简单直接,但它有一个主要缺点:无法保证提取的图片与文档中的顺序一致。
为了按照文档中的顺序提取图片,我们需要结合前面分析的文档结构:
import os
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
def extract_images_in_order(docx_path, output_dir):
"""
按文档顺序提取Word文档中的图片
Args:
docx_path: Word文档路径
output_dir: 图片输出目录
Returns:
按顺序提取的图片文件路径列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
# 定义命名空间
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}
with zipfile.ZipFile(docx_path) as docx_zip:
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系ID到目标文件的映射
rel_map = {rel.get('Id'): rel.get('Target')
for rel in rels_root.findall("*")}
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用
image_count = 0
for drawing in doc_root.findall('.//w:drawing', namespaces):
# 查找blip元素(包含图片引用)
blip = drawing.find('.//a:blip', namespaces)
if blip is not None:
rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if rel_id in rel_map:
target = rel_map[rel_id]
image_path = f"word/{target}"
# 检查文件是否存在于ZIP中
if image_path in docx_zip.namelist():
# 生成序号化的文件名
image_count += 1
original_filename = os.path.basename(target)
file_ext = os.path.splitext(original_filename)[1]
new_filename = f"image_{image_count:03d}{file_ext}"
output_path = os.path.join(output_dir, new_filename)
# 提取图片
with docx_zip.open(image_path) as source, open(output_path, 'wb') as target:
target.write(source.read())
# 记录提取信息
extracted_images.append({
'original_path': image_path,
'original_filename': original_filename,
'new_path': output_path,
'new_filename': new_filename,
'rel_id': rel_id,
'order': image_count
})
print(f"已提取图片 {image_count}: {new_filename} (原文件: {original_filename})")
return extracted_images
# 使用示例
images = extract_images_in_order("example.docx", "extracted_images")
print(f"按顺序提取了 {len(images)} 个图片")这个实现确保了图片按照它们在文档中出现的顺序被提取,并使用序号化的文件名保存,便于后续处理。
在实际应用中,我们可能会遇到一些特殊情况,如:
我们需要扩展我们的代码来处理这些情况:
def extract_images_advanced(docx_path, output_dir):
"""增强版图片提取,处理特殊情况"""
# 基本设置与前面相同
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
processed_targets = set() # 跟踪已处理的图片,避免重复
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml'
}
with zipfile.ZipFile(docx_path) as docx_zip:
# 解析关系文件
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 创建关系映射
rel_map = {}
for rel in rels_root.findall("*"):
rel_id = rel.get('Id')
target = rel.get('Target')
rel_type = rel.get('Type')
rel_map[rel_id] = {
'target': target,
'type': rel_type,
'is_external': target.startswith('http') or target.startswith('file:')
}
# 解析document.xml
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 图片计数器
image_count = 0
# 处理常规图片 (w:drawing)
for drawing in doc_root.findall('.//w:drawing', namespaces):
blip = drawing.find('.//a:blip', namespaces)
if blip is not None:
# 处理嵌入图片
embed_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed_rel_id and embed_rel_id in rel_map:
rel_info = rel_map[embed_rel_id]
target = rel_info['target']
# 跳过已处理的图片
if target in processed_targets:
continue
processed_targets.add(target)
# 处理内部图片
if not rel_info['is_external']:
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
image_count += 1
file_ext = os.path.splitext(target)[1]
new_filename = f"image_{image_count:03d}{file_ext}"
output_path = os.path.join(output_dir, new_filename)
with docx_zip.open(image_path) as source, open(output_path, 'wb') as target_file:
target_file.write(source.read())
extracted_images.append({
'original_path': image_path,
'new_path': output_path,
'new_filename': new_filename,
'rel_id': embed_rel_id,
'order': image_count,
'type': 'embedded'
})
# 处理外部链接图片
else:
image_count += 1
link_info = f"external_link_{image_count:03d}.txt"
link_path = os.path.join(output_dir, link_info)
with open(link_path, 'w') as f:
f.write(f"External image link: {target}\n")
extracted_images.append({
'original_path': target,
'new_path': link_path,
'new_filename': link_info,
'rel_id': embed_rel_id,
'order': image_count,
'type': 'external_link'
})
# 处理链接图片
link_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}link')
if link_rel_id and link_rel_id in rel_map:
# 类似处理链接图片...
pass
# 处理VML图片 (v:imagedata) - 通常用于兼容性模式
for img_data in doc_root.findall('.//v:imagedata', namespaces):
rel_id = img_data.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
if rel_id and rel_id in rel_map:
# 处理VML图片...
pass
# 处理嵌入对象中的图片
# 这需要更复杂的处理,可能需要解析其他关系文件
return extracted_images这个增强版的实现能够处理更多特殊情况,并避免重复提取相同的图片。
现在,我们将前面的技术整合成一个完整的、可用的图片提取类。这个类将提供更多功能和更好的错误处理。
import os
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
import shutil
from datetime import datetime
import json
from PIL import Image
import io
class WordImageExtractor:
"""Word文档图片提取器"""
def __init__(self, docx_path):
"""
初始化提取器
Args:
docx_path: Word文档路径
"""
self.docx_path = docx_path
self.namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml',
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape'
}
# 验证文件存在
if not os.path.exists(docx_path):
raise FileNotFoundError(f"找不到Word文档: {docx_path}")
# 验证文件格式
if not docx_path.lower().endswith('.docx'):
raise ValueError(f"不支持的文件格式: {docx_path}. 仅支持.docx格式")
# 初始化关系映射
self.rel_map = {}
self.image_info = []
# 解析文档结构
self._parse_document_structure()
def _parse_document_structure(self):
"""解析文档结构,建立关系映射"""
try:
with zipfile.ZipFile(self.docx_path) as docx_zip:
# 检查是否是有效的Word文档
if 'word/document.xml' not in docx_zip.namelist():
raise ValueError(f"无效的Word文档: {self.docx_path}")
# 解析关系文件
if 'word/_rels/document.xml.rels' in docx_zip.namelist():
rels_xml = docx_zip.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
# 建立关系映射
for rel in rels_root.findall("*"):
rel_id = rel.get('Id')
target = rel.get('Target')
rel_type = rel.get('Type')
self.rel_map[rel_id] = {
'target': target,
'type': rel_type,
'is_external': target.startswith('http') or target.startswith('file:')
}
# 解析文档内容,查找图片引用
doc_xml = docx_zip.read('word/document.xml')
doc_root = ET.fromstring(doc_xml)
# 查找所有图片引用并记录顺序
self._find_image_references(doc_root)
except zipfile.BadZipFile:
raise ValueError(f"文件不是有效的ZIP格式: {self.docx_path}")
except ET.ParseError as e:
raise ValueError(f"XML解析错误: {e}")
def _find_image_references(self, doc_root):
"""查找文档中的所有图片引用"""
image_order = 0
# 处理常规图片 (w:drawing)
for drawing in doc_root.findall('.//w:drawing', self.namespaces):
blip = drawing.find('.//a:blip', self.namespaces)
if blip is not None:
embed_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed_rel_id and embed_rel_id in self.rel_map:
image_order += 1
rel_info = self.rel_map[embed_rel_id]
# 获取图片尺寸信息
extent = drawing.find('.//wp:extent', self.namespaces)
width = height = None
if extent is not None:
width = extent.get('cx') # EMU单位
height = extent.get('cy') # EMU单位
# 获取替代文本
alt_text = ""
doc_pr = drawing.find('.//wp:docPr', self.namespaces)
if doc_pr is not None:
alt_text = doc_pr.get('descr', '')
self.image_info.append({
'order': image_order,
'rel_id': embed_rel_id,
'target': rel_info['target'],
'type': 'embedded' if not rel_info['is_external'] else 'external',
'width_emu': width,
'height_emu': height,
'alt_text': alt_text,
'element_type': 'drawing'
})
# 处理VML图片 (v:imagedata) - 兼容性模式
for img_data in doc_root.findall('.//v:imagedata', self.namespaces):
rel_id = img_data.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
if rel_id and rel_id in self.rel_map:
image_order += 1
rel_info = self.rel_map[rel_id]
self.image_info.append({
'order': image_order,
'rel_id': rel_id,
'target': rel_info['target'],
'type': 'embedded' if not rel_info['is_external'] else 'external',
'width_emu': None,
'height_emu': None,
'alt_text': img_data.get('title', ''),
'element_type': 'vml'
})
def get_image_count(self):
"""获取文档中的图片数量"""
return len(self.image_info)
def get_image_info(self):
"""获取所有图片的信息"""
return self.image_info.copy()
def extract_images(self, output_dir, preserve_names=False, include_metadata=True):
"""
提取所有图片
Args:
output_dir: 输出目录
preserve_names: 是否保留原始文件名
include_metadata: 是否包含元数据文件
Returns:
提取结果列表
"""
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
extracted_images = []
processed_targets = set()
with zipfile.ZipFile(self.docx_path) as docx_zip:
for img_info in self.image_info:
target = img_info['target']
# 跳过重复图片
if target in processed_targets:
continue
processed_targets.add(target)
# 处理嵌入图片
if img_info['type'] == 'embedded':
image_path = f"word/{target}"
if image_path in docx_zip.namelist():
# 确定输出文件名
if preserve_names:
filename = os.path.basename(target)
else:
file_ext = os.path.splitext(target)[1]
filename = f"image_{img_info['order']:03d}{file_ext}"
output_path = os.path.join(output_dir, filename)
# 提取图片
with docx_zip.open(image_path) as source:
image_data = source.read()
with open(output_path, 'wb') as target_file:
target_file.write(image_data)
# 获取图片实际尺寸
actual_width = actual_height = None
try:
with Image.open(io.BytesIO(image_data)) as pil_img:
actual_width, actual_height = pil_img.size
except Exception:
pass
extracted_images.append({
'order': img_info['order'],
'original_path': image_path,
'output_path': output_path,
'filename': filename,
'rel_id': img_info['rel_id'],
'type': 'embedded',
'width_emu': img_info['width_emu'],
'height_emu': img_info['height_emu'],
'actual_width': actual_width,
'actual_height': actual_height,
'alt_text': img_info['alt_text'],
'element_type': img_info['element_type'],
'file_size': len(image_data)
})
print(f"已提取图片 {img_info['order']}: {filename}")
# 处理外部链接图片
elif img_info['type'] == 'external':
link_filename = f"external_link_{img_info['order']:03d}.txt"
link_path = os.path.join(output_dir, link_filename)
with open(link_path, 'w', encoding='utf-8') as f:
f.write(f"外部图片链接: {target}\n")
f.write(f"替代文本: {img_info['alt_text']}\n")
f.write(f"关系ID: {img_info['rel_id']}\n")
extracted_images.append({
'order': img_info['order'],
'original_path': target,
'output_path': link_path,
'filename': link_filename,
'rel_id': img_info['rel_id'],
'type': 'external',
'alt_text': img_info['alt_text']
})
print(f"已记录外部链接 {img_info['order']}: {target}")
# 生成元数据文件
if include_metadata:
metadata_path = os.path.join(output_dir, 'extraction_metadata.json')
metadata = {
'source_document': os.path.basename(self.docx_path),
'extraction_time': datetime.now().isoformat(),
'total_images': len(extracted_images),
'embedded_images': len([img for img in extracted_images if img['type'] == 'embedded']),
'external_links': len([img for img in extracted_images if img['type'] == 'external']),
'images': extracted_images
}
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
print(f"已生成元数据文件: {metadata_path}")
return extracted_images
def extract_single_image(self, image_order, output_path):
"""
提取单个图片
Args:
image_order: 图片序号(从1开始)
output_path: 输出文件路径
Returns:
提取结果信息
"""
# 查找指定序号的图片
target_image = None
for img_info in self.image_info:
if img_info['order'] == image_order:
target_image = img_info
break
if not target_image:
raise ValueError(f"找不到序号为 {image_order} 的图片")
if target_image['type'] != 'embedded':
raise ValueError(f"图片 {image_order} 是外部链接,无法提取")
# 确保输出目录存在
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with zipfile.ZipFile(self.docx_path) as docx_zip:
image_path = f"word/{target_image['target']}"
if image_path in docx_zip.namelist():
with docx_zip.open(image_path) as source:
image_data = source.read()
with open(output_path, 'wb') as target_file:
target_file.write(image_data)
print(f"已提取图片 {image_order} 到: {output_path}")
return {
'order': image_order,
'output_path': output_path,
'file_size': len(image_data),
'success': True
}
else:
raise FileNotFoundError(f"在文档中找不到图片文件: {image_path}")
# 使用示例
def main():
"""主函数示例"""
try:
# 创建提取器实例
extractor = WordImageExtractor("example.docx")
# 显示图片信息
print(f"文档中共有 {extractor.get_image_count()} 个图片")
# 获取图片详细信息
for img_info in extractor.get_image_info():
print(f"图片 {img_info['order']}: {img_info['target']} ({img_info['type']})")
# 提取所有图片
results = extractor.extract_images("extracted_images", preserve_names=False)
print(f"\n提取完成,共处理 {len(results)} 个图片")
# 提取单个图片示例
if results:
extractor.extract_single_image(1, "single_image/first_image.jpg")
except Exception as e:
print(f"错误: {e}")
if __name__ == "__main__":
main()