简单说,这个项目就是利用“Tesseract-OCR教电脑学会“读图识字”。
我们生活中会遇到很多有文字的图片,比如:
你给电脑一张这样的图片,它能自动把里面的文字“抠”出来,变成可以编辑、复制、搜索的电子文本。
这个过程就像教一个不认识字的小孩读书一样,分几个步骤:
1. 搭建环境(准备教室和课本)
2. 预处理图片(把书弄平整、擦干净)
3. 识别文字(老师开始读书)
4. 处理特殊内容(读表格和批量读)
5. 做成好用的小工具(做一个友好的界面)
# 创建虚拟环境
python -m venv ocr-env
source ocr-env/bin/activate # Linux/Mac
# ocr-env\Scripts\activate # Windows
# 安装必要的 Python 包
pip install pytesseract
pip install opencv-python
pip install Pillow
pip install numpy
pip install pandas
# 验证安装
python -c "import pytesseract; print(pytesseract.get_tesseract_version())"
创建图像预处理模块 preprocess.py
:
import cv2
import numpy as np
from PIL import Image, ImageEnhance
def preprocess_image(image_path, output_path=None):
"""
图像预处理函数
"""
# 读取图像
image = cv2.imread(image_path)
# 转换为灰度图
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# 高斯模糊去噪
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
# 二值化处理
_, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 形态学操作去除噪声
kernel = np.ones((1, 1), np.uint8)
processed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
processed = cv2.medianBlur(processed, 3)
if output_path:
cv2.imwrite(output_path, processed)
return processed
def enhance_image(image_path, output_path=None):
"""
图像增强函数
"""
with Image.open(image_path) as img:
# 对比度增强
enhancer = ImageEnhance.Contrast(img)
enhanced = enhancer.enhance(2.0)
# 锐度增强
enhancer = ImageEnhance.Sharpness(enhanced)
enhanced = enhancer.enhance(2.0)
if output_path:
enhanced.save(output_path)
return enhanced
# 批量预处理函数
def batch_preprocess(input_dir, output_dir):
"""批量预处理图像"""
import os
from pathlib import Path
Path(output_dir).mkdir(exist_ok=True)
for file in os.listdir(input_dir):
if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
input_path = os.path.join(input_dir, file)
output_path = os.path.join(output_dir, f"preprocessed_{file}")
try:
preprocess_image(input_path, output_path)
print(f"处理成功: {file}")
except Exception as e:
print(f"处理失败 {file}: {e}")
创建配置管理模块 ocr_config.py
:
import pytesseract
from PIL import Image
class TesseractConfig:
"""Tesseract配置管理类"""
# 预设配置方案
PRESETS = {
'default': '--oem 3 --psm 6',
'single_word': '--oem 3 --psm 8',
'single_line': '--oem 3 --psm 7',
'sparse_text': '--oem 3 --psm 11',
'dense_text': '--oem 3 --psm 6',
'document': '--oem 1 --psm 3',
'table': '--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789.',
}
@staticmethod
def extract_text(image_path, config='default', lang='eng+chi_sim'):
"""
使用预设配置提取文本
"""
custom_config = TesseractConfig.PRESETS.get(config, '')
try:
text = pytesseract.image_to_string(
Image.open(image_path),
lang=lang,
config=custom_config
)
return text.strip()
except Exception as e:
print(f"OCR提取失败: {e}")
return None
@staticmethod
def get_detailed_data(image_path, lang='eng'):
"""
获取详细的OCR数据
"""
try:
data = pytesseract.image_to_data(
Image.open(image_path),
lang=lang,
output_type=pytesseract.Output.DICT
)
return data
except Exception as e:
print(f"获取详细数据失败: {e}")
return None
# 使用示例
def advanced_ocr_example():
# 多种配置尝试
image_path = 'document.png'
results = {}
for config_name in ['default', 'document', 'sparse_text']:
text = TesseractConfig.extract_text(image_path, config_name, 'eng+chi_sim')
results[config_name] = text
return results
创建文档处理模块 document_processor.py
:
import pytesseract
from PIL import Image
import cv2
import numpy as np
import re
from typing import Dict, List
class DocumentProcessor:
"""文档处理器"""
def __init__(self, languages: str = 'eng+chi_sim'):
self.languages = languages
def extract_text_with_confidence(self, image_path: str, min_confidence: int = 60) -> Dict:
"""
提取文本并包含置信度信息
"""
try:
# 读取图像
image = cv2.imread(image_path)
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# 执行OCR
data = pytesseract.image_to_data(
rgb,
lang=self.languages,
output_type=pytesseract.Output.DICT
)
# 处理结果
results = {
'text': '',
'confident_text': '',
'words': [],
'average_confidence': 0
}
confidences = []
for i in range(len(data['text'])):
text = data['text'][i].strip()
confidence = int(data['conf'][i])
if text and confidence > -1:
results['words'].append({
'text': text,
'confidence': confidence,
'position': (
data['left'][i],
data['top'][i],
data['width'][i],
data['height'][i]
)
})
results['text'] += text + ' '
confidences.append(confidence)
if confidence >= min_confidence:
results['confident_text'] += text + ' '
if confidences:
results['average_confidence'] = sum(confidences) / len(confidences)
return results
except Exception as e:
print(f"文档处理错误: {e}")
return {}
def batch_process_documents(self, input_dir: str, output_file: str):
"""
批量处理文档
"""
import os
import json
from datetime import datetime
results = {}
for filename in os.listdir(input_dir):
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff')):
filepath = os.path.join(input_dir, filename)
print(f"处理文件: {filename}")
result = self.extract_text_with_confidence(filepath)
results[filename] = {
'timestamp': datetime.now().isoformat(),
'result': result
}
# 保存结果
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
return results
# 使用示例
processor = DocumentProcessor(languages='eng+chi_sim')
result = processor.extract_text_with_confidence('business_card.jpg')
print(f"平均置信度: {result['average_confidence']:.2f}%")
print(f"高置信度文本: {result['confident_text']}")
创建表格提取模块 table_extractor.py
:
import cv2
import pytesseract
import numpy as np
from PIL import Image
import pandas as pd
class TableExtractor:
"""表格数据提取器"""
def __init__(self):
self.horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
self.vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 50))
def detect_table_structure(self, image_path):
"""
检测表格结构
"""
# 读取并预处理图像
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# 检测水平线
horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, self.horizontal_kernel, iterations=2)
# 检测垂直线
vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, self.vertical_kernel, iterations=2)
# 合并线条
table_structure = cv2.addWeighted(horizontal, 0.5, vertical, 0.5, 0.0)
return table_structure, horizontal, vertical
def extract_table_data(self, image_path, lang='eng'):
"""
提取表格数据
"""
try:
# 获取表格结构
table_structure, _, _ = self.detect_table_structure(image_path)
# 使用OCR提取文本
data = pytesseract.image_to_data(
Image.open(image_path),
lang=lang,
config='--psm 6 -c preserve_interword_spaces=1',
output_type=pytesseract.Output.DICT
)
# 组织表格数据
table_data = []
current_row = []
prev_top = -1
for i in range(len(data['text'])):
text = data['text'][i].strip()
confidence = int(data['conf'][i])
if text and confidence > 30: # 置信度阈值
left = data['left'][i]
top = data['top'][i]
# 检测新行
if prev_top != -1 and abs(top - prev_top) > 20:
if current_row:
table_data.append(current_row)
current_row = []
current_row.append({
'text': text,
'confidence': confidence,
'position': (left, top)
})
prev_top = top
if current_row:
table_data.append(current_row)
return table_data
except Exception as e:
print(f"表格提取错误: {e}")
return []
def table_to_dataframe(self, image_path, lang='eng'):
"""
将表格转换为DataFrame
"""
table_data = self.extract_table_data(image_path, lang)
# 转换为二维数组
max_cols = max(len(row) for row in table_data) if table_data else 0
data_matrix = []
for row in table_data:
row_data = [cell['text'] for cell in row]
# 填充缺失的列
while len(row_data) < max_cols:
row_data.append('')
data_matrix.append(row_data)
# 创建DataFrame
if data_matrix:
# 第一行作为列名
columns = data_matrix[0]
data = data_matrix[1:] if len(data_matrix) > 1 else []
return pd.DataFrame(data, columns=columns)
else:
return pd.DataFrame()
# 使用示例
extractor = TableExtractor()
df = extractor.table_to_dataframe('financial_table.png', lang='eng')
print("提取的表格数据:")
print(df.head())
创建并行处理模块 parallel_processor.py
:
import concurrent.futures
import pytesseract
from PIL import Image
import os
from typing import List, Dict
import time
class ParallelOCRProcessor:
"""并行OCR处理器"""
def __init__(self, max_workers: int = 4):
self.max_workers = max_workers
def process_single_image(self, image_path: str, lang: str = 'eng') -> Dict:
"""处理单个图像"""
try:
start_time = time.time()
text = pytesseract.image_to_string(
Image.open(image_path),
lang=lang,
config='--oem 3 --psm 6'
)
processing_time = time.time() - start_time
return {
'file': image_path,
'text': text.strip(),
'processing_time': processing_time,
'success': True
}
except Exception as e:
return {
'file': image_path,
'error': str(e),
'success': False
}
def process_batch(self, image_paths: List[str], lang: str = 'eng') -> List[Dict]:
"""批量处理图像"""
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# 提交所有任务
future_to_path = {
executor.submit(self.process_single_image, path, lang): path
for path in image_paths
}
# 收集结果
for future in concurrent.futures.as_completed(future_to_path):
result = future.result()
results.append(result)
print(f"已完成: {result['file']} - {result.get('processing_time', 0):.2f}s")
return results
def process_directory(self, directory_path: str, lang: str = 'eng') -> List[Dict]:
"""处理目录中的所有图像"""
image_extensions = ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.webp')
image_paths = []
for filename in os.listdir(directory_path):
if filename.lower().endswith(image_extensions):
image_paths.append(os.path.join(directory_path, filename))
print(f"找到 {len(image_paths)} 个图像文件")
return self.process_batch(image_paths, lang)
# 使用示例
def benchmark_performance():
"""性能基准测试"""
processor = ParallelOCRProcessor(max_workers=8)
# 测试不同线程数的性能
for workers in [1, 2, 4, 8]:
processor.max_workers = workers
start_time = time.time()
results = processor.process_directory('documents/', lang='eng+chi_sim')
total_time = time.time() - start_time
success_count = sum(1 for r in results if r['success'])
print(f"线程数: {workers}, 总时间: {total_time:.2f}s, "
f"成功: {success_count}/{len(results)}")
# 运行性能测试
benchmark_performance()
创建错误处理模块 error_handler.py
:
import logging
from datetime import datetime
import json
import traceback
from typing import Optional
class OCRLogger:
"""OCR日志记录器"""
def __init__(self, log_file: str = 'ocr_operations.log'):
self.log_file = log_file
self.setup_logging()
def setup_logging(self):
"""配置日志系统"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(self.log_file, encoding='utf-8'),
logging.StreamHandler()
]
)
def log_operation(self,
operation: str,
status: str,
details: Optional[dict] = None,
exception: Optional[Exception] = None):
"""记录操作日志"""
log_entry = {
'timestamp': datetime.now().isoformat(),
'operation': operation,
'status': status,
'details': details or {}
}
if exception:
log_entry['error'] = {
'type': type(exception).__name__,
'message': str(exception),
'traceback': traceback.format_exc()
}
logging.info(json.dumps(log_entry, ensure_ascii=False))
return log_entry
# 装饰器用于自动错误处理
def ocr_error_handler(logger: OCRLogger):
"""OCR错误处理装饰器"""
def decorator(func):
def wrapper(*args, **kwargs):
try:
result = func(*args, **kwargs)
logger.log_operation(
operation=func.__name__,
status='success',
details={'args': args, 'kwargs': kwargs}
)
return result
except Exception as e:
logger.log_operation(
operation=func.__name__,
status='error',
details={'args': args, 'kwargs': kwargs},
exception=e
)
raise
return wrapper
return decorator
# 使用示例
logger = OCRLogger()
@ocr_error_handler(logger)
def safe_ocr_extraction(image_path: str, lang: str = 'eng') -> str:
"""安全的OCR提取函数"""
return pytesseract.image_to_string(
Image.open(image_path),
lang=lang,
config='--oem 3 --psm 6'
)
创建完整的应用 ocr_app.py
:
import streamlit as st
import pytesseract
from PIL import Image
import numpy as np
import tempfile
import os
from document_processor import DocumentProcessor
from table_extractor import TableExtractor
class OCRWebApp:
"""OCR Web应用程序"""
def __init__(self):
self.setup_page()
def setup_page(self):
"""设置Streamlit页面"""
st.set_page_config(
page_title="OCR文本识别系统",
page_icon="🔍",
layout="wide"
)
st.title("📄 OCR文本识别系统")
st.markdown("上传图片文件进行文字识别")
def process_uploaded_file(self, uploaded_file, languages):
"""处理上传的文件"""
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_path = tmp_file.name
try:
# 使用DocumentProcessor处理
processor = DocumentProcessor(languages=languages)
result = processor.extract_text_with_confidence(tmp_path)
return result, tmp_path
finally:
os.unlink(tmp_path)
def run(self):
"""运行应用程序"""
# 侧边栏配置
st.sidebar.header("配置选项")
languages = st.sidebar.multiselect(
"选择语言",
['eng', 'chi_sim', 'chi_tra', 'jpn', 'kor'],
default=['eng', 'chi_sim']
)
lang_str = '+'.join(languages)
# 文件上传
uploaded_file = st.file_uploader(
"选择图片文件",
type=['png', 'jpg', 'jpeg', 'tiff', 'bmp']
)
if uploaded_file is not None:
# 显示上传的图片
image = Image.open(uploaded_file)
st.image(image, caption="上传的图片", use_column_width=True)
# 处理按钮
if st.button("开始识别"):
with st.spinner("正在处理中..."):
result, _ = self.process_uploaded_file(uploaded_file, lang_str)
if result and result['text']:
# 显示结果
st.subheader("识别结果")
st.text_area("提取的文本", result['text'], height=200)
# 显示统计信息
col1, col2, col3 = st.columns(3)
with col1:
st.metric("平均置信度", f"{result['average_confidence']:.1f}%")
with col2:
st.metric("识别词汇数", len(result['words']))
with col3:
st.metric("高置信度文本",
len(result['confident_text'].split()))
# 下载按钮
st.download_button(
label="下载文本结果",
data=result['text'],
file_name="ocr_result.txt",
mime="text/plain"
)
else:
st.error("识别失败或未检测到文本")
# 运行应用
if __name__ == "__main__":
app = OCRWebApp()
app.run()
这个技术在生活中用处非常大!
通过这个Tesseract-OCR实战项目,我获得了以下宝贵经验:
这个项目不仅提升了我的OCR技术水平,更培养了解决实际问题的工程化思维。Tesseract作为一个成熟的开源工具,在结合实际业务需求进行定制化开发后,能够发挥出强大的文本识别能力。我们不仅让电脑学会了“识字”,更重要的是,我们让它变得高效、准确、好用,最终能帮助我们解决现实世界中的各种问题,把人们从繁琐的打字工作中解放出来。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。