在众多业务场景中,如文档管理、数据提取等,经常需要对 PDF 文件进行精细处理。传统方式下,将 PDF 文件拆分为单独页面并对每个页面进行有意义的重命名以及提取关键信息并导出表格,通常需要人工手动操作,这不仅效率低下,还容易出错。随着业务数据量的增长,这种人工处理方式已无法满足需求。因此,我们需要一个自动化的解决方案来高效完成这些任务。本方案基于 WPF(Windows Presentation Foundation)构建用户界面,方便用户操作,同时借助腾讯云提供的云服务能力,实现 PDF 文件的拆分、内容识别、重命名以及信息导出表格等功能。
using iTextSharp.text.pdf;using System.IO;public void SplitPdf(string inputPdfPath, string outputFolder){ if (!Directory.Exists(outputFolder)) { Directory.CreateDirectory(outputFolder); } PdfReader reader = new PdfReader(inputPdfPath); for (int i = 1; i <= reader.NumberOfPages; i++) { Document document = new Document(reader.GetPageSizeWithRotation(i)); PdfCopy copy = new PdfCopy(document, new FileStream(Path.Combine(outputFolder, $"page_{i}.pdf"), FileMode.Create)); document.Open(); copy.AddPage(copy.GetImportedPage(reader, i)); document.Close(); } reader.Close();}
using TencentCloud.Common;using TencentCloud.Common.Profile;using TencentCloud.Ocr.V20181119;using TencentCloud.Ocr.V20181119.Models;using System.IO;using System.Threading.Tasks;public async Task<string> RecognizeTextFromPdfPage(string pdfPagePath){ byte[] fileBytes = File.ReadAllBytes(pdfPagePath); string base64File = Convert.ToBase64String(fileBytes); Credential cred = new Credential { SecretId = "YOUR_SECRET_ID", SecretKey = "YOUR_SECRET_KEY" }; ClientProfile clientProfile = new ClientProfile(); HttpProfile httpProfile = new HttpProfile(); httpProfile.Endpoint = "ocr.tencentcloudapi.com"; clientProfile.HttpProfile = httpProfile; OcrClient client = new OcrClient(cred, "ap-guangzhou", clientProfile); GeneralBasicOCRRequest req = new GeneralBasicOCRRequest(); req.ImageBase64 = base64File; GeneralBasicOCRResponse resp = await client.GeneralBasicOCR(req); string recognizedText = resp.TextDetections[0].DetectedText; return recognizedText;}
public void RenameFileBasedOnText(string pdfPagePath, string recognizedText){ string folderPath = Path.GetDirectoryName(pdfPagePath); string newFileName = $"{ExtractKeyInfo(recognizedText)}.pdf"; string newFilePath = Path.Combine(folderPath, newFileName); File.Move(pdfPagePath, newFilePath);}private string ExtractKeyInfo(string text){ // 这里编写提取关键信息的逻辑,例如通过正则表达式匹配日期和客户名称 // 示例:假设文本中日期格式为YYYY-MM-DD,客户名称在特定关键词后 string datePattern = @"\d{4}-\d{2}-\d{2}"; string clientNamePattern = @"客户名称:(\w+)"; Match dateMatch = Regex.Match(text, datePattern); Match clientNameMatch = Regex.Match(text, clientNamePattern); if (dateMatch.Success && clientNameMatch.Success) { return $"{dateMatch.Value}_{clientNameMatch.Groups[1].Value}"; } return "default_name";}
public class PdfPageInfo{ public string FileName { get; set; } public string RecognizedText { get; set; }}
using ClosedXML.Excel;using System.Collections.Generic;public void ExportToExcel(List<PdfPageInfo> pageInfos, string outputExcelPath){ using (XLWorkbook wb = new XLWorkbook()) { IXLWorksheet ws = wb.AddWorksheet("PDF Page Information"); ws.Cell(1, 1).Value = "File Name"; ws.Cell(1, 2).Value = "Recognized Text"; for (int i = 0; i < pageInfos.Count; i++) { ws.Cell(i + 2, 1).Value = pageInfos[i].FileName; ws.Cell(i + 2, 2).Value = pageInfos[i].RecognizedText; } wb.SaveAs(outputExcelPath); }}
<Window x:Class="PdfProcessingApp.MainWindow" xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" Title="PDF Processing" Height="350" Width="525"> <Grid> <Button Content="Select PDF File" HorizontalAlignment="Left" Margin="10,10,0,0" VerticalAlignment="Top" Width="120" Click="SelectPdfFile_Click"/> <Button Content="Select Output Folder" HorizontalAlignment="Left" Margin="10,40,0,0" VerticalAlignment="Top" Width="120" Click="SelectOutputFolder_Click"/> <Button Content="Start Processing" HorizontalAlignment="Left" Margin="10,70,0,0" VerticalAlignment="Top" Width="120" Click="StartProcessing_Click"/> <TextBox x:Name="ResultTextBox" HorizontalAlignment="Left" Margin="140,10,0,0" VerticalAlignment="Top" Width="375" Height="230" IsReadOnly="True"/> </Grid></Window>
using System.Windows;using System.Windows.Forms;using System.IO;namespace PdfProcessingApp{ public partial class MainWindow : Window { private string pdfFilePath; private string outputFolderPath; public MainWindow() { InitializeComponent(); } private void SelectPdfFile_Click(object sender, RoutedEventArgs e) { OpenFileDialog openFileDialog = new OpenFileDialog(); openFileDialog.Filter = "PDF Files|*.pdf"; if (openFileDialog.ShowDialog() == DialogResult.OK) { pdfFilePath = openFileDialog.FileName; ResultTextBox.Text += $"Selected PDF file: {pdfFilePath}\n"; } } private void SelectOutputFolder_Click(object sender, RoutedEventArgs e) { FolderBrowserDialog folderBrowserDialog = new FolderBrowserDialog(); if (folderBrowserDialog.ShowDialog() == DialogResult.OK) { outputFolderPath = folderBrowserDialog.SelectedPath; ResultTextBox.Text += $"Selected output folder: {outputFolderPath}\n"; } } private async void StartProcessing_Click(object sender, RoutedEventArgs e) { if (!string.IsNullOrEmpty(pdfFilePath) &&!string.IsNullOrEmpty(outputFolderPath)) { SplitPdf(pdfFilePath, outputFolderPath); List<PdfPageInfo> pageInfos = new List<PdfPageInfo>(); string[] pdfPageFiles = Directory.GetFiles(outputFolderPath, "*.pdf"); foreach (string pdfPageFile in pdfPageFiles) { string recognizedText = await RecognizeTextFromPdfPage(pdfPageFile); RenameFileBasedOnText(pdfPageFile, recognizedText); string newFileName = Path.GetFileName(pdfPageFile); pageInfos.Add(new PdfPageInfo { FileName = newFileName, RecognizedText = recognizedText }); } ExportToExcel(pageInfos, Path.Combine(outputFolderPath, "PDF_Info.xlsx")); ResultTextBox.Text += "Processing completed. Information exported to Excel.\n"; } else { ResultTextBox.Text += "Please select both a PDF file and an output folder.\n"; } } }}
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。