创建Maven项目,添加以下依赖
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.20</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.20</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-app -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-app</artifactId>
<version>1.20</version>
</dependency>
package cn.hadron.tikademo.util;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import java.io.File;
import java.io.FileInputStream;
public class TikaUtil {
public static String parse(String filePath) throws Exception{
return parse(filePath,10*1024*1024);
}
public static String parse(String filePath,int limit) throws Exception{
File file=new File(filePath);
if(!file.exists()){
System.out.println("目标文件不存在!");
return null;
}
BodyContentHandler handler=null;
if(limit>10*1024*1024) {
handler = new BodyContentHandler(limit);
}else{
handler = new BodyContentHandler(10 * 1024 * 1024);
}
Metadata meta=new Metadata();
FileInputStream input=new FileInputStream(file);
ParseContext context=new ParseContext();
new AutoDetectParser().parse(input,handler,meta,context);
return handler.toString();
}
public static void main(String[] args) throws Exception {
String content=TikaUtil.parse("D:\\tika\\a.doc");
System.out.println(content);
}
}
程序说明: 默认可读取10万以内个字符文档,如果文档文件过大,则报错。 org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException: Your document contained more than 100000 characters, and so your requested limit has been reached. To receive the full text of the document, increase your limit. (Text up to the limit is however available). 解决办法: 通过BodyContentHandler()有参构造器,设置更大的字符数限制。比如10 * 1024 * 1024,可读取1000万左右的字符文档。
new BodyContentHandler(10 * 1024 * 1024);
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有