备忘:ApacheTika内容分析与提取

编程

DEMO

import java.io.BufferedInputStream;

import java.io.BufferedOutputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import org.apache.tika.exception.TikaException;

import org.apache.tika.metadata.Metadata;

import org.apache.tika.parser.ParseContext;

import org.apache.tika.parser.Parser;

import org.apache.tika.parser.html.HtmlParser;

import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;

import org.apache.tika.parser.pdf.PDFParser;

import org.apache.tika.sax.BodyContentHandler;

import org.xml.sax.ContentHandler;

import org.xml.sax.SAXException;

public class TikaDemo {

// public static String PATH = "E:\test.docx";

// public static String PATH = "g:\丁聪生前访谈:画漫画有个屁用!_夏冬红_新浪博客.htm";

public static String PATH = "g:\你眷恋的 都已离去 歌词 - Google 搜索.htm";

// public static String PATH = "E:\summerbell的博客文章(32).pdf";

public static String OUTPATH = PATH + ".OUT";

/**

* @param args

* @throws TikaException

* @throws SAXException

* @throws IOException

*/

public static void main(String[] args) throws IOException, SAXException,

TikaException {

// Parser parser = new OOXMLParser();

// Parser parser = new PDFParser();

Parser parser = new HtmlParser();

/**

* */

// InputStream iStream = new BufferedInputStream(new FileInputStream(

// new File(PATH)));

// OutputStream oStream = new BufferedOutputStream(new FileOutputStream(

// new File(OUTPATH)));

// ContentHandler iHandler = new BodyContentHandler(oStream);

// parser.parse(iStream, iHandler, new Metadata(), new ParseContext());

/**

* 处理指定编码的html.

*/

InputStream iStream = new BufferedInputStream(new FileInputStream(

new File(PATH)));

OutputStream oStream = new BufferedOutputStream(new FileOutputStream(

new File(OUTPATH)));

ContentHandler iHandler = new BodyContentHandler(oStream);

Metadata meta = new Metadata();

meta.add(Metadata.CONTENT_ENCODING, "utf-8");

parser.parse(iStream, iHandler, meta, new ParseContext());

}

}

 

以上是 备忘:ApacheTika内容分析与提取 的全部内容, 来源链接: utcz.com/z/510550.html

回到顶部