Java实现Word/Pdf/TXT转html的示例

引言:

    最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成HTML文件,以便在网页上能够浏览学习

 下边主要针对word,pdf和txt文本文件进行转换

一:Java实现将word转换为html

1:引入依赖

<dependency>

<groupId>fr.opensagres.xdocreport</groupId>

<artifactId>fr.opensagres.xdocreport.document</artifactId>

<version>1.0.5</version>

</dependency>

<dependency>

<groupId>fr.opensagres.xdocreport</groupId>

<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>

<version>1.0.5</version>

</dependency>

<dependency>

<groupId>org.apache.poi</groupId>

<artifactId>poi</artifactId>

<version>3.12</version>

</dependency>

<dependency>

<groupId>org.apache.poi</groupId>

<artifactId>poi-scratchpad</artifactId>

<version>3.12</version>

</dependency>

2:代码demo

package com.svse.controller;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.parsers.ParserConfigurationException;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerException;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.converter.PicturesManager;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;

import org.apache.poi.hwpf.usermodel.PictureType;

import org.apache.poi.xwpf.converter.core.BasicURIResolver;

import org.apache.poi.xwpf.converter.core.FileImageExtractor;

import org.apache.poi.xwpf.converter.core.FileURIResolver;

import org.apache.poi.xwpf.converter.core.IURIResolver;

import org.apache.poi.xwpf.converter.core.IXWPFConverter;

import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;

import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;

import org.apache.poi.xwpf.usermodel.XWPFDocument;

/**

* word 转换成html

*/

public class TestWordToHtml {

public static final String STORAGEPATH="C://works//files//";

public static final String IP="192.168.30.222";

public static final String PORT="8010";

public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {

TestWordToHtml wt=new TestWordToHtml();

//wt.Word2003ToHtml("甲骨文考证.doc");

wt.Word2007ToHtml("甲骨文考证.docx");

}

/**

* 2003版本word转换成html

* @throws IOException

* @throws TransformerException

* @throws ParserConfigurationException

*/

public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {

final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片 图片会保存在此路径

final String strRanString=getRandomNum();

String filepath =STORAGEPATH;

String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html";

final String file = filepath + fileName;

InputStream input = new FileInputStream(new File(file));

HWPFDocument wordDocument = new HWPFDocument(input);

WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

//设置图片存放的位置

wordToHtmlConverter.setPicturesManager(new PicturesManager() {

public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {

File imgPath = new File(imagepath);

if(!imgPath.exists()){//图片目录不存在则创建

imgPath.mkdirs();

}

File file = new File(imagepath +strRanString+suggestedName);

try {

OutputStream os = new FileOutputStream(file);

os.write(content);

os.close();

} catch (FileNotFoundException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

return "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName;

// return imagepath +strRanString+suggestedName;

}

});

//解析word文档

wordToHtmlConverter.processDocument(wordDocument);

Document htmlDocument = wordToHtmlConverter.getDocument();

File htmlFile = new File(filepath +strRanString+htmlName);

OutputStream outStream = new FileOutputStream(htmlFile);

DOMSource domSource = new DOMSource(htmlDocument);

StreamResult streamResult = new StreamResult(outStream);

TransformerFactory factory = TransformerFactory.newInstance();

Transformer serializer = factory.newTransformer();

serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");

serializer.setOutputProperty(OutputKeys.INDENT, "yes");

serializer.setOutputProperty(OutputKeys.METHOD, "html");

serializer.transform(domSource, streamResult);

outStream.close();

System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);

}

/**

* 2007版本word转换成html

* @throws IOException

*/

public void Word2007ToHtml(String fileName) throws IOException {

final String strRanString=getRandomNum();

String filepath = STORAGEPATH+strRanString;

String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html";

File f = new File(STORAGEPATH+fileName);

if (!f.exists()) {

System.out.println("Sorry File does not Exists!");

} else {

if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {

try {

// 1) 加载word文档生成 XWPFDocument对象

InputStream in = new FileInputStream(f);

XWPFDocument document = new XWPFDocument(in);

// 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)

File imageFolderFile = new File(filepath);

XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));

options.setExtractor(new FileImageExtractor(imageFolderFile));

options.URIResolver(new IURIResolver() {

public String resolve(String uri) {

//http://192.168.30.222:8010//uploadFile/....

return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri;

}

});

options.setIgnoreStylesIfUnused(false);

options.setFragment(true);

// 3) 将 XWPFDocument转换成XHTML

OutputStream out = new FileOutputStream(new File(filepath + htmlName));

IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();

converter.convert(document,out, options);

//XHTMLConverter.getInstance().convert(document, out, options);

System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);

} catch (Exception e) {

e.printStackTrace();

}

} else {

System.out.println("Enter only MS Office 2007+ files");

}

}

}

/**

*功能说明:生成时间戳

*创建人:zsq

*创建时间:2019年12月7日 下午2:37:09

*

*/

public static String getRandomNum(){

Date dt = new Date();

SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

String str=sdf.format(dt);

return str;

}

}

二:Java实现将Pdf转换为html

1: 引入依赖

<dependency>

<groupId>net.sf.cssbox</groupId>

<artifactId>pdf2dom</artifactId>

<version>1.7</version>

</dependency>

<dependency>

<groupId>org.apache.pdfbox</groupId>

<artifactId>pdfbox</artifactId>

<version>2.0.12</version>

</dependency>

<dependency>

<groupId>org.apache.pdfbox</groupId>

<artifactId>pdfbox-tools</artifactId>

<version>2.0.12</version>

</dependency>

2:代码Demo

public class PdfToHtml {

/*

pdf转换html

*/

public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath) {

// String outputPath = "C:\\works\\files\\ZSQ保密知识测试题库.html";

//try() 写在()里面会自动关闭流

try{

BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8"));

//加载PDF文档

//PDDocument document = PDDocument.load(bytes);

PDDocument document = PDDocument.load(new File(inPdfPath));

PDFDomTree pdfDomTree = new PDFDomTree();

pdfDomTree.writeText(document,out);

} catch (Exception e) {

e.printStackTrace();

}

}

public static void main(String[] args) throws IOException {

PdfToHtml ph=new PdfToHtml();

String pdfPath="C:\\works\\files\\武研中心行政考勤制度.pdf";

String outputPath="C:\\works\\files\\武研中心行政考勤制度.html";

ph.pdfToHtmlTest(pdfPath,outputPath);

}

}

三:Java实现将TXT转换为html

/*

* txt文档转html

filePath:txt原文件路径

htmlPosition:转化后生成的html路径

*/

public static void txtToHtml(String filePath, String htmlPosition) {

try {

//String encoding = "GBK";

File file = new File(filePath);

if (file.isFile() && file.exists()) { // 判断文件是否存在

InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");

// 考虑到编码格式

BufferedReader bufferedReader = new BufferedReader(read);

// 写文件

FileOutputStream fos = new FileOutputStream(new File(htmlPosition));

OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");

BufferedWriter bw = new BufferedWriter(osw);

String lineTxt = null;

while ((lineTxt = bufferedReader.readLine()) != null) {

bw.write("&nbsp&nbsp&nbsp"+lineTxt + "</br>");

}

bw.close();

osw.close();

fos.close();

read.close();

} else {

System.out.println("找不到指定的文件");

}

} catch (Exception e) {

System.out.println("读取文件内容出错");

e.printStackTrace();

}

}

以上是 Java实现Word/Pdf/TXT转html的示例 的全部内容, 来源链接: utcz.com/z/345965.html

回到顶部