I have successfully used poi to make a DOC to HTML converter - and now I need a DOCX to HTML converter, with a decent degree of accuracy in the formatting. I think that docx4j might be just the tool I'm looking for if I can get it to work.
My Maven looks like this:
- Code: Select all
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>DocumentConverter</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>15</source>
<target>15</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<maven.compiler.source>16</maven.compiler.source>
<maven.compiler.target>16</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j-JAXB-ReferenceImpl</artifactId>
<version>8.2.8</version>
<!-- <scope>test</scope>-->
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency>
<!-- logging config files; docx4j.properties -->
<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j</artifactId>
<version>6.1.2</version>
</dependency>
<dependency>
<groupId>org.eclipse.persistence</groupId>
<artifactId>org.eclipse.persistence.moxy</artifactId>
<version>2.5.1</version>
</dependency>
</dependencies>
</project>
My code looks like this - as you can see, I've pared it right back to the essentials to see if I can find the problem…
- Code: Select all
import org.docx4j.Docx4J;
import org.docx4j.Docx4jProperties;
import org.docx4j.convert.out.HTMLSettings;
import org.docx4j.convert.out.html.AbstractHtmlExporter;
import org.docx4j.convert.out.html.HtmlExporterNG2;
import org.docx4j.model.fields.FieldUpdater;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.org.apache.poi.util.IOUtils;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import java.io.*;
public class DocXTest {
public static void docxToHTML(String filename, String outputname) throws FileNotFoundException, Docx4JException {
File file = new File(filename);
FileInputStream inputStream = new FileInputStream(file);
// WordprocessingMLPackage pkg = WordprocessingMLPackage.load(inputStream);
WordprocessingMLPackage wordMLPackage = Docx4J.load(new java.io.File(filename));
// String root = System.getProperty("user.dir");
//
// // Refresh the values of DOCPROPERTY fields
// FieldUpdater updater = new FieldUpdater(pkg);
// updater.update(true);
//
// AbstractHtmlExporter exporter = new HtmlExporterNG2();
// HTMLSettings htmlSettings = Docx4J.createHTMLSettings();
// htmlSettings.setImageDirPath(root +"/tmp/sample-docx.html_files");
// htmlSettings.setImageTargetUri(root +"/tmp/_files");
// htmlSettings.setWmlPackage(pkg);
//
// Docx4jProperties.setProperty("docx4j.Convert.Out.HTML.OutputMethodXML", true);
//
// OutputStream os = new FileOutputStream(root + "/" + outputname + ".html");
// Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_EXPORT_PREFER_XSL);
// IOUtils.closeQuietly(os);
//
// if (pkg.getMainDocumentPart().getFontTablePart() != null) {
// pkg.getMainDocumentPart().getFontTablePart()
// .deleteEmbeddedFontTempFiles();
// }
// // This would also do it, via finalize() methods
// htmlSettings = null;
// pkg = null;
}
public static void main(String[] args) throws Exception {
String parameter;
if (args.length < 1) {
System.out.println("No parameter supplied");
System.exit(1);
}
parameter = args[0];
docxToHTML(parameter, "testm");
}
}
Even with so minimal an implementation, with core heavy lifting removed, I get the following errors.
- Code: Select all
log4j:WARN No appenders could be found for logger (org.docx4j.utils.ResourceUtils).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Exception in thread "main" org.docx4j.openpackaging.exceptions.Docx4JException: Couldn't get [Content_Types].xml from ZipFile
at org.docx4j.openpackaging.io3.Load3.get(Load3.java:148)
at org.docx4j.openpackaging.packages.OpcPackage.load(OpcPackage.java:561)
at org.docx4j.openpackaging.packages.OpcPackage.load(OpcPackage.java:410)
at org.docx4j.openpackaging.packages.OpcPackage.load(OpcPackage.java:287)
at org.docx4j.openpackaging.packages.OpcPackage.load(OpcPackage.java:265)
at org.docx4j.openpackaging.packages.WordprocessingMLPackage.load(WordprocessingMLPackage.java:168)
at org.docx4j.Docx4J.load(Docx4J.java:233)
at DocXTest.docxToHTML(DocXTest.java:26)
at DocXTest.main(DocXTest.java:88)
Caused by: org.docx4j.openpackaging.exceptions.InvalidFormatException: Bad [Content_Types].xml
at org.docx4j.openpackaging.contenttype.ContentTypeManager.parseContentTypesFile(ContentTypeManager.java:871)
at org.docx4j.openpackaging.io3.Load3.get(Load3.java:146)
... 8 more
Caused by: java.lang.RuntimeException: javax.xml.bind.JAXBException: JAXB: Can't instantiate JAXB Reference Implementation
- with linked exception:
[java.lang.ClassNotFoundException: org.docx4j.jaxb.ri.NamespacePrefixMapper]
at org.docx4j.XmlUtils.marshaltoString(XmlUtils.java:901)
at org.docx4j.openpackaging.contenttype.ContentTypeManager.parseContentTypesFile(ContentTypeManager.java:851)
... 9 more
Caused by: javax.xml.bind.JAXBException: JAXB: Can't instantiate JAXB Reference Implementation
- with linked exception:
[java.lang.ClassNotFoundException: org.docx4j.jaxb.ri.NamespacePrefixMapper]
at org.docx4j.jaxb.NamespacePrefixMapperUtils.tryUsingRI(NamespacePrefixMapperUtils.java:95)
at org.docx4j.jaxb.NamespacePrefixMapperUtils.getPrefixMapper(NamespacePrefixMapperUtils.java:71)
at org.docx4j.XmlUtils.marshaltoString(XmlUtils.java:850)
... 10 more
Caused by: java.lang.ClassNotFoundException: org.docx4j.jaxb.ri.NamespacePrefixMapper
at java.base/jdk.internal.loader.BuiltinClassLoader.loadClass(BuiltinClassLoader.java:636)
at java.base/jdk.internal.loader.ClassLoaders$AppClassLoader.loadClass(ClassLoaders.java:182)
at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:519)
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Class.java:375)
at org.docx4j.jaxb.NamespacePrefixMapperUtils.tryUsingRI(NamespacePrefixMapperUtils.java:79)
... 12 more
Process finished with exit code 1
I hope it's obvious, but the web has not been helpful so far. I don't think that I missed anything from https://docx4java.org/docx4j/Docx4j_GettingStarted.pdf
What have I missed?