Hi Jason,
Attaching my code and docx file.
- 1134.docx
- Document File
- (62.06 KiB) Downloaded 286 times
public class DocxToHTMLParser {
public static void main(String args[]) throws IOException, SAXException, ParserConfigurationException {
DocxToHTMLParser docx = new DocxToHTMLParser();
docx.parseDocxToHTML();
}
public File parseDocxToHTML() {
WordprocessingMLPackage wmlPackage = null;
HTMLSettings settings = Docx4J.createHTMLSettings();
String inputFilePath = System.getProperty("user.dir");
System.out.println("Input file path is : " + inputFilePath);
File home = new File(inputFilePath);
File newHtmlFile = null;
try {
for (File file : home.listFiles()) {
String fileName = file.getName();
String ext = FileUtil.getFileNameExtension(fileName);
if (ext.equals(".docx")) {
wmlPackage = Docx4J.load(new File(fileName));
settings.setImageDirPath(inputFilePath + "_files");
settings.setImageTargetUri(inputFilePath.substring(inputFilePath.lastIndexOf("/") + 1) + "_files");
settings.setWmlPackage(wmlPackage);
String val = "html, body, div, span, h1, h2, h3, h4, h5, h6, p, a, img, ol, ul, li, table, caption, tbody, tfoot, thead, tr, th, td "
+ "{ margin: 0; padding: 0; border: 0;}" + "body {line-height: 1;} ";
;
settings.setUserCSS(val);
OutputStream outputStream = new FileOutputStream(
new File(inputFilePath + "/" + "output" + ".html"));
Docx4J.toHTML(settings, outputStream, Docx4J.FLAG_EXPORT_PREFER_XSL);
String input = FileUtils.readFileToString(new File(inputFilePath + "/" + "output" + ".html"),
"UTF-8");
Document doc = Jsoup.parse(input);
Elements elements = doc.body().children();
StringBuilder redefined = new StringBuilder();
String attributeValue = null;
int index = 1;
for (Element element : elements) {
if ((!element.tagName().equals("span")) &&( element.tagName().equals("div")
|| element.tagName().equals("p") || element.tagName().equals("li")
|| element.tagName().equals("table") || element.tagName().equals("td")
|| element.tagName().equals("h1") || element.tagName().equals("h2")
|| element.tagName().equals("h2") || element.tagName().equals("h3")
|| element.tagName().equals("h4") || element.tagName().equals("h5")
|| element.tagName().equals("h6"))) {
attributeValue = "" + index;
element.attr("id", attributeValue);
index++;
Elements childElement = element.children().select("*");
for (Element children : childElement) {
if ((!children.tagName().equals("span")) &&(children.tagName().equals("div")
|| children.tagName().equals("p") || children.tagName().equals("li")
|| children.tagName().equals("table") || children.tagName().equals("td")
|| children.tagName().equals("h1") || children.tagName().equals("h2")
|| children.tagName().equals("h2") || children.tagName().equals("h3")
|| children.tagName().equals("h4") || children.tagName().equals("h5")
|| children.tagName().equals("h6"))) {
attributeValue = "" + index;
children.attr("id", attributeValue);
index++;
}
}
redefined.append(element.toString());
}
}
newHtmlFile = new File(inputFilePath + "/" + "outputWithID" + ".html");
FileUtils.writeStringToFile(newHtmlFile, redefined.toString(), false);
}
}
} catch (Docx4JException | IOException e) {
e.printStackTrace();
}
return newHtmlFile;
}