I've adopted the XhtmlImporter for docx files to work for pptx in version 2.8.1.
Now I've updated to docx4j 3.0.0 and the XhtmlImporter is not working anymore, and I can't figure out what to do how to fix this.
What I figured out, is, that - using the same pptx and the same html, the xhtml converter of 2.8.1 produces a BlockBox with several children and with the newer version I'm getting a BlockBox with just one inline-element, after calling importer.renderer.layout().
fter this, the traverseChildren Method does not call traverse again the therefore the html is not converted.
Any ideas what to change to get the XhtmlImporter working again?
Many thanks!
Here is my xhtml importer:
- Code: Select all
/*
* Copyright 2011-2012, Plutext Pty Ltd. This file is part of docx4j. docx4j is licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in
* writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
* OF ANY KIND, either express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.xroot.xpptx;
import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.bind.DatatypeConverter;
import javax.xml.transform.TransformerException;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.docx4j.dml.CTRegularTextRun;
import org.docx4j.dml.CTSRgbColor;
import org.docx4j.dml.CTSolidColorFillProperties;
import org.docx4j.dml.CTTextAutonumberBullet;
import org.docx4j.dml.CTTextCharBullet;
import org.docx4j.dml.CTTextCharacterProperties;
import org.docx4j.dml.CTTextParagraph;
import org.docx4j.dml.CTTextParagraphProperties;
import org.docx4j.dml.CTTextSpacing;
import org.docx4j.dml.CTTextSpacingPercent;
import org.docx4j.dml.STTextAlignType;
import org.docx4j.dml.STTextAutonumberScheme;
import org.docx4j.dml.STTextStrikeType;
import org.docx4j.dml.STTextUnderlineType;
import org.docx4j.dml.TextFont;
import org.docx4j.model.properties.paragraph.Justification;
import org.docx4j.model.properties.run.Bold;
import org.docx4j.model.properties.run.FontColor;
import org.docx4j.model.properties.run.FontSize;
import org.docx4j.model.properties.run.Italics;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.org.xhtmlrenderer.css.constants.CSSName;
import org.docx4j.org.xhtmlrenderer.css.constants.IdentValue;
import org.docx4j.org.xhtmlrenderer.css.parser.PropertyValue;
import org.docx4j.org.xhtmlrenderer.css.style.CalculatedStyle;
import org.docx4j.org.xhtmlrenderer.css.style.DerivedValue;
import org.docx4j.org.xhtmlrenderer.css.style.FSDerivedValue;
import org.docx4j.org.xhtmlrenderer.docx.DocxRenderer;
import org.docx4j.org.xhtmlrenderer.render.BlockBox;
import org.docx4j.org.xhtmlrenderer.render.Box;
import org.docx4j.org.xhtmlrenderer.render.InlineBox;
import org.docx4j.org.xhtmlrenderer.resource.XMLResource;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.css.CSSValue;
import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;
/**
* Convert XHTML + CSS to PresentationML content. Your XHTML must be well formed XML!
*
* Based on the XHTMLImporter from docx4j.
*
* @author jharrop
*/
public final class XhtmlImporter {
private static final Log LOG = LogFactory.getLog(XhtmlImporter.class);
private final List<CTTextParagraph> imports = new ArrayList<CTTextParagraph>();
private static final String BULLET_POINT = "•";
private static final int INDENT_DELTA = 166688;
private static final int DEFAULT_FONT_SIZE = 800;
private CTTextParagraph currentParagraph;
private CTTextCharacterProperties characterProperties;
private CTTextParagraphProperties paragraphProperties;
private DocxRenderer renderer;
// A paragraph created for a div can be replaced by
// one created for a p within it, if it is still empty
private boolean paraStillEmpty;
// states the current vertical offset, e.g. for sub/sup tags
private int currentBaseline = 0;
// states whether the current block is within an em- or i-tag
private boolean emphasized;
// states whether the current block is within a strong- or b-tag
private boolean strong;
private int numberingDepth = 0;
private XhtmlImporter() {
}
/**
* Convert the well formed XHTML contained in the string to a list of text paragraph objects.
*
* @param content well formed HTML text
* @param characterProperties the characterProperties of the element containing the text
* @param paragraphProperties the paragraphProperties of the element containing the text
* @return the paragraphs
*/
public static List<CTTextParagraph> convert(final String content,
final CTTextCharacterProperties characterProperties, final CTTextParagraphProperties paragraphProperties) {
if (StringUtils.isEmpty(content)) {
return Collections.emptyList();
}
try {
// remove superfluous chars and wrap so that we have exactly one root node
String cleanContent =
"<!DOCTYPE wrapper [<!ENTITY nbsp \" \">]><wrapper>"
+ content.replaceAll("\\n|\\r|\\f", StringUtils.EMPTY) + "</wrapper>";
XhtmlImporter importer = new XhtmlImporter();
importer.characterProperties = characterProperties;
importer.paragraphProperties = paragraphProperties;
importer.renderer = new DocxRenderer();
InputSource is = new InputSource(new BufferedReader(new StringReader(cleanContent)));
Document dom;
try {
dom = XMLResource.load(is).getDocument();
} catch (org.docx4j.org.xhtmlrenderer.util.XRRuntimeException xre) {
// javax.xml.transform.TransformerException te
Throwable t = xre.getCause();
if (t instanceof TransformerException) {
// eg content of elements must consist of well-formed character data or markup.
Throwable t2 = t.getCause();
if (t2 instanceof SAXParseException) {
throw new Docx4JException("issues at Line " + ((SAXParseException) t2).getLineNumber() + ", Col "
+ ((SAXParseException) t2).getColumnNumber(), t);
}
throw new Docx4JException(((TransformerException) t).getLocationAsString(), t);
} else {
throw xre;
}
}
importer.renderer.setDocument(dom, null);
importer.renderer.layout();
importer.traverse(importer.renderer.getRootBox(), importer.imports, null);
return importer.imports;
} catch (Exception e) {
LOG.error("Error converting HTML. Content: " + content, e);
}
return null;
}
private Map<String, CSSValue> getCascadedProperties(final CalculatedStyle cs) {
Map<String, CSSValue> cssMap = new HashMap<String, CSSValue>();
FSDerivedValue[] derivedValues = cs.getDerivedValues();
for (int i = 0; i < derivedValues.length; i++) {
CSSName name = CSSName.getByID(i);
if (name.toString().startsWith("-fs")) continue;
FSDerivedValue val = cs.valueByName(name); // walks parents as necessary to get the value
if (val != null && val instanceof DerivedValue) {
cssMap.put(name.toString(), ((DerivedValue) val).getCSSPrimitiveValue());
} else if (val != null && val instanceof IdentValue) {
cssMap.put(name.toString(), ((IdentValue) val).getCSSPrimitiveValue());
} else if (val != null) {
LOG.debug("Skipping " + name.toString() + " .. " + val.getClass().getName());
} else {
LOG.debug("Skipping " + name.toString() + " .. (null value)");
}
}
return cssMap;
}
private void traverse(final Box box, final List<CTTextParagraph> contentContext, final Box parent)
throws Docx4JException {
LOG.debug("---------------------------------------------------------------------------" + box.getClass().getName());
if (!(box instanceof BlockBox)) {
LOG.debug("Skip box which isn't a BlockBox.");
return;
}
Element e = box.getElement();
// Don't add a new paragraph if this BlockBox is display: inline
LOG.debug("BB" + "<" + e.getNodeName() + " " + box.getStyle().toStringMine());
handleNodeBegin(e.getNodeName());
Map<String, CSSValue> cssMap = getCascadedProperties(box.getStyle());
if (box.getStyle().getDisplayMine().equals("inline")) {
// Don't add a paragraph for this, unless ..
if (currentParagraph == null) {
currentParagraph = new CTTextParagraph();
contentContext.add(currentParagraph);
currentParagraph.setPPr(addParagraphProperties(cssMap));
currentParagraph.setEndParaRPr(getTextCharacterProperties(cssMap));
paraStillEmpty = true;
}
} else {
// Avoid creating paragraphs for html, body
if (contentContext.size() > 0 && paraStillEmpty) {
contentContext.remove(contentContext.size() - 1);
}
currentParagraph = new CTTextParagraph();
contentContext.add(currentParagraph);
paraStillEmpty = true;
// Paragraph level styling
currentParagraph.setPPr(addParagraphProperties(cssMap));
currentParagraph.setEndParaRPr(getTextCharacterProperties(cssMap));
if (e.getNodeName().equals("li")) {
addNumbering(e, cssMap);
}
}
traverseChildren((BlockBox) box, contentContext);
}
private void traverseChildren(final BlockBox blockBox, final List<CTTextParagraph> contentContext)
throws Docx4JException {
LOG.debug("Processing children of " + blockBox.getElement().getNodeName());
if (blockBox.getChildrenContentType() == BlockBox.CONTENT_BLOCK) {
for (Object o : blockBox.getChildren()) {
traverse((Box) o, contentContext, blockBox);
}
} else if (blockBox.getChildrenContentType() == BlockBox.CONTENT_INLINE) {
if (blockBox.getInlineContent() != null) {
for (Object o : blockBox.getInlineContent()) {
if (o instanceof InlineBox) {
processInlineBox((InlineBox) o, contentContext);
} else if (o instanceof BlockBox) {
traverse((Box) o, contentContext, blockBox);
} else {
LOG.debug("Don't know what to do with " + blockBox.getClass().getName());
}
}
}
}
handleNodeEnd(blockBox.getElement().getNodeName());
LOG.debug("Done processing children of " + blockBox.getClass().getName());
}
private void setSub() {
currentBaseline = -25000;
}
private void setSup() {
currentBaseline = 30000;
}
private void addNumbering(final Element e, final Map<String, CSSValue> cssMap) {
CTTextSpacing textSpacing = new CTTextSpacing();
CTTextSpacingPercent textSpacingPercent = new CTTextSpacingPercent();
textSpacingPercent.setVal(20000);
textSpacing.setSpcPct(textSpacingPercent);
currentParagraph.getPPr().setSpcBef(textSpacing);
currentParagraph.getPPr().setIndent(-1 * INDENT_DELTA);
currentParagraph.getPPr().setMarL(numberingDepth * INDENT_DELTA);
currentParagraph.getPPr().setLvl(numberingDepth - 1);
if ("ul".equals(e.getParentNode().getNodeName())) {
// unordered list: bullets
CTTextCharBullet textCharBullet = new CTTextCharBullet();
textCharBullet.setChar(BULLET_POINT);
currentParagraph.getPPr().setBuChar(textCharBullet);
} else if ("ol".equals(e.getParentNode().getNodeName())) {
// ordered list: numbers
TextFont font = new TextFont();
font.setTypeface("+mj-lt");
currentParagraph.getPPr().setBuFont(font);
CTTextAutonumberBullet autonumber = new CTTextAutonumberBullet();
autonumber.setType(STTextAutonumberScheme.ARABIC_PERIOD);
currentParagraph.getPPr().setBuAutoNum(autonumber);
}
}
private void processInlineBox(final InlineBox inlineBox, final List<CTTextParagraph> contentContext) {
Map<String, CSSValue> cssMap = getCascadedProperties(inlineBox.getStyle());
String nodeName = inlineBox.getElement() == null ? null : inlineBox.getElement().getNodeName();
LOG.debug("Box \"" + inlineBox + "\", node \"" + nodeName + "\".");
// LOG.debug("Style: " + (inlineBox.getStyle() == null ? "" : inlineBox.getStyle().toStringMine()));
if (nodeName != null) {
if (nodeName.equals("p") || nodeName.equals("br")) {
currentParagraph = new CTTextParagraph();
if (paraStillEmpty) {
contentContext.remove(contentContext.size() - 1);
}
contentContext.add(currentParagraph);
currentParagraph.setPPr(addParagraphProperties(cssMap));
currentParagraph.setEndParaRPr(getTextCharacterProperties(cssMap));
paraStillEmpty = true;
}
handleNodeBegin(nodeName);
}
processInlineBoxContent(inlineBox, cssMap);
if (nodeName != null) {
// reset flags etc.
handleNodeEnd(nodeName);
}
}
private void handleNodeBegin(final String nodeName) {
if (nodeName.equals("sup")) {
setSup();
} else if (nodeName.equals("sub")) {
setSub();
} else if (nodeName.equals("em") || nodeName.equals("i")) {
emphasized = true;
} else if (nodeName.equals("strong") || nodeName.equals("b")) {
strong = true;
} else if (nodeName.equals("ul") || nodeName.equals("ol")) {
numberingDepth++;
}
}
private void handleNodeEnd(final String nodeName) {
if (nodeName.equals("sup") || nodeName.equals("sub")) {
currentBaseline = 0;
} else if (nodeName.equals("em") || nodeName.equals("i")) {
emphasized = false;
} else if (nodeName.equals("strong") || nodeName.equals("b")) {
strong = false;
} else if (nodeName.equals("ul") || nodeName.equals("ol")) {
numberingDepth--;
}
}
private void processInlineBoxContent(final InlineBox inlineBox, final Map<String, CSSValue> cssMap) {
if (inlineBox.getTextNode() == null) {
if (inlineBox.getElement().getNodeName().equals("br")) {
paraStillEmpty = false;
} else {
LOG.debug("InlineBox has no TextNode, so skipping");
}
} else {
String theText = inlineBox.getTextNode().getTextContent();
LOG.debug("Processing " + theText);
paraStillEmpty = false;
CTRegularTextRun run = new CTRegularTextRun();
run.setT(theText);
currentParagraph.getEGTextRun().add(run);
// Run level styling
run.setRPr(getTextCharacterProperties(cssMap));
}
}
private CTTextParagraphProperties addParagraphProperties(final Map<String, CSSValue> cssMap) {
CTTextParagraphProperties pPr = ComponentFactory.create(paragraphProperties);
for (Map.Entry<String, CSSValue> css : cssMap.entrySet()) {
String cssName = css.getKey();
CSSValue cssValue = css.getValue();
if (cssValue instanceof PropertyValue
&& PropertyValue.VALUE_TYPE_LIST == ((PropertyValue) cssValue).getPropertyValueType()) {
// list
for (Object listValue : ((PropertyValue) cssValue).getValues()) {
setPropertyFromCssName(pPr, cssName, (PropertyValue) listValue);
}
} else {
// single value
setPropertyFromCssName(pPr, cssName, cssValue);
}
}
return pPr;
}
private CTTextCharacterProperties getTextCharacterProperties(final Map<String, CSSValue> cssMap) {
CTTextCharacterProperties rPr = ComponentFactory.create(characterProperties);
// set node properties derived from surrounding tags
if (currentBaseline != 0) {
rPr.setBaseline(currentBaseline);
}
if (strong) {
rPr.setB(true);
}
if (emphasized) {
rPr.setI(true);
}
rPr.setSz(DEFAULT_FONT_SIZE);
for (Map.Entry<String, CSSValue> css : cssMap.entrySet()) {
String cssName = css.getKey();
CSSValue cssValue = css.getValue();
if (cssValue instanceof PropertyValue
&& PropertyValue.VALUE_TYPE_LIST == ((PropertyValue) cssValue).getPropertyValueType()) {
// list
for (Object listValue : ((PropertyValue) cssValue).getValues()) {
setPropertyFromCssName(rPr, cssName, (PropertyValue) listValue);
}
} else {
// single value
setPropertyFromCssName(rPr, cssName, cssValue);
}
}
return rPr;
}
private static void setPropertyFromCssName(final CTTextParagraphProperties paragraphProperties, final String name,
final CSSValue value) {
if (name == null || value == null) {
return;
}
if (name.equals(Justification.CSS_NAME)) {
// text-align
paragraphProperties.setAlgn(getJustification(value));
}
}
private static void setPropertyFromCssName(final CTTextCharacterProperties characterProperties, final String name,
final CSSValue value) {
if (name == null || value == null) {
return;
}
String cssText = value.getCssText();
// Run properties
// if (name.equals(Font.CSS_NAME)) {
// font-family
// TextFont font;
// if (characterProperties.getLatin() != null) {
// font = characterProperties.getLatin();
// } else {
// font = new TextFont();
// characterProperties.setLatin(font);
// font.setCharset((byte) 0);
// }
// font.setTypeface(cssText);
// } else
if (name.equals(Bold.CSS_NAME)) {
// font-weight
characterProperties.setB(!IdentValue.NORMAL.toString().equals(cssText));
} else if (name.equals(Italics.CSS_NAME)) {
// font-style
characterProperties.setI(!IdentValue.NORMAL.toString().equals(cssText));
} else if (name.equals("text-decoration")) {
// text-decoration
if (cssText.toLowerCase().equals("line-through")) {
characterProperties.setStrike(STTextStrikeType.SNG_STRIKE);
} else if (cssText.toLowerCase().equals("underline")) {
characterProperties.setU(STTextUnderlineType.SNG);
}
} else if (name.equals(FontColor.CSS_NAME)) {
// color
CTSolidColorFillProperties solidFill = new CTSolidColorFillProperties();
CTSRgbColor color = new CTSRgbColor();
color.setVal(DatatypeConverter.parseHexBinary(cssText.substring(1)));
solidFill.setSrgbClr(color);
characterProperties.setSolidFill(solidFill);
} else if (name.equals(FontSize.CSS_NAME)) {
// font-size
characterProperties.setSz(getFontSize(cssText));
}
}
// TODO crappy conversion code
private static int getFontSize(final String value) {
int size;
if ("xx-small".equals(value)) {
size = 10;
} else if ("x-small".equals(value)) {
size = 12;
} else if ("small".equals(value)) {
size = 14;
} else if ("medium".equals(value)) {
size = 16;
} else if ("large".equals(value)) {
size = 18;
} else if ("x-large".equals(value)) {
size = 22;
} else if ("xx-large".equals(value)) {
size = 30;
} else {
size = 16;
}
return size * 50;
}
private static STTextAlignType getJustification(final CSSValue cssValue) {
String value = cssValue.getCssText().toLowerCase();
if (value.equals("left")) {
return STTextAlignType.L;
}
if (value.equals("center")) {
return STTextAlignType.CTR;
}
if (value.equals("right")) {
return STTextAlignType.R;
}
if (value.equals("justify")) {
return STTextAlignType.JUST;
}
return STTextAlignType.L;
}
}