How to get all Text Element of a Paragraph with docx4j

by **m4tt3** » Wed Oct 29, 2014 7:30 pm

Hi,

I just trying to create a Map which key is a Paragraph and value the List of Text Element inside him. I use this map for count how many instance of a word is inside a document part. I need to do this because the document is not a plain docx document, but a a document is embed another document (like an "include"). For this reason i can't read all the plain text of the document beacuse.

So i've implemented this aloghoritm to traverse the document and create this map. this is the code.

Code: Select all: import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.bind.JAXBElement; import org.docx4j.openpackaging.exceptions.Docx4JException; import org.docx4j.openpackaging.packages.WordprocessingMLPackage; import org.docx4j.wml.ContentAccessor; import org.docx4j.wml.P; import org.docx4j.wml.R; import org.docx4j.wml.SdtBlock; import org.docx4j.wml.SdtPr; import org.docx4j.wml.SdtRun; import org.docx4j.wml.Text; public class FindWordAndReplaceTest { private String toFind; private boolean startAgain; public FindWordAndReplaceTest(String toFind){ this.toFind = toFind; } public int wordOccurances(File file) throws Docx4JException{ WordprocessingMLPackage wmlPackage = WordprocessingMLPackage.load(file); return findWord(wmlPackage, toFind); } private int findWord(WordprocessingMLPackage doc, String toFind){ HashMap<ContentAccessor, List<Text>> caMap = new HashMap<ContentAccessor, List<Text>>(); List<Object> bodyChildren = doc.getMainDocumentPart().getContent(); for (Object child : bodyChildren) { if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue(); if(child instanceof SdtBlock){ SdtBlock stdBlock = (SdtBlock)child; if(!checkIfInclude(stdBlock.getSdtPr())){ do { startAgain = false; for (Object o : stdBlock.getSdtContent().getContent()){ if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue(); if (o instanceof SdtBlock ){ stdBlock = (SdtBlock)o; startAgain = true; break; } else if ( o instanceof ContentAccessor ) { ContentAccessor caElement = (ContentAccessor) o; if (o instanceof P){ caMap.put(caElement, getAllTextfromContenAccessor(caElement,caMap)); }else { getAllTextfromContenAccessor(caElement, caMap); } } } } while (startAgain); } } else if(child instanceof ContentAccessor){ ContentAccessor caElement = (ContentAccessor) child; if (child instanceof P){ caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap)); } else { getAllTextfromContenAccessor(caElement, caMap); } } } // i've the map paragraph -- textList int wordOcc = 0; for (ContentAccessor ca : caMap.keySet()){ if (!caMap.get(ca).isEmpty()){ StringBuilder builder = new StringBuilder(); for (Text text : caMap.get(ca)){ builder.append(text.getValue()); } wordOcc += numOfOccourences(builder, toFind); } } return wordOcc; } private int numOfOccourences(StringBuilder builder, String toFind){ String[][] tasks = { {"^t", "\t"}, {"^=", "\u2013"}, {"^+", "\u2014"}, {"^s", "\u00A0"}, {"^?", "."}, {"^#", "\\d"}, {"^$", "\\p{L}"} }; for (String[] replacement : tasks) toFind = toFind.replace(replacement[0], replacement[1]); Pattern p = Pattern.compile(toFind, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(builder.toString()); int count = 0; while (m.find()){ count +=1; } return count; } /* * check if it is a include object * */ private boolean checkIfInclude(SdtPr sdtPr){ for(Object child : sdtPr.getRPrOrAliasOrLock()){ if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue(); if(child instanceof SdtPr.Alias){ SdtPr.Alias alias = (SdtPr.Alias) child; if(alias.getVal().contains(("Include :"))){ return true; } else return false; } } return false; } private List<Text> getAllTextfromContenAccessor(ContentAccessor ca, HashMap<ContentAccessor, List<Text>> caMap){ List<Text> textList = new ArrayList<Text>(); List<Object> children = ca.getContent(); for (Object child : children){ if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue(); if (child instanceof Text ){ Text text = (Text) child; textList.add(text); } else if (child instanceof R){ R run = (R)child; for (Object o : run.getContent()){ if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue(); if(o instanceof R.Tab){ Text text = new Text(); text.setValue("\t"); textList.add(text); } if(o instanceof R.SoftHyphen){ Text text = new Text(); text.setValue("\u00AD"); textList.add(text); } if(o instanceof Text){ textList.add((Text)o); } } } else if (child instanceof ContentAccessor){ ContentAccessor caElement = (ContentAccessor) child; if (child instanceof P){ caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap)); }else { getAllTextfromContenAccessor(caElement, caMap); } } else if(child instanceof SdtRun){ SdtRun sdtRun = (SdtRun)child; getAllTextFromSdtRun(sdtRun, textList,caMap); } } return textList; } public List<Text> getAllTextFromSdtRun(SdtRun sdtRun, List<Text> textList, HashMap<ContentAccessor, List<Text>> caMap){ if(!checkIfInclude(sdtRun.getSdtPr())){ for (Object o : sdtRun.getSdtContent().getContent()){ if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue(); if (o instanceof R){ R run = (R)o; for (Object ob : run.getContent()){ if (ob instanceof JAXBElement) ob = ((JAXBElement<?>) ob).getValue(); if(o instanceof R.Tab){ Text text = new Text(); text.setValue("\t"); textList.add(text); } if(o instanceof R.SoftHyphen){ Text text = new Text(); text.setValue("\u00AD"); textList.add(text); } if(ob instanceof Text){ textList.add((Text)ob); } } } else if ( o instanceof ContentAccessor ) { ContentAccessor caElement = (ContentAccessor) o; if (o instanceof P){ caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap)); }else { textList.addAll(getAllTextfromContenAccessor(caElement, caMap)); } } } } return textList; } public static void main(String[] args) { String filePath = System.getProperty("user.home") + "myDoc.docx"; FindWordAndReplaceTest th = new FindWordAndReplaceTest("NORTH"); try { System.out.println(th.wordOccurances(new java.io.File(filePath))); } catch (Docx4JException e) { e.printStackTrace(); } } }

I've created this on the basis of the analisys of some document, but i don't know if i've forgotten some docx4j element, i would like to make this algorithm as general as possible to works fine on all document that i will have? Any suggestion?
Also, is there any other smart procedure to do that, for example with traveralUtils? Any example?

Thanks

How to get all Text Element of a Paragraph with docx4j

How to get all Text Element of a Paragraph with docx4j

Who is online