I just trying to create a Map which key is a Paragraph and value the List of Text Element inside him. I use this map for count how many instance of a word is inside a document part. I need to do this because the document is not a plain docx document, but a a document is embed another document (like an "include"). For this reason i can't read all the plain text of the document beacuse.
So i've implemented this aloghoritm to traverse the document and create this map. this is the code.
- Code: Select all
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.bind.JAXBElement;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.wml.ContentAccessor;
import org.docx4j.wml.P;
import org.docx4j.wml.R;
import org.docx4j.wml.SdtBlock;
import org.docx4j.wml.SdtPr;
import org.docx4j.wml.SdtRun;
import org.docx4j.wml.Text;
public class FindWordAndReplaceTest {
private String toFind;
private boolean startAgain;
public FindWordAndReplaceTest(String toFind){
this.toFind = toFind;
}
public int wordOccurances(File file) throws Docx4JException{
WordprocessingMLPackage wmlPackage = WordprocessingMLPackage.load(file);
return findWord(wmlPackage, toFind);
}
private int findWord(WordprocessingMLPackage doc, String toFind){
HashMap<ContentAccessor, List<Text>> caMap = new HashMap<ContentAccessor, List<Text>>();
List<Object> bodyChildren = doc.getMainDocumentPart().getContent();
for (Object child : bodyChildren) {
if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue();
if(child instanceof SdtBlock){
SdtBlock stdBlock = (SdtBlock)child;
if(!checkIfInclude(stdBlock.getSdtPr())){
do {
startAgain = false;
for (Object o : stdBlock.getSdtContent().getContent()){
if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue();
if (o instanceof SdtBlock ){
stdBlock = (SdtBlock)o;
startAgain = true;
break;
}
else if ( o instanceof ContentAccessor ) {
ContentAccessor caElement = (ContentAccessor) o;
if (o instanceof P){
caMap.put(caElement, getAllTextfromContenAccessor(caElement,caMap));
}else {
getAllTextfromContenAccessor(caElement, caMap);
}
}
}
}
while (startAgain);
}
}
else if(child instanceof ContentAccessor){
ContentAccessor caElement = (ContentAccessor) child;
if (child instanceof P){
caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap));
} else {
getAllTextfromContenAccessor(caElement, caMap);
}
}
}
// i've the map paragraph -- textList
int wordOcc = 0;
for (ContentAccessor ca : caMap.keySet()){
if (!caMap.get(ca).isEmpty()){
StringBuilder builder = new StringBuilder();
for (Text text : caMap.get(ca)){
builder.append(text.getValue());
}
wordOcc += numOfOccourences(builder, toFind);
}
}
return wordOcc;
}
private int numOfOccourences(StringBuilder builder, String toFind){
String[][] tasks =
{
{"^t", "\t"},
{"^=", "\u2013"},
{"^+", "\u2014"},
{"^s", "\u00A0"},
{"^?", "."},
{"^#", "\\d"},
{"^$", "\\p{L}"}
};
for (String[] replacement : tasks)
toFind = toFind.replace(replacement[0], replacement[1]);
Pattern p = Pattern.compile(toFind, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(builder.toString());
int count = 0;
while (m.find()){
count +=1;
}
return count;
}
/*
* check if it is a include object
*
*/
private boolean checkIfInclude(SdtPr sdtPr){
for(Object child : sdtPr.getRPrOrAliasOrLock()){
if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue();
if(child instanceof SdtPr.Alias){
SdtPr.Alias alias = (SdtPr.Alias) child;
if(alias.getVal().contains(("Include :"))){
return true;
}
else
return false;
}
}
return false;
}
private List<Text> getAllTextfromContenAccessor(ContentAccessor ca, HashMap<ContentAccessor, List<Text>> caMap){
List<Text> textList = new ArrayList<Text>();
List<Object> children = ca.getContent();
for (Object child : children){
if (child instanceof JAXBElement) child = ((JAXBElement<?>) child).getValue();
if (child instanceof Text ){
Text text = (Text) child;
textList.add(text);
} else if (child instanceof R){
R run = (R)child;
for (Object o : run.getContent()){
if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue();
if(o instanceof R.Tab){
Text text = new Text();
text.setValue("\t");
textList.add(text);
}
if(o instanceof R.SoftHyphen){
Text text = new Text();
text.setValue("\u00AD");
textList.add(text);
}
if(o instanceof Text){
textList.add((Text)o);
}
}
}
else if (child instanceof ContentAccessor){
ContentAccessor caElement = (ContentAccessor) child;
if (child instanceof P){
caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap));
}else {
getAllTextfromContenAccessor(caElement, caMap);
}
}
else if(child instanceof SdtRun){
SdtRun sdtRun = (SdtRun)child;
getAllTextFromSdtRun(sdtRun, textList,caMap);
}
}
return textList;
}
public List<Text> getAllTextFromSdtRun(SdtRun sdtRun, List<Text> textList, HashMap<ContentAccessor, List<Text>> caMap){
if(!checkIfInclude(sdtRun.getSdtPr())){
for (Object o : sdtRun.getSdtContent().getContent()){
if (o instanceof JAXBElement) o = ((JAXBElement<?>) o).getValue();
if (o instanceof R){
R run = (R)o;
for (Object ob : run.getContent()){
if (ob instanceof JAXBElement) ob = ((JAXBElement<?>) ob).getValue();
if(o instanceof R.Tab){
Text text = new Text();
text.setValue("\t");
textList.add(text);
}
if(o instanceof R.SoftHyphen){
Text text = new Text();
text.setValue("\u00AD");
textList.add(text);
}
if(ob instanceof Text){
textList.add((Text)ob);
}
}
}
else if ( o instanceof ContentAccessor ) {
ContentAccessor caElement = (ContentAccessor) o;
if (o instanceof P){
caMap.put(caElement, getAllTextfromContenAccessor(caElement, caMap));
}else {
textList.addAll(getAllTextfromContenAccessor(caElement, caMap));
}
}
}
}
return textList;
}
public static void main(String[] args) {
String filePath = System.getProperty("user.home") + "myDoc.docx";
FindWordAndReplaceTest th = new FindWordAndReplaceTest("NORTH");
try {
System.out.println(th.wordOccurances(new java.io.File(filePath)));
} catch (Docx4JException e) {
e.printStackTrace();
}
}
}
I've created this on the basis of the analisys of some document, but i don't know if i've forgotten some docx4j element, i would like to make this algorithm as general as possible to works fine on all document that i will have? Any suggestion?
Also, is there any other smart procedure to do that, for example with traveralUtils? Any example?
Thanks