export to excel - Extract text word by word from .pdf file using pdfbox -

my object extract text .pdf file , write excel file without losing text formatting. @ moment, have been success extract text word word font, size, , coordinates in array of 2 dimensions have little problem: distinct words in same row have been combined 1 word.

for example:

020 | 0542 | cd45

it extracted in same word : 0200542cd45

i can not find error. need help.

this code:

public class printtextlocations extends pdftextstripper {      public static stringbuilder tword = new stringbuilder();     public static string[][] coordtab;     public static int p = 0;     public static string seek;     public static string[] seeka;     public static list<string> wordlist = new arraylist();     public static boolean is1stchar = true;     public static boolean linematch;     public static int pageno = 1;     public static double lastyval;      public printtextlocations() throws ioexception {         super.setsortbyposition(true);     }      public static void main(string[] args) throws exception {         pddocument document = null;         pdftextparser pdftext = new pdftextparser();         string file_name = "d:/test.pdf";         seeka = pdftext.pdftotext(file_name).split(" ");         seek = pdftext.pdftotext(file_name);         coordtab = new string [seeka.length*2][6];         try {             file input = new file(file_name);             document = pddocument.load(input);             if (document.isencrypted()) {                 document.decrypt("");             }             printtextlocations printer = new printtextlocations();             list allpages = document.getdocumentcatalog().getallpages();              (int = 0; < allpages.size(); i++) {                 pdpage page = (pdpage) allpages.get(i);                 pdstream contents = page.getcontents();                 if (contents != null) {                      printer.processstream(page, page.findresources(), page.getcontents().getstream());                 }                 pageno += 1;             }         } {             if (document != null) {                 (int k = 0; k<= p;k++){                     system.out.println(k+" : "+coordtab[k][0]+" | "+coordtab[k][1]+" | "+coordtab[k][2]+" | "+coordtab[k][3]+" | "+coordtab[k][4]+" | "+coordtab[k][5]);                 }                            myxls.close();                 document.close();             }         }     }      @override     protected void processtextposition(textposition text) {         string tchar = text.getcharacter();         string regex = "'' ";         char c = tchar.charat(0);         linematch = matchcharline(text);         if (!character.iswhitespace(c)) {             if ((!is1stchar) && (linematch == true)) {                 appendchar(tchar);             } else if (is1stchar == true) {                 setwordcoord(text, tchar);             }         } else {             endword();         }     }      protected void appendchar(string tchar) {         tword.append(tchar);         coordtab[p][3] = string.valueof(tword);         is1stchar = false;     }      protected void setwordcoord(textposition text, string tchar) {         tword.append(tchar);          coordtab[p][0] = ""+ pageno;         coordtab[p][1] = ""+ roundval(float.valueof(text.getx()));         coordtab[p][2] = ""+ roundval(float.valueof(text.gety()));         coordtab[p][3] = string.valueof(tword);         coordtab[p][4] = ""+text.getfontsize();         coordtab[p][5] = ""+text.getfont().getbasefont();          is1stchar = false;     }      protected void endword() {         string newword = tword.tostring().replaceall("[^\\x00-\\x7f]", "");         string sword = newword.substring(newword.lastindexof(' ') + 1);         if (!"".equals(sword)) {             if (arrays.aslist(seeka).contains(sword)) {                 wordlist.add(newword);             } else {                 wordlist.add(newword);             }         }         tword.delete(0, tword.length());         is1stchar = true;         p++;     }      protected boolean matchcharline(textposition text) {         double yval = roundval(float.valueof(text.gety()));         if (yval.doublevalue() == lastyval) {             return true;         }         lastyval = yval.doublevalue();         endword();         return false;     }      protected double roundval(float yval) {         decimalformat rounded = new decimalformat("###.##");         string st = rounded.format(yval);         double yvaldub = double.parsedouble(st.replace(",", "."));         return yvaldub;     } }

the missing spaces

the mistake in regard assume gaps between words created empty glyphs extracted space characters.

this need not case!

these gaps can created explicitly forwarding x position.

as @ y coordinate of text origin (textposition.gety() in matchcharline) ignore x coordinate (textposition.getx()), miss such gaps.

another issue

you assume order in retrieve textposition instances "correct" order.

this need not case!

the glyphs of text may drawn in random order, might receive them in random order.

a random order seldom seen degree of reordering used every once in while.

what do

the pdfbox pdftextstripper class uses well-tested routines adding spaces gap indicates , sorting unordered glyphs. overriding processtextposition, though, prevented code being used.

thus, should not replace processtextposition method instead override writestring(string, list<textposition>) , writewordseparator() methods.

for automatic sorting of glyphs use setsortbyposition(true).

Search This Blog

Premier