Skip navigation

This is a simple java app that uses the PDFBox library to locate text within a PDF document. This app is designed to be run from the command line, originally by a python script. Given a PDF it will parse the entire document and produce a comma delimited string of the identified word followed by the page number in parenthesis and the x/y coordinates within brackets of the top left corner of the first letter. Words are identified by character groupings and simple spaces and punctuation placement. For the most part all of the magic here is performed by PDFBox which is a fantastic library for parsing PDFs. This is a rough and rather featureless version of the one I used in production and could certainly use some improvement though it’s a good place to start if you can’t find a working example.

Output will be similar to:
[(1)[190.3 : 286.8] WORD1, (1)[283.3 : 286.8] WORD2, ...]

Dependencies:

package printtextlocations;

import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

public class PrintTextLocations extends PDFTextStripper {

    public static StringBuilder tWord = new StringBuilder();
    public static String seek;
    public static String[] seekA;
    public static List wordList = new ArrayList();
    public static boolean is1stChar = true;
    public static boolean lineMatch;
    public static int pageNo = 1;
    public static double lastYVal;

    public PrintTextLocations()
            throws IOException {
        super.setSortByPosition(true);
    }

    public static void main(String[] args)
            throws Exception {
        PDDocument document = null;
        seekA = args[1].split(",");
        seek = args[1];
        try {
            File input = new File(args[0]);
            document = PDDocument.load(input);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                } catch (InvalidPasswordException e) {
                    System.err.println("Error: Document is encrypted with a password.");
                    System.exit(1);
                }
            }
            PrintTextLocations printer = new PrintTextLocations();
            List allPages = document.getDocumentCatalog().getAllPages();

            for (int i = 0; i < allPages.size(); i++) {
                PDPage page = (PDPage) allPages.get(i);
                PDStream contents = page.getContents();

                if (contents != null) {
                    printer.processStream(page, page.findResources(), page.getContents().getStream());
                }
                pageNo += 1;
            }
        } finally {
            if (document != null) {
                System.out.println(wordList);
                document.close();
            }
        }
    }

    @Override
    protected void processTextPosition(TextPosition text) {
        String tChar = text.getCharacter();
        System.out.println("String[" + text.getXDirAdj() + ","
                + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
                + text.getXScale() + " height=" + text.getHeightDir() + " space="
                + text.getWidthOfSpace() + " width="
                + text.getWidthDirAdj() + "]" + text.getCharacter());
        String REGEX = "[,.\\[\\](:;!?)/]";
        char c = tChar.charAt(0);
        lineMatch = matchCharLine(text);
        if ((!tChar.matches(REGEX)) && (!Character.isWhitespace(c))) {
            if ((!is1stChar) && (lineMatch == true)) {
                appendChar(tChar);
            } else if (is1stChar == true) {
                setWordCoord(text, tChar);
            }
        } else {
            endWord();
        }
    }

    protected void appendChar(String tChar) {
        tWord.append(tChar);
        is1stChar = false;
    }

    protected void setWordCoord(TextPosition text, String tChar) {
        tWord.append("(").append(pageNo).append(")[").append(roundVal(Float.valueOf(text.getXDirAdj()))).append(" : ").append(roundVal(Float.valueOf(text.getYDirAdj()))).append("] ").append(tChar);
        is1stChar = false;
    }

    protected void endWord() {
        String newWord = tWord.toString().replaceAll("[^\\x00-\\x7F]", "");
        String sWord = newWord.substring(newWord.lastIndexOf(' ') + 1);
        if (!"".equals(sWord)) {
            if (Arrays.asList(seekA).contains(sWord)) {
                wordList.add(newWord);
            } else if ("SHOWMETHEMONEY".equals(seek)) {
                wordList.add(newWord);
            }
        }
        tWord.delete(0, tWord.length());
        is1stChar = true;
    }

    protected boolean matchCharLine(TextPosition text) {
        Double yVal = roundVal(Float.valueOf(text.getYDirAdj()));
        if (yVal.doubleValue() == lastYVal) {
            return true;
        }
        lastYVal = yVal.doubleValue();
        endWord();
        return false;
    }

    protected Double roundVal(Float yVal) {
        DecimalFormat rounded = new DecimalFormat("0.0'0'");
        Double yValDub = new Double(rounded.format(yVal));
        return yValDub;
    }
}

11 Trackbacks/Pingbacks

  1. By Homepage on 13 Sep 2020 at 4:26 am

    … [Trackback]

    […] Read More here: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  2. By this post on 22 Sep 2020 at 10:44 pm

    … [Trackback]

    […] Read More to that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  3. … [Trackback]

    […] Read More on that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  4. By unicvv ru on 23 Sep 2020 at 4:48 pm

    … [Trackback]

    […] Read More Info here to that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  5. By thenaturalpenguin.com on 24 Sep 2020 at 8:53 am

    … [Trackback]

    […] Read More on to that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  6. By click here to find out more on 24 Sep 2020 at 9:36 am

    … [Trackback]

    […] Read More on that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  7. … [Trackback]

    […] Read More Info here on that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  8. By bitcoin loophole review 2020 on 25 Sep 2020 at 7:28 pm

    … [Trackback]

    […] Info on that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  9. By buy dilaudid overnight delivery on 26 Sep 2020 at 5:00 pm

    … [Trackback]

    […] Information to that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

  10. By frontier airlines Phone Number on 27 Sep 2020 at 4:37 am

    … [Trackback]

    […] Read More to that Topic: jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/ […]

Comments are closed.