Index and Search PDF content using Lucene and PDFBox libraries

Code to using Lucene

  • Java main class
package com.lucene.demo;

import java.io.File;
import java.net.URL;
import java.util.List;

public class LuceneDemoApp {

    public static void main(String[] args) {


        LuceneTextSearchUtils utils = new LuceneTextSearchUtils();

        try {
            String fileName= "Example.pdf";
            utils.getFSStore("indexLocation", true);
            LuceneDemoApp demo = new LuceneDemoApp();
            File pdfFile = new File (demo.getFilePath(fileName));
            if(!pdfFile.exists()) {
                System.out.println("File Doesn't exists");
                System.exit(-1);
            }
            utils.insertDocument(pdfFile, "PDF");
            List<String> output = utils.searchQuery("The");
            System.out.println("search result: - "+output);
            List<String> result = utils.searchFuzzyQuery("*ward");
            System.out.println("search: "+result);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public String getFilePath(String input) {
         URL path =this.getClass().getClassLoader().getResource(input); 
            return path.getPath();
    }

}
  • Utility class to work with Lucene index and documents
    • Search example - With IndexSearcher, simple search and Fuzzy Search.
package com.lucene.demo;

import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class LuceneTextSearchUtils {

    Directory dir;
    Analyzer analyzer;
    IndexWriterConfig idxWriterConfig;
    IndexWriter idxWriter;

    public void getFSStore(String pathForIndex, boolean createNew) throws Exception{

        dir = FSDirectory.open(Paths.get(pathForIndex));
        analyzer = new StandardAnalyzer();
        idxWriterConfig = new IndexWriterConfig();
        if(createNew) {
            idxWriterConfig.setOpenMode(OpenMode.CREATE);
        }else { // open or create store if already exists
            idxWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }
        idxWriter = new IndexWriter(dir, idxWriterConfig);
    }

    public void clearStore() throws Exception{
        if(idxWriterConfig != null) {
            IndexWriter idxLocalWriter = new IndexWriter(dir,idxWriterConfig);
            idxLocalWriter.deleteAll();
            idxLocalWriter.close();
        }
    }

    public void closeIdxWriter() throws Exception {

        if(idxWriter !=null && idxWriter.isOpen()) {
            idxWriter.close();
        }
    }

    public void insertDocument(File file, String fileType) throws IOException {

        if(idxWriterConfig == null ) {
            //
            System.err.println("IndexWriterConfig is not defined or initalized.");
            return; 
        }

        Document doc = null;
        if("PDF".equalsIgnoreCase(fileType)) {
            PDFIndexer indexer = new PDFIndexer();
            doc = indexer.createDocument(file);
        }

        // create a Lucene fieldtype to store the file path
        // and store type
        FieldType fieldType = new FieldType();
        fieldType.setStored(true);
        fieldType.setTokenized(true);
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        // create term - this represents a word from the texts
        Term term = null;
        if(doc != null) {

            doc.add(new Field("File_attributes",file.getPath(),fieldType));
            term = new Term("File_attributes",file.getPath());
        }

        if(idxWriter!= null && idxWriter.getConfig().getOpenMode() == OpenMode.CREATE) {
            //if create mode then write the document
            idxWriter.addDocument(doc);
        }else {
            if (idxWriter != null && term!=null) {
                //return number of term deleted
                long deleteCnt = idxWriter.deleteDocuments(term);
                idxWriter.commit();
            }
        }
    }

    public List<String> searchQuery(String searchString) throws Exception{

        // if writer not initialized simply return null
        if(idxWriter == null ) return null;

        if (idxWriter !=null &&  idxWriter.isOpen()) {
            idxWriter.close();
        }

        IndexReader idxReader = DirectoryReader.open(dir);
        IndexSearcher idxSearcher = new IndexSearcher(idxReader);

        //using the classic query parser of lucene
        QueryParser queryParser = new QueryParser("pdfContent", analyzer);
        // create the query
        Query query = queryParser.parse(searchString);

        // find the number of hits for the matching string
        ScoreDoc[] hits = idxSearcher.search(query, idxReader.numDocs()).scoreDocs;
        System.out.println("Number of hits :- "+hits.length);
        List<String> str = new ArrayList<>();
        for (ScoreDoc hit : hits) {
            Document document = idxSearcher.doc(hit.doc);
           str.add(document.get("File_attributes"));
        }
        return str;
    }

    public List<String> searchFuzzyQuery(String searchQuery) throws IOException,ParseException{

        IndexReader idxReader = DirectoryReader.open(dir);
        IndexSearcher idxSearcher = new IndexSearcher(idxReader);

        Term term = new Term ("pdfContent",searchQuery);

        Query query = new FuzzyQuery(term);
        TopDocs hits = idxSearcher.search(query,10);
        List<String > res = new ArrayList<>();
        System.out.println("Number of hits := "+hits.totalHits);
        for(ScoreDoc hit : hits.scoreDocs) {
            Document doc = idxSearcher.doc(hit.doc);
            System.out.println(searchQuery+"  = Matched= :  "+hit.score);
            res.add(doc.get("PDFAuthor"));
        }
        return res;
    }
}
  • Reading the PDF file using pffbox and parse the content
package com.lucene.demo;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;

import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.text.PDFTextStripper;

public class PDFIndexer {

    // File sperator based on the system
    private static final char FILE_SEP = System.getProperty("file.separator").charAt(0);

    public static final FieldType NON_INDEX_FIELD = new FieldType();
    //set the non index attribute when class is created
    static {
        NON_INDEX_FIELD.setIndexOptions(IndexOptions.NONE);
        NON_INDEX_FIELD.setStored(true);
        NON_INDEX_FIELD.setTokenized(true);
        NON_INDEX_FIELD.freeze();
    }

    public Document createDocument(File pdfFile) throws IOException{

        Document doc = new Document();
        String fileModifiedTimeStr = DateTools.timeToString(pdfFile.lastModified(),DateTools.Resolution.SECOND);
        String path = pdfFile.getPath();
        // Add the file path to a non index field
        doc.add(new Field("filepath",path, NON_INDEX_FIELD));
        // add the file attributes to index field
        doc.add(new StringField("lastmodified",fileModifiedTimeStr,Store.YES));

        //create UID for the indexer
        String uuid = path.replace(FILE_SEP,'-')+ fileModifiedTimeStr;
        // don't store this uuid into the index
        doc.add(new Field("uuidstring",uuid,TextField.TYPE_NOT_STORED));

        //read the pdf file using pdf box and add the data
        try(FileInputStream inputStream = new FileInputStream(pdfFile)){
            parsePDFAndAddContentToDocument(doc,inputStream,path);
        }
        return doc;
    }

    private void parsePDFAndAddContentToDocument(Document doc, FileInputStream inputStream, String path) throws IOException {

        //PDDocument implements closable
        // also throws load() throws io exception
        try (PDDocument pdfDocument = PDDocument.load(inputStream)){

            StringWriter strWriter= new StringWriter();
            PDFTextStripper txtStripper= new PDFTextStripper();
            txtStripper.writeText(pdfDocument, strWriter);
            //read the content and add to index as text field

            StringReader reader = new StringReader(strWriter.getBuffer().toString());
            System.out.println("read content:- "+strWriter.getBuffer().toString());
            doc.add(new TextField("pdfContent", reader));
            // NOTE: if the pdf is password protected then it will throw 
            // invalidpasswordExcepton and cannot index the file

            // Metadata of the PDF that can be indexed
            PDDocumentInformation metaInfo = pdfDocument.getDocumentInformation();

            if(metaInfo != null) {
                doc.add(new TextField("PDFAuthor", metaInfo.getAuthor(),Store.YES));
                doc.add(new TextField("PDFCreationDate",metaInfo.getCreationDate().toString(),Store.YES));
                // other information check the documentation of PDF box
                // example: Creator, keywords, Producer, Subject, Title, Trapped, Modifiaction date

            }    
        }
    }
}
  • Dependencies required for the project
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>luceneexample</groupId>
  <artifactId>luceneexample</artifactId>
  <version>0.0.1-SNAPSHOT</version>

  <name>com.lucene.demo</name>
  <url>http://www.maven.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>11</maven.compiler.source>
    <maven.compiler.target>11</maven.compiler.target>
  </properties>

  <build>
    <plugins>
      <plugin>
        <artifactId>maven-surefire-plugin</artifactId>
        <version>2.22.2</version>
      </plugin>
      <plugin>
        <artifactId>maven-failsafe-plugin</artifactId>
        <version>2.22.2</version>
      </plugin>
    </plugins>
  </build>

 <dependencies>
    <dependency>
      <groupId>org.junit.jupiter</groupId>
      <artifactId>junit-jupiter-api</artifactId>
      <version>5.7.2</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.junit.jupiter</groupId>
      <artifactId>junit-jupiter-engine</artifactId>
      <version>5.7.2</version>
      <scope>test</scope>
    </dependency>
    <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-core</artifactId>
    <version>9.0.0</version>
</dependency>
<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-queryparser</artifactId>
    <version>9.0.0</version>
</dependency>

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.22</version>
</dependency>
  </dependencies>
</project>

The above code can be optimized better, this is demonstration of Lucene code.

Note:

  • The above example, the input file should be provided within the class path.

Input file pdf content snapshot

image.png

Index Created by Lucene

image.png

Output:

read content:- Generate random text for search 
Sun moon night day  
Season weather the  
lord, word,  
ward, award 
thread, tread, heard 
Trust, mister 
Weak, tech 
The  
The  
The  
Three  
Four  
Numbers are in here. 

Number of hits :- 1
search result: - [...luceneexample\target\classes\Example.pdf]
Number of hits := 1 hits
*ward  = Matched= :  0.34652615
search: [thirumurthi]