Index and Search PDF content using Lucene and PDFBox libraries
Code to using Lucene
- Java main class
package com.lucene.demo;
import java.io.File;
import java.net.URL;
import java.util.List;
public class LuceneDemoApp {
public static void main(String[] args) {
LuceneTextSearchUtils utils = new LuceneTextSearchUtils();
try {
String fileName= "Example.pdf";
utils.getFSStore("indexLocation", true);
LuceneDemoApp demo = new LuceneDemoApp();
File pdfFile = new File (demo.getFilePath(fileName));
if(!pdfFile.exists()) {
System.out.println("File Doesn't exists");
System.exit(-1);
}
utils.insertDocument(pdfFile, "PDF");
List<String> output = utils.searchQuery("The");
System.out.println("search result: - "+output);
List<String> result = utils.searchFuzzyQuery("*ward");
System.out.println("search: "+result);
} catch (Exception e) {
e.printStackTrace();
}
}
public String getFilePath(String input) {
URL path =this.getClass().getClassLoader().getResource(input);
return path.getPath();
}
}
- Utility class to work with Lucene index and documents
- Search example - With IndexSearcher, simple search and Fuzzy Search.
package com.lucene.demo;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class LuceneTextSearchUtils {
Directory dir;
Analyzer analyzer;
IndexWriterConfig idxWriterConfig;
IndexWriter idxWriter;
public void getFSStore(String pathForIndex, boolean createNew) throws Exception{
dir = FSDirectory.open(Paths.get(pathForIndex));
analyzer = new StandardAnalyzer();
idxWriterConfig = new IndexWriterConfig();
if(createNew) {
idxWriterConfig.setOpenMode(OpenMode.CREATE);
}else { // open or create store if already exists
idxWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
idxWriter = new IndexWriter(dir, idxWriterConfig);
}
public void clearStore() throws Exception{
if(idxWriterConfig != null) {
IndexWriter idxLocalWriter = new IndexWriter(dir,idxWriterConfig);
idxLocalWriter.deleteAll();
idxLocalWriter.close();
}
}
public void closeIdxWriter() throws Exception {
if(idxWriter !=null && idxWriter.isOpen()) {
idxWriter.close();
}
}
public void insertDocument(File file, String fileType) throws IOException {
if(idxWriterConfig == null ) {
//
System.err.println("IndexWriterConfig is not defined or initalized.");
return;
}
Document doc = null;
if("PDF".equalsIgnoreCase(fileType)) {
PDFIndexer indexer = new PDFIndexer();
doc = indexer.createDocument(file);
}
// create a Lucene fieldtype to store the file path
// and store type
FieldType fieldType = new FieldType();
fieldType.setStored(true);
fieldType.setTokenized(true);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
// create term - this represents a word from the texts
Term term = null;
if(doc != null) {
doc.add(new Field("File_attributes",file.getPath(),fieldType));
term = new Term("File_attributes",file.getPath());
}
if(idxWriter!= null && idxWriter.getConfig().getOpenMode() == OpenMode.CREATE) {
//if create mode then write the document
idxWriter.addDocument(doc);
}else {
if (idxWriter != null && term!=null) {
//return number of term deleted
long deleteCnt = idxWriter.deleteDocuments(term);
idxWriter.commit();
}
}
}
public List<String> searchQuery(String searchString) throws Exception{
// if writer not initialized simply return null
if(idxWriter == null ) return null;
if (idxWriter !=null && idxWriter.isOpen()) {
idxWriter.close();
}
IndexReader idxReader = DirectoryReader.open(dir);
IndexSearcher idxSearcher = new IndexSearcher(idxReader);
//using the classic query parser of lucene
QueryParser queryParser = new QueryParser("pdfContent", analyzer);
// create the query
Query query = queryParser.parse(searchString);
// find the number of hits for the matching string
ScoreDoc[] hits = idxSearcher.search(query, idxReader.numDocs()).scoreDocs;
System.out.println("Number of hits :- "+hits.length);
List<String> str = new ArrayList<>();
for (ScoreDoc hit : hits) {
Document document = idxSearcher.doc(hit.doc);
str.add(document.get("File_attributes"));
}
return str;
}
public List<String> searchFuzzyQuery(String searchQuery) throws IOException,ParseException{
IndexReader idxReader = DirectoryReader.open(dir);
IndexSearcher idxSearcher = new IndexSearcher(idxReader);
Term term = new Term ("pdfContent",searchQuery);
Query query = new FuzzyQuery(term);
TopDocs hits = idxSearcher.search(query,10);
List<String > res = new ArrayList<>();
System.out.println("Number of hits := "+hits.totalHits);
for(ScoreDoc hit : hits.scoreDocs) {
Document doc = idxSearcher.doc(hit.doc);
System.out.println(searchQuery+" = Matched= : "+hit.score);
res.add(doc.get("PDFAuthor"));
}
return res;
}
}
- Reading the PDF file using
pffbox
and parse the content
package com.lucene.demo;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDFIndexer {
// File sperator based on the system
private static final char FILE_SEP = System.getProperty("file.separator").charAt(0);
public static final FieldType NON_INDEX_FIELD = new FieldType();
//set the non index attribute when class is created
static {
NON_INDEX_FIELD.setIndexOptions(IndexOptions.NONE);
NON_INDEX_FIELD.setStored(true);
NON_INDEX_FIELD.setTokenized(true);
NON_INDEX_FIELD.freeze();
}
public Document createDocument(File pdfFile) throws IOException{
Document doc = new Document();
String fileModifiedTimeStr = DateTools.timeToString(pdfFile.lastModified(),DateTools.Resolution.SECOND);
String path = pdfFile.getPath();
// Add the file path to a non index field
doc.add(new Field("filepath",path, NON_INDEX_FIELD));
// add the file attributes to index field
doc.add(new StringField("lastmodified",fileModifiedTimeStr,Store.YES));
//create UID for the indexer
String uuid = path.replace(FILE_SEP,'-')+ fileModifiedTimeStr;
// don't store this uuid into the index
doc.add(new Field("uuidstring",uuid,TextField.TYPE_NOT_STORED));
//read the pdf file using pdf box and add the data
try(FileInputStream inputStream = new FileInputStream(pdfFile)){
parsePDFAndAddContentToDocument(doc,inputStream,path);
}
return doc;
}
private void parsePDFAndAddContentToDocument(Document doc, FileInputStream inputStream, String path) throws IOException {
//PDDocument implements closable
// also throws load() throws io exception
try (PDDocument pdfDocument = PDDocument.load(inputStream)){
StringWriter strWriter= new StringWriter();
PDFTextStripper txtStripper= new PDFTextStripper();
txtStripper.writeText(pdfDocument, strWriter);
//read the content and add to index as text field
StringReader reader = new StringReader(strWriter.getBuffer().toString());
System.out.println("read content:- "+strWriter.getBuffer().toString());
doc.add(new TextField("pdfContent", reader));
// NOTE: if the pdf is password protected then it will throw
// invalidpasswordExcepton and cannot index the file
// Metadata of the PDF that can be indexed
PDDocumentInformation metaInfo = pdfDocument.getDocumentInformation();
if(metaInfo != null) {
doc.add(new TextField("PDFAuthor", metaInfo.getAuthor(),Store.YES));
doc.add(new TextField("PDFCreationDate",metaInfo.getCreationDate().toString(),Store.YES));
// other information check the documentation of PDF box
// example: Creator, keywords, Producer, Subject, Title, Trapped, Modifiaction date
}
}
}
}
- Dependencies required for the project
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>luceneexample</groupId>
<artifactId>luceneexample</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>com.lucene.demo</name>
<url>http://www.maven.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
<build>
<plugins>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.2</version>
</plugin>
<plugin>
<artifactId>maven-failsafe-plugin</artifactId>
<version>2.22.2</version>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.7.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>5.7.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>9.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>9.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.22</version>
</dependency>
</dependencies>
</project>
The above code can be optimized better, this is demonstration of Lucene code.
Note:
- The above example, the input file should be provided within the class path.
Input file pdf content snapshot
Index Created by Lucene
Output:
read content:- Generate random text for search
Sun moon night day
Season weather the
lord, word,
ward, award
thread, tread, heard
Trust, mister
Weak, tech
The
The
The
Three
Four
Numbers are in here.
Number of hits :- 1
search result: - [...luceneexample\target\classes\Example.pdf]
Number of hits := 1 hits
*ward = Matched= : 0.34652615
search: [thirumurthi]