Facing org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException while converting PDF to JSON using Tika in AEM
Hi all,
I’m trying to convert a PDF stored in AEM DAM to a JSON tree structure (pages → paragraphs → lines) using Apache Tika’s PDFParser. Below is the servlet code I’m using:
However, when I try to parse certain PDFs, I’m getting the following exception:
org.apache.pdfbox.pdmodel.encryption.InvalidPasswordExceptionIt seems like the PDF might be password-protected, but I don’t have the password and some PDFs are not encrypted.
package com.core.servlets;
import com.day.cq.dam.api.Asset;
import com.day.cq.dam.api.Rendition;
import org.apache.sling.api.SlingHttpServletRequest;
import org.apache.sling.api.SlingHttpServletResponse;
import org.apache.sling.api.servlets.HttpConstants;
import org.apache.sling.api.servlets.SlingAllMethodsServlet;
import org.json.JSONArray;
import org.json.JSONObject;
import org.osgi.service.component.annotations.Component;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.ToXMLContentHandler;
import org.xml.sax.ContentHandler;
import javax.servlet.Servlet;
import java.io.InputStream;
@Component(
service = Servlet.class,
property = {
"sling.servlet.methods=" + HttpConstants.METHOD_GET,
"sling.servlet.paths=" + "/bin/pdfToJson"
}
)
public class PdfToJsonServlet extends SlingAllMethodsServlet {
protected void doGet(SlingHttpServletRequest request, SlingHttpServletResponse response) {
String damPath = request.getParameter("path"); // DAM path of PDF
response.setContentType("application/json");
try {
if (damPath == null || damPath.isEmpty()) {
response.setStatus(400);
response.getWriter().write("{\"error\":\"Please provide a DAM path parameter\"}");
return;
}
Asset asset = request.getResourceResolver().getResource(damPath).adaptTo(Asset.class);
if (asset == null) {
response.setStatus(404);
response.getWriter().write("{\"error\":\"Asset not found\"}");
return;
}
Rendition rendition = asset.getOriginal();
try (InputStream is = rendition.adaptTo(InputStream.class)) {
// Use Tika PDFParser with XML output
ContentHandler handler = new ToXMLContentHandler();
PDFParser parser = new PDFParser();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
parser.parse(is, handler, metadata, context);
String xmlContent = handler.toString();
// Convert XML to JSON tree (simple page/paragraph/line approximation)
JSONObject pdfJson = new JSONObject();
JSONArray pagesArray = new JSONArray();
// Split pages by <div class="page">
String[] pages = xmlContent.split("<div class=\"page\"");
int pageNumber = 1;
for (String page : pages) {
if (page.trim().isEmpty()) continue;
JSONObject pageObj = new JSONObject();
pageObj.put("pageNumber", pageNumber++);
JSONArray paragraphsArray = new JSONArray();
// Split paragraphs by <p>
String[] paragraphs = page.split("<p>");
for (String para : paragraphs) {
para = para.replaceAll("</p>", "").trim();
if (para.isEmpty()) continue;
JSONObject paraObj = new JSONObject();
JSONArray linesArray = new JSONArray();
// Split lines by \n
String[] lines = para.split("\\n");
for (String line : lines) {
line = line.trim();
if (!line.isEmpty()) linesArray.put(line);
}
paraObj.put("lines", linesArray);
paragraphsArray.put(paraObj);
}
pageObj.put("paragraphs", paragraphsArray);
pagesArray.put(pageObj);
}
pdfJson.put("pages", pagesArray);
response.getWriter().write(pdfJson.toString());
}
} catch (Exception e) {
response.setStatus(500);
try {
response.getWriter().write("{\"error\":\"" + e.getMessage() + "\"}");
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
}This is the code I have used.
Tried using pdfbox dependency, itext as well..but getting dependency injection error.
Can anyone suggest:
How to handle PDFs that might be encrypted?
How to skip or safely parse PDFs that require a password without failing the whole servlet?
Any code examples or best practices for handling this in AEM with Tika would be really helpful.
Thanks in advance!
Regards,
Karishma.