I have a need to use the Edit | Form Options | Export Data feature to convert PDFs to XML. I need to be able to do this programmatically. have done research on the SDK and this maybe a solution. I have Adobe Acrobat Pro 2017 installed. I was to work with the AcroExch COM object in Powershell. This offered a lot of interesting functionality but not a simple Export Data to XML. I see there maybe a way via a plugin in C to do this but not sure if that is possible either. I looked at my installation and I don't think the full SDK is available.
Any thoughts on how to proceed?
Thank you.
Solved! Go to Solution.
Total Likes
Hi @fmcaruso
You May give a try with this
public class ConvertPDFToXML { static StreamResult streamResult; static TransformerHandler handler; static AttributesImpl atts; public static void main(String[] args) throws IOException { try { Document document = new Document(); document.open(); PdfReader reader = new PdfReader("C:\\hello.pdf"); PdfDictionary page = reader.getPageN(1); PRIndirectReference objectReference = (PRIndirectReference) page .get(PdfName.CONTENTS); PRStream stream = (PRStream) PdfReader .getPdfObject(objectReference); byte[] streamBytes = PdfReader.getStreamBytes(stream); PRTokeniser tokenizer = new PRTokeniser(streamBytes); StringBuffer strbufe = new StringBuffer(); while (tokenizer.nextToken()) { if (tokenizer.getTokenType() == PRTokeniser.TK_STRING) { strbufe.append(tokenizer.getStringValue()); } } String test = strbufe.toString(); streamResult = new StreamResult("data.xml"); initXML(); process(test); closeXML(); document.add(new Paragraph("..")); document.close(); } catch (Exception e) { } } public static void initXML() throws ParserConfigurationException, TransformerConfigurationException, SAXException { SAXTransformerFactory tf = (SAXTransformerFactory) SAXTransformerFactory .newInstance(); handler = tf.newTransformerHandler(); Transformer serializer = handler.getTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1"); serializer.setOutputProperty( "{http://xml.apache.org/xslt}indent-amount", "4"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); handler.setResult(streamResult); handler.startDocument(); atts = new AttributesImpl(); handler.startElement("", "", "data", atts); } public static void process(String s) throws SAXException { String[] elements = s.split("\\|"); atts.clear(); handler.startElement("", "", "Message", atts); handler.characters(elements[0].toCharArray(), 0, elements[0].length()); handler.endElement("", "", "Message"); } public static void closeXML() throws SAXException { handler.endElement("", "", "data"); handler.endDocument(); } }
Hi @fmcaruso
You May give a try with this
public class ConvertPDFToXML { static StreamResult streamResult; static TransformerHandler handler; static AttributesImpl atts; public static void main(String[] args) throws IOException { try { Document document = new Document(); document.open(); PdfReader reader = new PdfReader("C:\\hello.pdf"); PdfDictionary page = reader.getPageN(1); PRIndirectReference objectReference = (PRIndirectReference) page .get(PdfName.CONTENTS); PRStream stream = (PRStream) PdfReader .getPdfObject(objectReference); byte[] streamBytes = PdfReader.getStreamBytes(stream); PRTokeniser tokenizer = new PRTokeniser(streamBytes); StringBuffer strbufe = new StringBuffer(); while (tokenizer.nextToken()) { if (tokenizer.getTokenType() == PRTokeniser.TK_STRING) { strbufe.append(tokenizer.getStringValue()); } } String test = strbufe.toString(); streamResult = new StreamResult("data.xml"); initXML(); process(test); closeXML(); document.add(new Paragraph("..")); document.close(); } catch (Exception e) { } } public static void initXML() throws ParserConfigurationException, TransformerConfigurationException, SAXException { SAXTransformerFactory tf = (SAXTransformerFactory) SAXTransformerFactory .newInstance(); handler = tf.newTransformerHandler(); Transformer serializer = handler.getTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1"); serializer.setOutputProperty( "{http://xml.apache.org/xslt}indent-amount", "4"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); handler.setResult(streamResult); handler.startDocument(); atts = new AttributesImpl(); handler.startElement("", "", "data", atts); } public static void process(String s) throws SAXException { String[] elements = s.split("\\|"); atts.clear(); handler.startElement("", "", "Message", atts); handler.characters(elements[0].toCharArray(), 0, elements[0].length()); handler.endElement("", "", "Message"); } public static void closeXML() throws SAXException { handler.endElement("", "", "data"); handler.endDocument(); } }