hi
i too having the same issue as converting PDF file to XML format/file. this should be done using pure java code and till now i got the pdf file name by selecting the file using the choose file browser and storing it in a variable then with the help of PDFBOX i tried but the result is converted but not in XML format(i.e.,when i open the converted XML file it is in normal format but only the extension has been changed .PDF to .XML no other changed)so please let me know how to achive this
i have paste the code i have used
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.encryption.AccessPermission;
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFText2HTML;
//import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.pdmodel.font.PDFont.* ;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.util.*;
import org.pdfbox.pdmodel.*;
import com.activegrid.util.AGObject;
//import executesqljavaxsd.types.*;
import com.activegrid.util.Logger;
import com.activegrid.data.DataService;
import java.util.List;
import java.util.ArrayList;
public class pdf2xml {
public static final String DEFAULT_ENCODING =
null;
//"ISO-8859-1";
//"ISO-8859-6"; //arabic
//"US-ASCII";
//"UTF-8";
//"UTF-16";
//"UTF-16BE";
//"UTF-16LE";
private static final String PASSWORD = "-password";
private static final String ENCODING = "-encoding";
private static final String CONSOLE = "-console";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String SORT = "-sort";
private static final String HTML = "-html"; // jjb - added simple HTML output
private static String a;
public static void main( String[] args ) throws Exception
{
}
public static void abc(String inp)throws Exception
{
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
String password = "";
String encoding = DEFAULT_ENCODING;
String pdfFile = inp;
String textFile = "C:/tut.xml";//file to store in XML format
int startPage = 1;
int endPage = Integer.MAX_VALUE;
a ="txt";
if( pdfFile == null )
{
usage();
}
else
{
Writer output = null;
PDDocument document = null;
try
{
try
{
//basically try to load it from a url first and if the URL
//is not recognized then try to load it from the file system.
URL url = new URL( pdfFile );
document = PDDocument.load( url );
String fileName = url.getFile();
if( textFile == null && fileName.length() >4 )
{
File outputFile =
new File( fileName.substring( 0, fileName.length() -4 ) + ".txt" );
textFile = outputFile.getName();
}
}
catch( MalformedURLException e )
{
document = PDDocument.load( pdfFile );
if( textFile == null && pdfFile.length() >4 )
{
textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
}
}
//document.print();
if( document.isEncrypted() )
{
StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );
document.openProtection( sdm );
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException( "You do not have permission to extract text" );
}
}
if( toConsole )
{
output = new OutputStreamWriter( System.out );
}
else
{
if( encoding != null )
{
output = new OutputStreamWriter(
new FileOutputStream( textFile ), encoding );
}
else
{
//use default encoding
output = new OutputStreamWriter(
new FileOutputStream( textFile ) );
}
}
PDFTextStripper stripper = null;
if(toHTML)
{
stripper = new PDFText2HTML();
}
else
{
stripper = new PDFTextStripper();
}
stripper.setSortByPosition( sort );
stripper.setStartPage( startPage );
stripper.setEndPage( endPage );
stripper.writeText( document, output );
}
finally
{
if( output != null )
{
output.close();
}
if( document != null )
{
document.close();
}
}
}
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n" +
" -password <password> Password to decrypt document\n" +
" -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
" -console Send text to console instead of file\n" +
" -html Output in HTML format instead of raw text\n" +
" -sort Sort the text before writing\n" +
" -startPage <number> The first page to start extraction(1 based)\n" +
" -endPage <number> The last page to extract(inclusive)\n" +
" <PDF file> The PDF document to use\n" +
" [Text File] The file to write the text to\n"
);
System.exit( 1 );
}
public static java.lang.String pdf2xml(java.lang.String inp) {
try{
abc(inp);
}
catch( Exception e){}
a = inp;
java.lang.String out = a;
// your custom code goes here
return out;
}
}
reply me ASAP
regards
yuvaraj