[Egothor-tech] (no subject)
Matthew Tam
matthewkwtam at gmail.com
Thu Nov 9 11:18:17 GMT 2006
Unfortunately, I cannot solve the problem following the instruction.
It appears that the PDFParser.java (attached) I have downloaded works
in a different way.
After a few hacking, I have temporarily solved the problem by the following:
1. Add 2 lines to $home/egothor.properties:
native.parser.pdf = /usr/local/bin/pdftotext
native.parser.pdf.meta = /usr/local/bin/pdfinfo
/* I found that PDFParser refer to these 2 keys to locate xpdf */
2. Add a few lines to setSource() in PDFParser.java after the
statement "mFilename = aSource;"
File f = new File(mFilename);
try {
FileInputStream s = new FileInputStream(f);
this.setInputStream(s);
} catch (java.io.FileNotFoundException e)
{
System.out.println("File: "+mFilename+" not found!");
} catch (Exception e) {}
/* This provide PDF with the input stream*/
3. Add the following at the beginning of PDFParser.java
import java.io.FileInputStream;
Any comments?
---------------------------------------------------
On 11/9/06, Leo Galambos <leo.galambos at mff.cuni.cz> wrote:
> Matthew Tam wrote:
> > Hello,
> >
> > I tried to index to some pdf document and I got a problem. Basically,
> > a java.lang.NullPointerException was thrown when the PDFParser try to
> > convert the pdf file into a input stream.
> >
>
>
> Hello,
>
> I guess the issue is identical to this one:
> http://sourceforge.net/tracker/index.php?func=detail&aid=1505085&group_id=62395&atid=500424
>
> Cheers,
> Leo
>
> --
> Leo Galambos
> Faculty of Mathematics and Physics, DSE
> Malostranske namesti 25
> Prague 1
> CZE
>
> http://kocour.ms.mff.cuni.cz/~galambos/
>
> _______________________________________________
> Egothor-tech mailing list
> Egothor-tech at egothor.org
> http://www.egothor.org/mailman/listinfo/egothor-tech
>
-------------- next part --------------
package org.egothor.analyzer.pdf.nativewrapper;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.StringTokenizer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* @author Edmond Nolan
* @version 0.0.1
*/
public class PDFParser
{
private String mFilename = null;
/**
* <code>EXIT_FAILURE</code> is a constant for -1
*/
public static final int EXIT_FAILURE = -1;
/**
* <code>EXIT_SUCCESS</code> is a constant for 0
*/
public static final int EXIT_SUCCESS = -0;
/**
* <code>SUMMARY_LENGTH</code> is the length of the summary in
* characters
*/
public static final int SUMMARY_LENGTH = 256;
/**
* <code>SHORT_SUMMARY_LENGTH</code> is the length of the summary in
* characters
*/
public static final int SHORT_SUMMARY_LENGTH = 60;
/**
* <code>SLEEP_TIME</code> is the number of milliseconds to wait until
* the HTML file is generated by xpdf
*/
public static final int SLEEP_TIME = 100;
/**
* <code>native.parser.meta.args</code> is the key used
* to find the argument in $HOME/egothor.properties used to read the
* pdf meta information using the underlying native parser
*/
public static final String KEY_META = "native.parser.pdf.meta";
/**
* <code>mFullPropsName</code> will hold the value of user.home
* plus "/" plus "egothor.properties"
*/
private String mFullPropsName = null;
/**
* <code>mNativeProgram</code> will hold location of the
* native PDF program used by this class. The location is
* read from $HOME/egothor.properties
*/
private String mNativeProgram = null;
/**
* <code>mHomeDir</code> will hold the value of user.home
*/
private String mHomeDir = null;
/**
* <code>KEY_PARSER</code> is the key used when reading
* the $HOME/egothor.properties file to discover the native
* parsser being used
*/
public static final String KEY_PARSER = "native.parser.pdf";
/**
* <code>EGOTHOR_PROPS</code> is a constant for "egothor.properties"
*/
public static final String EGOTHOR_PROPS = "egothor.properties";
/**
* <code>log</code> is the logger used.
*/
private static Log log = LogFactory.getLog(PDFParser.class);
/**
* <code>mPropertyManager</code> is the PropertyManager instance.
*/
private PropertyManager mPropertyManager;
/**
* <code>mMetainfo</code> is the PDFMetaInfo instance.
*/
private PDFMetaInfo mMetainfo;
/**
* <code>mInputStream</code> is the mInputStream instance, used as
* we need to recreate the PDF file from an input stream.
*/
private InputStream mInputStream;
/**
* <code>egothor-native-pdf</code> is the prefix used for temp pdf
* files recreated from the input streams.
*/
public final static String PREFIX = "egothor-native-tmp";
/**
* <code>pdf</code> is the suffix used for temp pdf
* files recreated from the input streams.
*/
public final static String SUFFIX_PDF = ".pdf";
/**
* <code>txt</code> is the suffix used for temp txt
* files used to hold PDF file text contents.
*/
public final static String SUFFIX_TXT = ".txt";
/**
* <code>html</code> is the suffix used for temp html
* files used to hold the PDF meta information.
*/
public final static String SUFFIX_HTML = ".html";
/**
* <code>mTempFilePDF</code> is the handle used for temp
* pdf files recreated from the input streams.
*/
private File mTempFilePDF = null;
/**
* <code>mTempFileTXT</code> is the handle used for temp
* text files which will contain the text of the pdf file
*/
private File mTempFileTXT = null;
//-------------------------------------------------------------
public static void main(String[] args)
{
//TO-DO
}
//-------------------------------------------------------------
public PDFParser() throws Exception
{
try
{
this.deleteTempFiles();
}
catch( Exception e )
{
// ignore
}
mHomeDir = System.getProperty( "user.home" );
log.debug( "using home dir: " + mHomeDir );
mFullPropsName = mHomeDir + File.separator;
mFullPropsName += PDFParser.EGOTHOR_PROPS;
log.debug( "egothor props: " + mFullPropsName );
PropertyManager.getInstance().load( mFullPropsName );
this.mNativeProgram
= PropertyManager.getInstance().getProperty( PDFParser.KEY_PARSER );
log.debug( "using parser: " + mNativeProgram );
this.mMetainfo = new PDFMetaInfo();
}
//-------------------------------------------------------------
public InputStream getInputStream()
{
return( mInputStream );
}
//-------------------------------------------------------------
public void setInputStream(InputStream aInputStream)
throws Exception
{
mInputStream = aInputStream;
log.debug( "input stream set: " + mInputStream );
this.createTempFiles();
this.recreatePDFFile();
this.parse();
this.extractPDFMetaInfo();
}
//-------------------------------------------------------------
public PDFMetaInfo getMetainfo()
{
return mMetainfo;
}
//-------------------------------------------------------------
public void setMetainfo(PDFMetaInfo metainfo)
{
this.mMetainfo = metainfo;
}
//-------------------------------------------------------------
public PropertyManager getPropertyManager()
{
return mPropertyManager;
}
//-------------------------------------------------------------
public void setPropertyManager(PropertyManager propertyManager)
{
this.mPropertyManager = propertyManager;
}
//-------------------------------------------------------------
/**
* Recreates the PDF file from input stream
* @throws Exception
*/
private void recreatePDFFile() throws Exception
{
DataInputStream lInputStream = null;
FileOutputStream lOutputStream = null;
lOutputStream = new FileOutputStream( getTempFilePDF() );
lInputStream = new DataInputStream( getInputStream() );
byte lByte;
boolean lEOF = false;
while( !lEOF )
{
try
{
lByte = lInputStream.readByte();
lOutputStream.write(lByte);
}
catch( EOFException aEOFException )
{
lEOF = true;
}
}
lInputStream.close();
lOutputStream.flush();
lOutputStream.close();
}
//-------------------------------------------------------------
/**
* Creates a temp PDF file in $TMP dir, in format egothor-native-pdf.pdf
* and a temp text file
* @throws Exception
*/
private void createTempFiles() throws IOException
{
mTempFilePDF = File.createTempFile( PREFIX, SUFFIX_PDF );
mTempFilePDF.deleteOnExit();
mTempFileTXT = File.createTempFile( PREFIX, SUFFIX_TXT );
mTempFileTXT.deleteOnExit();
}
//-------------------------------------------------------------
/**
* Delete the files created by <code>createTempFiles()</code>
*/
public void deleteTempFiles()
{
log.debug( "deleting temp file from previous session" );
FileFilter lFilter =
new FileFilter()
{
public boolean accept(File aFile)
{
return( aFile.getName().startsWith( PDFParser.PREFIX ));
}
};
File lTempDir = new File( System.getProperty( "java.io.tmpdir" ) );
File[] lTempFiles = lTempDir.listFiles( lFilter );
for (int i = 0; i < lTempFiles.length; i++)
{
try
{
lTempFiles[i].getCanonicalFile().delete();
}
catch( Exception e)
{
// ignore
}
}
}
//-------------------------------------------------------------
/**
* Gets the <code>mTempFilePDF</code> attribute of the PDFNativeParser object
* @return The mTempFilePDF value
*/
public File getTempFilePDF()
{
return( mTempFilePDF );
}
//-------------------------------------------------------------
/**
* Gets the <code>mTempFilePDF</code> attribute of the PDFNativeParser object
* @return The mTempFilePDF value
*/
public File getTempFileHTML() throws IOException
{
return( new File( getHTMLFileName() ) );
}
//-------------------------------------------------------------
/**
* Gets the <code>mTempFileTXT</code> attribute of the PDFNativeParser object
* @return The mTempFileTXT value
*/
public File getTempFileTXT()
{
return( mTempFileTXT );
}
//-------------------------------------------------------------
/**
* Read the text from the PDF file using native parser as specified in
* $HOME/egothor.properties file
*/
private void parse()
{
String lArgs = getNativeProgram();
log.debug( "parser executable: " + lArgs );
if( lArgs.indexOf( " " ) != -1 )
{
lArgs = "\"" + lArgs + "\"";
log.debug( "[escaped] parser executable: " + lArgs );
}
lArgs += " " + getTempFilePDF();
lArgs += " " + getTempFileTXT();
runProcess( lArgs );
}
//-------------------------------------------------------------
public String getNativeProgram()
{
return( mNativeProgram );
}
//-------------------------------------------------------------
public int runProcess( String aStr )
{
int lReturn = -1;
log.debug( "executing " + aStr );
try
{
Runtime.getRuntime().exec( aStr );
lReturn = PDFParser.EXIT_SUCCESS;
}
catch( Exception aException )
{
log.error( "caught: ", aException );
lReturn = EXIT_FAILURE;
}
log.debug( "done / returning " + lReturn );
return( lReturn );
}
//-------------------------------------------------------------
public String getHTMLFileName() throws IOException
{
String lReturn = null;
String lPrefix = null;
int lEnd = getTempFilePDF().getCanonicalPath().indexOf( PDFParser.SUFFIX_PDF );
if( lEnd != -1 )
{
lPrefix = getTempFilePDF().getCanonicalPath().substring( 0, lEnd );
lReturn = lPrefix + PDFParser.SUFFIX_HTML;
}
return( lReturn );
}
//-------------------------------------------------------------
private void processLine( String aLine )
{
StringTokenizer lTokenizer = null;
lTokenizer = new StringTokenizer( aLine, ":" );
String lKey = null;
String lValue = null;
while( lTokenizer.hasMoreTokens() )
{
lKey = lTokenizer.nextToken();
if( lKey == null )
return;
lValue = lTokenizer.nextToken();
if( lValue == null )
return;
lKey = lKey.trim();
lValue = lValue.trim();
log.debug( "key, value pair = " + lKey + ", " + lValue );
getMetainfo().getHashtable().put( lKey, lValue );
}
}
//-------------------------------------------------------------
/**
* Reads the PDF meta information and stores it in a hashtable
*/
private void extractPDFMetaInfo() throws Exception
{
log.debug( "called" );
String lArgs = null;
lArgs =
PropertyManager.getInstance().getProperty( PDFParser.KEY_META );
lArgs += " ";
lArgs += getTempFilePDF().getCanonicalPath();
Process lProcess = null;
DataInputStream lStream = null;
lProcess = Runtime.getRuntime().exec( lArgs );
lStream = new DataInputStream( lProcess.getInputStream() );
String lString = null;
try
{
while( (lString = lStream.readLine()) != null)
{
processLine( lString );
}
}
catch (IOException e)
{
log.debug( "caught", e );
}
log.debug( "meta args: " + lArgs );
}
//-------------------------------------------------------------
public Reader getReader() throws IOException
{
return( new FileReader( getTempFileTXT() ) );
}
//-------------------------------------------------------------
public String getAuthor()
{
return( getMetainfo().getAuthor() );
}
//-------------------------------------------------------------
public String getSummary()
{
String lReturn = null;
StringBuffer lBuffer = new StringBuffer();
Reader lReader = null;
int lCount = 0;
try
{
lReader = getReader();
for
(
int i = lReader.read();
i != -1 && lCount < PDFParser.SUMMARY_LENGTH;
i = lReader.read()
)
{
lBuffer.append((char)i);
lCount++;
}
}
catch( Exception e )
{
return( "" );
}
lReturn = lBuffer.toString();
return( lReturn );
}
//-------------------------------------------------------------
public String getKeywords()
{
return( getMetainfo().getKeywords() );
}
//-------------------------------------------------------------
public String getProducer()
{
return( getMetainfo().getProducer() );
}
//-------------------------------------------------------------
public String getTitle()
{
String lTitle = null;
String lSummary = null;
lTitle = getMetainfo().getSubject();
lSummary = getSummary();
boolean lEmpty = false;
if( lTitle == null )
lEmpty = true;
else if( lTitle.trim().length() == 0 )
lEmpty = true;
if( lEmpty )
{
if( lSummary.length() >= PDFParser.SHORT_SUMMARY_LENGTH )
lTitle = lSummary.substring( 0, PDFParser.SHORT_SUMMARY_LENGTH );
else
lTitle = lSummary.substring( 0, getSummary().length() );
}
return( lTitle );
}
//-------------------------------------------------------------
public String getCreator()
{
return( getMetainfo().getCreator() );
}
//-------------------------------------------------------------
public void setSource( String aSource )
{
mFilename = aSource;
}
//-------------------------------------------------------------
}
More information about the Egothor-tech
mailing list