[Egothor-tech] (no subject)

Matthew Tam matthewkwtam at gmail.com
Thu Nov 9 11:18:17 GMT 2006


Unfortunately, I cannot solve the problem following the instruction.
It appears that the PDFParser.java (attached) I have downloaded works
in a different way.

After a few hacking, I have temporarily solved the problem by the following:
1. Add 2 lines to $home/egothor.properties:

native.parser.pdf	= /usr/local/bin/pdftotext
native.parser.pdf.meta	= /usr/local/bin/pdfinfo

/* I found that PDFParser refer to these 2 keys to locate xpdf */

2. Add a few lines to setSource() in PDFParser.java after the
statement "mFilename = aSource;"

	File f = new File(mFilename);
	try {
		FileInputStream s = new FileInputStream(f);
		this.setInputStream(s);
	} catch (java.io.FileNotFoundException e)
	{
	   System.out.println("File: "+mFilename+" not found!");
	} catch (Exception e) {}

/* This provide PDF with the input stream*/

3. Add the following at the beginning of PDFParser.java

import java.io.FileInputStream;

Any comments?

---------------------------------------------------
On 11/9/06, Leo Galambos <leo.galambos at mff.cuni.cz> wrote:
> Matthew Tam wrote:
> > Hello,
> >
> > I tried to index to some pdf document and I got a problem.  Basically,
> > a java.lang.NullPointerException was thrown when the PDFParser try to
> > convert the pdf file into a input stream.
> >
>
>
> Hello,
>
> I guess the issue is identical to this one:
> http://sourceforge.net/tracker/index.php?func=detail&aid=1505085&group_id=62395&atid=500424
>
> Cheers,
> Leo
>
> --
> Leo Galambos
> Faculty of Mathematics and Physics, DSE
> Malostranske namesti 25
> Prague 1
> CZE
>
> http://kocour.ms.mff.cuni.cz/~galambos/
>
> _______________________________________________
> Egothor-tech mailing list
> Egothor-tech at egothor.org
> http://www.egothor.org/mailman/listinfo/egothor-tech
>
-------------- next part --------------
package org.egothor.analyzer.pdf.nativewrapper;

import java.io.DataInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.StringTokenizer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * @author Edmond Nolan
 * @version 0.0.1
 */

public class PDFParser
{
  private String mFilename = null;
  /**
   * <code>EXIT_FAILURE</code> is a constant for -1
   */  
  public static final int EXIT_FAILURE = -1;
  
  /**
   * <code>EXIT_SUCCESS</code> is a constant for 0
   */  
  public static final int EXIT_SUCCESS = -0;
  
  /**
   * <code>SUMMARY_LENGTH</code> is the length of the summary in
   * characters
   */
  public static final int SUMMARY_LENGTH = 256;
  
  /**
   * <code>SHORT_SUMMARY_LENGTH</code> is the length of the summary in
   * characters
   */
  public static final int SHORT_SUMMARY_LENGTH = 60;   
  /**
   * <code>SLEEP_TIME</code> is the number of milliseconds to wait until
   * the HTML file is generated by xpdf
   */
  public static final int SLEEP_TIME = 100;
  
  /**
   * <code>native.parser.meta.args</code> is the key used
   * to find the argument in $HOME/egothor.properties used to read the 
   * pdf meta information using the underlying native parser
   */  
  public static final String KEY_META = "native.parser.pdf.meta";
  
  /**
   * <code>mFullPropsName</code> will hold the value of user.home
   * plus "/" plus "egothor.properties"
   */   
  private String mFullPropsName = null;  
  /**
   * <code>mNativeProgram</code> will hold location of the 
   * native PDF program used by this class. The location is
   * read from $HOME/egothor.properties
   */  
  private String mNativeProgram = null;
  
  /**
   * <code>mHomeDir</code> will hold the value of user.home
   */   
  private String mHomeDir = null;
  
  /**
   * <code>KEY_PARSER</code> is the key used when reading
   * the $HOME/egothor.properties file to discover the native
   * parsser being used
   */  
  public static final String KEY_PARSER = "native.parser.pdf";
  
  /**
   * <code>EGOTHOR_PROPS</code> is a constant for "egothor.properties"
   */  
  public static final String EGOTHOR_PROPS = "egothor.properties";  
  
  /**
   * <code>log</code> is the logger used.
   */
  private static Log log = LogFactory.getLog(PDFParser.class);
  
  /**
   * <code>mPropertyManager</code> is the PropertyManager instance.
   */  
  private PropertyManager mPropertyManager;
  
  /**
   * <code>mMetainfo</code> is the PDFMetaInfo instance.
   */  
  private PDFMetaInfo mMetainfo;
  
  /**
   * <code>mInputStream</code> is the mInputStream instance, used as
   * we need to recreate the PDF file from an input stream.
   */  
  private InputStream mInputStream;
  
  /**
   * <code>egothor-native-pdf</code> is the prefix used for temp pdf
   * files recreated from the input streams.
   */
  public final static String PREFIX = "egothor-native-tmp";
    
  /**
   * <code>pdf</code> is the suffix used for temp pdf
   * files recreated from the input streams.
   */
  public final static String SUFFIX_PDF = ".pdf";
  
  /**
   * <code>txt</code> is the suffix used for temp txt
   * files used to hold PDF file text contents.
   */
  public final static String SUFFIX_TXT = ".txt";
  
  /**
   * <code>html</code> is the suffix used for temp html
   * files used to hold the PDF meta information.
   */
  public final static String SUFFIX_HTML = ".html";   

  /**
   * <code>mTempFilePDF</code> is the handle used for temp
   * pdf files recreated from the input streams.
   */
  private File mTempFilePDF = null;
  
  /**
   * <code>mTempFileTXT</code> is the handle used for temp
   * text files which will contain the text of the pdf file
   */
  private File mTempFileTXT = null;    

  //-------------------------------------------------------------  

  public static void main(String[] args)
  {
    //TO-DO
  }

  //------------------------------------------------------------- 
  
  public PDFParser() throws Exception
  {
    try
    {
      this.deleteTempFiles();
    }
    catch( Exception e )
    {
      // ignore
    }    
    mHomeDir = System.getProperty( "user.home" );
    log.debug( "using home dir: " + mHomeDir );
    mFullPropsName = mHomeDir + File.separator;
    mFullPropsName += PDFParser.EGOTHOR_PROPS;
    log.debug( "egothor props: " + mFullPropsName );
    
    PropertyManager.getInstance().load( mFullPropsName );
    this.mNativeProgram 
      = PropertyManager.getInstance().getProperty( PDFParser.KEY_PARSER );
    log.debug( "using parser: " + mNativeProgram );
    
    this.mMetainfo = new PDFMetaInfo();        
  }
  
  //-------------------------------------------------------------
    
  public InputStream getInputStream()
  {
    return( mInputStream );
  }

  //-------------------------------------------------------------  

  public void setInputStream(InputStream aInputStream) 
  throws Exception
  {    
    mInputStream = aInputStream;
    log.debug( "input stream set: " + mInputStream );
    
    this.createTempFiles();
    this.recreatePDFFile();
    this.parse();
    this.extractPDFMetaInfo();
  }

  //-------------------------------------------------------------

  public PDFMetaInfo getMetainfo()
  {
    return mMetainfo;
  }

  //-------------------------------------------------------------

  public void setMetainfo(PDFMetaInfo metainfo)
  {
    this.mMetainfo = metainfo;
  }
 
  //-------------------------------------------------------------
    
  public PropertyManager getPropertyManager()
  {
    return mPropertyManager;
  }

  //-------------------------------------------------------------
   
  public void setPropertyManager(PropertyManager propertyManager)
  {
    this.mPropertyManager = propertyManager;
  }

  //-------------------------------------------------------------
     
  /**
   * Recreates the PDF file from input stream
   * @throws Exception
   */
  private void recreatePDFFile() throws Exception 
  {    
    DataInputStream lInputStream = null;
    FileOutputStream lOutputStream = null;

    lOutputStream = new FileOutputStream( getTempFilePDF() );
    lInputStream = new DataInputStream( getInputStream() );

    byte lByte;
    boolean lEOF = false;
    
    while( !lEOF )
    {
      try
      {
        lByte = lInputStream.readByte();                       
        lOutputStream.write(lByte);       
      }
      catch( EOFException aEOFException )
      {
        lEOF = true;
      }      
    }        

    lInputStream.close();
    lOutputStream.flush();
    lOutputStream.close();
  }  
  
  //-------------------------------------------------------------  

  /**
   * Creates a temp PDF file in $TMP dir, in format egothor-native-pdf.pdf
   * and a temp text file
   * @throws Exception
   */
  private void createTempFiles() throws IOException
  {
    mTempFilePDF = File.createTempFile( PREFIX, SUFFIX_PDF );
    mTempFilePDF.deleteOnExit();
    
    mTempFileTXT = File.createTempFile( PREFIX, SUFFIX_TXT );
    mTempFileTXT.deleteOnExit();
  }
  
  //-------------------------------------------------------------
  
  /**
   * Delete the files created by <code>createTempFiles()</code>      
   */
  
  public void deleteTempFiles()
  {
    log.debug( "deleting temp file from previous session" );
    FileFilter lFilter =
      new FileFilter()
      {
        public boolean accept(File aFile)
        {
          return( aFile.getName().startsWith( PDFParser.PREFIX ));
        }
      };
        
    File lTempDir = new File( System.getProperty( "java.io.tmpdir" ) );
    File[] lTempFiles = lTempDir.listFiles( lFilter );
    
    for (int i = 0; i < lTempFiles.length; i++)
    {
      try
      {
        lTempFiles[i].getCanonicalFile().delete();
      }
      catch( Exception e)
      {
        // ignore
      }      
    }    
  }  
  
  //-------------------------------------------------------------
  
  /**
   * Gets the <code>mTempFilePDF</code> attribute of the PDFNativeParser object
   * @return    The mTempFilePDF value
   */
  
  public File getTempFilePDF() 
  {
    return( mTempFilePDF );
  } 
  
  //-------------------------------------------------------------  
  
  /**
   * Gets the <code>mTempFilePDF</code> attribute of the PDFNativeParser object
   * @return    The mTempFilePDF value
   */
  
  public File getTempFileHTML() throws IOException
  {    
    return( new File( getHTMLFileName() ) );
  } 
  
  //-------------------------------------------------------------  
  
  /**
   * Gets the <code>mTempFileTXT</code> attribute of the PDFNativeParser object
   * @return    The mTempFileTXT value
   */
  
  public File getTempFileTXT() 
  {
    return( mTempFileTXT );
  } 
  
  //-------------------------------------------------------------
  
  /**
   * Read the text from the PDF file using native parser as specified in
   * $HOME/egothor.properties file   
   */
  
  private void parse()
  {    
    String lArgs = getNativeProgram();
    log.debug( "parser executable: " + lArgs );
        
    if( lArgs.indexOf( " " ) != -1 )
    {
      lArgs = "\"" + lArgs + "\"";
      log.debug( "[escaped] parser executable: " + lArgs );
    }
        
    lArgs += " " + getTempFilePDF();
    lArgs += " " + getTempFileTXT();       
    runProcess( lArgs );        
  }
    
  //-------------------------------------------------------------
  
  public String getNativeProgram()
  {
    return( mNativeProgram );
  }
  
  //-------------------------------------------------------------
  
  public int runProcess( String aStr )
  {
    int lReturn = -1;
    log.debug( "executing " + aStr );
    
    try
    {
      Runtime.getRuntime().exec( aStr );
      lReturn = PDFParser.EXIT_SUCCESS;
    }
    catch( Exception aException )
    {
      log.error( "caught: ", aException );
      lReturn = EXIT_FAILURE;
    }
    
    log.debug( "done / returning " + lReturn );
    return( lReturn );    
  }
    
  //-------------------------------------------------------------  
  
  public String getHTMLFileName() throws IOException
  {    
    String lReturn = null;
    String lPrefix = null;
    
    int lEnd = getTempFilePDF().getCanonicalPath().indexOf( PDFParser.SUFFIX_PDF );
    if( lEnd != -1 )
    {
      lPrefix = getTempFilePDF().getCanonicalPath().substring( 0, lEnd );
      lReturn = lPrefix + PDFParser.SUFFIX_HTML;
    }          
    return( lReturn );
  }
  
  //-------------------------------------------------------------
  
  private void processLine( String aLine )
  {
    StringTokenizer lTokenizer = null;
    lTokenizer = new StringTokenizer( aLine, ":" );
    String lKey = null;
    String lValue = null;
    
    while( lTokenizer.hasMoreTokens() )
    {
      lKey = lTokenizer.nextToken();
      if( lKey == null )
        return;
      
      lValue = lTokenizer.nextToken();      
      if( lValue == null )
        return;
      
      lKey = lKey.trim();
      lValue = lValue.trim();
      
      log.debug( "key, value pair = " + lKey + ", " + lValue );
      getMetainfo().getHashtable().put( lKey, lValue );                     
    }
  }
  
  //-------------------------------------------------------------  
  
  /**
   * Reads the PDF meta information and stores it in a hashtable
   */  
  
  private void extractPDFMetaInfo() throws Exception
  {
    log.debug( "called" );
    
    String lArgs = null;
    lArgs = 
      PropertyManager.getInstance().getProperty( PDFParser.KEY_META );
    lArgs += " ";
    lArgs += getTempFilePDF().getCanonicalPath();
    
    Process lProcess = null;
    DataInputStream  lStream = null;
    
    lProcess = Runtime.getRuntime().exec( lArgs );
    lStream = new DataInputStream( lProcess.getInputStream() );
    String lString = null;
        
    try 
    {
      while( (lString = lStream.readLine()) != null) 
      {        
        processLine( lString );
      }
    } 
    catch (IOException e) 
    {
      log.debug( "caught", e );
    }
    
    log.debug( "meta args: " + lArgs );
    
  }
  
  //-------------------------------------------------------------
    
  public Reader getReader() throws IOException
  {
    return( new FileReader( getTempFileTXT() ) );
  }
  
  //-------------------------------------------------------------
  
  public String getAuthor()
  {
    return( getMetainfo().getAuthor() );
  }
  
  //-------------------------------------------------------------
  
  public String getSummary()
  {
    String lReturn = null;
    StringBuffer lBuffer = new StringBuffer();
    Reader lReader = null;
    int lCount = 0;
    
    try
    {
      lReader = getReader();
      
      for
      ( 
        int i = lReader.read();
        i != -1 && lCount < PDFParser.SUMMARY_LENGTH; 
        i = lReader.read() 
      )
      {
        lBuffer.append((char)i);
        lCount++;
      }
    }
    catch( Exception e )
    {
      return( "" );
    }       
    
    lReturn = lBuffer.toString();
    return( lReturn );
  }
  
  //-------------------------------------------------------------
  
  public String getKeywords()
  {
    return( getMetainfo().getKeywords() );
  }
  
  //-------------------------------------------------------------  
  
  public String getProducer()
  {
    return( getMetainfo().getProducer() );
  }
  
  //-------------------------------------------------------------  
  
  public String getTitle()
  {
    String lTitle = null;
    String lSummary = null;
    
    lTitle = getMetainfo().getSubject();
    lSummary = getSummary();
    
    boolean lEmpty = false;
    
    if( lTitle == null )
      lEmpty = true;    
    else if( lTitle.trim().length() == 0 )
      lEmpty = true;
    
    if( lEmpty )
    {  
      if( lSummary.length() >= PDFParser.SHORT_SUMMARY_LENGTH )
        lTitle = lSummary.substring( 0, PDFParser.SHORT_SUMMARY_LENGTH );
      else
        lTitle = lSummary.substring( 0, getSummary().length() );
    }    
    return( lTitle );
  }
  
  //-------------------------------------------------------------
  
  public String getCreator()
  {
    return( getMetainfo().getCreator() );
  }
  
  //-------------------------------------------------------------
  
  public void setSource( String aSource )
  {
	mFilename = aSource;	  
  }
  
  //-------------------------------------------------------------  
}


More information about the Egothor-tech mailing list