Show
Ignore:
Timestamp:
06/19/04 00:21:27 (20 years ago)
Author:
jalet
Message:

Native PDF parser greatly improved.
GhostScript? based PDF parser completely removed because native code
is now portable across Python versions.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • pykota/trunk/pykota/pdlanalyzer.py

    r1551 r1552  
    2222# 
    2323# $Log$ 
     24# Revision 1.9  2004/06/18 22:21:27  jalet 
     25# Native PDF parser greatly improved. 
     26# GhostScript based PDF parser completely removed because native code 
     27# is now portable across Python versions. 
     28# 
    2429# Revision 1.8  2004/06/18 20:49:46  jalet 
    2530# "ERROR:" prefix added 
     
    5156import sys 
    5257import os 
     58import re 
    5359import struct 
    5460import tempfile 
     
    7581        """Count pages in a DSC compliant PostScript document.""" 
    7682        pagecount = 0 
    77         while 1 : 
    78             line = self.infile.readline() 
    79             if not line : 
    80                 break 
     83        for line in self.infile.xreadlines() :  
    8184            if line.startswith("%%Page: ") : 
    8285                pagecount += 1 
     
    8790        """Initialize PDF Analyzer.""" 
    8891        self.infile = infile 
    89         try : 
    90             if float(sys.version[:3]) >= 2.3 : 
    91                 self.getJobSize = self.native_getJobSize 
    92             else :     
    93                 self.getJobSize = self.gs_getJobSize 
    94         except : 
    95             self.getJobSize = self.gs_getJobSize 
    9692                 
    97     def native_getJobSize(self) :     
    98         """Counts pages in a PDF document natively.""" 
     93    def getJobSize(self) :     
     94        """Counts pages in a PDF document.""" 
     95        regexp = re.compile(r"(/Type) ?(/Page)[/ \r\n]") 
    9996        pagecount = 0 
    100         content = [] 
    101         while 1 :      
    102             line = self.infile.readline() 
    103             if not line : 
    104                 break 
    105             line = line.strip() 
    106             content.append(line) 
    107             if line.endswith("endobj") : 
    108                 pagecount += " /".join([x.strip() for x in " ".join(content).split("/")]).count(" /Type /Page ") 
    109                 content = [] 
    110         return pagecount     
    111          
    112     def gs_getJobSize(self) :     
    113         """Counts pages in a PDF document using GhostScript to convert PDF to PS.""" 
    114         MEGABYTE = 1024*1024 
    115         child = popen2.Popen4("gs -q -dNOPAUSE -dBATCH -dSAFER -sDEVICE=pswrite -sOutputFile=- -c save pop -f - 2>/dev/null") 
    116         try : 
    117             data = self.infile.read(MEGABYTE)     
    118             while data : 
    119                 child.tochild.write(data) 
    120                 data = self.infile.read(MEGABYTE) 
    121             child.tochild.flush() 
    122             child.tochild.close()     
    123         except (IOError, OSError), msg :     
    124             raise PDLAnalyzerError, "Unable to convert PDF input to PS with GhostScript : %s" % msg 
    125          
    126         psanalyzer = PostScriptAnalyzer(child.fromchild) 
    127         pagecount = psanalyzer.getJobSize() 
    128         child.fromchild.close() 
    129         try : 
    130             retcode = child.wait() 
    131         except OSError, msg :     
    132             self.filter.logger.log_message(_("Problem while waiting for PDF to PS converter (GhostScript pid %s) to exit : %s") % (child.pid, msg)) 
    133         else :     
    134             if os.WIFEXITED(retcode) : 
    135                 status = os.WEXITSTATUS(retcode) 
    136             else :     
    137                 status = retcode 
    138             if status :     
    139                 raise PDLAnalyzerError, "PDF to PS converter (GhostScript pid %s) exit code is %s" % (child.pid, repr(status)) 
     97        for line in self.infile.xreadlines() :  
     98            pagecount += len(regexp.findall(line)) 
    14099        return pagecount     
    141100         
     
    458417        else :     
    459418            # normal file 
    460             self.infile = open(self.filename, "rbU") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2 
     419            self.infile = open(self.filename, "rb") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2 
    461420            self.mustclose = 1 
    462421            return 
     
    464423        # Use a temporary file, always seekable contrary to standard input. 
    465424        # This also has the benefit to let us use the "U" mode (new in Python 2.3) 
    466         self.infile = tempfile.TemporaryFile(mode="w+bU")   # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2 
     425        self.infile = tempfile.TemporaryFile(mode="w+b")   # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2 
    467426        while 1 : 
    468427            data = infile.read(MEGABYTE)