Changeset 1550

Show
Ignore:
Timestamp:
06/18/04 19:48:04 (20 years ago)
Author:
jalet
Message:

Added native fast PDF parsing method

Location:
pykota/trunk
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • pykota/trunk/NEWS

    r1549 r1550  
    2727          printers. Thanks to Ryan Suarez. 
    2828         
    29         - Smart PDL analyzer now recognizes PDF too. 
     29        - Smart PDL analyzer now recognizes PDF too : two 
     30          PDF parsing methods are used : a native one which 
     31          is very fast but requires Python 2.3 or above, and 
     32          a slower one which uses Ghostscript. PyKota detects 
     33          automatically which method to use depending on your 
     34          version of Python. 
    3035         
    3136        - All tracebacks now include PyKota's version number. 
  • pykota/trunk/pykota/pdlanalyzer.py

    r1547 r1550  
    2222# 
    2323# $Log$ 
     24# Revision 1.7  2004/06/18 17:48:04  jalet 
     25# Added native fast PDF parsing method 
     26# 
    2427# Revision 1.6  2004/06/18 14:00:16  jalet 
    2528# Added PDF support in smart PDL analyzer (through GhostScript for now) 
     
    8184        """Initialize PDF Analyzer.""" 
    8285        self.infile = infile 
    83          
    84     def getJobSize(self) :     
    85         """Counts pages in a PDF document. TODO : don't use GhostScript in the future.""" 
     86        try : 
     87            if float(sys.version[:3]) >= 2.3 : 
     88                self.getJobSize = self.native_getJobSize 
     89            else :     
     90                self.getJobSize = self.gs_getJobSize 
     91        except : 
     92            self.getJobSize = self.gs_getJobSize 
     93                 
     94    def native_getJobSize(self) :     
     95        """Counts pages in a PDF document natively.""" 
     96        pagecount = 0 
     97        content = [] 
     98        while 1 :      
     99            line = self.infile.readline() 
     100            if not line : 
     101                break 
     102            line = line.strip() 
     103            content.append(line) 
     104            if line.endswith("endobj") : 
     105                pagecount += " /".join([x.strip() for x in " ".join(content).split("/")]).count(" /Type /Page ") 
     106                content = [] 
     107        return pagecount     
     108         
     109    def gs_getJobSize(self) :     
     110        """Counts pages in a PDF document using GhostScript to convert PDF to PS.""" 
    86111        MEGABYTE = 1024*1024 
    87112        child = popen2.Popen4("gs -q -dNOPAUSE -dBATCH -dSAFER -sDEVICE=pswrite -sOutputFile=- -c save pop -f - 2>/dev/null") 
     
    421446    def openFile(self) :     
    422447        """Opens the job's data stream for reading.""" 
    423         self.mustclose = 1 
     448        self.mustclose = 0  # by default we don't want to close the file when finished 
    424449        if hasattr(self.filename, "read") and hasattr(self.filename, "seek") : 
    425450            # filename is in fact a file-like object  
    426             self.infile = self.filename 
    427             self.mustclose = 0  # we don't want to close this file when finished 
     451            infile = self.filename 
    428452        elif self.filename == "-" : 
    429453            # we must read from stdin 
    430             # but since stdin is not seekable, we have to use a temporary 
    431             # file instead. 
    432             self.infile = tempfile.TemporaryFile() 
    433             while 1 : 
    434                 data = sys.stdin.read(MEGABYTE)  
    435                 if not data : 
    436                     break 
    437                 self.infile.write(data) 
    438             self.infile.flush()     
    439             self.infile.seek(0) 
     454            infile = sys.stdin 
    440455        else :     
    441456            # normal file 
    442             self.infile = open(self.filename, "rb") 
     457            self.infile = open(self.filename, "rbU") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2 
     458            self.mustclose = 1 
     459            return 
     460             
     461        # Use a temporary file, always seekable contrary to standard input. 
     462        # This also has the benefit to let us use the "U" mode (new in Python 2.3) 
     463        self.infile = tempfile.TemporaryFile(mode="w+bU")   # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2 
     464        while 1 : 
     465            data = infile.read(MEGABYTE)  
     466            if not data : 
     467                break 
     468            self.infile.write(data) 
     469        self.infile.flush()     
     470        self.infile.seek(0) 
    443471             
    444472    def closeFile(self) :