Show
Ignore:
Timestamp:
06/22/08 01:38:04 (14 years ago)
Author:
jerome
Message:

Did some work to improve PDF parser : A very fast method (26 times
faster than the original one) doesn't work with some "strange"
documents like PCL developers' guide. A slow method, which
extracts objects from PDF documents and correctly handles object
versioning (more cleaning work is needed)

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • pkpgcounter/trunk/pkpgpdls/pdf.py

    r564 r3384  
    2020# 
    2121 
    22 """This modules implements a page counter for PDF documents.""" 
     22"""This modules implements a page counter for PDF documents. 
     23 
     24   Some informations taken from PDF Reference v1.7 by Adobe. 
     25""" 
    2326 
    2427import re 
    2528 
    2629import pdlparser 
     30 
     31PDFWHITESPACE = chr(0) \ 
     32                + chr(9) \ 
     33                + chr(10) \ 
     34                + chr(12) \ 
     35                + chr(13) \ 
     36                + chr(32) 
     37                  
     38PDFDELIMITERS = r"()<>[]{}/%"                  
     39PDFCOMMENT = r"%"        # Up to next EOL 
     40 
     41PDFPAGEMARKER = "<< /Type /Page " # Where spaces are any whitespace char 
     42 
     43PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects 
     44PDFOBJREGEX = r"\s+(\d+)\s+(\d+)\s+(obj\s*.+\s*endobj)" # Doesn't work as expected 
    2745 
    2846class PDFObject : 
     
    106124                pagecount += count 
    107125        return pagecount     
     126         
     127    def veryFastAndNotAlwaysCorrectgetJobSize(self) :     
     128        """Counts pages in a PDF document.""" 
     129        newpageregexp = re.compile(r"/Type\s*/Page[/>\s]") 
     130        return len(newpageregexp.findall(self.infile.read())) 
     131 
     132    def thisOneIsSlowButCorrectgetJobSize(self) : 
     133        """Counts pages in a PDF document.""" 
     134        oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s*.+?\s*?endobj)", \ 
     135                             re.DOTALL) 
     136        objtokeep = {} 
     137        for (smajor, sminor, content) in oregexp.findall(self.infile.read()) : 
     138            major = int(smajor) 
     139            minor = int(sminor) 
     140            (prevmin, prevcont) = objtokeep.get(major, (None, None)) 
     141            if (minor >= prevmin) : # Handles both None and real previous minor 
     142                objtokeep[major] = (minor, content) 
     143                #if prevmin is not None : 
     144                #    self.logdebug("Object %i.%i overwritten with %i.%i" \ 
     145                #                     % (major, prevmin, \ 
     146                #                        major, minor)) 
     147                #else : 
     148                #    self.logdebug("Object %i.%i OK" % (major, minor)) 
     149        npregexp = re.compile(r"/Type\s*/Page[/>\s]") 
     150        pagecount = 0 
     151        for (major, (minor, content)) in objtokeep.items() : 
     152            count = len(npregexp.findall(content)) 
     153            if count : 
     154                emptycount = content.count("obj\n<< \n/Type /Page \n>> \nendobj") + content.count("obj\n<< \n/Type /Page \n\n>> \nendobj") # TODO : make this clean 
     155                if not emptycount : 
     156                    self.logdebug("%i.%i : %s\n" % (major, minor, repr(content))) 
     157                pagecount += count - emptycount 
     158        return pagecount