root / pkpgcounter / trunk / pkpgpdls / pdf.py @ 3385

Revision 3385, 5.6 kB (checked in by jerome, 14 years ago)

Rebuilt the PDF parsing engine to correctly account for redacted parts.
Around 25% slower than previous method, but more accurate.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Auth Date Id Rev
Line 
1# -*- coding: UTF-8 -*-
2#
3# pkpgcounter : a generic Page Description Language parser
4#
5# (c) 2003, 2004, 2005, 2006, 2007, 2008 Jerome Alet <alet@librelogiciel.com>
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program.  If not, see <http://www.gnu.org/licenses/>.
18#
19# $Id$
20#
21
22"""This modules implements a page counter for PDF documents.
23
24   Some informations taken from PDF Reference v1.7 by Adobe.
25"""
26
27import re
28
29import pdlparser
30
31PDFWHITESPACE = chr(0) \
32                + chr(9) \
33                + chr(10) \
34                + chr(12) \
35                + chr(13) \
36                + chr(32)
37                 
38PDFDELIMITERS = r"()<>[]{}/%"                 
39PDFCOMMENT = r"%"        # Up to next EOL
40
41PDFPAGEMARKER = "<< /Type /Page " # Where spaces are any whitespace char
42
43PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects
44PDFOBJREGEX = r"\s+(\d+)\s+(\d+)\s+(obj\s*.+\s*endobj)" # Doesn't work as expected
45
46class PDFObject :
47    """A class for PDF objects."""
48    def __init__(self, major, minor, description) :
49        """Initialize the PDF object."""
50        self.major = major
51        self.minor = minor
52        self.majori = int(major)
53        self.minori = int(minor)
54        self.description = description
55        self.comments = []
56        self.content = []
57        self.parent = None
58        self.kids = []
59       
60class Parser(pdlparser.PDLParser) :
61    """A parser for PDF documents."""
62    totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ]
63    required = [ "gs" ]
64    openmode = "rU"
65    format = "PDF"
66    def isValid(self) :   
67        """Returns True if data is PDF, else False."""
68        if self.firstblock.startswith("%PDF-") or \
69           self.firstblock.startswith("\033%-12345X%PDF-") or \
70           ((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \
71           (self.firstblock.find("%PDF-") != -1) :
72            return True
73        else :   
74            return False
75       
76    def veryFastAndNotAlwaysCorrectgetJobSize(self) :   
77        """Counts pages in a PDF document.
78       
79           This method works great in the general case,
80           and is around 30 times faster than the active
81           one.
82           Unfortunately it doesn't take into account documents
83           with redacted pages (only made with FrameMaker ?)
84        """
85        newpageregexp = re.compile(r"/Type\s*/Page[/>\s]")
86        return len(newpageregexp.findall(self.infile.read()))
87
88    def getJobSize(self) :
89        """Counts pages in a PDF document.
90       
91           A faster way seems to be possible by extracting the
92           "/Type/Pages/Count xxxx" value where there's no /Parent
93           (i.e. the root of the page tree)
94           Unfortunately I can't make a regexp work for this currently.
95           
96           At least the actual method below is accurate, even if 25%
97           slower than the old one.
98        """
99        # Regular expression to extract objects from a PDF document
100        oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s*.+?\s*?endobj)", \
101                             re.DOTALL)
102                             
103        # Regular expression indicating a new page
104        npregexp = re.compile(r"/Type\s*/Page[/>\s]")
105       
106        # Regular expression indicating an empty page
107        # (usually to delete an existing one with a lower minor number)
108        epregexp = re.compile(r"obj\s*<<\s*/Type\s*/Page\s*>>\s*endobj") 
109       
110        # First we build a mapping of objects to keep because
111        # if two objects with the same major number are found,
112        # we only keep the one with the higher minor number :
113        # this is the way in PDF to replace existing objects.
114        objtokeep = {}
115        for (smajor, sminor, content) in oregexp.findall(self.infile.read()) :
116            major = int(smajor)
117            minor = int(sminor)
118            (prevmin, prevcont) = objtokeep.get(major, (None, None))
119            if (minor >= prevmin) : # Handles both None and real previous minor
120                objtokeep[major] = (minor, content)
121                #if prevmin is not None :
122                #    self.logdebug("Object %i.%i overwritten with %i.%i" \
123                #                     % (major, prevmin, \
124                #                        major, minor))
125                #else :
126                #    self.logdebug("Object %i.%i OK" % (major, minor))
127               
128        # Now that we have deleted all unneeded objects, we       
129        # can count the ones which are new pages, minus the ones
130        # which are empty and not displayed pages (in fact pages
131        # used to redact existing content).
132        pagecount = 0
133        for (major, (minor, content)) in objtokeep.items() :
134            count = len(npregexp.findall(content))
135            if count :
136                emptycount = len(epregexp.findall(content))
137                #if not emptycount :
138                #    self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
139                pagecount += count - emptycount
140        return pagecount
Note: See TracBrowser for help on using the browser.