root / pkpgcounter / trunk / pkpgpdls / pdf.py @ 3558

Revision 3474, 5.1 kB (checked in by jerome, 16 years ago)

Changed copyright years.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Auth Date Id Rev
Line 
1# -*- coding: utf-8 -*-
2#
3# pkpgcounter : a generic Page Description Language parser
4#
5# (c) 2003-2009 Jerome Alet <alet@librelogiciel.com>
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program.  If not, see <http://www.gnu.org/licenses/>.
18#
19# $Id$
20#
21
22"""This modules implements a page counter for PDF documents.
23
24   Some informations taken from PDF Reference v1.7 by Adobe.
25"""
26
27import re
28
29import pdlparser
30
31PDFWHITESPACE = chr(0) \
32                + chr(9) \
33                + chr(10) \
34                + chr(12) \
35                + chr(13) \
36                + chr(32)
37PDFDELIMITERS = r"()<>[]{}/%"
38PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects
39
40class Parser(pdlparser.PDLParser) :
41    """A parser for PDF documents."""
42    totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ]
43    required = [ "gs" ]
44    openmode = "rU"
45    format = "PDF"
46    def isValid(self) :
47        """Returns True if data is PDF, else False."""
48        if self.firstblock.startswith("%PDF-") or \
49           self.firstblock.startswith("\033%-12345X%PDF-") or \
50           ((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \
51           (self.firstblock.find("%PDF-") != -1) :
52            return True
53        else :
54            return False
55
56    def veryFastAndNotAlwaysCorrectgetJobSize(self) :
57        """Counts pages in a PDF document.
58
59           This method works great in the general case,
60           and is around 30 times faster than the active
61           one.
62           Unfortunately it doesn't take into account documents
63           with redacted pages (only made with FrameMaker ?)
64           where an existing PDF object is replaced with one
65           with the same major number a higher minor number.
66        """
67        newpageregexp = re.compile(r"/Type\s*/Page[/>\s]")
68        return len(newpageregexp.findall(self.infile.read()))
69
70    def getJobSize(self) :
71        """Counts pages in a PDF document.
72
73           A faster way seems to be possible by extracting the
74           "/Type/Pages/Count xxxx" value where there's no /Parent
75           (i.e. the root of the page tree)
76           Unfortunately I can't make a regexp work for this currently.
77
78           At least the actual method below is accurate, even if 25%
79           slower than the old one. But we will be able to extract
80           other informations as well when needed, like orientation
81           and size.
82        """
83        # Regular expression to extract objects from a PDF document
84        oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s*.+?\s*?endobj)", \
85                             re.DOTALL)
86
87        # Regular expression indicating a new page
88        npregexp = re.compile(r"/Type\s*/Page[/>\s]")
89
90        # Regular expression indicating an empty page
91        # (usually to delete an existing one with a lower minor number)
92        epregexp = re.compile(r"obj\s*<<\s*/Type\s*/Page\s*>>\s*endobj")
93
94        # First we build a mapping of objects to keep because
95        # if two objects with the same major number are found,
96        # we only keep the one with the higher minor number :
97        # this is the way in PDF to replace existing objects.
98        objtokeep = {}
99        for (smajor, sminor, content) in oregexp.findall(self.infile.read()) :
100            major = int(smajor)
101            minor = int(sminor)
102            (prevmin, prevcont) = objtokeep.get(major, (None, None))
103            if (minor >= prevmin) : # Handles both None and real previous minor
104                objtokeep[major] = (minor, content)
105                #if prevmin is not None :
106                #    self.logdebug("Object %i.%i overwritten with %i.%i" \
107                #                     % (major, prevmin, \
108                #                        major, minor))
109                #else :
110                #    self.logdebug("Object %i.%i OK" % (major, minor))
111
112        # Now that we have deleted all unneeded objects, we
113        # can count the ones which are new pages, minus the ones
114        # which are empty and not displayed pages (in fact pages
115        # used to redact existing content).
116        pagecount = 0
117        for (major, (minor, content)) in objtokeep.items() :
118            count = len(npregexp.findall(content))
119            if count :
120                emptycount = len(epregexp.findall(content))
121                #if not emptycount :
122                #    self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
123                pagecount += count - emptycount
124        return pagecount
Note: See TracBrowser for help on using the browser.