root / pkpgcounter / trunk / pkpgpdls / pdf.py @ 491

Revision 491, 4.5 kB (checked in by jerome, 16 years ago)

Major code cleaning. Now clearer, although probably a bit slower since
a file can be opened several times.
Now universal line opening mode is only used when needed (PS, PDF and plain
text), and binary opening mode is used for the other formats.
This mean we will be able to remove mmap calls wherever possible, finally.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Auth Date Id Rev
Line 
1#! /usr/bin/env python
2# -*- coding: ISO-8859-15 -*-
3#
4# pkpgcounter : a generic Page Description Language parser
5#
6# (c) 2003, 2004, 2005, 2006, 2007 Jerome Alet <alet@librelogiciel.com>
7# This program is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program.  If not, see <http://www.gnu.org/licenses/>.
19#
20# $Id$
21#
22
23"""This modules implements a page counter for PDF documents."""
24
25import re
26
27import pdlparser
28
29class PDFObject :
30    """A class for PDF objects."""
31    def __init__(self, major, minor, description) :
32        """Initialize the PDF object."""
33        self.major = major
34        self.minor = minor
35        self.majori = int(major)
36        self.minori = int(minor)
37        self.description = description
38        self.comments = []
39        self.content = []
40        self.parent = None
41        self.kids = []
42       
43class Parser(pdlparser.PDLParser) :
44    """A parser for PDF documents."""
45    totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r%(dpi)i -sOutputFile="%(fname)s" -' ]
46    openmode = "rU"
47    def isValid(self) :   
48        """Returns True if data is PDF, else False."""
49        if self.firstblock.startswith("%PDF-") or \
50           self.firstblock.startswith("\033%-12345X%PDF-") or \
51           ((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \
52           (self.firstblock.find("%PDF-") != -1) :
53            self.logdebug("DEBUG: Input file is in the PDF format.")
54            return True
55        else :   
56            return False
57       
58    def getJobSize(self) :   
59        """Counts pages in a PDF document."""
60        # First we start with a generic PDF parser.
61        lastcomment = None
62        objects = {}
63        inobject = 0
64        objre = re.compile(r"\s?(\d+)\s+(\d+)\s+obj[<\s/]?")
65        for line in self.infile :
66            line = line.strip()   
67            if line.startswith("% ") :   
68                if inobject :
69                    obj.comments.append(line)
70                else :
71                    lastcomment = line[2:]
72            else :
73                # New object begins here
74                result = objre.search(line)
75                if result is not None :
76                    (major, minor) = line[result.start():result.end()].split()[:2]
77                    obj = PDFObject(major, minor, lastcomment)
78                    obj.content.append(line[result.end():])
79                    inobject = 1
80                elif line.startswith("endobj") \
81                  or line.startswith(">> endobj") \
82                  or line.startswith(">>endobj") :
83                    # Handle previous object, if any
84                    if inobject :
85                        # only overwrite older versions of this object
86                        # same minor seems to be possible, so the latest one
87                        # found in the file will be the one we keep.
88                        # if we want the first one, just use > instead of >=
89                        oldobject = objects.setdefault(major, obj)
90                        if int(minor) >= oldobject.minori :
91                            objects[major] = obj
92                            # self.logdebug("Object(%i, %i) overwritten with Object(%i, %i)" % (oldobject.majori, oldobject.minori, obj.majori, obj.minori))
93                        # self.logdebug("Object(%i, %i)" % (obj.majori, obj.minori))
94                        inobject = 0       
95                else :   
96                    if inobject :
97                        obj.content.append(line)
98                       
99        # Now we check each PDF object we've just created.
100        newpageregexp = re.compile(r"(/Type)\s?(/Page)[/>\s]", re.I)
101        pagecount = 0
102        for obj in objects.values() :
103            content = "".join(obj.content)
104            count = len(newpageregexp.findall(content))
105            if count and (content != r"<</Type /Page>>") : # Empty pages which are not rendered ?
106                pagecount += count
107        return pagecount   
Note: See TracBrowser for help on using the browser.