1 | # -*- coding: utf-8 -*- |
---|
2 | # |
---|
3 | # pkpgcounter : a generic Page Description Language parser |
---|
4 | # |
---|
5 | # (c) 2003-2009 Jerome Alet <alet@librelogiciel.com> |
---|
6 | # This program is free software: you can redistribute it and/or modify |
---|
7 | # it under the terms of the GNU General Public License as published by |
---|
8 | # the Free Software Foundation, either version 3 of the License, or |
---|
9 | # (at your option) any later version. |
---|
10 | # |
---|
11 | # This program is distributed in the hope that it will be useful, |
---|
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
14 | # GNU General Public License for more details. |
---|
15 | # |
---|
16 | # You should have received a copy of the GNU General Public License |
---|
17 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
18 | # |
---|
19 | # $Id$ |
---|
20 | # |
---|
21 | |
---|
22 | """This modules implements a page counter for PDF documents. |
---|
23 | |
---|
24 | Some informations taken from PDF Reference v1.7 by Adobe. |
---|
25 | """ |
---|
26 | |
---|
27 | import re |
---|
28 | |
---|
29 | import pdlparser |
---|
30 | |
---|
31 | PDFWHITESPACE = chr(0) \ |
---|
32 | + chr(9) \ |
---|
33 | + chr(10) \ |
---|
34 | + chr(12) \ |
---|
35 | + chr(13) \ |
---|
36 | + chr(32) |
---|
37 | PDFDELIMITERS = r"()<>[]{}/%" |
---|
38 | PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects |
---|
39 | |
---|
40 | class Parser(pdlparser.PDLParser) : |
---|
41 | """A parser for PDF documents.""" |
---|
42 | totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ] |
---|
43 | required = [ "gs" ] |
---|
44 | openmode = "rU" |
---|
45 | format = "PDF" |
---|
46 | def isValid(self) : |
---|
47 | """Returns True if data is PDF, else False.""" |
---|
48 | if self.firstblock.startswith("%PDF-") or \ |
---|
49 | self.firstblock.startswith("\033%-12345X%PDF-") or \ |
---|
50 | ((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \ |
---|
51 | (self.firstblock.find("%PDF-") != -1) : |
---|
52 | return True |
---|
53 | else : |
---|
54 | return False |
---|
55 | |
---|
56 | def veryFastAndNotAlwaysCorrectgetJobSize(self) : |
---|
57 | """Counts pages in a PDF document. |
---|
58 | |
---|
59 | This method works great in the general case, |
---|
60 | and is around 30 times faster than the active |
---|
61 | one. |
---|
62 | Unfortunately it doesn't take into account documents |
---|
63 | with redacted pages (only made with FrameMaker ?) |
---|
64 | where an existing PDF object is replaced with one |
---|
65 | with the same major number a higher minor number. |
---|
66 | """ |
---|
67 | newpageregexp = re.compile(r"/Type\s*/Page[/>\s]") |
---|
68 | return len(newpageregexp.findall(self.infile.read())) |
---|
69 | |
---|
70 | def getJobSize(self) : |
---|
71 | """Counts pages in a PDF document. |
---|
72 | |
---|
73 | A faster way seems to be possible by extracting the |
---|
74 | "/Type/Pages/Count xxxx" value where there's no /Parent |
---|
75 | (i.e. the root of the page tree) |
---|
76 | Unfortunately I can't make a regexp work for this currently. |
---|
77 | |
---|
78 | At least the actual method below is accurate, even if 25% |
---|
79 | slower than the old one. But we will be able to extract |
---|
80 | other informations as well when needed, like orientation |
---|
81 | and size. |
---|
82 | """ |
---|
83 | # Regular expression to extract objects from a PDF document |
---|
84 | oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s*.+?\s*?endobj)", \ |
---|
85 | re.DOTALL) |
---|
86 | |
---|
87 | # Regular expression indicating a new page |
---|
88 | npregexp = re.compile(r"/Type\s*/Page[/>\s]") |
---|
89 | |
---|
90 | # Regular expression indicating an empty page |
---|
91 | # (usually to delete an existing one with a lower minor number) |
---|
92 | epregexp = re.compile(r"obj\s*<<\s*/Type\s*/Page\s*>>\s*endobj") |
---|
93 | |
---|
94 | # First we build a mapping of objects to keep because |
---|
95 | # if two objects with the same major number are found, |
---|
96 | # we only keep the one with the higher minor number : |
---|
97 | # this is the way in PDF to replace existing objects. |
---|
98 | objtokeep = {} |
---|
99 | for (smajor, sminor, content) in oregexp.findall(self.infile.read()) : |
---|
100 | major = int(smajor) |
---|
101 | minor = int(sminor) |
---|
102 | (prevmin, prevcont) = objtokeep.get(major, (None, None)) |
---|
103 | if (minor >= prevmin) : # Handles both None and real previous minor |
---|
104 | objtokeep[major] = (minor, content) |
---|
105 | #if prevmin is not None : |
---|
106 | # self.logdebug("Object %i.%i overwritten with %i.%i" \ |
---|
107 | # % (major, prevmin, \ |
---|
108 | # major, minor)) |
---|
109 | #else : |
---|
110 | # self.logdebug("Object %i.%i OK" % (major, minor)) |
---|
111 | |
---|
112 | # Now that we have deleted all unneeded objects, we |
---|
113 | # can count the ones which are new pages, minus the ones |
---|
114 | # which are empty and not displayed pages (in fact pages |
---|
115 | # used to redact existing content). |
---|
116 | pagecount = 0 |
---|
117 | for (major, (minor, content)) in objtokeep.items() : |
---|
118 | count = len(npregexp.findall(content)) |
---|
119 | if count : |
---|
120 | emptycount = len(epregexp.findall(content)) |
---|
121 | #if not emptycount : |
---|
122 | # self.logdebug("%i.%i : %s\n" % (major, minor, repr(content))) |
---|
123 | pagecount += count - emptycount |
---|
124 | return pagecount |
---|