[3410] | 1 | # -*- coding: utf-8 -*- |
---|
[191] | 2 | # |
---|
| 3 | # pkpgcounter : a generic Page Description Language parser |
---|
| 4 | # |
---|
[3578] | 5 | # (c) 2003-2019 Jerome Alet <alet@librelogiciel.com> |
---|
[463] | 6 | # This program is free software: you can redistribute it and/or modify |
---|
[191] | 7 | # it under the terms of the GNU General Public License as published by |
---|
[463] | 8 | # the Free Software Foundation, either version 3 of the License, or |
---|
[191] | 9 | # (at your option) any later version. |
---|
[3436] | 10 | # |
---|
[191] | 11 | # This program is distributed in the hope that it will be useful, |
---|
| 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 14 | # GNU General Public License for more details. |
---|
[3436] | 15 | # |
---|
[191] | 16 | # You should have received a copy of the GNU General Public License |
---|
[463] | 17 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
[191] | 18 | # |
---|
| 19 | # $Id$ |
---|
| 20 | # |
---|
[193] | 21 | |
---|
[3384] | 22 | """This modules implements a page counter for PDF documents. |
---|
[355] | 23 | |
---|
[3384] | 24 | Some informations taken from PDF Reference v1.7 by Adobe. |
---|
| 25 | """ |
---|
| 26 | |
---|
[193] | 27 | import re |
---|
| 28 | |
---|
[235] | 29 | import pdlparser |
---|
[193] | 30 | |
---|
[3384] | 31 | PDFWHITESPACE = chr(0) \ |
---|
| 32 | + chr(9) \ |
---|
| 33 | + chr(10) \ |
---|
| 34 | + chr(12) \ |
---|
| 35 | + chr(13) \ |
---|
| 36 | + chr(32) |
---|
[3386] | 37 | PDFDELIMITERS = r"()<>[]{}/%" |
---|
[3384] | 38 | PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects |
---|
[3436] | 39 | |
---|
[220] | 40 | class Parser(pdlparser.PDLParser) : |
---|
[193] | 41 | """A parser for PDF documents.""" |
---|
[492] | 42 | totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ] |
---|
[527] | 43 | required = [ "gs" ] |
---|
[491] | 44 | openmode = "rU" |
---|
[555] | 45 | format = "PDF" |
---|
[3436] | 46 | def isValid(self) : |
---|
[387] | 47 | """Returns True if data is PDF, else False.""" |
---|
[522] | 48 | if self.firstblock.startswith("%PDF-") or \ |
---|
| 49 | self.firstblock.startswith("\033%-12345X%PDF-") or \ |
---|
| 50 | ((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \ |
---|
| 51 | (self.firstblock.find("%PDF-") != -1) : |
---|
[387] | 52 | return True |
---|
[3436] | 53 | else : |
---|
[387] | 54 | return False |
---|
[3436] | 55 | |
---|
| 56 | def veryFastAndNotAlwaysCorrectgetJobSize(self) : |
---|
[3385] | 57 | """Counts pages in a PDF document. |
---|
[3436] | 58 | |
---|
[3385] | 59 | This method works great in the general case, |
---|
| 60 | and is around 30 times faster than the active |
---|
| 61 | one. |
---|
| 62 | Unfortunately it doesn't take into account documents |
---|
| 63 | with redacted pages (only made with FrameMaker ?) |
---|
[3386] | 64 | where an existing PDF object is replaced with one |
---|
| 65 | with the same major number a higher minor number. |
---|
[3385] | 66 | """ |
---|
[3384] | 67 | newpageregexp = re.compile(r"/Type\s*/Page[/>\s]") |
---|
| 68 | return len(newpageregexp.findall(self.infile.read())) |
---|
| 69 | |
---|
[3385] | 70 | def getJobSize(self) : |
---|
| 71 | """Counts pages in a PDF document. |
---|
[3436] | 72 | |
---|
[3385] | 73 | A faster way seems to be possible by extracting the |
---|
| 74 | "/Type/Pages/Count xxxx" value where there's no /Parent |
---|
| 75 | (i.e. the root of the page tree) |
---|
| 76 | Unfortunately I can't make a regexp work for this currently. |
---|
[3436] | 77 | |
---|
[3385] | 78 | At least the actual method below is accurate, even if 25% |
---|
[3386] | 79 | slower than the old one. But we will be able to extract |
---|
| 80 | other informations as well when needed, like orientation |
---|
| 81 | and size. |
---|
[3385] | 82 | """ |
---|
| 83 | # Regular expression to extract objects from a PDF document |
---|
[3384] | 84 | oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s*.+?\s*?endobj)", \ |
---|
| 85 | re.DOTALL) |
---|
[3436] | 86 | |
---|
[3385] | 87 | # Regular expression indicating a new page |
---|
| 88 | npregexp = re.compile(r"/Type\s*/Page[/>\s]") |
---|
[3436] | 89 | |
---|
| 90 | # Regular expression indicating an empty page |
---|
[3385] | 91 | # (usually to delete an existing one with a lower minor number) |
---|
[3436] | 92 | epregexp = re.compile(r"obj\s*<<\s*/Type\s*/Page\s*>>\s*endobj") |
---|
| 93 | |
---|
[3385] | 94 | # First we build a mapping of objects to keep because |
---|
| 95 | # if two objects with the same major number are found, |
---|
| 96 | # we only keep the one with the higher minor number : |
---|
| 97 | # this is the way in PDF to replace existing objects. |
---|
[3384] | 98 | objtokeep = {} |
---|
| 99 | for (smajor, sminor, content) in oregexp.findall(self.infile.read()) : |
---|
| 100 | major = int(smajor) |
---|
| 101 | minor = int(sminor) |
---|
| 102 | (prevmin, prevcont) = objtokeep.get(major, (None, None)) |
---|
| 103 | if (minor >= prevmin) : # Handles both None and real previous minor |
---|
| 104 | objtokeep[major] = (minor, content) |
---|
| 105 | #if prevmin is not None : |
---|
| 106 | # self.logdebug("Object %i.%i overwritten with %i.%i" \ |
---|
| 107 | # % (major, prevmin, \ |
---|
| 108 | # major, minor)) |
---|
| 109 | #else : |
---|
| 110 | # self.logdebug("Object %i.%i OK" % (major, minor)) |
---|
[3436] | 111 | |
---|
| 112 | # Now that we have deleted all unneeded objects, we |
---|
[3385] | 113 | # can count the ones which are new pages, minus the ones |
---|
| 114 | # which are empty and not displayed pages (in fact pages |
---|
| 115 | # used to redact existing content). |
---|
[3384] | 116 | pagecount = 0 |
---|
| 117 | for (major, (minor, content)) in objtokeep.items() : |
---|
| 118 | count = len(npregexp.findall(content)) |
---|
| 119 | if count : |
---|
[3385] | 120 | emptycount = len(epregexp.findall(content)) |
---|
| 121 | #if not emptycount : |
---|
| 122 | # self.logdebug("%i.%i : %s\n" % (major, minor, repr(content))) |
---|
[3384] | 123 | pagecount += count - emptycount |
---|
[3385] | 124 | return pagecount |
---|