76 | | def getJobSize(self) : |
77 | | """Counts pages in a PDF document.""" |
78 | | # First we start with a generic PDF parser. |
79 | | lastcomment = None |
80 | | objects = {} |
81 | | inobject = 0 |
82 | | objre = re.compile(r"\s?(\d+)\s+(\d+)\s+obj[<\s/]?") |
83 | | for line in self.infile : |
84 | | line = line.strip() |
85 | | if line.startswith("% ") : |
86 | | if inobject : |
87 | | obj.comments.append(line) |
88 | | else : |
89 | | lastcomment = line[2:] |
90 | | else : |
91 | | # New object begins here |
92 | | result = objre.search(line) |
93 | | if result is not None : |
94 | | (major, minor) = line[result.start():result.end()].split()[:2] |
95 | | obj = PDFObject(major, minor, lastcomment) |
96 | | obj.content.append(line[result.end():]) |
97 | | inobject = 1 |
98 | | elif line.startswith("endobj") \ |
99 | | or line.startswith(">> endobj") \ |
100 | | or line.startswith(">>endobj") : |
101 | | # Handle previous object, if any |
102 | | if inobject : |
103 | | # only overwrite older versions of this object |
104 | | # same minor seems to be possible, so the latest one |
105 | | # found in the file will be the one we keep. |
106 | | # if we want the first one, just use > instead of >= |
107 | | oldobject = objects.setdefault(major, obj) |
108 | | if int(minor) >= oldobject.minori : |
109 | | objects[major] = obj |
110 | | # self.logdebug("Object(%i, %i) overwritten with Object(%i, %i)" % (oldobject.majori, oldobject.minori, obj.majori, obj.minori)) |
111 | | # self.logdebug("Object(%i, %i)" % (obj.majori, obj.minori)) |
112 | | inobject = 0 |
113 | | else : |
114 | | if inobject : |
115 | | obj.content.append(line) |
116 | | |
117 | | # Now we check each PDF object we've just created. |
118 | | newpageregexp = re.compile(r"(/Type)\s?(/Page)[/>\s]", re.I) |
119 | | pagecount = 0 |
120 | | for obj in objects.values() : |
121 | | content = "".join(obj.content) |
122 | | count = len(newpageregexp.findall(content)) |
123 | | if count and (content != r"<</Type /Page>>") : # Empty pages which are not rendered ? |
124 | | pagecount += count |
125 | | return pagecount |
| 76 | def veryFastAndNotAlwaysCorrectgetJobSize(self) : |
| 77 | """Counts pages in a PDF document. |
132 | | def thisOneIsSlowButCorrectgetJobSize(self) : |
133 | | """Counts pages in a PDF document.""" |
| 88 | def getJobSize(self) : |
| 89 | """Counts pages in a PDF document. |
| 90 | |
| 91 | A faster way seems to be possible by extracting the |
| 92 | "/Type/Pages/Count xxxx" value where there's no /Parent |
| 93 | (i.e. the root of the page tree) |
| 94 | Unfortunately I can't make a regexp work for this currently. |
| 95 | |
| 96 | At least the actual method below is accurate, even if 25% |
| 97 | slower than the old one. |
| 98 | """ |
| 99 | # Regular expression to extract objects from a PDF document |
| 102 | |
| 103 | # Regular expression indicating a new page |
| 104 | npregexp = re.compile(r"/Type\s*/Page[/>\s]") |
| 105 | |
| 106 | # Regular expression indicating an empty page |
| 107 | # (usually to delete an existing one with a lower minor number) |
| 108 | epregexp = re.compile(r"obj\s*<<\s*/Type\s*/Page\s*>>\s*endobj") |
| 109 | |
| 110 | # First we build a mapping of objects to keep because |
| 111 | # if two objects with the same major number are found, |
| 112 | # we only keep the one with the higher minor number : |
| 113 | # this is the way in PDF to replace existing objects. |