Changeset 3436 for pkpgcounter/trunk/pkpgpdls/pdf.py
- Timestamp:
- 10/06/08 00:22:07 (16 years ago)
- Files:
-
- 1 modified
Legend:
- Unmodified
- Added
- Removed
-
pkpgcounter/trunk/pkpgpdls/pdf.py
r3410 r3436 8 8 # the Free Software Foundation, either version 3 of the License, or 9 9 # (at your option) any later version. 10 # 10 # 11 11 # This program is distributed in the hope that it will be useful, 12 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 14 # GNU General Public License for more details. 15 # 15 # 16 16 # You should have received a copy of the GNU General Public License 17 17 # along with this program. If not, see <http://www.gnu.org/licenses/>. … … 37 37 PDFDELIMITERS = r"()<>[]{}/%" 38 38 PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects 39 39 40 40 class Parser(pdlparser.PDLParser) : 41 41 """A parser for PDF documents.""" … … 44 44 openmode = "rU" 45 45 format = "PDF" 46 def isValid(self) : 46 def isValid(self) : 47 47 """Returns True if data is PDF, else False.""" 48 48 if self.firstblock.startswith("%PDF-") or \ … … 51 51 (self.firstblock.find("%PDF-") != -1) : 52 52 return True 53 else : 53 else : 54 54 return False 55 56 def veryFastAndNotAlwaysCorrectgetJobSize(self) : 55 56 def veryFastAndNotAlwaysCorrectgetJobSize(self) : 57 57 """Counts pages in a PDF document. 58 58 59 59 This method works great in the general case, 60 60 and is around 30 times faster than the active … … 70 70 def getJobSize(self) : 71 71 """Counts pages in a PDF document. 72 72 73 73 A faster way seems to be possible by extracting the 74 74 "/Type/Pages/Count xxxx" value where there's no /Parent 75 75 (i.e. the root of the page tree) 76 76 Unfortunately I can't make a regexp work for this currently. 77 77 78 78 At least the actual method below is accurate, even if 25% 79 79 slower than the old one. But we will be able to extract … … 84 84 oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s*.+?\s*?endobj)", \ 85 85 re.DOTALL) 86 86 87 87 # Regular expression indicating a new page 88 88 npregexp = re.compile(r"/Type\s*/Page[/>\s]") 89 90 # Regular expression indicating an empty page 89 90 # Regular expression indicating an empty page 91 91 # (usually to delete an existing one with a lower minor number) 92 epregexp = re.compile(r"obj\s*<<\s*/Type\s*/Page\s*>>\s*endobj") 93 92 epregexp = re.compile(r"obj\s*<<\s*/Type\s*/Page\s*>>\s*endobj") 93 94 94 # First we build a mapping of objects to keep because 95 95 # if two objects with the same major number are found, … … 109 109 #else : 110 110 # self.logdebug("Object %i.%i OK" % (major, minor)) 111 112 # Now that we have deleted all unneeded objects, we 111 112 # Now that we have deleted all unneeded objects, we 113 113 # can count the ones which are new pages, minus the ones 114 114 # which are empty and not displayed pages (in fact pages