Context Navigation

pdf.py @ 3385

Revision 3385, 5.6 kB (checked in by jerome, 16 years ago)
Rebuilt the PDF parsing engine to correctly account for redacted parts. Around 25% slower than previous method, but more accurate.
Property svn:eol-style set to `native` Property svn:keywords set to `Auth Date Id Rev`

Line
1	# -- coding: UTF-8 --
2	#
3	# pkpgcounter : a generic Page Description Language parser
4	#
5	# (c) 2003, 2004, 2005, 2006, 2007, 2008 Jerome Alet <alet@librelogiciel.com>
6	# This program is free software: you can redistribute it and/or modify
7	# it under the terms of the GNU General Public License as published by
8	# the Free Software Foundation, either version 3 of the License, or
9	# (at your option) any later version.
10	#
11	# This program is distributed in the hope that it will be useful,
12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	# GNU General Public License for more details.
15	#
16	# You should have received a copy of the GNU General Public License
17	# along with this program. If not, see <http://www.gnu.org/licenses/>.
18	#
19	# $Id$
20	#
21
22	"""This modules implements a page counter for PDF documents.
23
24	Some informations taken from PDF Reference v1.7 by Adobe.
25	"""
26
27	import re
28
29	import pdlparser
30
31	PDFWHITESPACE = chr(0) \
32	+ chr(9) \
33	+ chr(10) \
34	+ chr(12) \
35	+ chr(13) \
36	+ chr(32)
37
38	PDFDELIMITERS = r"()<>[]{}/%"
39	PDFCOMMENT = r"%" # Up to next EOL
40
41	PDFPAGEMARKER = "<< /Type /Page " # Where spaces are any whitespace char
42
43	PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects
44	PDFOBJREGEX = r"\s+(\d+)\s+(\d+)\s+(obj\s.+\sendobj)" # Doesn't work as expected
45
46	class PDFObject :
47	"""A class for PDF objects."""
48	def __init__(self, major, minor, description) :
49	"""Initialize the PDF object."""
50	self.major = major
51	self.minor = minor
52	self.majori = int(major)
53	self.minori = int(minor)
54	self.description = description
55	self.comments = []
56	self.content = []
57	self.parent = None
58	self.kids = []
59
60	class Parser(pdlparser.PDLParser) :
61	"""A parser for PDF documents."""
62	totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ]
63	required = [ "gs" ]
64	openmode = "rU"
65	format = "PDF"
66	def isValid(self) :
67	"""Returns True if data is PDF, else False."""
68	if self.firstblock.startswith("%PDF-") or \
69	self.firstblock.startswith("\033%-12345X%PDF-") or \
70	((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \
71	(self.firstblock.find("%PDF-") != -1) :
72	return True
73	else :
74	return False
75
76	def veryFastAndNotAlwaysCorrectgetJobSize(self) :
77	"""Counts pages in a PDF document.
78
79	This method works great in the general case,
80	and is around 30 times faster than the active
81	one.
82	Unfortunately it doesn't take into account documents
83	with redacted pages (only made with FrameMaker ?)
84	"""
85	newpageregexp = re.compile(r"/Type\s*/Page[/>\s]")
86	return len(newpageregexp.findall(self.infile.read()))
87
88	def getJobSize(self) :
89	"""Counts pages in a PDF document.
90
91	A faster way seems to be possible by extracting the
92	"/Type/Pages/Count xxxx" value where there's no /Parent
93	(i.e. the root of the page tree)
94	Unfortunately I can't make a regexp work for this currently.
95
96	At least the actual method below is accurate, even if 25%
97	slower than the old one.
98	"""
99	# Regular expression to extract objects from a PDF document
100	oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s.+?\s?endobj)", \
101	re.DOTALL)
102
103	# Regular expression indicating a new page
104	npregexp = re.compile(r"/Type\s*/Page[/>\s]")
105
106	# Regular expression indicating an empty page
107	# (usually to delete an existing one with a lower minor number)
108	epregexp = re.compile(r"obj\s<<\s/Type\s/Page\s>>\s*endobj")
109
110	# First we build a mapping of objects to keep because
111	# if two objects with the same major number are found,
112	# we only keep the one with the higher minor number :
113	# this is the way in PDF to replace existing objects.
114	objtokeep = {}
115	for (smajor, sminor, content) in oregexp.findall(self.infile.read()) :
116	major = int(smajor)
117	minor = int(sminor)
118	(prevmin, prevcont) = objtokeep.get(major, (None, None))
119	if (minor >= prevmin) : # Handles both None and real previous minor
120	objtokeep[major] = (minor, content)
121	#if prevmin is not None :
122	# self.logdebug("Object %i.%i overwritten with %i.%i" \
123	# % (major, prevmin, \
124	# major, minor))
125	#else :
126	# self.logdebug("Object %i.%i OK" % (major, minor))
127
128	# Now that we have deleted all unneeded objects, we
129	# can count the ones which are new pages, minus the ones
130	# which are empty and not displayed pages (in fact pages
131	# used to redact existing content).
132	pagecount = 0
133	for (major, (minor, content)) in objtokeep.items() :
134	count = len(npregexp.findall(content))
135	if count :
136	emptycount = len(epregexp.findall(content))
137	#if not emptycount :
138	# self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
139	pagecount += count - emptycount
140	return pagecount

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pkpgcounter / trunk / pkpgpdls / pdf.py @ 3385

Download in other formats: