Context Navigation

pdf.py @ 3384

Revision 3384, 6.7 kB (checked in by jerome, 16 years ago)
Did some work to improve PDF parser : A very fast method (26 times faster than the original one) doesn't work with some "strange" documents like PCL developers' guide. A slow method, which extracts objects from PDF documents and correctly handles object versioning (more cleaning work is needed)
Property svn:eol-style set to `native` Property svn:keywords set to `Auth Date Id Rev`

Line
1	# -- coding: UTF-8 --
2	#
3	# pkpgcounter : a generic Page Description Language parser
4	#
5	# (c) 2003, 2004, 2005, 2006, 2007, 2008 Jerome Alet <alet@librelogiciel.com>
6	# This program is free software: you can redistribute it and/or modify
7	# it under the terms of the GNU General Public License as published by
8	# the Free Software Foundation, either version 3 of the License, or
9	# (at your option) any later version.
10	#
11	# This program is distributed in the hope that it will be useful,
12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	# GNU General Public License for more details.
15	#
16	# You should have received a copy of the GNU General Public License
17	# along with this program. If not, see <http://www.gnu.org/licenses/>.
18	#
19	# $Id$
20	#
21
22	"""This modules implements a page counter for PDF documents.
23
24	Some informations taken from PDF Reference v1.7 by Adobe.
25	"""
26
27	import re
28
29	import pdlparser
30
31	PDFWHITESPACE = chr(0) \
32	+ chr(9) \
33	+ chr(10) \
34	+ chr(12) \
35	+ chr(13) \
36	+ chr(32)
37
38	PDFDELIMITERS = r"()<>[]{}/%"
39	PDFCOMMENT = r"%" # Up to next EOL
40
41	PDFPAGEMARKER = "<< /Type /Page " # Where spaces are any whitespace char
42
43	PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects
44	PDFOBJREGEX = r"\s+(\d+)\s+(\d+)\s+(obj\s.+\sendobj)" # Doesn't work as expected
45
46	class PDFObject :
47	"""A class for PDF objects."""
48	def __init__(self, major, minor, description) :
49	"""Initialize the PDF object."""
50	self.major = major
51	self.minor = minor
52	self.majori = int(major)
53	self.minori = int(minor)
54	self.description = description
55	self.comments = []
56	self.content = []
57	self.parent = None
58	self.kids = []
59
60	class Parser(pdlparser.PDLParser) :
61	"""A parser for PDF documents."""
62	totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ]
63	required = [ "gs" ]
64	openmode = "rU"
65	format = "PDF"
66	def isValid(self) :
67	"""Returns True if data is PDF, else False."""
68	if self.firstblock.startswith("%PDF-") or \
69	self.firstblock.startswith("\033%-12345X%PDF-") or \
70	((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \
71	(self.firstblock.find("%PDF-") != -1) :
72	return True
73	else :
74	return False
75
76	def getJobSize(self) :
77	"""Counts pages in a PDF document."""
78	# First we start with a generic PDF parser.
79	lastcomment = None
80	objects = {}
81	inobject = 0
82	objre = re.compile(r"\s?(\d+)\s+(\d+)\s+obj[<\s/]?")
83	for line in self.infile :
84	line = line.strip()
85	if line.startswith("% ") :
86	if inobject :
87	obj.comments.append(line)
88	else :
89	lastcomment = line[2:]
90	else :
91	# New object begins here
92	result = objre.search(line)
93	if result is not None :
94	(major, minor) = line[result.start():result.end()].split()[:2]
95	obj = PDFObject(major, minor, lastcomment)
96	obj.content.append(line[result.end():])
97	inobject = 1
98	elif line.startswith("endobj") \
99	or line.startswith(">> endobj") \
100	or line.startswith(">>endobj") :
101	# Handle previous object, if any
102	if inobject :
103	# only overwrite older versions of this object
104	# same minor seems to be possible, so the latest one
105	# found in the file will be the one we keep.
106	# if we want the first one, just use > instead of >=
107	oldobject = objects.setdefault(major, obj)
108	if int(minor) >= oldobject.minori :
109	objects[major] = obj
110	# self.logdebug("Object(%i, %i) overwritten with Object(%i, %i)" % (oldobject.majori, oldobject.minori, obj.majori, obj.minori))
111	# self.logdebug("Object(%i, %i)" % (obj.majori, obj.minori))
112	inobject = 0
113	else :
114	if inobject :
115	obj.content.append(line)
116
117	# Now we check each PDF object we've just created.
118	newpageregexp = re.compile(r"(/Type)\s?(/Page)[/>\s]", re.I)
119	pagecount = 0
120	for obj in objects.values() :
121	content = "".join(obj.content)
122	count = len(newpageregexp.findall(content))
123	if count and (content != r"<</Type /Page>>") : # Empty pages which are not rendered ?
124	pagecount += count
125	return pagecount
126
127	def veryFastAndNotAlwaysCorrectgetJobSize(self) :
128	"""Counts pages in a PDF document."""
129	newpageregexp = re.compile(r"/Type\s*/Page[/>\s]")
130	return len(newpageregexp.findall(self.infile.read()))
131
132	def thisOneIsSlowButCorrectgetJobSize(self) :
133	"""Counts pages in a PDF document."""
134	oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s.+?\s?endobj)", \
135	re.DOTALL)
136	objtokeep = {}
137	for (smajor, sminor, content) in oregexp.findall(self.infile.read()) :
138	major = int(smajor)
139	minor = int(sminor)
140	(prevmin, prevcont) = objtokeep.get(major, (None, None))
141	if (minor >= prevmin) : # Handles both None and real previous minor
142	objtokeep[major] = (minor, content)
143	#if prevmin is not None :
144	# self.logdebug("Object %i.%i overwritten with %i.%i" \
145	# % (major, prevmin, \
146	# major, minor))
147	#else :
148	# self.logdebug("Object %i.%i OK" % (major, minor))
149	npregexp = re.compile(r"/Type\s*/Page[/>\s]")
150	pagecount = 0
151	for (major, (minor, content)) in objtokeep.items() :
152	count = len(npregexp.findall(content))
153	if count :
154	emptycount = content.count("obj\n<< \n/Type /Page \n>> \nendobj") + content.count("obj\n<< \n/Type /Page \n\n>> \nendobj") # TODO : make this clean
155	if not emptycount :
156	self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
157	pagecount += count - emptycount
158	return pagecount

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pkpgcounter / trunk / pkpgpdls / pdf.py @ 3384

Download in other formats: