Context Navigation

pdf.py @ 3384

Revision 3384, 6.7 kB (checked in by jerome, 16 years ago)
Did some work to improve PDF parser : A very fast method (26 times faster than the original one) doesn't work with some "strange" documents like PCL developers' guide. A slow method, which extracts objects from PDF documents and correctly handles object versioning (more cleaning work is needed)
Property svn:eol-style set to `native` Property svn:keywords set to `Auth Date Id Rev`

Rev	Line
[564]	1	# -- coding: UTF-8 --
[191]	2	#
	3	# pkpgcounter : a generic Page Description Language parser
	4	#
[564]	5	# (c) 2003, 2004, 2005, 2006, 2007, 2008 Jerome Alet <alet@librelogiciel.com>
[463]	6	# This program is free software: you can redistribute it and/or modify
[191]	7	# it under the terms of the GNU General Public License as published by
[463]	8	# the Free Software Foundation, either version 3 of the License, or
[191]	9	# (at your option) any later version.
[463]	10	#
[191]	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU General Public License
[463]	17	# along with this program. If not, see <http://www.gnu.org/licenses/>.
[191]	18	#
	19	# $Id$
	20	#
[193]	21
[3384]	22	"""This modules implements a page counter for PDF documents.
[355]	23
[3384]	24	Some informations taken from PDF Reference v1.7 by Adobe.
	25	"""
	26
[193]	27	import re
	28
[235]	29	import pdlparser
[193]	30
[3384]	31	PDFWHITESPACE = chr(0) \
	32	+ chr(9) \
	33	+ chr(10) \
	34	+ chr(12) \
	35	+ chr(13) \
	36	+ chr(32)
	37
	38	PDFDELIMITERS = r"()<>[]{}/%"
	39	PDFCOMMENT = r"%" # Up to next EOL
	40
	41	PDFPAGEMARKER = "<< /Type /Page " # Where spaces are any whitespace char
	42
	43	PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects
	44	PDFOBJREGEX = r"\s+(\d+)\s+(\d+)\s+(obj\s.+\sendobj)" # Doesn't work as expected
	45
[240]	46	class PDFObject :
	47	"""A class for PDF objects."""
	48	def __init__(self, major, minor, description) :
	49	"""Initialize the PDF object."""
	50	self.major = major
	51	self.minor = minor
[491]	52	self.majori = int(major)
	53	self.minori = int(minor)
[240]	54	self.description = description
	55	self.comments = []
	56	self.content = []
	57	self.parent = None
	58	self.kids = []
	59
[220]	60	class Parser(pdlparser.PDLParser) :
[193]	61	"""A parser for PDF documents."""
[492]	62	totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ]
[527]	63	required = [ "gs" ]
[491]	64	openmode = "rU"
[555]	65	format = "PDF"
[220]	66	def isValid(self) :
[387]	67	"""Returns True if data is PDF, else False."""
[522]	68	if self.firstblock.startswith("%PDF-") or \
	69	self.firstblock.startswith("\033%-12345X%PDF-") or \
	70	((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \
	71	(self.firstblock.find("%PDF-") != -1) :
[387]	72	return True
[220]	73	else :
[387]	74	return False
[220]	75
[193]	76	def getJobSize(self) :
	77	"""Counts pages in a PDF document."""
[240]	78	# First we start with a generic PDF parser.
	79	lastcomment = None
	80	objects = {}
[241]	81	inobject = 0
[243]	82	objre = re.compile(r"\s?(\d+)\s+(\d+)\s+obj[<\s/]?")
[450]	83	for line in self.infile :
[491]	84	line = line.strip()
[450]	85	if line.startswith("% ") :
	86	if inobject :
	87	obj.comments.append(line)
	88	else :
	89	lastcomment = line[2:]
	90	else :
	91	# New object begins here
	92	result = objre.search(line)
	93	if result is not None :
[491]	94	(major, minor) = line[result.start():result.end()].split()[:2]
[450]	95	obj = PDFObject(major, minor, lastcomment)
	96	obj.content.append(line[result.end():])
	97	inobject = 1
	98	elif line.startswith("endobj") \
	99	or line.startswith(">> endobj") \
	100	or line.startswith(">>endobj") :
	101	# Handle previous object, if any
[241]	102	if inobject :
[450]	103	# only overwrite older versions of this object
	104	# same minor seems to be possible, so the latest one
	105	# found in the file will be the one we keep.
	106	# if we want the first one, just use > instead of >=
	107	oldobject = objects.setdefault(major, obj)
[491]	108	if int(minor) >= oldobject.minori :
[450]	109	objects[major] = obj
[491]	110	# self.logdebug("Object(%i, %i) overwritten with Object(%i, %i)" % (oldobject.majori, oldobject.minori, obj.majori, obj.minori))
	111	# self.logdebug("Object(%i, %i)" % (obj.majori, obj.minori))
[450]	112	inobject = 0
	113	else :
	114	if inobject :
	115	obj.content.append(line)
[240]	116
	117	# Now we check each PDF object we've just created.
[450]	118	newpageregexp = re.compile(r"(/Type)\s?(/Page)[/>\s]", re.I)
[193]	119	pagecount = 0
[252]	120	for obj in objects.values() :
	121	content = "".join(obj.content)
[243]	122	count = len(newpageregexp.findall(content))
[450]	123	if count and (content != r"<</Type /Page>>") : # Empty pages which are not rendered ?
	124	pagecount += count
[193]	125	return pagecount
[3384]	126
	127	def veryFastAndNotAlwaysCorrectgetJobSize(self) :
	128	"""Counts pages in a PDF document."""
	129	newpageregexp = re.compile(r"/Type\s*/Page[/>\s]")
	130	return len(newpageregexp.findall(self.infile.read()))
	131
	132	def thisOneIsSlowButCorrectgetJobSize(self) :
	133	"""Counts pages in a PDF document."""
	134	oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s.+?\s?endobj)", \
	135	re.DOTALL)
	136	objtokeep = {}
	137	for (smajor, sminor, content) in oregexp.findall(self.infile.read()) :
	138	major = int(smajor)
	139	minor = int(sminor)
	140	(prevmin, prevcont) = objtokeep.get(major, (None, None))
	141	if (minor >= prevmin) : # Handles both None and real previous minor
	142	objtokeep[major] = (minor, content)
	143	#if prevmin is not None :
	144	# self.logdebug("Object %i.%i overwritten with %i.%i" \
	145	# % (major, prevmin, \
	146	# major, minor))
	147	#else :
	148	# self.logdebug("Object %i.%i OK" % (major, minor))
	149	npregexp = re.compile(r"/Type\s*/Page[/>\s]")
	150	pagecount = 0
	151	for (major, (minor, content)) in objtokeep.items() :
	152	count = len(npregexp.findall(content))
	153	if count :
	154	emptycount = content.count("obj\n<< \n/Type /Page \n>> \nendobj") + content.count("obj\n<< \n/Type /Page \n\n>> \nendobj") # TODO : make this clean
	155	if not emptycount :
	156	self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
	157	pagecount += count - emptycount
	158	return pagecount

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pkpgcounter / trunk / pkpgpdls / pdf.py @ 3384

Download in other formats: