Context Navigation

pdf.py @ 3578

Revision 3578, 5.1 kB (checked in by jerome, 5 years ago)
Clarified dependency wrt PIL/Pillow. Updated copyright years. Regenerated manual page.
Property svn:eol-style set to `native` Property svn:keywords set to `Auth Date Id Rev`

Rev	Line
[3410]	1	# -- coding: utf-8 --
[191]	2	#
	3	# pkpgcounter : a generic Page Description Language parser
	4	#
[3578]	5	# (c) 2003-2019 Jerome Alet <alet@librelogiciel.com>
[463]	6	# This program is free software: you can redistribute it and/or modify
[191]	7	# it under the terms of the GNU General Public License as published by
[463]	8	# the Free Software Foundation, either version 3 of the License, or
[191]	9	# (at your option) any later version.
[3436]	10	#
[191]	11	# This program is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	# GNU General Public License for more details.
[3436]	15	#
[191]	16	# You should have received a copy of the GNU General Public License
[463]	17	# along with this program. If not, see <http://www.gnu.org/licenses/>.
[191]	18	#
	19	# $Id$
	20	#
[193]	21
[3384]	22	"""This modules implements a page counter for PDF documents.
[355]	23
[3384]	24	Some informations taken from PDF Reference v1.7 by Adobe.
	25	"""
	26
[193]	27	import re
	28
[235]	29	import pdlparser
[193]	30
[3384]	31	PDFWHITESPACE = chr(0) \
	32	+ chr(9) \
	33	+ chr(10) \
	34	+ chr(12) \
	35	+ chr(13) \
	36	+ chr(32)
[3386]	37	PDFDELIMITERS = r"()<>[]{}/%"
[3384]	38	PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects
[3436]	39
[220]	40	class Parser(pdlparser.PDLParser) :
[193]	41	"""A parser for PDF documents."""
[492]	42	totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ]
[527]	43	required = [ "gs" ]
[491]	44	openmode = "rU"
[555]	45	format = "PDF"
[3436]	46	def isValid(self) :
[387]	47	"""Returns True if data is PDF, else False."""
[522]	48	if self.firstblock.startswith("%PDF-") or \
	49	self.firstblock.startswith("\033%-12345X%PDF-") or \
	50	((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \
	51	(self.firstblock.find("%PDF-") != -1) :
[387]	52	return True
[3436]	53	else :
[387]	54	return False
[3436]	55
	56	def veryFastAndNotAlwaysCorrectgetJobSize(self) :
[3385]	57	"""Counts pages in a PDF document.
[3436]	58
[3385]	59	This method works great in the general case,
	60	and is around 30 times faster than the active
	61	one.
	62	Unfortunately it doesn't take into account documents
	63	with redacted pages (only made with FrameMaker ?)
[3386]	64	where an existing PDF object is replaced with one
	65	with the same major number a higher minor number.
[3385]	66	"""
[3384]	67	newpageregexp = re.compile(r"/Type\s*/Page[/>\s]")
	68	return len(newpageregexp.findall(self.infile.read()))
	69
[3385]	70	def getJobSize(self) :
	71	"""Counts pages in a PDF document.
[3436]	72
[3385]	73	A faster way seems to be possible by extracting the
	74	"/Type/Pages/Count xxxx" value where there's no /Parent
	75	(i.e. the root of the page tree)
	76	Unfortunately I can't make a regexp work for this currently.
[3436]	77
[3385]	78	At least the actual method below is accurate, even if 25%
[3386]	79	slower than the old one. But we will be able to extract
	80	other informations as well when needed, like orientation
	81	and size.
[3385]	82	"""
	83	# Regular expression to extract objects from a PDF document
[3384]	84	oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s.+?\s?endobj)", \
	85	re.DOTALL)
[3436]	86
[3385]	87	# Regular expression indicating a new page
	88	npregexp = re.compile(r"/Type\s*/Page[/>\s]")
[3436]	89
	90	# Regular expression indicating an empty page
[3385]	91	# (usually to delete an existing one with a lower minor number)
[3436]	92	epregexp = re.compile(r"obj\s<<\s/Type\s/Page\s>>\s*endobj")
	93
[3385]	94	# First we build a mapping of objects to keep because
	95	# if two objects with the same major number are found,
	96	# we only keep the one with the higher minor number :
	97	# this is the way in PDF to replace existing objects.
[3384]	98	objtokeep = {}
	99	for (smajor, sminor, content) in oregexp.findall(self.infile.read()) :
	100	major = int(smajor)
	101	minor = int(sminor)
	102	(prevmin, prevcont) = objtokeep.get(major, (None, None))
	103	if (minor >= prevmin) : # Handles both None and real previous minor
	104	objtokeep[major] = (minor, content)
	105	#if prevmin is not None :
	106	# self.logdebug("Object %i.%i overwritten with %i.%i" \
	107	# % (major, prevmin, \
	108	# major, minor))
	109	#else :
	110	# self.logdebug("Object %i.%i OK" % (major, minor))
[3436]	111
	112	# Now that we have deleted all unneeded objects, we
[3385]	113	# can count the ones which are new pages, minus the ones
	114	# which are empty and not displayed pages (in fact pages
	115	# used to redact existing content).
[3384]	116	pagecount = 0
	117	for (major, (minor, content)) in objtokeep.items() :
	118	count = len(npregexp.findall(content))
	119	if count :
[3385]	120	emptycount = len(epregexp.findall(content))
	121	#if not emptycount :
	122	# self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
[3384]	123	pagecount += count - emptycount
[3385]	124	return pagecount

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pkpgcounter / trunk / pkpgpdls / pdf.py @ 3578

Download in other formats: