Context Navigation

pdf.py @ 3523

Revision 3474, 5.1 kB (checked in by jerome, 16 years ago)
Changed copyright years.
Property svn:eol-style set to `native` Property svn:keywords set to `Auth Date Id Rev`

Line
1	# -- coding: utf-8 --
2	#
3	# pkpgcounter : a generic Page Description Language parser
4	#
5	# (c) 2003-2009 Jerome Alet <alet@librelogiciel.com>
6	# This program is free software: you can redistribute it and/or modify
7	# it under the terms of the GNU General Public License as published by
8	# the Free Software Foundation, either version 3 of the License, or
9	# (at your option) any later version.
10	#
11	# This program is distributed in the hope that it will be useful,
12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	# GNU General Public License for more details.
15	#
16	# You should have received a copy of the GNU General Public License
17	# along with this program. If not, see <http://www.gnu.org/licenses/>.
18	#
19	# $Id$
20	#
21
22	"""This modules implements a page counter for PDF documents.
23
24	Some informations taken from PDF Reference v1.7 by Adobe.
25	"""
26
27	import re
28
29	import pdlparser
30
31	PDFWHITESPACE = chr(0) \
32	+ chr(9) \
33	+ chr(10) \
34	+ chr(12) \
35	+ chr(13) \
36	+ chr(32)
37	PDFDELIMITERS = r"()<>[]{}/%"
38	PDFMEDIASIZE = "/MediaBox [xmin ymin xmax ymax]" # an example. MUST be present in Page objects
39
40	class Parser(pdlparser.PDLParser) :
41	"""A parser for PDF documents."""
42	totiffcommands = [ 'gs -sDEVICE=tiff24nc -dPARANOIDSAFER -dNOPAUSE -dBATCH -dQUIET -r"%(dpi)i" -sOutputFile="%(outfname)s" "%(infname)s"' ]
43	required = [ "gs" ]
44	openmode = "rU"
45	format = "PDF"
46	def isValid(self) :
47	"""Returns True if data is PDF, else False."""
48	if self.firstblock.startswith("%PDF-") or \
49	self.firstblock.startswith("\033%-12345X%PDF-") or \
50	((self.firstblock[:128].find("\033%-12345X") != -1) and (self.firstblock.upper().find("LANGUAGE=PDF") != -1)) or \
51	(self.firstblock.find("%PDF-") != -1) :
52	return True
53	else :
54	return False
55
56	def veryFastAndNotAlwaysCorrectgetJobSize(self) :
57	"""Counts pages in a PDF document.
58
59	This method works great in the general case,
60	and is around 30 times faster than the active
61	one.
62	Unfortunately it doesn't take into account documents
63	with redacted pages (only made with FrameMaker ?)
64	where an existing PDF object is replaced with one
65	with the same major number a higher minor number.
66	"""
67	newpageregexp = re.compile(r"/Type\s*/Page[/>\s]")
68	return len(newpageregexp.findall(self.infile.read()))
69
70	def getJobSize(self) :
71	"""Counts pages in a PDF document.
72
73	A faster way seems to be possible by extracting the
74	"/Type/Pages/Count xxxx" value where there's no /Parent
75	(i.e. the root of the page tree)
76	Unfortunately I can't make a regexp work for this currently.
77
78	At least the actual method below is accurate, even if 25%
79	slower than the old one. But we will be able to extract
80	other informations as well when needed, like orientation
81	and size.
82	"""
83	# Regular expression to extract objects from a PDF document
84	oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s.+?\s?endobj)", \
85	re.DOTALL)
86
87	# Regular expression indicating a new page
88	npregexp = re.compile(r"/Type\s*/Page[/>\s]")
89
90	# Regular expression indicating an empty page
91	# (usually to delete an existing one with a lower minor number)
92	epregexp = re.compile(r"obj\s<<\s/Type\s/Page\s>>\s*endobj")
93
94	# First we build a mapping of objects to keep because
95	# if two objects with the same major number are found,
96	# we only keep the one with the higher minor number :
97	# this is the way in PDF to replace existing objects.
98	objtokeep = {}
99	for (smajor, sminor, content) in oregexp.findall(self.infile.read()) :
100	major = int(smajor)
101	minor = int(sminor)
102	(prevmin, prevcont) = objtokeep.get(major, (None, None))
103	if (minor >= prevmin) : # Handles both None and real previous minor
104	objtokeep[major] = (minor, content)
105	#if prevmin is not None :
106	# self.logdebug("Object %i.%i overwritten with %i.%i" \
107	# % (major, prevmin, \
108	# major, minor))
109	#else :
110	# self.logdebug("Object %i.%i OK" % (major, minor))
111
112	# Now that we have deleted all unneeded objects, we
113	# can count the ones which are new pages, minus the ones
114	# which are empty and not displayed pages (in fact pages
115	# used to redact existing content).
116	pagecount = 0
117	for (major, (minor, content)) in objtokeep.items() :
118	count = len(npregexp.findall(content))
119	if count :
120	emptycount = len(epregexp.findall(content))
121	#if not emptycount :
122	# self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
123	pagecount += count - emptycount
124	return pagecount

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pkpgcounter / trunk / pkpgpdls / pdf.py @ 3523

Download in other formats: