Context Navigation

← Previous Change
Next Change →

Changeset 3385 for pkpgcounter/trunk

Timestamp:

06/22/08 11:45:35 (16 years ago)

Author:

jerome

Message:

Rebuilt the PDF parsing engine to correctly account for redacted parts.
Around 25% slower than previous method, but more accurate.

Files:

: 1 modified

pkpgcounter/trunk/pkpgpdls/pdf.py (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

pkpgcounter/trunk/pkpgpdls/pdf.py

r3384	r3385
74	74	return False
75	75
76		def getJobSize(self) :
77		"""Counts pages in a PDF document."""
78		# First we start with a generic PDF parser.
79		lastcomment = None
80		objects = {}
81		inobject = 0
82		objre = re.compile(r"\s?(\d+)\s+(\d+)\s+obj[<\s/]?")
83		for line in self.infile :
84		line = line.strip()
85		if line.startswith("% ") :
86		if inobject :
87		obj.comments.append(line)
88		else :
89		lastcomment = line[2:]
90		else :
91		# New object begins here
92		result = objre.search(line)
93		if result is not None :
94		(major, minor) = line[result.start():result.end()].split()[:2]
95		obj = PDFObject(major, minor, lastcomment)
96		obj.content.append(line[result.end():])
97		inobject = 1
98		elif line.startswith("endobj") \
99		or line.startswith(">> endobj") \
100		or line.startswith(">>endobj") :
101		# Handle previous object, if any
102		if inobject :
103		# only overwrite older versions of this object
104		# same minor seems to be possible, so the latest one
105		# found in the file will be the one we keep.
106		# if we want the first one, just use > instead of >=
107		oldobject = objects.setdefault(major, obj)
108		if int(minor) >= oldobject.minori :
109		objects[major] = obj
110		# self.logdebug("Object(%i, %i) overwritten with Object(%i, %i)" % (oldobject.majori, oldobject.minori, obj.majori, obj.minori))
111		# self.logdebug("Object(%i, %i)" % (obj.majori, obj.minori))
112		inobject = 0
113		else :
114		if inobject :
115		obj.content.append(line)
116
117		# Now we check each PDF object we've just created.
118		newpageregexp = re.compile(r"(/Type)\s?(/Page)[/>\s]", re.I)
119		pagecount = 0
120		for obj in objects.values() :
121		content = "".join(obj.content)
122		count = len(newpageregexp.findall(content))
123		if count and (content != r"<</Type /Page>>") : # Empty pages which are not rendered ?
124		pagecount += count
125		return pagecount
	76	def veryFastAndNotAlwaysCorrectgetJobSize(self) :
	77	"""Counts pages in a PDF document.
126	78
127		def veryFastAndNotAlwaysCorrectgetJobSize(self) :
128		"""Counts pages in a PDF document."""
	79	This method works great in the general case,
	80	and is around 30 times faster than the active
	81	one.
	82	Unfortunately it doesn't take into account documents
	83	with redacted pages (only made with FrameMaker ?)
	84	"""
129	85	newpageregexp = re.compile(r"/Type\s*/Page[/>\s]")
130	86	return len(newpageregexp.findall(self.infile.read()))
131	87
132		def thisOneIsSlowButCorrectgetJobSize(self) :
133		"""Counts pages in a PDF document."""
	88	def getJobSize(self) :
	89	"""Counts pages in a PDF document.
	90
	91	A faster way seems to be possible by extracting the
	92	"/Type/Pages/Count xxxx" value where there's no /Parent
	93	(i.e. the root of the page tree)
	94	Unfortunately I can't make a regexp work for this currently.
	95
	96	At least the actual method below is accurate, even if 25%
	97	slower than the old one.
	98	"""
	99	# Regular expression to extract objects from a PDF document
134	100	oregexp = re.compile(r"\s+(\d+)\s+(\d+)\s+(obj\s.+?\s?endobj)", \
135	101	re.DOTALL)
	102
	103	# Regular expression indicating a new page
	104	npregexp = re.compile(r"/Type\s*/Page[/>\s]")
	105
	106	# Regular expression indicating an empty page
	107	# (usually to delete an existing one with a lower minor number)
	108	epregexp = re.compile(r"obj\s<<\s/Type\s/Page\s>>\s*endobj")
	109
	110	# First we build a mapping of objects to keep because
	111	# if two objects with the same major number are found,
	112	# we only keep the one with the higher minor number :
	113	# this is the way in PDF to replace existing objects.
136	114	objtokeep = {}
137	115	for (smajor, sminor, content) in oregexp.findall(self.infile.read()) :
…	…
147	125	#else :
148	126	# self.logdebug("Object %i.%i OK" % (major, minor))
149		npregexp = re.compile(r"/Type\s*/Page[/>\s]")
	127
	128	# Now that we have deleted all unneeded objects, we
	129	# can count the ones which are new pages, minus the ones
	130	# which are empty and not displayed pages (in fact pages
	131	# used to redact existing content).
150	132	pagecount = 0
151	133	for (major, (minor, content)) in objtokeep.items() :
152	134	count = len(npregexp.findall(content))
153	135	if count :
154		emptycount = ~~content.count("obj\n<< \n/Type /Page \n>> \nendobj") + content.count("obj\n<< \n/Type /Page \n\n>> \nendobj") # TODO : make this clean~~
155		if not emptycount :
156		self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
	136	emptycount = len(epregexp.findall(content))
	137	#if not emptycount :
	138	# self.logdebug("%i.%i : %s\n" % (major, minor, repr(content)))
157	139	pagecount += count - emptycount
158	140	return pagecount

Context Navigation

Changeset 3385 for pkpgcounter/trunk

Legend:

pkpgcounter/trunk/pkpgpdls/pdf.py

Download in other formats: