Context Navigation

← Previous Change
Next Change →

pdf.py

Timestamp:

07/20/05 14:45:04 (19 years ago)

Author:

jerome

Message:

Fixed the PDF parser for PDF documents which contain several versions of the same PDF object.

Files:

: 1 modified

pkpgcounter/trunk/pkpgpdls/pdf.py (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

pkpgcounter/trunk/pkpgpdls/pdf.py

r235	r240
27	27	import pdlparser
28	28
	29	class PDFObject :
	30	"""A class for PDF objects."""
	31	def __init__(self, major, minor, description) :
	32	"""Initialize the PDF object."""
	33	self.major = major
	34	self.minor = minor
	35	self.description = description
	36	self.comments = []
	37	self.content = []
	38	self.parent = None
	39	self.kids = []
	40
29	41	class Parser(pdlparser.PDLParser) :
30	42	"""A parser for PDF documents."""
…	…
43	55	def getJobSize(self) :
44	56	"""Counts pages in a PDF document."""
	57	# First we start with a generic PDF parser.
	58	lastcomment = None
	59	objects = {}
	60	while 1 :
	61	line = self.infile.readline()
	62	if not line :
	63	break
	64	# now workaround the unavailability of "Universal New Line"
	65	# under Python <2.3.
	66	line = line.strip().replace("\r\n", " ").replace("\r", " ")
	67	if line.startswith("% ") :
	68	lastcomment = line[2:]
	69	if line.endswith(" obj") :
	70	# New object begins here
	71	(n0, n1, dummy) = line.split()
	72	(major, minor) = map(int, (n0, n1))
	73	obj = PDFObject(major, minor, lastcomment)
	74	while 1 :
	75	line = self.infile.readline()
	76	if not line :
	77	break
	78	line = line.strip()
	79	if line.startswith("% ") :
	80	obj.comments.append(line)
	81	elif line.startswith("endobj") :
	82	break
	83	else :
	84	obj.content.append(line)
	85	try :
	86	# try to find a different version of this object
	87	oldobject = objects[major]
	88	except KeyError :
	89	# not found, so we add it
	90	objects[major] = obj
	91	else :
	92	# only overwrite older versions of this object
	93	# same minor seems to be possible, so the latest one
	94	# found in the file will be the one we keep.
	95	# if we want the first one, just use > instead of >=
	96	if minor >= oldobject.minor :
	97	objects[major] = obj
	98
	99	# Now we check each PDF object we've just created.
45	100	self.iscolor = None
46	101	newpageregexp = re.compile(r"(/Type) ?(/Page)[/ \t\r\n]", re.I)
47	102	colorregexp = re.compile(r"(/ColorSpace) ?(/DeviceRGB\|/DeviceCMYK)[/ \t\r\n]", re.I)
48	103	pagecount = 0
49		for line in self.infile.xreadlines() :
50		pagecount += len(newpageregexp.findall(line))
51		if colorregexp.match(line) :
	104	for object in objects.values() :
	105	content = "".join(object.content)
	106	pagecount += len(newpageregexp.findall(content))
	107	if colorregexp.match(content) :
52	108	self.iscolor = 1
53	109	if self.debug :
54		sys.stderr.write("ColorSpace : %s\n" % ~~line~~)
	110	sys.stderr.write("ColorSpace : %s\n" % content)
55	111	return pagecount
56	112
…	…
65	121	mustclose = 0
66	122	else :
67		infile = open(arg, "rb")
	123	infile = open(arg, "rU")
68	124	mustclose = 1
69	125	try :

Context Navigation

Changeset 240 for pkpgcounter/trunk/pkpgpdls/pdf.py

Legend:

pkpgcounter/trunk/pkpgpdls/pdf.py

Download in other formats: