Context Navigation

pdlanalyzer.py @ 1552

Revision 1552, 19.6 kB (checked in by jalet, 20 years ago)
Native PDF parser greatly improved. GhostScript? based PDF parser completely removed because native code is now portable across Python versions.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`

Line
1	# PyKota
2	# -- coding: ISO-8859-15 --
3	#
4	# PyKota - Print Quotas for CUPS and LPRng
5	#
6	# (c) 2003-2004 Jerome Alet <alet@librelogiciel.com>
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
20	#
21	# $Id$
22	#
23	# $Log$
24	# Revision 1.9 2004/06/18 22:21:27 jalet
25	# Native PDF parser greatly improved.
26	# GhostScript based PDF parser completely removed because native code
27	# is now portable across Python versions.
28	#
29	# Revision 1.8 2004/06/18 20:49:46 jalet
30	# "ERROR:" prefix added
31	#
32	# Revision 1.7 2004/06/18 17:48:04 jalet
33	# Added native fast PDF parsing method
34	#
35	# Revision 1.6 2004/06/18 14:00:16 jalet
36	# Added PDF support in smart PDL analyzer (through GhostScript for now)
37	#
38	# Revision 1.5 2004/06/18 10:09:05 jalet
39	# Resets file pointer to start of file in all cases
40	#
41	# Revision 1.4 2004/06/18 06:16:14 jalet
42	# Fixes PostScript detection code for incorrect drivers
43	#
44	# Revision 1.3 2004/05/21 20:40:08 jalet
45	# All the code for pkpgcounter is now in pdlanalyzer.py
46	#
47	# Revision 1.2 2004/05/19 19:09:36 jalet
48	# Speed improvement
49	#
50	# Revision 1.1 2004/05/18 09:59:54 jalet
51	# pkpgcounter is now just a wrapper around the PDLAnalyzer class
52	#
53	#
54	#
55
56	import sys
57	import os
58	import re
59	import struct
60	import tempfile
61	import popen2
62
63	KILOBYTE = 1024
64	MEGABYTE = 1024 * KILOBYTE
65
66	class PDLAnalyzerError(Exception):
67	"""An exception for PDL Analyzer related stuff."""
68	def __init__(self, message = ""):
69	self.message = message
70	Exception.__init__(self, message)
71	def __repr__(self):
72	return self.message
73	__str__ = __repr__
74
75	class PostScriptAnalyzer :
76	def __init__(self, infile) :
77	"""Initialize PostScript Analyzer."""
78	self.infile = infile
79
80	def getJobSize(self) :
81	"""Count pages in a DSC compliant PostScript document."""
82	pagecount = 0
83	for line in self.infile.xreadlines() :
84	if line.startswith("%%Page: ") :
85	pagecount += 1
86	return pagecount
87
88	class PDFAnalyzer :
89	def __init__(self, infile) :
90	"""Initialize PDF Analyzer."""
91	self.infile = infile
92
93	def getJobSize(self) :
94	"""Counts pages in a PDF document."""
95	regexp = re.compile(r"(/Type) ?(/Page)[/ \r\n]")
96	pagecount = 0
97	for line in self.infile.xreadlines() :
98	pagecount += len(regexp.findall(line))
99	return pagecount
100
101	class PCLAnalyzer :
102	def __init__(self, infile) :
103	"""Initialize PCL Analyzer."""
104	self.infile = infile
105
106	def skip(self, nb) :
107	"""Reads a new datablock."""
108	newpos = self.pos + nb
109	if newpos >= self.len :
110	oldlen = self.len
111	self.data = self.infile.read(MEGABYTE)
112	self.len = len(self.data)
113	if not self.len :
114	return
115	self.pos = newpos - oldlen
116	else :
117	self.pos = newpos
118
119	def readone(self) :
120	"""Reads a new byte."""
121	if self.pos < self.len :
122	char = self.data[self.pos]
123	else :
124	self.data = self.infile.read(MEGABYTE)
125	self.len = len(self.data)
126	self.pos = 0
127	if not self.len :
128	return
129	char = self.data[0]
130	self.pos += 1
131	return char
132
133	def getJobSize(self) :
134	"""Count pages in a PCL5 document."""
135	#
136	# Algorithm from pclcount
137	# (c) 2003, by Eduardo Gielamo Oliveira & Rodolfo Broco Manin
138	# published under the terms of the GNU General Public Licence v2.
139	#
140	# Backported from C to Python by Jerome Alet, then enhanced
141	# with more PCL tags detected. I think all the necessary PCL tags
142	# are recognized to correctly handle PCL5 files wrt their number
143	# of pages. The documentation used for this was :
144	#
145	# HP PCL/PJL Reference Set
146	# PCL5 Printer Language Technical Quick Reference Guide
147	# http://h20000.www2.hp.com/bc/docs/support/SupportManual/bpl13205/bpl13205.pdf
148	#
149	tagsends = { "&n" : "W",
150	"&b" : "W",
151	"*i" : "W",
152	"*l" : "W",
153	"*m" : "W",
154	"*v" : "W",
155	"*c" : "W",
156	"(f" : "W",
157	"*b" : "VW",
158	"(s" : "W",
159	")s" : "W",
160	"&p" : "X",
161	"&l" : "X" }
162	self.data = []
163	self.pos = self.len = 0
164	copies = 1
165	pagecount = resets = 0
166	tag = None
167	while 1 :
168	char = self.readone()
169	if not char : # EOF ?
170	break
171	if char == "\014" :
172	pagecount += 1
173	elif char == "\033" :
174	#
175	# <ESC>*b###W -> Start of a raster data row/block
176	# <ESC>*b###V -> Start of a raster data plane
177	# <ESC>*c###W -> Start of a user defined pattern
178	# <ESC>*i###W -> Start of a viewing illuminant block
179	# <ESC>*l###W -> Start of a color lookup table
180	# <ESC>*m###W -> Start of a download dither matrix block
181	# <ESC>*v###W -> Start of a configure image data block
182	# <ESC>(s###W -> Start of a characters description block
183	# <ESC>)s###W -> Start of a fonts description block
184	# <ESC>(f###W -> Start of a symbol set block
185	# <ESC>&b###W -> Start of configuration data block
186	# <ESC>&l###X -> Number of copies
187	# <ESC>&n###W -> Starts an alphanumeric string ID block
188	# <ESC>&p###X -> Start of a non printable characters block
189	#
190	tagstart = self.readone()
191	if tagstart in "E9=YZ" : # one byte PCL tag
192	if tagstart == "E" :
193	resets += 1
194	continue # skip to next tag
195	tag = tagstart + self.readone()
196	try :
197	tagend = tagsends[tag]
198	except KeyError :
199	pass # Unsupported PCL tag
200	else :
201	# Now read the numeric argument
202	size = 0
203	while 1 :
204	char = self.readone()
205	if not char.isdigit() :
206	break
207	size = (size * 10) + int(char)
208	if char in tagend :
209	if tag == "&l" :
210	copies = size
211	else :
212	# doing a read will prevent the seek
213	# for unseekable streams.
214	# we just ignore the block anyway.
215	if tag == "&n" :
216	# we have to take care of the operation id byte
217	# which is before the string itself
218	size += 1
219	self.skip(size)
220
221	# if pagecount is still 0, we will return the number
222	# of resets instead of the number of form feed characters.
223	# but the number of resets is always at least 2 with a valid
224	# pcl file : one at the very start and one at the very end
225	# of the job's data. So we substract 2 from the number of
226	# resets. And since on our test data we needed to substract
227	# 1 more, we finally substract 3, and will test several
228	# PCL files with this. If resets < 2, then the file is
229	# probably not a valid PCL file, so we return 0
230	return copies * (pagecount or ((resets - 3) * (resets > 2)))
231
232	class PCLXLAnalyzer :
233	def __init__(self, infile) :
234	"""Initialize PCLXL Analyzer."""
235	raise PDLAnalyzerError, "PCLXL (aka PCL6) is not supported yet."
236	self.infile = infile
237	self.islittleendian = None
238	found = 0
239	while not found :
240	line = self.infile.readline()
241	if not line :
242	break
243	if line[1:12] == " HP-PCL XL;" :
244	found = 1
245	if line[0] == ")" :
246	self.littleendian()
247	elif line[0] == "(" :
248	self.bigendian()
249	if not found :
250	raise PDLAnalyzerError, "This file doesn't seem to be PCLXL (aka PCL6)"
251	else :
252	self.tags = [lambda: None] * 256
253	self.tags[0x28] = self.bigendian # big endian
254	self.tags[0x29] = self.littleendian # big endian
255	self.tags[0x43] = self.beginPage # BeginPage
256	self.tags[0x44] = self.endPage # EndPage
257
258	self.tags[0xc0] = lambda: 1 # ubyte
259	self.tags[0xc1] = lambda: 2 # uint16
260	self.tags[0xc2] = lambda: 4 # uint32
261	self.tags[0xc3] = lambda: 2 # sint16
262	self.tags[0xc4] = lambda: 4 # sint32
263	self.tags[0xc5] = lambda: 4 # real32
264
265	self.tags[0xc8] = self.array_8 # ubyte_array
266	self.tags[0xc9] = self.array_16 # uint16_array
267	self.tags[0xca] = self.array_32 # uint32_array
268	self.tags[0xcb] = self.array_16 # sint16_array
269	self.tags[0xcc] = self.array_32 # sint32_array
270	self.tags[0xcd] = self.array_32 # real32_array
271
272	self.tags[0xd0] = lambda: 2 # ubyte_xy
273	self.tags[0xd1] = lambda: 4 # uint16_xy
274	self.tags[0xd2] = lambda: 8 # uint32_xy
275	self.tags[0xd3] = lambda: 4 # sint16_xy
276	self.tags[0xd4] = lambda: 8 # sint32_xy
277	self.tags[0xd5] = lambda: 8 # real32_xy
278
279	self.tags[0xd0] = lambda: 4 # ubyte_box
280	self.tags[0xd1] = lambda: 8 # uint16_box
281	self.tags[0xd2] = lambda: 16 # uint32_box
282	self.tags[0xd3] = lambda: 8 # sint16_box
283	self.tags[0xd4] = lambda: 16 # sint32_box
284	self.tags[0xd5] = lambda: 16 # real32_box
285
286	self.tags[0xf8] = lambda: 1 # attr_ubyte
287	self.tags[0xf9] = lambda: 2 # attr_uint16
288
289	self.tags[0xfa] = self.embeddedData # dataLength
290	self.tags[0xfb] = self.embeddedDataSmall # dataLengthByte
291
292	def debug(self, msg) :
293	"""Outputs a debug message on stderr."""
294	sys.stderr.write("%s\n" % msg)
295	sys.stderr.flush()
296
297	def beginPage(self) :
298	"""Indicates the beginning of a new page."""
299	self.pagecount += 1
300	self.debug("Begin page %i at %s" % (self.pagecount, self.infile.tell()))
301
302	def endPage(self) :
303	"""Indicates the end of a page."""
304	self.debug("End page %i at %s" % (self.pagecount, self.infile.tell()))
305
306	def handleArray(self, itemsize) :
307	"""Handles arrays."""
308	pos = self.infile.tell()
309	datatype = self.infile.read(1)
310	length = self.tags[ord(datatype)]()
311	if length is None :
312	self.debug("Bogus array length at %s" % pos)
313	else :
314	sarraysize = self.infile.read(length)
315	if self.islittleendian :
316	fmt = "<"
317	else :
318	fmt = ">"
319	if length == 1 :
320	fmt += "B"
321	elif length == 2 :
322	fmt += "H"
323	elif length == 4 :
324	fmt += "I"
325	else :
326	raise PDLAnalyzerError, "Error on array size at %s" % self.infile.tell()
327	arraysize = struct.unpack(fmt, sarraysize)[0]
328	self.debug("Array at %s, itemsize %s, datatype 0x%02x, size %s" % (pos, itemsize, ord(datatype), arraysize))
329	return arraysize * itemsize
330
331	def array_8(self) :
332	"""Handles byte arrays."""
333	return self.handleArray(1)
334
335	def array_16(self) :
336	"""Handles byte arrays."""
337	return self.handleArray(2)
338
339	def array_32(self) :
340	"""Handles byte arrays."""
341	return self.handleArray(4)
342
343	def embeddedDataSmall(self) :
344	"""Handle small amounts of data."""
345	pos = self.infile.tell()
346	val = ord(self.infile.read(1))
347	self.debug("smalldatablock at %s (0x%02x)" % (pos, val))
348	return val
349
350	def embeddedData(self) :
351	"""Handle normal amounts of data."""
352	if self.islittleendian :
353	fmt = "<I"
354	else :
355	fmt = ">I"
356	pos = self.infile.tell()
357	val = struct.unpack(fmt, self.infile.read(4))[0]
358	self.debug("datablock at %s (0x%08x)" % (pos, val))
359	return val
360
361	def littleendian(self) :
362	"""Toggles to little endianness."""
363	self.islittleendian = 1 # little endian
364
365	def bigendian(self) :
366	"""Toggles to big endianness."""
367	self.islittleendian = 0 # big endian
368
369	def getJobSize(self) :
370	"""Counts pages in a PCLXL (PCL6) document."""
371	self.pagecount = 0
372	while 1 :
373	char = self.infile.read(1)
374	if not char :
375	break
376	index = ord(char)
377	length = self.tags[index]()
378	if length :
379	self.infile.read(length)
380	return self.pagecount
381
382	class PDLAnalyzer :
383	"""Generic PDL Analyzer class."""
384	def __init__(self, filename) :
385	"""Initializes the PDL analyzer.
386
387	filename is the name of the file or '-' for stdin.
388	filename can also be a file-like object which
389	supports read() and seek().
390	"""
391	self.filename = filename
392
393	def getJobSize(self) :
394	"""Returns the job's size."""
395	self.openFile()
396	try :
397	pdlhandler = self.detectPDLHandler()
398	except PDLAnalyzerError, msg :
399	self.closeFile()
400	raise PDLAnalyzerError, "ERROR : Unknown file format for %s (%s)" % (self.filename, msg)
401	else :
402	try :
403	size = pdlhandler(self.infile).getJobSize()
404	finally :
405	self.closeFile()
406	return size
407
408	def openFile(self) :
409	"""Opens the job's data stream for reading."""
410	self.mustclose = 0 # by default we don't want to close the file when finished
411	if hasattr(self.filename, "read") and hasattr(self.filename, "seek") :
412	# filename is in fact a file-like object
413	infile = self.filename
414	elif self.filename == "-" :
415	# we must read from stdin
416	infile = sys.stdin
417	else :
418	# normal file
419	self.infile = open(self.filename, "rb") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2
420	self.mustclose = 1
421	return
422
423	# Use a temporary file, always seekable contrary to standard input.
424	# This also has the benefit to let us use the "U" mode (new in Python 2.3)
425	self.infile = tempfile.TemporaryFile(mode="w+b") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2
426	while 1 :
427	data = infile.read(MEGABYTE)
428	if not data :
429	break
430	self.infile.write(data)
431	self.infile.flush()
432	self.infile.seek(0)
433
434	def closeFile(self) :
435	"""Closes the job's data stream if we can close it."""
436	if self.mustclose :
437	self.infile.close()
438	else :
439	# if we don't have to close the file, then
440	# ensure the file pointer is reset to the
441	# start of the file in case the process wants
442	# to read the file again.
443	try :
444	self.infile.seek(0)
445	except :
446	pass # probably stdin, which is not seekable
447
448	def isPostScript(self, data) :
449	"""Returns 1 if data is PostScript, else 0."""
450	if data.startswith("%!") or \
451	data.startswith("\004%!") or \
452	data.startswith("\033%-12345X%!PS") or \
453	((data[:128].find("\033%-12345X") != -1) and \
454	((data.find("LANGUAGE=POSTSCRIPT") != -1) or \
455	(data.find("LANGUAGE = POSTSCRIPT") != -1) or \
456	(data.find("LANGUAGE = Postscript") != -1))) or \
457	(data.find("%!PS-Adobe") != -1) :
458	return 1
459	else :
460	return 0
461
462	def isPDF(self, data) :
463	"""Returns 1 if data is PDF, else 0."""
464	if data.startswith("%PDF-") or \
465	data.startswith("\033%-12345X%PDF-") or \
466	((data[:128].find("\033%-12345X") != -1) and (data.upper().find("LANGUAGE=PDF") != -1)) or \
467	(data.find("%PDF-") != -1) :
468	return 1
469	else :
470	return 0
471
472	def isPCL(self, data) :
473	"""Returns 1 if data is PCL, else 0."""
474	if data.startswith("\033E\033") or \
475	((data[:128].find("\033%-12345X") != -1) and \
476	((data.find("LANGUAGE=PCL") != -1) or \
477	(data.find("LANGUAGE = PCL") != -1) or \
478	(data.find("LANGUAGE = Pcl") != -1))) :
479	return 1
480	else :
481	return 0
482
483	def isPCLXL(self, data) :
484	"""Returns 1 if data is PCLXL aka PCL6, else 0."""
485	if ((data[:128].find("\033%-12345X") != -1) and \
486	(data.find(" HP-PCL XL;") != -1) and \
487	((data.find("LANGUAGE=PCLXL") != -1) or \
488	(data.find("LANGUAGE = PCLXL") != -1))) :
489	return 1
490	else :
491	return 0
492
493	def detectPDLHandler(self) :
494	"""Tries to autodetect the document format.
495
496	Returns the correct PDL handler class or None if format is unknown
497	"""
498	# Try to detect file type by reading first block of datas
499	self.infile.seek(0)
500	firstblock = self.infile.read(KILOBYTE)
501	self.infile.seek(0)
502	if self.isPostScript(firstblock) :
503	return PostScriptAnalyzer
504	elif self.isPCLXL(firstblock) :
505	return PCLXLAnalyzer
506	elif self.isPCL(firstblock) :
507	return PCLAnalyzer
508	elif self.isPDF(firstblock) :
509	return PDFAnalyzer
510	else :
511	raise PDLAnalyzerError, "Analysis of first data block failed."
512
513	def main() :
514	"""Entry point for PDL Analyzer."""
515	if (len(sys.argv) < 2) or ((not sys.stdin.isatty()) and ("-" not in sys.argv[1:])) :
516	sys.argv.append("-")
517
518	totalsize = 0
519	for arg in sys.argv[1:] :
520	try :
521	parser = PDLAnalyzer(arg)
522	totalsize += parser.getJobSize()
523	except PDLAnalyzerError, msg :
524	sys.stderr.write("ERROR: %s\n" % msg)
525	sys.stderr.flush()
526	print "%s" % totalsize
527
528	if __name__ == "__main__" :
529	main()

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pykota / trunk / pykota / pdlanalyzer.py @ 1552

Download in other formats: