Context Navigation

pdlanalyzer.py @ 1699

Revision 1699, 30.8 kB (checked in by jalet, 20 years ago)
First draft of PCL3GUI analyzer.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`

Line
1	# PyKota
2	# -- coding: ISO-8859-15 --
3	#
4	# PyKota - Print Quotas for CUPS and LPRng
5	#
6	# (c) 2003-2004 Jerome Alet <alet@librelogiciel.com>
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
20	#
21	# $Id$
22	#
23	# $Log$
24	# Revision 1.38 2004/09/02 22:08:37 jalet
25	# First draft of PCL3GUI analyzer.
26	#
27	# Revision 1.37 2004/09/02 21:22:49 jalet
28	# One more PCL tag
29	#
30	# Revision 1.36 2004/09/01 22:31:49 jalet
31	# Some more work on ESC/P2 analyzer to avoid missing \r\n sequences. Not
32	# exactly optimal though...
33	#
34	# Revision 1.35 2004/08/30 23:10:24 jalet
35	# Improved the ESC/P2 analyzer so that more GhostScript devices are supported
36	#
37	# Revision 1.34 2004/08/27 09:08:22 jalet
38	# Improvement in PostScript parser to avoid being fooled by clever "students"
39	#
40	# Revision 1.33 2004/08/27 09:02:34 jalet
41	# Forgot to remove some special debugging code...
42	#
43	# Revision 1.32 2004/08/27 08:58:50 jalet
44	# Relax checks for PCL5 header to accomodate strange printer drivers
45	#
46	# Revision 1.31 2004/08/22 08:25:33 jalet
47	# Improved ESC/P2 miniparser thanks to Paulo Silva
48	#
49	# Revision 1.30 2004/08/21 23:16:57 jalet
50	# First draft of ESC/P2 (mini-)parser.
51	#
52	# Revision 1.29 2004/08/11 16:25:38 jalet
53	# Fixed index problem in PCLXL parser when retrieving number of copies for
54	# each page
55	#
56	# Revision 1.28 2004/08/10 23:01:49 jalet
57	# Fixed number of copies in PCL5 parser
58	#
59	# Revision 1.27 2004/08/09 18:14:22 jalet
60	# Added workaround for number of copies and some PostScript drivers
61	#
62	# Revision 1.26 2004/07/22 13:49:51 jalet
63	# Added support for binary PostScript through GhostScript if native DSC
64	# compliant PostScript analyzer doesn't find any page. This is much
65	# slower though, so native analyzer is tried first.
66	#
67	# Revision 1.25 2004/07/10 14:06:36 jalet
68	# Fix for Python2.1 incompatibilities
69	#
70	# Revision 1.24 2004/07/05 21:00:39 jalet
71	# Fix for number of copies for each page in PCLXL parser
72	#
73	# Revision 1.23 2004/07/03 08:21:59 jalet
74	# Testsuite for PDL Analyzer added
75	#
76	# Revision 1.22 2004/06/29 14:21:41 jalet
77	# Smallish optimization
78	#
79	# Revision 1.21 2004/06/28 23:11:26 jalet
80	# Code de-factorization in PCLXL parser
81	#
82	# Revision 1.20 2004/06/28 22:38:41 jalet
83	# Increased speed by a factor of 2 in PCLXL parser
84	#
85	# Revision 1.19 2004/06/28 21:20:30 jalet
86	# PCLXL support now works !
87	#
88	# Revision 1.18 2004/06/27 22:59:37 jalet
89	# More work on PCLXL parser
90	#
91	# Revision 1.17 2004/06/26 23:20:01 jalet
92	# Additionnal speedup for GhostScript generated PCL5 files
93	#
94	# Revision 1.16 2004/06/26 15:31:00 jalet
95	# mmap reintroduced in PCL5 parser
96	#
97	# Revision 1.15 2004/06/26 14:14:31 jalet
98	# Now uses Psyco if it is available
99	#
100	# Revision 1.14 2004/06/25 09:50:28 jalet
101	# More debug info in PCLXL parser
102	#
103	# Revision 1.13 2004/06/25 08:10:08 jalet
104	# Another fix for PCL5 parser
105	#
106	# Revision 1.12 2004/06/24 23:09:53 jalet
107	# Fix for number of copies in PCL5 parser
108	#
109	# Revision 1.11 2004/06/23 22:07:50 jalet
110	# Fixed PCL5 parser according to the sources of rastertohp
111	#
112	# Revision 1.10 2004/06/18 22:24:03 jalet
113	# Removed old comments
114	#
115	# Revision 1.9 2004/06/18 22:21:27 jalet
116	# Native PDF parser greatly improved.
117	# GhostScript based PDF parser completely removed because native code
118	# is now portable across Python versions.
119	#
120	# Revision 1.8 2004/06/18 20:49:46 jalet
121	# "ERROR:" prefix added
122	#
123	# Revision 1.7 2004/06/18 17:48:04 jalet
124	# Added native fast PDF parsing method
125	#
126	# Revision 1.6 2004/06/18 14:00:16 jalet
127	# Added PDF support in smart PDL analyzer (through GhostScript for now)
128	#
129	# Revision 1.5 2004/06/18 10:09:05 jalet
130	# Resets file pointer to start of file in all cases
131	#
132	# Revision 1.4 2004/06/18 06:16:14 jalet
133	# Fixes PostScript detection code for incorrect drivers
134	#
135	# Revision 1.3 2004/05/21 20:40:08 jalet
136	# All the code for pkpgcounter is now in pdlanalyzer.py
137	#
138	# Revision 1.2 2004/05/19 19:09:36 jalet
139	# Speed improvement
140	#
141	# Revision 1.1 2004/05/18 09:59:54 jalet
142	# pkpgcounter is now just a wrapper around the PDLAnalyzer class
143	#
144	#
145	#
146
147	import sys
148	import os
149	import re
150	from struct import unpack
151	import tempfile
152	import mmap
153	import popen2
154
155	KILOBYTE = 1024
156	MEGABYTE = 1024 * KILOBYTE
157
158	class PDLAnalyzerError(Exception):
159	"""An exception for PDL Analyzer related stuff."""
160	def __init__(self, message = ""):
161	self.message = message
162	Exception.__init__(self, message)
163	def __repr__(self):
164	return self.message
165	__str__ = __repr__
166
167	class PostScriptAnalyzer :
168	def __init__(self, infile) :
169	"""Initialize PostScript Analyzer."""
170	self.infile = infile
171	self.copies = 1
172
173	def throughGhostScript(self) :
174	"""Get the count through GhostScript, useful for non-DSC compliant PS files."""
175	self.infile.seek(0)
176	command = 'gs -sDEVICE=bbox -dNOPAUSE -dBATCH -dQUIET - 2>&1 \| grep -c "%%HiResBoundingBox:" 2>/dev/null'
177	child = popen2.Popen4(command)
178	try :
179	data = self.infile.read(MEGABYTE)
180	while data :
181	child.tochild.write(data)
182	data = self.infile.read(MEGABYTE)
183	child.tochild.flush()
184	child.tochild.close()
185	except (IOError, OSError), msg :
186	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
187
188	pagecount = 0
189	try :
190	pagecount = int(child.fromchild.readline().strip())
191	except (IOError, OSError, AttributeError, ValueError) :
192	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
193	child.fromchild.close()
194
195	try :
196	retcode = child.wait()
197	except OSError, msg :
198	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
199	return pagecount * self.copies
200
201	def natively(self) :
202	"""Count pages in a DSC compliant PostScript document."""
203	self.infile.seek(0)
204	pagecount = 0
205	for line in self.infile.xreadlines() :
206	if line.startswith("%%Page: ") :
207	pagecount += 1
208	elif line.startswith("%%BeginNonPPDFeature: NumCopies ") :
209	# handle # of copies set by some Windows printer driver
210	try :
211	number = int(line.strip().split()[2])
212	except :
213	pass
214	else :
215	if number > self.copies :
216	self.copies = number
217	elif line.startswith("1 dict dup /NumCopies ") :
218	# handle # of copies set by mozilla/kprinter
219	try :
220	number = int(line.strip().split()[4])
221	except :
222	pass
223	else :
224	if number > self.copies :
225	self.copies = number
226	return pagecount * self.copies
227
228	def getJobSize(self) :
229	"""Count pages in PostScript document."""
230	return self.natively() or self.throughGhostScript()
231
232	class PDFAnalyzer :
233	def __init__(self, infile) :
234	"""Initialize PDF Analyzer."""
235	self.infile = infile
236
237	def getJobSize(self) :
238	"""Counts pages in a PDF document."""
239	regexp = re.compile(r"(/Type) ?(/Page)[/ \t\r\n]")
240	pagecount = 0
241	for line in self.infile.xreadlines() :
242	pagecount += len(regexp.findall(line))
243	return pagecount
244
245	class ESCP2Analyzer :
246	def __init__(self, infile) :
247	"""Initialize ESC/P2 Analyzer."""
248	self.infile = infile
249
250	def getJobSize(self) :
251	"""Counts pages in an ESC/P2 document."""
252	# with Gimpprint, at least, for each page there
253	# are two Reset Printer sequences (ESC + @)
254	marker1 = "\033@"
255
256	# with other software or printer driver, we
257	# may prefer to search for "\r\n\fESCAPE"
258	# or "\r\fESCAPE"
259	marker2r = "\r\f\033"
260	marker2rn = "\r\n\f\033"
261
262	# and ghostscript's stcolor for example seems to
263	# output ESC + @ + \f for each page plus one
264	marker3 = "\033@\f"
265
266	# while ghostscript's escp driver outputs instead
267	# \f + ESC + @
268	marker4 = "\f\033@"
269
270	data = self.infile.read()
271	pagecount1 = data.count(marker1)
272	pagecount2 = max(data.count(marker2r), data.count(marker2rn))
273	pagecount3 = data.count(marker3)
274	pagecount4 = data.count(marker4)
275
276	if pagecount2 :
277	return pagecount2
278	elif pagecount3 > 1 :
279	return pagecount3 - 1
280	elif pagecount4 :
281	return pagecount4
282	else :
283	return int(pagecount1 / 2)
284
285	class PCLAnalyzer :
286	def __init__(self, infile) :
287	"""Initialize PCL Analyzer."""
288	self.infile = infile
289
290	def getJobSize(self) :
291	"""Count pages in a PCL5 document.
292
293	Should also work for PCL3 and PCL4 documents.
294
295	Algorithm from pclcount
296	(c) 2003, by Eduardo Gielamo Oliveira & Rodolfo Broco Manin
297	published under the terms of the GNU General Public Licence v2.
298
299	Backported from C to Python by Jerome Alet, then enhanced
300	with more PCL tags detected. I think all the necessary PCL tags
301	are recognized to correctly handle PCL5 files wrt their number
302	of pages. The documentation used for this was :
303
304	HP PCL/PJL Reference Set
305	PCL5 Printer Language Technical Quick Reference Guide
306	http://h20000.www2.hp.com/bc/docs/support/SupportManual/bpl13205/bpl13205.pdf
307	"""
308	infileno = self.infile.fileno()
309	minfile = mmap.mmap(infileno, os.fstat(infileno)[6], prot=mmap.PROT_READ, flags=mmap.MAP_SHARED)
310	tagsends = { "&n" : "W",
311	"&b" : "W",
312	"*i" : "W",
313	"*l" : "W",
314	"*m" : "W",
315	"*v" : "W",
316	"*c" : "W",
317	"(f" : "W",
318	"(s" : "W",
319	")s" : "W",
320	"&p" : "X",
321	"&l" : "XH",
322	"&a" : "G",
323	"*g" : "W",
324	# "*b" : "VW", # treated specially because it occurs very often
325	}
326	pagecount = resets = ejects = backsides = 0
327	tag = None
328	copies = {}
329	pos = 0
330	try :
331	while 1 :
332	char = minfile[pos] ; pos += 1
333	if char == "\014" :
334	pagecount += 1
335	elif char == "\033" :
336	#
337	# <ESC>*b###W -> Start of a raster data row/block
338	# <ESC>*b###V -> Start of a raster data plane
339	# <ESC>*c###W -> Start of a user defined pattern
340	# <ESC>*i###W -> Start of a viewing illuminant block
341	# <ESC>*l###W -> Start of a color lookup table
342	# <ESC>*m###W -> Start of a download dither matrix block
343	# <ESC>*v###W -> Start of a configure image data block
344	# <ESC>(s###W -> Start of a characters description block
345	# <ESC>)s###W -> Start of a fonts description block
346	# <ESC>(f###W -> Start of a symbol set block
347	# <ESC>&b###W -> Start of configuration data block
348	# <ESC>&l###X -> Number of copies for current page
349	# <ESC>&n###W -> Starts an alphanumeric string ID block
350	# <ESC>&p###X -> Start of a non printable characters block
351	# <ESC>&a2G -> Back side when duplex mode as generated by rastertohp
352	# <ESC>*g###W -> Needed for planes in PCL3 output
353	# <ESC>&l0H -> Eject if NumPlanes > 1, as generated by rastertohp
354	#
355	tagstart = minfile[pos] ; pos += 1
356	if tagstart in "E9=YZ" : # one byte PCL tag
357	if tagstart == "E" :
358	resets += 1
359	continue # skip to next tag
360	tag = tagstart + minfile[pos] ; pos += 1
361	if tag == "*b" :
362	tagend = "VW"
363	else :
364	try :
365	tagend = tagsends[tag]
366	except KeyError :
367	continue # Unsupported PCL tag
368	# Now read the numeric argument
369	size = 0
370	while 1 :
371	char = minfile[pos] ; pos += 1
372	if not char.isdigit() :
373	break
374	size = (size * 10) + int(char)
375	if char in tagend :
376	if (tag == "&l") and (char == "X") : # copies for current page
377	copies[pagecount] = size
378	elif (tag == "&l") and (char == "H") and (size == 0) :
379	ejects += 1 # Eject
380	elif (tag == "&a") and (size == 2) :
381	backsides += 1 # Back side in duplex mode
382	else :
383	# we just ignore the block.
384	if tag == "&n" :
385	# we have to take care of the operation id byte
386	# which is before the string itself
387	size += 1
388	pos += size
389	except IndexError : # EOF ?
390	minfile.close() # reached EOF
391
392	# if pagecount is still 0, we will use the number
393	# of resets instead of the number of form feed characters.
394	# but the number of resets is always at least 2 with a valid
395	# pcl file : one at the very start and one at the very end
396	# of the job's data. So we substract 2 from the number of
397	# resets. And since on our test data we needed to substract
398	# 1 more, we finally substract 3, and will test several
399	# PCL files with this. If resets < 2, then the file is
400	# probably not a valid PCL file, so we use 0
401	if not pagecount :
402	pagecount = (pagecount or ((resets - 3) * (resets > 2)))
403	else :
404	# here we add counters for other ways new pages may have
405	# been printed and ejected by the printer
406	pagecount += ejects + backsides
407
408	# now handle number of copies for each page (may differ).
409	# in duplex mode, number of copies may be sent only once.
410	for pnum in range(pagecount) :
411	# if no number of copies defined, take the preceding one else the one set before any page else 1.
412	nb = copies.get(pnum, copies.get(pnum-1, copies.get(0, 1)))
413	pagecount += (nb - 1)
414	return pagecount
415
416	class PCL3GUIAnalyzer :
417	def __init__(self, infile) :
418	"""Initialize PCL3GUI Analyzer."""
419	self.infile = infile
420
421	def getJobSize(self) :
422	"""Count pages in a PCL3GUI document.
423
424	Not much documentation available, so we will count occurences
425	of <ESC>*r1A which is start of graphical data.
426
427	This is FAR from being accurate. PCL3 ressembles PCL5 in fact,
428	and PCL parser should be made better, but some documentation
429	definitely lacks.
430	"""
431	data = self.infile.read()
432	pagecount = data.count("\033*r1A")
433	return pagecount
434
435	class PCLXLAnalyzer :
436	def __init__(self, infile) :
437	"""Initialize PCLXL Analyzer."""
438	self.infile = infile
439	self.endianness = None
440	found = 0
441	while not found :
442	line = self.infile.readline()
443	if not line :
444	break
445	if line[1:12] == " HP-PCL XL;" :
446	found = 1
447	endian = ord(line[0])
448	if endian == 0x29 :
449	self.littleEndian()
450	elif endian == 0x28 :
451	self.bigEndian()
452	# elif endian == 0x27 : TODO : What can we do here ?
453	#
454	else :
455	raise PDLAnalyzerError, "Unknown endianness marker 0x%02x at start !" % endian
456	if not found :
457	raise PDLAnalyzerError, "This file doesn't seem to be PCLXL (aka PCL6)"
458	else :
459	# Initialize table of tags
460	self.tags = [ 0 ] * 256
461
462	# GhostScript's sources tell us that HP printers
463	# only accept little endianness, but we can handle both.
464	self.tags[0x28] = self.bigEndian # BigEndian
465	self.tags[0x29] = self.littleEndian # LittleEndian
466
467	self.tags[0x43] = self.beginPage # BeginPage
468	self.tags[0x44] = self.endPage # EndPage
469
470	self.tags[0xc0] = 1 # ubyte
471	self.tags[0xc1] = 2 # uint16
472	self.tags[0xc2] = 4 # uint32
473	self.tags[0xc3] = 2 # sint16
474	self.tags[0xc4] = 4 # sint32
475	self.tags[0xc5] = 4 # real32
476
477	self.tags[0xc8] = self.array_8 # ubyte_array
478	self.tags[0xc9] = self.array_16 # uint16_array
479	self.tags[0xca] = self.array_32 # uint32_array
480	self.tags[0xcb] = self.array_16 # sint16_array
481	self.tags[0xcc] = self.array_32 # sint32_array
482	self.tags[0xcd] = self.array_32 # real32_array
483
484	self.tags[0xd0] = 2 # ubyte_xy
485	self.tags[0xd1] = 4 # uint16_xy
486	self.tags[0xd2] = 8 # uint32_xy
487	self.tags[0xd3] = 4 # sint16_xy
488	self.tags[0xd4] = 8 # sint32_xy
489	self.tags[0xd5] = 8 # real32_xy
490
491	self.tags[0xe0] = 4 # ubyte_box
492	self.tags[0xe1] = 8 # uint16_box
493	self.tags[0xe2] = 16 # uint32_box
494	self.tags[0xe3] = 8 # sint16_box
495	self.tags[0xe4] = 16 # sint32_box
496	self.tags[0xe5] = 16 # real32_box
497
498	self.tags[0xf8] = 1 # attr_ubyte
499	self.tags[0xf9] = 2 # attr_uint16
500
501	self.tags[0xfa] = self.embeddedData # dataLength
502	self.tags[0xfb] = self.embeddedDataSmall # dataLengthByte
503
504	def beginPage(self) :
505	"""Indicates the beginning of a new page."""
506	self.pagecount += 1
507	return 0
508
509	def endPage(self) :
510	"""Indicates the end of a page."""
511	pos = self.pos
512	minfile = self.minfile
513	if (ord(minfile[pos-3]) == 0xf8) and (ord(minfile[pos-2]) == 0x31) :
514	# The EndPage operator is preceded by a PageCopies attribute
515	# So set number of copies for current page.
516	# From what I read in PCLXL documentation, the number
517	# of copies is an unsigned 16 bits integer
518	self.copies[self.pagecount] = unpack(self.endianness + "H", minfile[pos-5:pos-3])[0]
519	return 0
520
521	def array_8(self) :
522	"""Handles byte arrays."""
523	pos = self.pos
524	datatype = self.minfile[pos]
525	pos += 1
526	length = self.tags[ord(datatype)]
527	if callable(length) :
528	self.pos = pos
529	length = length()
530	pos = self.pos
531	posl = pos + length
532	self.pos = posl
533	if length == 1 :
534	return unpack("B", self.minfile[pos:posl])[0]
535	elif length == 2 :
536	return unpack(self.endianness + "H", self.minfile[pos:posl])[0]
537	elif length == 4 :
538	return unpack(self.endianness + "I", self.minfile[pos:posl])[0]
539	else :
540	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
541
542	def array_16(self) :
543	"""Handles byte arrays."""
544	pos = self.pos
545	datatype = self.minfile[pos]
546	pos += 1
547	length = self.tags[ord(datatype)]
548	if callable(length) :
549	self.pos = pos
550	length = length()
551	pos = self.pos
552	posl = pos + length
553	self.pos = posl
554	if length == 1 :
555	return 2 * unpack("B", self.minfile[pos:posl])[0]
556	elif length == 2 :
557	return 2 * unpack(self.endianness + "H", self.minfile[pos:posl])[0]
558	elif length == 4 :
559	return 2 * unpack(self.endianness + "I", self.minfile[pos:posl])[0]
560	else :
561	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
562
563	def array_32(self) :
564	"""Handles byte arrays."""
565	pos = self.pos
566	datatype = self.minfile[pos]
567	pos += 1
568	length = self.tags[ord(datatype)]
569	if callable(length) :
570	self.pos = pos
571	length = length()
572	pos = self.pos
573	posl = pos + length
574	self.pos = posl
575	if length == 1 :
576	return 4 * unpack("B", self.minfile[pos:posl])[0]
577	elif length == 2 :
578	return 4 * unpack(self.endianness + "H", self.minfile[pos:posl])[0]
579	elif length == 4 :
580	return 4 * unpack(self.endianness + "I", self.minfile[pos:posl])[0]
581	else :
582	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
583
584	def embeddedDataSmall(self) :
585	"""Handle small amounts of data."""
586	pos = self.pos
587	length = ord(self.minfile[pos])
588	self.pos = pos + 1
589	return length
590
591	def embeddedData(self) :
592	"""Handle normal amounts of data."""
593	pos = self.pos
594	pos4 = pos + 4
595	self.pos = pos4
596	return unpack(self.endianness + "I", self.minfile[pos:pos4])[0]
597
598	def littleEndian(self) :
599	"""Toggles to little endianness."""
600	self.endianness = "<" # little endian
601	return 0
602
603	def bigEndian(self) :
604	"""Toggles to big endianness."""
605	self.endianness = ">" # big endian
606	return 0
607
608	def getJobSize(self) :
609	"""Counts pages in a PCLXL (PCL6) document.
610
611	Algorithm by Jerome Alet.
612
613	The documentation used for this was :
614
615	HP PCL XL Feature Reference
616	Protocol Class 2.0
617	http://www.hpdevelopersolutions.com/downloads/64/358/xl_ref20r22.pdf
618	"""
619	infileno = self.infile.fileno()
620	self.copies = {}
621	self.minfile = minfile = mmap.mmap(infileno, os.fstat(infileno)[6], prot=mmap.PROT_READ, flags=mmap.MAP_SHARED)
622	tags = self.tags
623	self.pagecount = 0
624	self.pos = pos = self.infile.tell()
625	try :
626	while 1 :
627	char = minfile[pos]
628	pos += 1
629	length = tags[ord(char)]
630	if not length :
631	continue
632	if callable(length) :
633	self.pos = pos
634	length = length()
635	pos = self.pos
636	pos += length
637	except IndexError : # EOF ?
638	self.minfile.close() # reached EOF
639
640	# now handle number of copies for each page (may differ).
641	for pnum in range(1, self.pagecount + 1) :
642	# if no number of copies defined, take 1, as explained
643	# in PCLXL documentation.
644	# NB : is number of copies is 0, the page won't be output
645	# but the formula below is still correct : we want
646	# to decrease the total number of pages in this case.
647	self.pagecount += (self.copies.get(pnum, 1) - 1)
648
649	return self.pagecount
650
651	class PDLAnalyzer :
652	"""Generic PDL Analyzer class."""
653	def __init__(self, filename) :
654	"""Initializes the PDL analyzer.
655
656	filename is the name of the file or '-' for stdin.
657	filename can also be a file-like object which
658	supports read() and seek().
659	"""
660	self.filename = filename
661	try :
662	import psyco
663	except ImportError :
664	pass # Psyco is not installed
665	else :
666	# Psyco is installed, tell it to compile
667	# the CPU intensive methods : PCL and PCLXL
668	# parsing will greatly benefit from this,
669	# for PostScript and PDF the difference is
670	# barely noticeable since they are already
671	# almost optimal, and much more speedy anyway.
672	psyco.bind(PostScriptAnalyzer.getJobSize)
673	psyco.bind(PDFAnalyzer.getJobSize)
674	psyco.bind(ESCP2Analyzer.getJobSize)
675	psyco.bind(PCLAnalyzer.getJobSize)
676	psyco.bind(PCLXLAnalyzer.getJobSize)
677
678	def getJobSize(self) :
679	"""Returns the job's size."""
680	self.openFile()
681	try :
682	pdlhandler = self.detectPDLHandler()
683	except PDLAnalyzerError, msg :
684	self.closeFile()
685	raise PDLAnalyzerError, "ERROR : Unknown file format for %s (%s)" % (self.filename, msg)
686	else :
687	try :
688	size = pdlhandler(self.infile).getJobSize()
689	finally :
690	self.closeFile()
691	return size
692
693	def openFile(self) :
694	"""Opens the job's data stream for reading."""
695	self.mustclose = 0 # by default we don't want to close the file when finished
696	if hasattr(self.filename, "read") and hasattr(self.filename, "seek") :
697	# filename is in fact a file-like object
698	infile = self.filename
699	elif self.filename == "-" :
700	# we must read from stdin
701	infile = sys.stdin
702	else :
703	# normal file
704	self.infile = open(self.filename, "rb")
705	self.mustclose = 1
706	return
707
708	# Use a temporary file, always seekable contrary to standard input.
709	self.infile = tempfile.TemporaryFile(mode="w+b")
710	while 1 :
711	data = infile.read(MEGABYTE)
712	if not data :
713	break
714	self.infile.write(data)
715	self.infile.flush()
716	self.infile.seek(0)
717
718	def closeFile(self) :
719	"""Closes the job's data stream if we can close it."""
720	if self.mustclose :
721	self.infile.close()
722	else :
723	# if we don't have to close the file, then
724	# ensure the file pointer is reset to the
725	# start of the file in case the process wants
726	# to read the file again.
727	try :
728	self.infile.seek(0)
729	except :
730	pass # probably stdin, which is not seekable
731
732	def isPostScript(self, data) :
733	"""Returns 1 if data is PostScript, else 0."""
734	if data.startswith("%!") or \
735	data.startswith("\004%!") or \
736	data.startswith("\033%-12345X%!PS") or \
737	((data[:128].find("\033%-12345X") != -1) and \
738	((data.find("LANGUAGE=POSTSCRIPT") != -1) or \
739	(data.find("LANGUAGE = POSTSCRIPT") != -1) or \
740	(data.find("LANGUAGE = Postscript") != -1))) or \
741	(data.find("%!PS-Adobe") != -1) :
742	return 1
743	else :
744	return 0
745
746	def isPDF(self, data) :
747	"""Returns 1 if data is PDF, else 0."""
748	if data.startswith("%PDF-") or \
749	data.startswith("\033%-12345X%PDF-") or \
750	((data[:128].find("\033%-12345X") != -1) and (data.upper().find("LANGUAGE=PDF") != -1)) or \
751	(data.find("%PDF-") != -1) :
752	return 1
753	else :
754	return 0
755
756	def isPCL(self, data) :
757	"""Returns 1 if data is PCL, else 0."""
758	if data.startswith("\033E\033") or \
759	(data[:128].find("\033%-12345X") != -1) :
760	return 1
761	else :
762	return 0
763
764	def isPCL3GUI(self, data) :
765	"""Returns 1 if data is PCL3GUI, else 0."""
766	if data.find("@PJL ENTER LANGUAGE=PCL3GUI") != -1 :
767	return 1
768	else :
769	return 0
770
771	def isPCLXL(self, data) :
772	"""Returns 1 if data is PCLXL aka PCL6, else 0."""
773	if ((data[:128].find("\033%-12345X") != -1) and \
774	(data.find(" HP-PCL XL;") != -1) and \
775	((data.find("LANGUAGE=PCLXL") != -1) or \
776	(data.find("LANGUAGE = PCLXL") != -1))) :
777	return 1
778	else :
779	return 0
780
781	def isESCP2(self, data) :
782	"""Returns 1 if data is ESC/P2, else 0."""
783	if data.startswith("\033@") or \
784	data.startswith("\033*") or \
785	data.startswith("\n\033@") :
786	return 1
787	else :
788	return 0
789
790	def detectPDLHandler(self) :
791	"""Tries to autodetect the document format.
792
793	Returns the correct PDL handler class or None if format is unknown
794	"""
795	# Try to detect file type by reading first block of datas
796	self.infile.seek(0)
797	firstblock = self.infile.read(4 * KILOBYTE)
798	self.infile.seek(0)
799	if self.isPostScript(firstblock) :
800	return PostScriptAnalyzer
801	elif self.isPCLXL(firstblock) :
802	return PCLXLAnalyzer
803	elif self.isPDF(firstblock) :
804	return PDFAnalyzer
805	elif self.isPCL3GUI(firstblock) :
806	return PCL3GUIAnalyzer
807	elif self.isPCL(firstblock) :
808	return PCLAnalyzer
809	elif self.isESCP2(firstblock) :
810	return ESCP2Analyzer
811	else :
812	raise PDLAnalyzerError, "Analysis of first data block failed."
813
814	def main() :
815	"""Entry point for PDL Analyzer."""
816	if (len(sys.argv) < 2) or ((not sys.stdin.isatty()) and ("-" not in sys.argv[1:])) :
817	sys.argv.append("-")
818
819	totalsize = 0
820	for arg in sys.argv[1:] :
821	try :
822	parser = PDLAnalyzer(arg)
823	totalsize += parser.getJobSize()
824	except PDLAnalyzerError, msg :
825	sys.stderr.write("ERROR: %s\n" % msg)
826	sys.stderr.flush()
827	print "%s" % totalsize
828
829	if __name__ == "__main__" :
830	main()

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pykota / trunk / pykota / pdlanalyzer.py @ 1699

Download in other formats: