Context Navigation

pdlanalyzer.py @ 1700

Revision 1700, 30.9 kB (checked in by jalet, 20 years ago)
Comments
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`

Line
1	# PyKota
2	# -- coding: ISO-8859-15 --
3	#
4	# PyKota - Print Quotas for CUPS and LPRng
5	#
6	# (c) 2003-2004 Jerome Alet <alet@librelogiciel.com>
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
20	#
21	# $Id$
22	#
23	# $Log$
24	# Revision 1.39 2004/09/02 23:30:05 jalet
25	# Comments
26	#
27	# Revision 1.38 2004/09/02 22:08:37 jalet
28	# First draft of PCL3GUI analyzer.
29	#
30	# Revision 1.37 2004/09/02 21:22:49 jalet
31	# One more PCL tag
32	#
33	# Revision 1.36 2004/09/01 22:31:49 jalet
34	# Some more work on ESC/P2 analyzer to avoid missing \r\n sequences. Not
35	# exactly optimal though...
36	#
37	# Revision 1.35 2004/08/30 23:10:24 jalet
38	# Improved the ESC/P2 analyzer so that more GhostScript devices are supported
39	#
40	# Revision 1.34 2004/08/27 09:08:22 jalet
41	# Improvement in PostScript parser to avoid being fooled by clever "students"
42	#
43	# Revision 1.33 2004/08/27 09:02:34 jalet
44	# Forgot to remove some special debugging code...
45	#
46	# Revision 1.32 2004/08/27 08:58:50 jalet
47	# Relax checks for PCL5 header to accomodate strange printer drivers
48	#
49	# Revision 1.31 2004/08/22 08:25:33 jalet
50	# Improved ESC/P2 miniparser thanks to Paulo Silva
51	#
52	# Revision 1.30 2004/08/21 23:16:57 jalet
53	# First draft of ESC/P2 (mini-)parser.
54	#
55	# Revision 1.29 2004/08/11 16:25:38 jalet
56	# Fixed index problem in PCLXL parser when retrieving number of copies for
57	# each page
58	#
59	# Revision 1.28 2004/08/10 23:01:49 jalet
60	# Fixed number of copies in PCL5 parser
61	#
62	# Revision 1.27 2004/08/09 18:14:22 jalet
63	# Added workaround for number of copies and some PostScript drivers
64	#
65	# Revision 1.26 2004/07/22 13:49:51 jalet
66	# Added support for binary PostScript through GhostScript if native DSC
67	# compliant PostScript analyzer doesn't find any page. This is much
68	# slower though, so native analyzer is tried first.
69	#
70	# Revision 1.25 2004/07/10 14:06:36 jalet
71	# Fix for Python2.1 incompatibilities
72	#
73	# Revision 1.24 2004/07/05 21:00:39 jalet
74	# Fix for number of copies for each page in PCLXL parser
75	#
76	# Revision 1.23 2004/07/03 08:21:59 jalet
77	# Testsuite for PDL Analyzer added
78	#
79	# Revision 1.22 2004/06/29 14:21:41 jalet
80	# Smallish optimization
81	#
82	# Revision 1.21 2004/06/28 23:11:26 jalet
83	# Code de-factorization in PCLXL parser
84	#
85	# Revision 1.20 2004/06/28 22:38:41 jalet
86	# Increased speed by a factor of 2 in PCLXL parser
87	#
88	# Revision 1.19 2004/06/28 21:20:30 jalet
89	# PCLXL support now works !
90	#
91	# Revision 1.18 2004/06/27 22:59:37 jalet
92	# More work on PCLXL parser
93	#
94	# Revision 1.17 2004/06/26 23:20:01 jalet
95	# Additionnal speedup for GhostScript generated PCL5 files
96	#
97	# Revision 1.16 2004/06/26 15:31:00 jalet
98	# mmap reintroduced in PCL5 parser
99	#
100	# Revision 1.15 2004/06/26 14:14:31 jalet
101	# Now uses Psyco if it is available
102	#
103	# Revision 1.14 2004/06/25 09:50:28 jalet
104	# More debug info in PCLXL parser
105	#
106	# Revision 1.13 2004/06/25 08:10:08 jalet
107	# Another fix for PCL5 parser
108	#
109	# Revision 1.12 2004/06/24 23:09:53 jalet
110	# Fix for number of copies in PCL5 parser
111	#
112	# Revision 1.11 2004/06/23 22:07:50 jalet
113	# Fixed PCL5 parser according to the sources of rastertohp
114	#
115	# Revision 1.10 2004/06/18 22:24:03 jalet
116	# Removed old comments
117	#
118	# Revision 1.9 2004/06/18 22:21:27 jalet
119	# Native PDF parser greatly improved.
120	# GhostScript based PDF parser completely removed because native code
121	# is now portable across Python versions.
122	#
123	# Revision 1.8 2004/06/18 20:49:46 jalet
124	# "ERROR:" prefix added
125	#
126	# Revision 1.7 2004/06/18 17:48:04 jalet
127	# Added native fast PDF parsing method
128	#
129	# Revision 1.6 2004/06/18 14:00:16 jalet
130	# Added PDF support in smart PDL analyzer (through GhostScript for now)
131	#
132	# Revision 1.5 2004/06/18 10:09:05 jalet
133	# Resets file pointer to start of file in all cases
134	#
135	# Revision 1.4 2004/06/18 06:16:14 jalet
136	# Fixes PostScript detection code for incorrect drivers
137	#
138	# Revision 1.3 2004/05/21 20:40:08 jalet
139	# All the code for pkpgcounter is now in pdlanalyzer.py
140	#
141	# Revision 1.2 2004/05/19 19:09:36 jalet
142	# Speed improvement
143	#
144	# Revision 1.1 2004/05/18 09:59:54 jalet
145	# pkpgcounter is now just a wrapper around the PDLAnalyzer class
146	#
147	#
148	#
149
150	import sys
151	import os
152	import re
153	from struct import unpack
154	import tempfile
155	import mmap
156	import popen2
157
158	KILOBYTE = 1024
159	MEGABYTE = 1024 * KILOBYTE
160
161	class PDLAnalyzerError(Exception):
162	"""An exception for PDL Analyzer related stuff."""
163	def __init__(self, message = ""):
164	self.message = message
165	Exception.__init__(self, message)
166	def __repr__(self):
167	return self.message
168	__str__ = __repr__
169
170	class PostScriptAnalyzer :
171	def __init__(self, infile) :
172	"""Initialize PostScript Analyzer."""
173	self.infile = infile
174	self.copies = 1
175
176	def throughGhostScript(self) :
177	"""Get the count through GhostScript, useful for non-DSC compliant PS files."""
178	self.infile.seek(0)
179	command = 'gs -sDEVICE=bbox -dNOPAUSE -dBATCH -dQUIET - 2>&1 \| grep -c "%%HiResBoundingBox:" 2>/dev/null'
180	child = popen2.Popen4(command)
181	try :
182	data = self.infile.read(MEGABYTE)
183	while data :
184	child.tochild.write(data)
185	data = self.infile.read(MEGABYTE)
186	child.tochild.flush()
187	child.tochild.close()
188	except (IOError, OSError), msg :
189	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
190
191	pagecount = 0
192	try :
193	pagecount = int(child.fromchild.readline().strip())
194	except (IOError, OSError, AttributeError, ValueError) :
195	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
196	child.fromchild.close()
197
198	try :
199	retcode = child.wait()
200	except OSError, msg :
201	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
202	return pagecount * self.copies
203
204	def natively(self) :
205	"""Count pages in a DSC compliant PostScript document."""
206	self.infile.seek(0)
207	pagecount = 0
208	for line in self.infile.xreadlines() :
209	if line.startswith("%%Page: ") :
210	pagecount += 1
211	elif line.startswith("%%BeginNonPPDFeature: NumCopies ") :
212	# handle # of copies set by some Windows printer driver
213	try :
214	number = int(line.strip().split()[2])
215	except :
216	pass
217	else :
218	if number > self.copies :
219	self.copies = number
220	elif line.startswith("1 dict dup /NumCopies ") :
221	# handle # of copies set by mozilla/kprinter
222	try :
223	number = int(line.strip().split()[4])
224	except :
225	pass
226	else :
227	if number > self.copies :
228	self.copies = number
229	return pagecount * self.copies
230
231	def getJobSize(self) :
232	"""Count pages in PostScript document."""
233	return self.natively() or self.throughGhostScript()
234
235	class PDFAnalyzer :
236	def __init__(self, infile) :
237	"""Initialize PDF Analyzer."""
238	self.infile = infile
239
240	def getJobSize(self) :
241	"""Counts pages in a PDF document."""
242	regexp = re.compile(r"(/Type) ?(/Page)[/ \t\r\n]")
243	pagecount = 0
244	for line in self.infile.xreadlines() :
245	pagecount += len(regexp.findall(line))
246	return pagecount
247
248	class ESCP2Analyzer :
249	def __init__(self, infile) :
250	"""Initialize ESC/P2 Analyzer."""
251	self.infile = infile
252
253	def getJobSize(self) :
254	"""Counts pages in an ESC/P2 document."""
255	# with Gimpprint, at least, for each page there
256	# are two Reset Printer sequences (ESC + @)
257	marker1 = "\033@"
258
259	# with other software or printer driver, we
260	# may prefer to search for "\r\n\fESCAPE"
261	# or "\r\fESCAPE"
262	marker2r = "\r\f\033"
263	marker2rn = "\r\n\f\033"
264
265	# and ghostscript's stcolor for example seems to
266	# output ESC + @ + \f for each page plus one
267	marker3 = "\033@\f"
268
269	# while ghostscript's escp driver outputs instead
270	# \f + ESC + @
271	marker4 = "\f\033@"
272
273	data = self.infile.read()
274	pagecount1 = data.count(marker1)
275	pagecount2 = max(data.count(marker2r), data.count(marker2rn))
276	pagecount3 = data.count(marker3)
277	pagecount4 = data.count(marker4)
278
279	if pagecount2 :
280	return pagecount2
281	elif pagecount3 > 1 :
282	return pagecount3 - 1
283	elif pagecount4 :
284	return pagecount4
285	else :
286	return int(pagecount1 / 2)
287
288	class PCLAnalyzer :
289	def __init__(self, infile) :
290	"""Initialize PCL Analyzer."""
291	self.infile = infile
292
293	def getJobSize(self) :
294	"""Count pages in a PCL5 document.
295
296	Should also work for PCL3 and PCL4 documents.
297
298	Algorithm from pclcount
299	(c) 2003, by Eduardo Gielamo Oliveira & Rodolfo Broco Manin
300	published under the terms of the GNU General Public Licence v2.
301
302	Backported from C to Python by Jerome Alet, then enhanced
303	with more PCL tags detected. I think all the necessary PCL tags
304	are recognized to correctly handle PCL5 files wrt their number
305	of pages. The documentation used for this was :
306
307	HP PCL/PJL Reference Set
308	PCL5 Printer Language Technical Quick Reference Guide
309	http://h20000.www2.hp.com/bc/docs/support/SupportManual/bpl13205/bpl13205.pdf
310	"""
311	infileno = self.infile.fileno()
312	minfile = mmap.mmap(infileno, os.fstat(infileno)[6], prot=mmap.PROT_READ, flags=mmap.MAP_SHARED)
313	tagsends = { "&n" : "W",
314	"&b" : "W",
315	"*i" : "W",
316	"*l" : "W",
317	"*m" : "W",
318	"*v" : "W",
319	"*c" : "W",
320	"(f" : "W",
321	"(s" : "W",
322	")s" : "W",
323	"&p" : "X",
324	"&l" : "XH",
325	"&a" : "G", # TODO : 0 means next side, 1 front side, 2 back side
326	"*g" : "W",
327	# "*b" : "VW", # treated specially because it occurs very often
328	}
329	pagecount = resets = ejects = backsides = 0
330	tag = None
331	copies = {}
332	pos = 0
333	try :
334	while 1 :
335	char = minfile[pos] ; pos += 1
336	if char == "\014" :
337	pagecount += 1
338	elif char == "\033" :
339	#
340	# <ESC>*b###W -> Start of a raster data row/block
341	# <ESC>*b###V -> Start of a raster data plane
342	# <ESC>*c###W -> Start of a user defined pattern
343	# <ESC>*i###W -> Start of a viewing illuminant block
344	# <ESC>*l###W -> Start of a color lookup table
345	# <ESC>*m###W -> Start of a download dither matrix block
346	# <ESC>*v###W -> Start of a configure image data block
347	# <ESC>(s###W -> Start of a characters description block
348	# <ESC>)s###W -> Start of a fonts description block
349	# <ESC>(f###W -> Start of a symbol set block
350	# <ESC>&b###W -> Start of configuration data block
351	# <ESC>&l###X -> Number of copies for current page
352	# <ESC>&n###W -> Starts an alphanumeric string ID block
353	# <ESC>&p###X -> Start of a non printable characters block
354	# <ESC>&a2G -> Back side when duplex mode as generated by rastertohp
355	# <ESC>*g###W -> Needed for planes in PCL3 output
356	# <ESC>&l0H -> Eject if NumPlanes > 1, as generated by rastertohp
357	#
358	tagstart = minfile[pos] ; pos += 1
359	if tagstart in "E9=YZ" : # one byte PCL tag
360	if tagstart == "E" :
361	resets += 1
362	continue # skip to next tag
363	tag = tagstart + minfile[pos] ; pos += 1
364	if tag == "*b" :
365	tagend = "VW"
366	else :
367	try :
368	tagend = tagsends[tag]
369	except KeyError :
370	continue # Unsupported PCL tag
371	# Now read the numeric argument
372	size = 0
373	while 1 :
374	char = minfile[pos] ; pos += 1
375	if not char.isdigit() :
376	break
377	size = (size * 10) + int(char)
378	if char in tagend :
379	if (tag == "&l") and (char == "X") : # copies for current page
380	copies[pagecount] = size
381	elif (tag == "&l") and (char == "H") and (size == 0) :
382	ejects += 1 # Eject
383	elif (tag == "&a") and (size == 2) :
384	backsides += 1 # Back side in duplex mode
385	else :
386	# we just ignore the block.
387	if tag == "&n" :
388	# we have to take care of the operation id byte
389	# which is before the string itself
390	size += 1
391	pos += size
392	except IndexError : # EOF ?
393	minfile.close() # reached EOF
394
395	# if pagecount is still 0, we will use the number
396	# of resets instead of the number of form feed characters.
397	# but the number of resets is always at least 2 with a valid
398	# pcl file : one at the very start and one at the very end
399	# of the job's data. So we substract 2 from the number of
400	# resets. And since on our test data we needed to substract
401	# 1 more, we finally substract 3, and will test several
402	# PCL files with this. If resets < 2, then the file is
403	# probably not a valid PCL file, so we use 0
404	if not pagecount :
405	pagecount = (pagecount or ((resets - 3) * (resets > 2)))
406	else :
407	# here we add counters for other ways new pages may have
408	# been printed and ejected by the printer
409	pagecount += ejects + backsides
410
411	# now handle number of copies for each page (may differ).
412	# in duplex mode, number of copies may be sent only once.
413	for pnum in range(pagecount) :
414	# if no number of copies defined, take the preceding one else the one set before any page else 1.
415	nb = copies.get(pnum, copies.get(pnum-1, copies.get(0, 1)))
416	pagecount += (nb - 1)
417	return pagecount
418
419	class PCL3GUIAnalyzer :
420	def __init__(self, infile) :
421	"""Initialize PCL3GUI Analyzer."""
422	self.infile = infile
423
424	def getJobSize(self) :
425	"""Count pages in a PCL3GUI document.
426
427	Not much documentation available, so we will count occurences
428	of <ESC>*r1A which is start of graphical data.
429
430	This is FAR from being accurate. PCL3 ressembles PCL5 in fact,
431	and PCL parser should be made better, but some documentation
432	definitely lacks.
433	"""
434	data = self.infile.read()
435	pagecount = data.count("\033r1A") # TODO : Allowed values 0, 1, 2, 3 after r
436	return pagecount
437
438	class PCLXLAnalyzer :
439	def __init__(self, infile) :
440	"""Initialize PCLXL Analyzer."""
441	self.infile = infile
442	self.endianness = None
443	found = 0
444	while not found :
445	line = self.infile.readline()
446	if not line :
447	break
448	if line[1:12] == " HP-PCL XL;" :
449	found = 1
450	endian = ord(line[0])
451	if endian == 0x29 :
452	self.littleEndian()
453	elif endian == 0x28 :
454	self.bigEndian()
455	# elif endian == 0x27 : TODO : What can we do here ?
456	#
457	else :
458	raise PDLAnalyzerError, "Unknown endianness marker 0x%02x at start !" % endian
459	if not found :
460	raise PDLAnalyzerError, "This file doesn't seem to be PCLXL (aka PCL6)"
461	else :
462	# Initialize table of tags
463	self.tags = [ 0 ] * 256
464
465	# GhostScript's sources tell us that HP printers
466	# only accept little endianness, but we can handle both.
467	self.tags[0x28] = self.bigEndian # BigEndian
468	self.tags[0x29] = self.littleEndian # LittleEndian
469
470	self.tags[0x43] = self.beginPage # BeginPage
471	self.tags[0x44] = self.endPage # EndPage
472
473	self.tags[0xc0] = 1 # ubyte
474	self.tags[0xc1] = 2 # uint16
475	self.tags[0xc2] = 4 # uint32
476	self.tags[0xc3] = 2 # sint16
477	self.tags[0xc4] = 4 # sint32
478	self.tags[0xc5] = 4 # real32
479
480	self.tags[0xc8] = self.array_8 # ubyte_array
481	self.tags[0xc9] = self.array_16 # uint16_array
482	self.tags[0xca] = self.array_32 # uint32_array
483	self.tags[0xcb] = self.array_16 # sint16_array
484	self.tags[0xcc] = self.array_32 # sint32_array
485	self.tags[0xcd] = self.array_32 # real32_array
486
487	self.tags[0xd0] = 2 # ubyte_xy
488	self.tags[0xd1] = 4 # uint16_xy
489	self.tags[0xd2] = 8 # uint32_xy
490	self.tags[0xd3] = 4 # sint16_xy
491	self.tags[0xd4] = 8 # sint32_xy
492	self.tags[0xd5] = 8 # real32_xy
493
494	self.tags[0xe0] = 4 # ubyte_box
495	self.tags[0xe1] = 8 # uint16_box
496	self.tags[0xe2] = 16 # uint32_box
497	self.tags[0xe3] = 8 # sint16_box
498	self.tags[0xe4] = 16 # sint32_box
499	self.tags[0xe5] = 16 # real32_box
500
501	self.tags[0xf8] = 1 # attr_ubyte
502	self.tags[0xf9] = 2 # attr_uint16
503
504	self.tags[0xfa] = self.embeddedData # dataLength
505	self.tags[0xfb] = self.embeddedDataSmall # dataLengthByte
506
507	def beginPage(self) :
508	"""Indicates the beginning of a new page."""
509	self.pagecount += 1
510	return 0
511
512	def endPage(self) :
513	"""Indicates the end of a page."""
514	pos = self.pos
515	minfile = self.minfile
516	if (ord(minfile[pos-3]) == 0xf8) and (ord(minfile[pos-2]) == 0x31) :
517	# The EndPage operator is preceded by a PageCopies attribute
518	# So set number of copies for current page.
519	# From what I read in PCLXL documentation, the number
520	# of copies is an unsigned 16 bits integer
521	self.copies[self.pagecount] = unpack(self.endianness + "H", minfile[pos-5:pos-3])[0]
522	return 0
523
524	def array_8(self) :
525	"""Handles byte arrays."""
526	pos = self.pos
527	datatype = self.minfile[pos]
528	pos += 1
529	length = self.tags[ord(datatype)]
530	if callable(length) :
531	self.pos = pos
532	length = length()
533	pos = self.pos
534	posl = pos + length
535	self.pos = posl
536	if length == 1 :
537	return unpack("B", self.minfile[pos:posl])[0]
538	elif length == 2 :
539	return unpack(self.endianness + "H", self.minfile[pos:posl])[0]
540	elif length == 4 :
541	return unpack(self.endianness + "I", self.minfile[pos:posl])[0]
542	else :
543	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
544
545	def array_16(self) :
546	"""Handles byte arrays."""
547	pos = self.pos
548	datatype = self.minfile[pos]
549	pos += 1
550	length = self.tags[ord(datatype)]
551	if callable(length) :
552	self.pos = pos
553	length = length()
554	pos = self.pos
555	posl = pos + length
556	self.pos = posl
557	if length == 1 :
558	return 2 * unpack("B", self.minfile[pos:posl])[0]
559	elif length == 2 :
560	return 2 * unpack(self.endianness + "H", self.minfile[pos:posl])[0]
561	elif length == 4 :
562	return 2 * unpack(self.endianness + "I", self.minfile[pos:posl])[0]
563	else :
564	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
565
566	def array_32(self) :
567	"""Handles byte arrays."""
568	pos = self.pos
569	datatype = self.minfile[pos]
570	pos += 1
571	length = self.tags[ord(datatype)]
572	if callable(length) :
573	self.pos = pos
574	length = length()
575	pos = self.pos
576	posl = pos + length
577	self.pos = posl
578	if length == 1 :
579	return 4 * unpack("B", self.minfile[pos:posl])[0]
580	elif length == 2 :
581	return 4 * unpack(self.endianness + "H", self.minfile[pos:posl])[0]
582	elif length == 4 :
583	return 4 * unpack(self.endianness + "I", self.minfile[pos:posl])[0]
584	else :
585	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
586
587	def embeddedDataSmall(self) :
588	"""Handle small amounts of data."""
589	pos = self.pos
590	length = ord(self.minfile[pos])
591	self.pos = pos + 1
592	return length
593
594	def embeddedData(self) :
595	"""Handle normal amounts of data."""
596	pos = self.pos
597	pos4 = pos + 4
598	self.pos = pos4
599	return unpack(self.endianness + "I", self.minfile[pos:pos4])[0]
600
601	def littleEndian(self) :
602	"""Toggles to little endianness."""
603	self.endianness = "<" # little endian
604	return 0
605
606	def bigEndian(self) :
607	"""Toggles to big endianness."""
608	self.endianness = ">" # big endian
609	return 0
610
611	def getJobSize(self) :
612	"""Counts pages in a PCLXL (PCL6) document.
613
614	Algorithm by Jerome Alet.
615
616	The documentation used for this was :
617
618	HP PCL XL Feature Reference
619	Protocol Class 2.0
620	http://www.hpdevelopersolutions.com/downloads/64/358/xl_ref20r22.pdf
621	"""
622	infileno = self.infile.fileno()
623	self.copies = {}
624	self.minfile = minfile = mmap.mmap(infileno, os.fstat(infileno)[6], prot=mmap.PROT_READ, flags=mmap.MAP_SHARED)
625	tags = self.tags
626	self.pagecount = 0
627	self.pos = pos = self.infile.tell()
628	try :
629	while 1 :
630	char = minfile[pos]
631	pos += 1
632	length = tags[ord(char)]
633	if not length :
634	continue
635	if callable(length) :
636	self.pos = pos
637	length = length()
638	pos = self.pos
639	pos += length
640	except IndexError : # EOF ?
641	self.minfile.close() # reached EOF
642
643	# now handle number of copies for each page (may differ).
644	for pnum in range(1, self.pagecount + 1) :
645	# if no number of copies defined, take 1, as explained
646	# in PCLXL documentation.
647	# NB : is number of copies is 0, the page won't be output
648	# but the formula below is still correct : we want
649	# to decrease the total number of pages in this case.
650	self.pagecount += (self.copies.get(pnum, 1) - 1)
651
652	return self.pagecount
653
654	class PDLAnalyzer :
655	"""Generic PDL Analyzer class."""
656	def __init__(self, filename) :
657	"""Initializes the PDL analyzer.
658
659	filename is the name of the file or '-' for stdin.
660	filename can also be a file-like object which
661	supports read() and seek().
662	"""
663	self.filename = filename
664	try :
665	import psyco
666	except ImportError :
667	pass # Psyco is not installed
668	else :
669	# Psyco is installed, tell it to compile
670	# the CPU intensive methods : PCL and PCLXL
671	# parsing will greatly benefit from this,
672	# for PostScript and PDF the difference is
673	# barely noticeable since they are already
674	# almost optimal, and much more speedy anyway.
675	psyco.bind(PostScriptAnalyzer.getJobSize)
676	psyco.bind(PDFAnalyzer.getJobSize)
677	psyco.bind(ESCP2Analyzer.getJobSize)
678	psyco.bind(PCLAnalyzer.getJobSize)
679	psyco.bind(PCLXLAnalyzer.getJobSize)
680
681	def getJobSize(self) :
682	"""Returns the job's size."""
683	self.openFile()
684	try :
685	pdlhandler = self.detectPDLHandler()
686	except PDLAnalyzerError, msg :
687	self.closeFile()
688	raise PDLAnalyzerError, "ERROR : Unknown file format for %s (%s)" % (self.filename, msg)
689	else :
690	try :
691	size = pdlhandler(self.infile).getJobSize()
692	finally :
693	self.closeFile()
694	return size
695
696	def openFile(self) :
697	"""Opens the job's data stream for reading."""
698	self.mustclose = 0 # by default we don't want to close the file when finished
699	if hasattr(self.filename, "read") and hasattr(self.filename, "seek") :
700	# filename is in fact a file-like object
701	infile = self.filename
702	elif self.filename == "-" :
703	# we must read from stdin
704	infile = sys.stdin
705	else :
706	# normal file
707	self.infile = open(self.filename, "rb")
708	self.mustclose = 1
709	return
710
711	# Use a temporary file, always seekable contrary to standard input.
712	self.infile = tempfile.TemporaryFile(mode="w+b")
713	while 1 :
714	data = infile.read(MEGABYTE)
715	if not data :
716	break
717	self.infile.write(data)
718	self.infile.flush()
719	self.infile.seek(0)
720
721	def closeFile(self) :
722	"""Closes the job's data stream if we can close it."""
723	if self.mustclose :
724	self.infile.close()
725	else :
726	# if we don't have to close the file, then
727	# ensure the file pointer is reset to the
728	# start of the file in case the process wants
729	# to read the file again.
730	try :
731	self.infile.seek(0)
732	except :
733	pass # probably stdin, which is not seekable
734
735	def isPostScript(self, data) :
736	"""Returns 1 if data is PostScript, else 0."""
737	if data.startswith("%!") or \
738	data.startswith("\004%!") or \
739	data.startswith("\033%-12345X%!PS") or \
740	((data[:128].find("\033%-12345X") != -1) and \
741	((data.find("LANGUAGE=POSTSCRIPT") != -1) or \
742	(data.find("LANGUAGE = POSTSCRIPT") != -1) or \
743	(data.find("LANGUAGE = Postscript") != -1))) or \
744	(data.find("%!PS-Adobe") != -1) :
745	return 1
746	else :
747	return 0
748
749	def isPDF(self, data) :
750	"""Returns 1 if data is PDF, else 0."""
751	if data.startswith("%PDF-") or \
752	data.startswith("\033%-12345X%PDF-") or \
753	((data[:128].find("\033%-12345X") != -1) and (data.upper().find("LANGUAGE=PDF") != -1)) or \
754	(data.find("%PDF-") != -1) :
755	return 1
756	else :
757	return 0
758
759	def isPCL(self, data) :
760	"""Returns 1 if data is PCL, else 0."""
761	if data.startswith("\033E\033") or \
762	(data[:128].find("\033%-12345X") != -1) :
763	return 1
764	else :
765	return 0
766
767	def isPCL3GUI(self, data) :
768	"""Returns 1 if data is PCL3GUI, else 0."""
769	if data.find("@PJL ENTER LANGUAGE=PCL3GUI") != -1 :
770	return 1
771	else :
772	return 0
773
774	def isPCLXL(self, data) :
775	"""Returns 1 if data is PCLXL aka PCL6, else 0."""
776	if ((data[:128].find("\033%-12345X") != -1) and \
777	(data.find(" HP-PCL XL;") != -1) and \
778	((data.find("LANGUAGE=PCLXL") != -1) or \
779	(data.find("LANGUAGE = PCLXL") != -1))) :
780	return 1
781	else :
782	return 0
783
784	def isESCP2(self, data) :
785	"""Returns 1 if data is ESC/P2, else 0."""
786	if data.startswith("\033@") or \
787	data.startswith("\033*") or \
788	data.startswith("\n\033@") :
789	return 1
790	else :
791	return 0
792
793	def detectPDLHandler(self) :
794	"""Tries to autodetect the document format.
795
796	Returns the correct PDL handler class or None if format is unknown
797	"""
798	# Try to detect file type by reading first block of datas
799	self.infile.seek(0)
800	firstblock = self.infile.read(4 * KILOBYTE)
801	self.infile.seek(0)
802	if self.isPostScript(firstblock) :
803	return PostScriptAnalyzer
804	elif self.isPCLXL(firstblock) :
805	return PCLXLAnalyzer
806	elif self.isPDF(firstblock) :
807	return PDFAnalyzer
808	elif self.isPCL3GUI(firstblock) :
809	return PCL3GUIAnalyzer
810	elif self.isPCL(firstblock) :
811	return PCLAnalyzer
812	elif self.isESCP2(firstblock) :
813	return ESCP2Analyzer
814	else :
815	raise PDLAnalyzerError, "Analysis of first data block failed."
816
817	def main() :
818	"""Entry point for PDL Analyzer."""
819	if (len(sys.argv) < 2) or ((not sys.stdin.isatty()) and ("-" not in sys.argv[1:])) :
820	sys.argv.append("-")
821
822	totalsize = 0
823	for arg in sys.argv[1:] :
824	try :
825	parser = PDLAnalyzer(arg)
826	totalsize += parser.getJobSize()
827	except PDLAnalyzerError, msg :
828	sys.stderr.write("ERROR: %s\n" % msg)
829	sys.stderr.flush()
830	print "%s" % totalsize
831
832	if __name__ == "__main__" :
833	main()

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pykota / trunk / pykota / pdlanalyzer.py @ 1700

Download in other formats: