Context Navigation

pdlanalyzer.py @ 1700

Revision 1700, 30.9 kB (checked in by jalet, 20 years ago)
Comments
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`

Rev	Line
[1482]	1	# PyKota
	2	# -- coding: ISO-8859-15 --
	3	#
	4	# PyKota - Print Quotas for CUPS and LPRng
	5	#
	6	# (c) 2003-2004 Jerome Alet <alet@librelogiciel.com>
	7	# This program is free software; you can redistribute it and/or modify
	8	# it under the terms of the GNU General Public License as published by
	9	# the Free Software Foundation; either version 2 of the License, or
	10	# (at your option) any later version.
	11	#
	12	# This program is distributed in the hope that it will be useful,
	13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	# GNU General Public License for more details.
	16	#
	17	# You should have received a copy of the GNU General Public License
	18	# along with this program; if not, write to the Free Software
	19	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	20	#
	21	# $Id$
	22	#
	23	# $Log$
[1700]	24	# Revision 1.39 2004/09/02 23:30:05 jalet
	25	# Comments
	26	#
[1699]	27	# Revision 1.38 2004/09/02 22:08:37 jalet
	28	# First draft of PCL3GUI analyzer.
	29	#
[1698]	30	# Revision 1.37 2004/09/02 21:22:49 jalet
	31	# One more PCL tag
	32	#
[1690]	33	# Revision 1.36 2004/09/01 22:31:49 jalet
	34	# Some more work on ESC/P2 analyzer to avoid missing \r\n sequences. Not
	35	# exactly optimal though...
	36	#
[1686]	37	# Revision 1.35 2004/08/30 23:10:24 jalet
	38	# Improved the ESC/P2 analyzer so that more GhostScript devices are supported
	39	#
[1683]	40	# Revision 1.34 2004/08/27 09:08:22 jalet
	41	# Improvement in PostScript parser to avoid being fooled by clever "students"
	42	#
[1682]	43	# Revision 1.33 2004/08/27 09:02:34 jalet
	44	# Forgot to remove some special debugging code...
	45	#
[1681]	46	# Revision 1.32 2004/08/27 08:58:50 jalet
	47	# Relax checks for PCL5 header to accomodate strange printer drivers
	48	#
[1677]	49	# Revision 1.31 2004/08/22 08:25:33 jalet
	50	# Improved ESC/P2 miniparser thanks to Paulo Silva
	51	#
[1676]	52	# Revision 1.30 2004/08/21 23:16:57 jalet
	53	# First draft of ESC/P2 (mini-)parser.
	54	#
[1675]	55	# Revision 1.29 2004/08/11 16:25:38 jalet
	56	# Fixed index problem in PCLXL parser when retrieving number of copies for
	57	# each page
	58	#
[1674]	59	# Revision 1.28 2004/08/10 23:01:49 jalet
	60	# Fixed number of copies in PCL5 parser
	61	#
[1673]	62	# Revision 1.27 2004/08/09 18:14:22 jalet
	63	# Added workaround for number of copies and some PostScript drivers
	64	#
[1622]	65	# Revision 1.26 2004/07/22 13:49:51 jalet
	66	# Added support for binary PostScript through GhostScript if native DSC
	67	# compliant PostScript analyzer doesn't find any page. This is much
	68	# slower though, so native analyzer is tried first.
	69	#
[1599]	70	# Revision 1.25 2004/07/10 14:06:36 jalet
	71	# Fix for Python2.1 incompatibilities
	72	#
[1591]	73	# Revision 1.24 2004/07/05 21:00:39 jalet
	74	# Fix for number of copies for each page in PCLXL parser
	75	#
[1588]	76	# Revision 1.23 2004/07/03 08:21:59 jalet
	77	# Testsuite for PDL Analyzer added
	78	#
[1580]	79	# Revision 1.22 2004/06/29 14:21:41 jalet
	80	# Smallish optimization
	81	#
[1577]	82	# Revision 1.21 2004/06/28 23:11:26 jalet
	83	# Code de-factorization in PCLXL parser
	84	#
[1576]	85	# Revision 1.20 2004/06/28 22:38:41 jalet
	86	# Increased speed by a factor of 2 in PCLXL parser
	87	#
[1575]	88	# Revision 1.19 2004/06/28 21:20:30 jalet
	89	# PCLXL support now works !
	90	#
[1574]	91	# Revision 1.18 2004/06/27 22:59:37 jalet
	92	# More work on PCLXL parser
	93	#
[1573]	94	# Revision 1.17 2004/06/26 23:20:01 jalet
	95	# Additionnal speedup for GhostScript generated PCL5 files
	96	#
[1572]	97	# Revision 1.16 2004/06/26 15:31:00 jalet
	98	# mmap reintroduced in PCL5 parser
	99	#
[1570]	100	# Revision 1.15 2004/06/26 14:14:31 jalet
	101	# Now uses Psyco if it is available
	102	#
[1568]	103	# Revision 1.14 2004/06/25 09:50:28 jalet
	104	# More debug info in PCLXL parser
	105	#
[1567]	106	# Revision 1.13 2004/06/25 08:10:08 jalet
	107	# Another fix for PCL5 parser
	108	#
[1566]	109	# Revision 1.12 2004/06/24 23:09:53 jalet
	110	# Fix for number of copies in PCL5 parser
	111	#
[1564]	112	# Revision 1.11 2004/06/23 22:07:50 jalet
	113	# Fixed PCL5 parser according to the sources of rastertohp
	114	#
[1553]	115	# Revision 1.10 2004/06/18 22:24:03 jalet
	116	# Removed old comments
	117	#
[1552]	118	# Revision 1.9 2004/06/18 22:21:27 jalet
	119	# Native PDF parser greatly improved.
	120	# GhostScript based PDF parser completely removed because native code
	121	# is now portable across Python versions.
	122	#
[1551]	123	# Revision 1.8 2004/06/18 20:49:46 jalet
	124	# "ERROR:" prefix added
	125	#
[1550]	126	# Revision 1.7 2004/06/18 17:48:04 jalet
	127	# Added native fast PDF parsing method
	128	#
[1547]	129	# Revision 1.6 2004/06/18 14:00:16 jalet
	130	# Added PDF support in smart PDL analyzer (through GhostScript for now)
	131	#
[1544]	132	# Revision 1.5 2004/06/18 10:09:05 jalet
	133	# Resets file pointer to start of file in all cases
	134	#
[1543]	135	# Revision 1.4 2004/06/18 06:16:14 jalet
	136	# Fixes PostScript detection code for incorrect drivers
	137	#
[1487]	138	# Revision 1.3 2004/05/21 20:40:08 jalet
	139	# All the code for pkpgcounter is now in pdlanalyzer.py
	140	#
[1485]	141	# Revision 1.2 2004/05/19 19:09:36 jalet
	142	# Speed improvement
	143	#
[1482]	144	# Revision 1.1 2004/05/18 09:59:54 jalet
	145	# pkpgcounter is now just a wrapper around the PDLAnalyzer class
	146	#
	147	#
	148	#
	149
	150	import sys
[1547]	151	import os
[1552]	152	import re
[1580]	153	from struct import unpack
[1482]	154	import tempfile
[1572]	155	import mmap
[1622]	156	import popen2
[1482]	157
[1487]	158	KILOBYTE = 1024
	159	MEGABYTE = 1024 * KILOBYTE
	160
	161	class PDLAnalyzerError(Exception):
	162	"""An exception for PDL Analyzer related stuff."""
	163	def __init__(self, message = ""):
	164	self.message = message
	165	Exception.__init__(self, message)
	166	def __repr__(self):
	167	return self.message
	168	__str__ = __repr__
	169
[1482]	170	class PostScriptAnalyzer :
	171	def __init__(self, infile) :
	172	"""Initialize PostScript Analyzer."""
	173	self.infile = infile
[1673]	174	self.copies = 1
[1622]	175
	176	def throughGhostScript(self) :
	177	"""Get the count through GhostScript, useful for non-DSC compliant PS files."""
	178	self.infile.seek(0)
	179	command = 'gs -sDEVICE=bbox -dNOPAUSE -dBATCH -dQUIET - 2>&1 \| grep -c "%%HiResBoundingBox:" 2>/dev/null'
	180	child = popen2.Popen4(command)
	181	try :
	182	data = self.infile.read(MEGABYTE)
	183	while data :
	184	child.tochild.write(data)
	185	data = self.infile.read(MEGABYTE)
	186	child.tochild.flush()
	187	child.tochild.close()
	188	except (IOError, OSError), msg :
	189	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
	190
	191	pagecount = 0
	192	try :
	193	pagecount = int(child.fromchild.readline().strip())
	194	except (IOError, OSError, AttributeError, ValueError) :
	195	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
	196	child.fromchild.close()
[1482]	197
[1622]	198	try :
	199	retcode = child.wait()
	200	except OSError, msg :
	201	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
[1673]	202	return pagecount * self.copies
[1622]	203
	204	def natively(self) :
[1482]	205	"""Count pages in a DSC compliant PostScript document."""
[1622]	206	self.infile.seek(0)
[1482]	207	pagecount = 0
[1552]	208	for line in self.infile.xreadlines() :
[1482]	209	if line.startswith("%%Page: ") :
	210	pagecount += 1
[1673]	211	elif line.startswith("%%BeginNonPPDFeature: NumCopies ") :
	212	# handle # of copies set by some Windows printer driver
	213	try :
	214	number = int(line.strip().split()[2])
	215	except :
	216	pass
	217	else :
[1683]	218	if number > self.copies :
[1673]	219	self.copies = number
	220	elif line.startswith("1 dict dup /NumCopies ") :
	221	# handle # of copies set by mozilla/kprinter
	222	try :
	223	number = int(line.strip().split()[4])
	224	except :
	225	pass
	226	else :
[1683]	227	if number > self.copies :
[1673]	228	self.copies = number
	229	return pagecount * self.copies
[1482]	230
[1622]	231	def getJobSize(self) :
	232	"""Count pages in PostScript document."""
	233	return self.natively() or self.throughGhostScript()
	234
[1547]	235	class PDFAnalyzer :
	236	def __init__(self, infile) :
	237	"""Initialize PDF Analyzer."""
	238	self.infile = infile
[1550]	239
[1552]	240	def getJobSize(self) :
	241	"""Counts pages in a PDF document."""
[1573]	242	regexp = re.compile(r"(/Type) ?(/Page)[/ \t\r\n]")
[1550]	243	pagecount = 0
[1552]	244	for line in self.infile.xreadlines() :
	245	pagecount += len(regexp.findall(line))
[1550]	246	return pagecount
[1547]	247
[1676]	248	class ESCP2Analyzer :
	249	def __init__(self, infile) :
	250	"""Initialize ESC/P2 Analyzer."""
	251	self.infile = infile
	252
	253	def getJobSize(self) :
	254	"""Counts pages in an ESC/P2 document."""
[1686]	255	# with Gimpprint, at least, for each page there
[1677]	256	# are two Reset Printer sequences (ESC + @)
[1686]	257	marker1 = "\033@"
	258
	259	# with other software or printer driver, we
	260	# may prefer to search for "\r\n\fESCAPE"
	261	# or "\r\fESCAPE"
	262	marker2r = "\r\f\033"
	263	marker2rn = "\r\n\f\033"
	264
	265	# and ghostscript's stcolor for example seems to
	266	# output ESC + @ + \f for each page plus one
	267	marker3 = "\033@\f"
	268
	269	# while ghostscript's escp driver outputs instead
	270	# \f + ESC + @
	271	marker4 = "\f\033@"
	272
[1690]	273	data = self.infile.read()
	274	pagecount1 = data.count(marker1)
	275	pagecount2 = max(data.count(marker2r), data.count(marker2rn))
	276	pagecount3 = data.count(marker3)
	277	pagecount4 = data.count(marker4)
[1686]	278
	279	if pagecount2 :
	280	return pagecount2
	281	elif pagecount3 > 1 :
	282	return pagecount3 - 1
	283	elif pagecount4 :
	284	return pagecount4
	285	else :
	286	return int(pagecount1 / 2)
[1676]	287
[1482]	288	class PCLAnalyzer :
	289	def __init__(self, infile) :
	290	"""Initialize PCL Analyzer."""
	291	self.infile = infile
	292
	293	def getJobSize(self) :
[1591]	294	"""Count pages in a PCL5 document.
	295
	296	Should also work for PCL3 and PCL4 documents.
	297
	298	Algorithm from pclcount
	299	(c) 2003, by Eduardo Gielamo Oliveira & Rodolfo Broco Manin
	300	published under the terms of the GNU General Public Licence v2.
	301
	302	Backported from C to Python by Jerome Alet, then enhanced
	303	with more PCL tags detected. I think all the necessary PCL tags
	304	are recognized to correctly handle PCL5 files wrt their number
	305	of pages. The documentation used for this was :
	306
	307	HP PCL/PJL Reference Set
	308	PCL5 Printer Language Technical Quick Reference Guide
	309	http://h20000.www2.hp.com/bc/docs/support/SupportManual/bpl13205/bpl13205.pdf
	310	"""
[1572]	311	infileno = self.infile.fileno()
[1599]	312	minfile = mmap.mmap(infileno, os.fstat(infileno)[6], prot=mmap.PROT_READ, flags=mmap.MAP_SHARED)
[1482]	313	tagsends = { "&n" : "W",
	314	"&b" : "W",
	315	"*i" : "W",
	316	"*l" : "W",
	317	"*m" : "W",
	318	"*v" : "W",
	319	"*c" : "W",
	320	"(f" : "W",
	321	"(s" : "W",
	322	")s" : "W",
	323	"&p" : "X",
[1564]	324	"&l" : "XH",
[1700]	325	"&a" : "G", # TODO : 0 means next side, 1 front side, 2 back side
[1698]	326	"*g" : "W",
[1573]	327	# "*b" : "VW", # treated specially because it occurs very often
[1564]	328	}
[1567]	329	pagecount = resets = ejects = backsides = 0
[1482]	330	tag = None
[1566]	331	copies = {}
[1572]	332	pos = 0
	333	try :
	334	while 1 :
	335	char = minfile[pos] ; pos += 1
	336	if char == "\014" :
	337	pagecount += 1
	338	elif char == "\033" :
	339	#
	340	# <ESC>*b###W -> Start of a raster data row/block
	341	# <ESC>*b###V -> Start of a raster data plane
	342	# <ESC>*c###W -> Start of a user defined pattern
	343	# <ESC>*i###W -> Start of a viewing illuminant block
	344	# <ESC>*l###W -> Start of a color lookup table
	345	# <ESC>*m###W -> Start of a download dither matrix block
	346	# <ESC>*v###W -> Start of a configure image data block
	347	# <ESC>(s###W -> Start of a characters description block
	348	# <ESC>)s###W -> Start of a fonts description block
	349	# <ESC>(f###W -> Start of a symbol set block
	350	# <ESC>&b###W -> Start of configuration data block
	351	# <ESC>&l###X -> Number of copies for current page
	352	# <ESC>&n###W -> Starts an alphanumeric string ID block
	353	# <ESC>&p###X -> Start of a non printable characters block
	354	# <ESC>&a2G -> Back side when duplex mode as generated by rastertohp
[1698]	355	# <ESC>*g###W -> Needed for planes in PCL3 output
[1572]	356	# <ESC>&l0H -> Eject if NumPlanes > 1, as generated by rastertohp
	357	#
	358	tagstart = minfile[pos] ; pos += 1
	359	if tagstart in "E9=YZ" : # one byte PCL tag
	360	if tagstart == "E" :
	361	resets += 1
	362	continue # skip to next tag
	363	tag = tagstart + minfile[pos] ; pos += 1
[1573]	364	if tag == "*b" :
	365	tagend = "VW"
[1572]	366	else :
[1573]	367	try :
	368	tagend = tagsends[tag]
	369	except KeyError :
	370	continue # Unsupported PCL tag
	371	# Now read the numeric argument
	372	size = 0
	373	while 1 :
	374	char = minfile[pos] ; pos += 1
	375	if not char.isdigit() :
	376	break
	377	size = (size * 10) + int(char)
	378	if char in tagend :
	379	if (tag == "&l") and (char == "X") : # copies for current page
	380	copies[pagecount] = size
	381	elif (tag == "&l") and (char == "H") and (size == 0) :
	382	ejects += 1 # Eject
	383	elif (tag == "&a") and (size == 2) :
	384	backsides += 1 # Back side in duplex mode
	385	else :
	386	# we just ignore the block.
	387	if tag == "&n" :
	388	# we have to take care of the operation id byte
	389	# which is before the string itself
	390	size += 1
	391	pos += size
[1572]	392	except IndexError : # EOF ?
	393	minfile.close() # reached EOF
[1482]	394
[1567]	395	# if pagecount is still 0, we will use the number
[1482]	396	# of resets instead of the number of form feed characters.
	397	# but the number of resets is always at least 2 with a valid
	398	# pcl file : one at the very start and one at the very end
	399	# of the job's data. So we substract 2 from the number of
	400	# resets. And since on our test data we needed to substract
	401	# 1 more, we finally substract 3, and will test several
	402	# PCL files with this. If resets < 2, then the file is
[1567]	403	# probably not a valid PCL file, so we use 0
	404	if not pagecount :
	405	pagecount = (pagecount or ((resets - 3) * (resets > 2)))
	406	else :
	407	# here we add counters for other ways new pages may have
	408	# been printed and ejected by the printer
	409	pagecount += ejects + backsides
[1482]	410
[1566]	411	# now handle number of copies for each page (may differ).
	412	# in duplex mode, number of copies may be sent only once.
	413	for pnum in range(pagecount) :
[1674]	414	# if no number of copies defined, take the preceding one else the one set before any page else 1.
	415	nb = copies.get(pnum, copies.get(pnum-1, copies.get(0, 1)))
[1566]	416	pagecount += (nb - 1)
	417	return pagecount
	418
[1699]	419	class PCL3GUIAnalyzer :
	420	def __init__(self, infile) :
	421	"""Initialize PCL3GUI Analyzer."""
	422	self.infile = infile
	423
	424	def getJobSize(self) :
	425	"""Count pages in a PCL3GUI document.
	426
	427	Not much documentation available, so we will count occurences
	428	of <ESC>*r1A which is start of graphical data.
	429
	430	This is FAR from being accurate. PCL3 ressembles PCL5 in fact,
	431	and PCL parser should be made better, but some documentation
	432	definitely lacks.
	433	"""
	434	data = self.infile.read()
[1700]	435	pagecount = data.count("\033r1A") # TODO : Allowed values 0, 1, 2, 3 after r
[1699]	436	return pagecount
	437
[1482]	438	class PCLXLAnalyzer :
	439	def __init__(self, infile) :
	440	"""Initialize PCLXL Analyzer."""
	441	self.infile = infile
[1577]	442	self.endianness = None
[1482]	443	found = 0
	444	while not found :
	445	line = self.infile.readline()
	446	if not line :
	447	break
	448	if line[1:12] == " HP-PCL XL;" :
	449	found = 1
[1574]	450	endian = ord(line[0])
	451	if endian == 0x29 :
[1575]	452	self.littleEndian()
[1574]	453	elif endian == 0x28 :
[1575]	454	self.bigEndian()
	455	# elif endian == 0x27 : TODO : What can we do here ?
	456	#
[1574]	457	else :
[1591]	458	raise PDLAnalyzerError, "Unknown endianness marker 0x%02x at start !" % endian
[1482]	459	if not found :
[1487]	460	raise PDLAnalyzerError, "This file doesn't seem to be PCLXL (aka PCL6)"
[1482]	461	else :
[1575]	462	# Initialize table of tags
	463	self.tags = [ 0 ] * 256
[1574]	464
[1575]	465	# GhostScript's sources tell us that HP printers
	466	# only accept little endianness, but we can handle both.
	467	self.tags[0x28] = self.bigEndian # BigEndian
	468	self.tags[0x29] = self.littleEndian # LittleEndian
[1574]	469
[1482]	470	self.tags[0x43] = self.beginPage # BeginPage
[1591]	471	self.tags[0x44] = self.endPage # EndPage
[1482]	472
[1575]	473	self.tags[0xc0] = 1 # ubyte
	474	self.tags[0xc1] = 2 # uint16
	475	self.tags[0xc2] = 4 # uint32
	476	self.tags[0xc3] = 2 # sint16
	477	self.tags[0xc4] = 4 # sint32
	478	self.tags[0xc5] = 4 # real32
[1574]	479
[1482]	480	self.tags[0xc8] = self.array_8 # ubyte_array
	481	self.tags[0xc9] = self.array_16 # uint16_array
	482	self.tags[0xca] = self.array_32 # uint32_array
	483	self.tags[0xcb] = self.array_16 # sint16_array
	484	self.tags[0xcc] = self.array_32 # sint32_array
	485	self.tags[0xcd] = self.array_32 # real32_array
	486
[1575]	487	self.tags[0xd0] = 2 # ubyte_xy
	488	self.tags[0xd1] = 4 # uint16_xy
	489	self.tags[0xd2] = 8 # uint32_xy
	490	self.tags[0xd3] = 4 # sint16_xy
	491	self.tags[0xd4] = 8 # sint32_xy
	492	self.tags[0xd5] = 8 # real32_xy
[1482]	493
[1575]	494	self.tags[0xe0] = 4 # ubyte_box
	495	self.tags[0xe1] = 8 # uint16_box
	496	self.tags[0xe2] = 16 # uint32_box
	497	self.tags[0xe3] = 8 # sint16_box
	498	self.tags[0xe4] = 16 # sint32_box
	499	self.tags[0xe5] = 16 # real32_box
[1482]	500
[1575]	501	self.tags[0xf8] = 1 # attr_ubyte
	502	self.tags[0xf9] = 2 # attr_uint16
[1482]	503
	504	self.tags[0xfa] = self.embeddedData # dataLength
	505	self.tags[0xfb] = self.embeddedDataSmall # dataLengthByte
	506
	507	def beginPage(self) :
	508	"""Indicates the beginning of a new page."""
	509	self.pagecount += 1
[1575]	510	return 0
[1482]	511
[1591]	512	def endPage(self) :
	513	"""Indicates the end of a page."""
	514	pos = self.pos
	515	minfile = self.minfile
	516	if (ord(minfile[pos-3]) == 0xf8) and (ord(minfile[pos-2]) == 0x31) :
	517	# The EndPage operator is preceded by a PageCopies attribute
	518	# So set number of copies for current page.
	519	# From what I read in PCLXL documentation, the number
	520	# of copies is an unsigned 16 bits integer
	521	self.copies[self.pagecount] = unpack(self.endianness + "H", minfile[pos-5:pos-3])[0]
	522	return 0
	523
[1577]	524	def array_8(self) :
	525	"""Handles byte arrays."""
[1576]	526	pos = self.pos
	527	datatype = self.minfile[pos]
	528	pos += 1
[1575]	529	length = self.tags[ord(datatype)]
[1576]	530	if callable(length) :
	531	self.pos = pos
[1575]	532	length = length()
[1576]	533	pos = self.pos
[1575]	534	posl = pos + length
	535	self.pos = posl
	536	if length == 1 :
[1580]	537	return unpack("B", self.minfile[pos:posl])[0]
[1575]	538	elif length == 2 :
[1580]	539	return unpack(self.endianness + "H", self.minfile[pos:posl])[0]
[1575]	540	elif length == 4 :
[1580]	541	return unpack(self.endianness + "I", self.minfile[pos:posl])[0]
[1575]	542	else :
	543	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
[1482]	544
	545	def array_16(self) :
	546	"""Handles byte arrays."""
[1577]	547	pos = self.pos
	548	datatype = self.minfile[pos]
	549	pos += 1
	550	length = self.tags[ord(datatype)]
	551	if callable(length) :
	552	self.pos = pos
	553	length = length()
	554	pos = self.pos
	555	posl = pos + length
	556	self.pos = posl
	557	if length == 1 :
[1580]	558	return 2 * unpack("B", self.minfile[pos:posl])[0]
[1577]	559	elif length == 2 :
[1580]	560	return 2 * unpack(self.endianness + "H", self.minfile[pos:posl])[0]
[1577]	561	elif length == 4 :
[1580]	562	return 2 * unpack(self.endianness + "I", self.minfile[pos:posl])[0]
[1577]	563	else :
	564	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
[1482]	565
	566	def array_32(self) :
	567	"""Handles byte arrays."""
[1577]	568	pos = self.pos
	569	datatype = self.minfile[pos]
	570	pos += 1
	571	length = self.tags[ord(datatype)]
	572	if callable(length) :
	573	self.pos = pos
	574	length = length()
	575	pos = self.pos
	576	posl = pos + length
	577	self.pos = posl
	578	if length == 1 :
[1580]	579	return 4 * unpack("B", self.minfile[pos:posl])[0]
[1577]	580	elif length == 2 :
[1580]	581	return 4 * unpack(self.endianness + "H", self.minfile[pos:posl])[0]
[1577]	582	elif length == 4 :
[1580]	583	return 4 * unpack(self.endianness + "I", self.minfile[pos:posl])[0]
[1577]	584	else :
	585	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
[1482]	586
	587	def embeddedDataSmall(self) :
	588	"""Handle small amounts of data."""
[1576]	589	pos = self.pos
	590	length = ord(self.minfile[pos])
	591	self.pos = pos + 1
[1575]	592	return length
[1482]	593
	594	def embeddedData(self) :
	595	"""Handle normal amounts of data."""
[1575]	596	pos = self.pos
	597	pos4 = pos + 4
	598	self.pos = pos4
[1588]	599	return unpack(self.endianness + "I", self.minfile[pos:pos4])[0]
[1482]	600
[1575]	601	def littleEndian(self) :
[1482]	602	"""Toggles to little endianness."""
[1577]	603	self.endianness = "<" # little endian
[1575]	604	return 0
[1482]	605
[1575]	606	def bigEndian(self) :
[1482]	607	"""Toggles to big endianness."""
[1577]	608	self.endianness = ">" # big endian
[1575]	609	return 0
[1482]	610
	611	def getJobSize(self) :
[1591]	612	"""Counts pages in a PCLXL (PCL6) document.
	613
	614	Algorithm by Jerome Alet.
	615
	616	The documentation used for this was :
	617
	618	HP PCL XL Feature Reference
	619	Protocol Class 2.0
	620	http://www.hpdevelopersolutions.com/downloads/64/358/xl_ref20r22.pdf
	621	"""
[1575]	622	infileno = self.infile.fileno()
[1591]	623	self.copies = {}
[1599]	624	self.minfile = minfile = mmap.mmap(infileno, os.fstat(infileno)[6], prot=mmap.PROT_READ, flags=mmap.MAP_SHARED)
[1576]	625	tags = self.tags
[1482]	626	self.pagecount = 0
[1576]	627	self.pos = pos = self.infile.tell()
[1575]	628	try :
	629	while 1 :
[1576]	630	char = minfile[pos]
	631	pos += 1
	632	length = tags[ord(char)]
	633	if not length :
[1575]	634	continue
	635	if callable(length) :
[1576]	636	self.pos = pos
[1575]	637	length = length()
[1576]	638	pos = self.pos
	639	pos += length
[1575]	640	except IndexError : # EOF ?
	641	self.minfile.close() # reached EOF
[1591]	642
	643	# now handle number of copies for each page (may differ).
[1675]	644	for pnum in range(1, self.pagecount + 1) :
[1591]	645	# if no number of copies defined, take 1, as explained
	646	# in PCLXL documentation.
	647	# NB : is number of copies is 0, the page won't be output
	648	# but the formula below is still correct : we want
	649	# to decrease the total number of pages in this case.
	650	self.pagecount += (self.copies.get(pnum, 1) - 1)
	651
[1482]	652	return self.pagecount
[1487]	653
[1482]	654	class PDLAnalyzer :
	655	"""Generic PDL Analyzer class."""
	656	def __init__(self, filename) :
[1487]	657	"""Initializes the PDL analyzer.
	658
	659	filename is the name of the file or '-' for stdin.
	660	filename can also be a file-like object which
	661	supports read() and seek().
	662	"""
[1482]	663	self.filename = filename
[1570]	664	try :
	665	import psyco
	666	except ImportError :
	667	pass # Psyco is not installed
	668	else :
	669	# Psyco is installed, tell it to compile
	670	# the CPU intensive methods : PCL and PCLXL
	671	# parsing will greatly benefit from this,
	672	# for PostScript and PDF the difference is
	673	# barely noticeable since they are already
	674	# almost optimal, and much more speedy anyway.
	675	psyco.bind(PostScriptAnalyzer.getJobSize)
	676	psyco.bind(PDFAnalyzer.getJobSize)
[1686]	677	psyco.bind(ESCP2Analyzer.getJobSize)
[1570]	678	psyco.bind(PCLAnalyzer.getJobSize)
	679	psyco.bind(PCLXLAnalyzer.getJobSize)
[1482]	680
	681	def getJobSize(self) :
	682	"""Returns the job's size."""
	683	self.openFile()
[1487]	684	try :
	685	pdlhandler = self.detectPDLHandler()
	686	except PDLAnalyzerError, msg :
	687	self.closeFile()
	688	raise PDLAnalyzerError, "ERROR : Unknown file format for %s (%s)" % (self.filename, msg)
	689	else :
[1482]	690	try :
	691	size = pdlhandler(self.infile).getJobSize()
	692	finally :
	693	self.closeFile()
	694	return size
	695
	696	def openFile(self) :
	697	"""Opens the job's data stream for reading."""
[1550]	698	self.mustclose = 0 # by default we don't want to close the file when finished
[1487]	699	if hasattr(self.filename, "read") and hasattr(self.filename, "seek") :
	700	# filename is in fact a file-like object
[1550]	701	infile = self.filename
[1487]	702	elif self.filename == "-" :
[1482]	703	# we must read from stdin
[1550]	704	infile = sys.stdin
[1482]	705	else :
	706	# normal file
[1553]	707	self.infile = open(self.filename, "rb")
[1550]	708	self.mustclose = 1
	709	return
[1482]	710
[1550]	711	# Use a temporary file, always seekable contrary to standard input.
[1553]	712	self.infile = tempfile.TemporaryFile(mode="w+b")
[1550]	713	while 1 :
	714	data = infile.read(MEGABYTE)
	715	if not data :
	716	break
	717	self.infile.write(data)
	718	self.infile.flush()
	719	self.infile.seek(0)
	720
[1482]	721	def closeFile(self) :
[1487]	722	"""Closes the job's data stream if we can close it."""
	723	if self.mustclose :
	724	self.infile.close()
[1544]	725	else :
	726	# if we don't have to close the file, then
	727	# ensure the file pointer is reset to the
	728	# start of the file in case the process wants
	729	# to read the file again.
	730	try :
	731	self.infile.seek(0)
	732	except :
	733	pass # probably stdin, which is not seekable
[1482]	734
	735	def isPostScript(self, data) :
	736	"""Returns 1 if data is PostScript, else 0."""
	737	if data.startswith("%!") or \
	738	data.startswith("\004%!") or \
	739	data.startswith("\033%-12345X%!PS") or \
	740	((data[:128].find("\033%-12345X") != -1) and \
	741	((data.find("LANGUAGE=POSTSCRIPT") != -1) or \
	742	(data.find("LANGUAGE = POSTSCRIPT") != -1) or \
[1543]	743	(data.find("LANGUAGE = Postscript") != -1))) or \
	744	(data.find("%!PS-Adobe") != -1) :
[1482]	745	return 1
	746	else :
	747	return 0
	748
[1547]	749	def isPDF(self, data) :
	750	"""Returns 1 if data is PDF, else 0."""
	751	if data.startswith("%PDF-") or \
	752	data.startswith("\033%-12345X%PDF-") or \
	753	((data[:128].find("\033%-12345X") != -1) and (data.upper().find("LANGUAGE=PDF") != -1)) or \
	754	(data.find("%PDF-") != -1) :
	755	return 1
	756	else :
	757	return 0
	758
[1482]	759	def isPCL(self, data) :
	760	"""Returns 1 if data is PCL, else 0."""
	761	if data.startswith("\033E\033") or \
[1681]	762	(data[:128].find("\033%-12345X") != -1) :
[1482]	763	return 1
	764	else :
	765	return 0
	766
[1699]	767	def isPCL3GUI(self, data) :
	768	"""Returns 1 if data is PCL3GUI, else 0."""
	769	if data.find("@PJL ENTER LANGUAGE=PCL3GUI") != -1 :
	770	return 1
	771	else :
	772	return 0
	773
[1482]	774	def isPCLXL(self, data) :
	775	"""Returns 1 if data is PCLXL aka PCL6, else 0."""
	776	if ((data[:128].find("\033%-12345X") != -1) and \
	777	(data.find(" HP-PCL XL;") != -1) and \
	778	((data.find("LANGUAGE=PCLXL") != -1) or \
	779	(data.find("LANGUAGE = PCLXL") != -1))) :
	780	return 1
	781	else :
	782	return 0
	783
[1676]	784	def isESCP2(self, data) :
	785	"""Returns 1 if data is ESC/P2, else 0."""
[1677]	786	if data.startswith("\033@") or \
[1686]	787	data.startswith("\033*") or \
[1677]	788	data.startswith("\n\033@") :
[1676]	789	return 1
	790	else :
	791	return 0
	792
[1482]	793	def detectPDLHandler(self) :
	794	"""Tries to autodetect the document format.
	795
	796	Returns the correct PDL handler class or None if format is unknown
	797	"""
	798	# Try to detect file type by reading first block of datas
	799	self.infile.seek(0)
[1699]	800	firstblock = self.infile.read(4 * KILOBYTE)
[1482]	801	self.infile.seek(0)
	802	if self.isPostScript(firstblock) :
	803	return PostScriptAnalyzer
	804	elif self.isPCLXL(firstblock) :
	805	return PCLXLAnalyzer
[1681]	806	elif self.isPDF(firstblock) :
	807	return PDFAnalyzer
[1699]	808	elif self.isPCL3GUI(firstblock) :
	809	return PCL3GUIAnalyzer
[1482]	810	elif self.isPCL(firstblock) :
	811	return PCLAnalyzer
[1676]	812	elif self.isESCP2(firstblock) :
	813	return ESCP2Analyzer
[1487]	814	else :
	815	raise PDLAnalyzerError, "Analysis of first data block failed."
	816
	817	def main() :
	818	"""Entry point for PDL Analyzer."""
	819	if (len(sys.argv) < 2) or ((not sys.stdin.isatty()) and ("-" not in sys.argv[1:])) :
	820	sys.argv.append("-")
	821
	822	totalsize = 0
	823	for arg in sys.argv[1:] :
	824	try :
	825	parser = PDLAnalyzer(arg)
	826	totalsize += parser.getJobSize()
	827	except PDLAnalyzerError, msg :
[1551]	828	sys.stderr.write("ERROR: %s\n" % msg)
[1487]	829	sys.stderr.flush()
	830	print "%s" % totalsize
	831
	832	if __name__ == "__main__" :
[1577]	833	main()

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pykota / trunk / pykota / pdlanalyzer.py @ 1700

Download in other formats: