Context Navigation

pdlanalyzer.py @ 1622

Revision 1622, 26.1 kB (checked in by jalet, 20 years ago)
Added support for binary PostScript? through GhostScript? if native DSC compliant PostScript? analyzer doesn't find any page. This is much slower though, so native analyzer is tried first.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`

Rev	Line
[1482]	1	# PyKota
	2	# -- coding: ISO-8859-15 --
	3	#
	4	# PyKota - Print Quotas for CUPS and LPRng
	5	#
	6	# (c) 2003-2004 Jerome Alet <alet@librelogiciel.com>
	7	# This program is free software; you can redistribute it and/or modify
	8	# it under the terms of the GNU General Public License as published by
	9	# the Free Software Foundation; either version 2 of the License, or
	10	# (at your option) any later version.
	11	#
	12	# This program is distributed in the hope that it will be useful,
	13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	# GNU General Public License for more details.
	16	#
	17	# You should have received a copy of the GNU General Public License
	18	# along with this program; if not, write to the Free Software
	19	# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
	20	#
	21	# $Id$
	22	#
	23	# $Log$
[1622]	24	# Revision 1.26 2004/07/22 13:49:51 jalet
	25	# Added support for binary PostScript through GhostScript if native DSC
	26	# compliant PostScript analyzer doesn't find any page. This is much
	27	# slower though, so native analyzer is tried first.
	28	#
[1599]	29	# Revision 1.25 2004/07/10 14:06:36 jalet
	30	# Fix for Python2.1 incompatibilities
	31	#
[1591]	32	# Revision 1.24 2004/07/05 21:00:39 jalet
	33	# Fix for number of copies for each page in PCLXL parser
	34	#
[1588]	35	# Revision 1.23 2004/07/03 08:21:59 jalet
	36	# Testsuite for PDL Analyzer added
	37	#
[1580]	38	# Revision 1.22 2004/06/29 14:21:41 jalet
	39	# Smallish optimization
	40	#
[1577]	41	# Revision 1.21 2004/06/28 23:11:26 jalet
	42	# Code de-factorization in PCLXL parser
	43	#
[1576]	44	# Revision 1.20 2004/06/28 22:38:41 jalet
	45	# Increased speed by a factor of 2 in PCLXL parser
	46	#
[1575]	47	# Revision 1.19 2004/06/28 21:20:30 jalet
	48	# PCLXL support now works !
	49	#
[1574]	50	# Revision 1.18 2004/06/27 22:59:37 jalet
	51	# More work on PCLXL parser
	52	#
[1573]	53	# Revision 1.17 2004/06/26 23:20:01 jalet
	54	# Additionnal speedup for GhostScript generated PCL5 files
	55	#
[1572]	56	# Revision 1.16 2004/06/26 15:31:00 jalet
	57	# mmap reintroduced in PCL5 parser
	58	#
[1570]	59	# Revision 1.15 2004/06/26 14:14:31 jalet
	60	# Now uses Psyco if it is available
	61	#
[1568]	62	# Revision 1.14 2004/06/25 09:50:28 jalet
	63	# More debug info in PCLXL parser
	64	#
[1567]	65	# Revision 1.13 2004/06/25 08:10:08 jalet
	66	# Another fix for PCL5 parser
	67	#
[1566]	68	# Revision 1.12 2004/06/24 23:09:53 jalet
	69	# Fix for number of copies in PCL5 parser
	70	#
[1564]	71	# Revision 1.11 2004/06/23 22:07:50 jalet
	72	# Fixed PCL5 parser according to the sources of rastertohp
	73	#
[1553]	74	# Revision 1.10 2004/06/18 22:24:03 jalet
	75	# Removed old comments
	76	#
[1552]	77	# Revision 1.9 2004/06/18 22:21:27 jalet
	78	# Native PDF parser greatly improved.
	79	# GhostScript based PDF parser completely removed because native code
	80	# is now portable across Python versions.
	81	#
[1551]	82	# Revision 1.8 2004/06/18 20:49:46 jalet
	83	# "ERROR:" prefix added
	84	#
[1550]	85	# Revision 1.7 2004/06/18 17:48:04 jalet
	86	# Added native fast PDF parsing method
	87	#
[1547]	88	# Revision 1.6 2004/06/18 14:00:16 jalet
	89	# Added PDF support in smart PDL analyzer (through GhostScript for now)
	90	#
[1544]	91	# Revision 1.5 2004/06/18 10:09:05 jalet
	92	# Resets file pointer to start of file in all cases
	93	#
[1543]	94	# Revision 1.4 2004/06/18 06:16:14 jalet
	95	# Fixes PostScript detection code for incorrect drivers
	96	#
[1487]	97	# Revision 1.3 2004/05/21 20:40:08 jalet
	98	# All the code for pkpgcounter is now in pdlanalyzer.py
	99	#
[1485]	100	# Revision 1.2 2004/05/19 19:09:36 jalet
	101	# Speed improvement
	102	#
[1482]	103	# Revision 1.1 2004/05/18 09:59:54 jalet
	104	# pkpgcounter is now just a wrapper around the PDLAnalyzer class
	105	#
	106	#
	107	#
	108
	109	import sys
[1547]	110	import os
[1552]	111	import re
[1580]	112	from struct import unpack
[1482]	113	import tempfile
[1572]	114	import mmap
[1622]	115	import popen2
[1482]	116
[1487]	117	KILOBYTE = 1024
	118	MEGABYTE = 1024 * KILOBYTE
	119
	120	class PDLAnalyzerError(Exception):
	121	"""An exception for PDL Analyzer related stuff."""
	122	def __init__(self, message = ""):
	123	self.message = message
	124	Exception.__init__(self, message)
	125	def __repr__(self):
	126	return self.message
	127	__str__ = __repr__
	128
[1482]	129	class PostScriptAnalyzer :
	130	def __init__(self, infile) :
	131	"""Initialize PostScript Analyzer."""
	132	self.infile = infile
[1622]	133
	134	def throughGhostScript(self) :
	135	"""Get the count through GhostScript, useful for non-DSC compliant PS files."""
	136	self.infile.seek(0)
	137	command = 'gs -sDEVICE=bbox -dNOPAUSE -dBATCH -dQUIET - 2>&1 \| grep -c "%%HiResBoundingBox:" 2>/dev/null'
	138	child = popen2.Popen4(command)
	139	try :
	140	data = self.infile.read(MEGABYTE)
	141	while data :
	142	child.tochild.write(data)
	143	data = self.infile.read(MEGABYTE)
	144	child.tochild.flush()
	145	child.tochild.close()
	146	except (IOError, OSError), msg :
	147	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
	148
	149	pagecount = 0
	150	try :
	151	pagecount = int(child.fromchild.readline().strip())
	152	except (IOError, OSError, AttributeError, ValueError) :
	153	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
	154	child.fromchild.close()
[1482]	155
[1622]	156	try :
	157	retcode = child.wait()
	158	except OSError, msg :
	159	raise PDLAnalyzerError, "Problem during analysis of Binary PostScript document."
	160	return pagecount
	161
	162	def natively(self) :
[1482]	163	"""Count pages in a DSC compliant PostScript document."""
[1622]	164	self.infile.seek(0)
[1482]	165	pagecount = 0
[1552]	166	for line in self.infile.xreadlines() :
[1482]	167	if line.startswith("%%Page: ") :
	168	pagecount += 1
	169	return pagecount
	170
[1622]	171	def getJobSize(self) :
	172	"""Count pages in PostScript document."""
	173	return self.natively() or self.throughGhostScript()
	174
[1547]	175	class PDFAnalyzer :
	176	def __init__(self, infile) :
	177	"""Initialize PDF Analyzer."""
	178	self.infile = infile
[1550]	179
[1552]	180	def getJobSize(self) :
	181	"""Counts pages in a PDF document."""
[1573]	182	regexp = re.compile(r"(/Type) ?(/Page)[/ \t\r\n]")
[1550]	183	pagecount = 0
[1552]	184	for line in self.infile.xreadlines() :
	185	pagecount += len(regexp.findall(line))
[1550]	186	return pagecount
[1547]	187
[1482]	188	class PCLAnalyzer :
	189	def __init__(self, infile) :
	190	"""Initialize PCL Analyzer."""
	191	self.infile = infile
	192
	193	def getJobSize(self) :
[1591]	194	"""Count pages in a PCL5 document.
	195
	196	Should also work for PCL3 and PCL4 documents.
	197
	198	Algorithm from pclcount
	199	(c) 2003, by Eduardo Gielamo Oliveira & Rodolfo Broco Manin
	200	published under the terms of the GNU General Public Licence v2.
	201
	202	Backported from C to Python by Jerome Alet, then enhanced
	203	with more PCL tags detected. I think all the necessary PCL tags
	204	are recognized to correctly handle PCL5 files wrt their number
	205	of pages. The documentation used for this was :
	206
	207	HP PCL/PJL Reference Set
	208	PCL5 Printer Language Technical Quick Reference Guide
	209	http://h20000.www2.hp.com/bc/docs/support/SupportManual/bpl13205/bpl13205.pdf
	210	"""
[1572]	211	infileno = self.infile.fileno()
[1599]	212	minfile = mmap.mmap(infileno, os.fstat(infileno)[6], prot=mmap.PROT_READ, flags=mmap.MAP_SHARED)
[1482]	213	tagsends = { "&n" : "W",
	214	"&b" : "W",
	215	"*i" : "W",
	216	"*l" : "W",
	217	"*m" : "W",
	218	"*v" : "W",
	219	"*c" : "W",
	220	"(f" : "W",
	221	"(s" : "W",
	222	")s" : "W",
	223	"&p" : "X",
[1564]	224	"&l" : "XH",
	225	"&a" : "G",
[1573]	226	# "*b" : "VW", # treated specially because it occurs very often
[1564]	227	}
[1567]	228	pagecount = resets = ejects = backsides = 0
[1482]	229	tag = None
[1566]	230	copies = {}
[1572]	231	pos = 0
	232	try :
	233	while 1 :
	234	char = minfile[pos] ; pos += 1
	235	if char == "\014" :
	236	pagecount += 1
	237	elif char == "\033" :
	238	#
	239	# <ESC>*b###W -> Start of a raster data row/block
	240	# <ESC>*b###V -> Start of a raster data plane
	241	# <ESC>*c###W -> Start of a user defined pattern
	242	# <ESC>*i###W -> Start of a viewing illuminant block
	243	# <ESC>*l###W -> Start of a color lookup table
	244	# <ESC>*m###W -> Start of a download dither matrix block
	245	# <ESC>*v###W -> Start of a configure image data block
	246	# <ESC>(s###W -> Start of a characters description block
	247	# <ESC>)s###W -> Start of a fonts description block
	248	# <ESC>(f###W -> Start of a symbol set block
	249	# <ESC>&b###W -> Start of configuration data block
	250	# <ESC>&l###X -> Number of copies for current page
	251	# <ESC>&n###W -> Starts an alphanumeric string ID block
	252	# <ESC>&p###X -> Start of a non printable characters block
	253	# <ESC>&a2G -> Back side when duplex mode as generated by rastertohp
	254	# <ESC>&l0H -> Eject if NumPlanes > 1, as generated by rastertohp
	255	#
	256	tagstart = minfile[pos] ; pos += 1
	257	if tagstart in "E9=YZ" : # one byte PCL tag
	258	if tagstart == "E" :
	259	resets += 1
	260	continue # skip to next tag
	261	tag = tagstart + minfile[pos] ; pos += 1
[1573]	262	if tag == "*b" :
	263	tagend = "VW"
[1572]	264	else :
[1573]	265	try :
	266	tagend = tagsends[tag]
	267	except KeyError :
	268	continue # Unsupported PCL tag
	269	# Now read the numeric argument
	270	size = 0
	271	while 1 :
	272	char = minfile[pos] ; pos += 1
	273	if not char.isdigit() :
	274	break
	275	size = (size * 10) + int(char)
	276	if char in tagend :
	277	if (tag == "&l") and (char == "X") : # copies for current page
	278	copies[pagecount] = size
	279	elif (tag == "&l") and (char == "H") and (size == 0) :
	280	ejects += 1 # Eject
	281	elif (tag == "&a") and (size == 2) :
	282	backsides += 1 # Back side in duplex mode
	283	else :
	284	# we just ignore the block.
	285	if tag == "&n" :
	286	# we have to take care of the operation id byte
	287	# which is before the string itself
	288	size += 1
	289	pos += size
[1572]	290	except IndexError : # EOF ?
	291	minfile.close() # reached EOF
[1482]	292
[1567]	293	# if pagecount is still 0, we will use the number
[1482]	294	# of resets instead of the number of form feed characters.
	295	# but the number of resets is always at least 2 with a valid
	296	# pcl file : one at the very start and one at the very end
	297	# of the job's data. So we substract 2 from the number of
	298	# resets. And since on our test data we needed to substract
	299	# 1 more, we finally substract 3, and will test several
	300	# PCL files with this. If resets < 2, then the file is
[1567]	301	# probably not a valid PCL file, so we use 0
	302	if not pagecount :
	303	pagecount = (pagecount or ((resets - 3) * (resets > 2)))
	304	else :
	305	# here we add counters for other ways new pages may have
	306	# been printed and ejected by the printer
	307	pagecount += ejects + backsides
[1482]	308
[1566]	309	# now handle number of copies for each page (may differ).
	310	# in duplex mode, number of copies may be sent only once.
	311	for pnum in range(pagecount) :
	312	# if no number of copies defined, take the preceding one else 1.
	313	nb = copies.get(pnum, copies.get(pnum-1, 1))
	314	pagecount += (nb - 1)
	315	return pagecount
	316
[1482]	317	class PCLXLAnalyzer :
	318	def __init__(self, infile) :
	319	"""Initialize PCLXL Analyzer."""
	320	self.infile = infile
[1577]	321	self.endianness = None
[1482]	322	found = 0
	323	while not found :
	324	line = self.infile.readline()
	325	if not line :
	326	break
	327	if line[1:12] == " HP-PCL XL;" :
	328	found = 1
[1574]	329	endian = ord(line[0])
	330	if endian == 0x29 :
[1575]	331	self.littleEndian()
[1574]	332	elif endian == 0x28 :
[1575]	333	self.bigEndian()
	334	# elif endian == 0x27 : TODO : What can we do here ?
	335	#
[1574]	336	else :
[1591]	337	raise PDLAnalyzerError, "Unknown endianness marker 0x%02x at start !" % endian
[1482]	338	if not found :
[1487]	339	raise PDLAnalyzerError, "This file doesn't seem to be PCLXL (aka PCL6)"
[1482]	340	else :
[1575]	341	# Initialize table of tags
	342	self.tags = [ 0 ] * 256
[1574]	343
[1575]	344	# GhostScript's sources tell us that HP printers
	345	# only accept little endianness, but we can handle both.
	346	self.tags[0x28] = self.bigEndian # BigEndian
	347	self.tags[0x29] = self.littleEndian # LittleEndian
[1574]	348
[1482]	349	self.tags[0x43] = self.beginPage # BeginPage
[1591]	350	self.tags[0x44] = self.endPage # EndPage
[1482]	351
[1575]	352	self.tags[0xc0] = 1 # ubyte
	353	self.tags[0xc1] = 2 # uint16
	354	self.tags[0xc2] = 4 # uint32
	355	self.tags[0xc3] = 2 # sint16
	356	self.tags[0xc4] = 4 # sint32
	357	self.tags[0xc5] = 4 # real32
[1574]	358
[1482]	359	self.tags[0xc8] = self.array_8 # ubyte_array
	360	self.tags[0xc9] = self.array_16 # uint16_array
	361	self.tags[0xca] = self.array_32 # uint32_array
	362	self.tags[0xcb] = self.array_16 # sint16_array
	363	self.tags[0xcc] = self.array_32 # sint32_array
	364	self.tags[0xcd] = self.array_32 # real32_array
	365
[1575]	366	self.tags[0xd0] = 2 # ubyte_xy
	367	self.tags[0xd1] = 4 # uint16_xy
	368	self.tags[0xd2] = 8 # uint32_xy
	369	self.tags[0xd3] = 4 # sint16_xy
	370	self.tags[0xd4] = 8 # sint32_xy
	371	self.tags[0xd5] = 8 # real32_xy
[1482]	372
[1575]	373	self.tags[0xe0] = 4 # ubyte_box
	374	self.tags[0xe1] = 8 # uint16_box
	375	self.tags[0xe2] = 16 # uint32_box
	376	self.tags[0xe3] = 8 # sint16_box
	377	self.tags[0xe4] = 16 # sint32_box
	378	self.tags[0xe5] = 16 # real32_box
[1482]	379
[1575]	380	self.tags[0xf8] = 1 # attr_ubyte
	381	self.tags[0xf9] = 2 # attr_uint16
[1482]	382
	383	self.tags[0xfa] = self.embeddedData # dataLength
	384	self.tags[0xfb] = self.embeddedDataSmall # dataLengthByte
	385
	386	def beginPage(self) :
	387	"""Indicates the beginning of a new page."""
	388	self.pagecount += 1
[1575]	389	return 0
[1482]	390
[1591]	391	def endPage(self) :
	392	"""Indicates the end of a page."""
	393	pos = self.pos
	394	minfile = self.minfile
	395	if (ord(minfile[pos-3]) == 0xf8) and (ord(minfile[pos-2]) == 0x31) :
	396	# The EndPage operator is preceded by a PageCopies attribute
	397	# So set number of copies for current page.
	398	# From what I read in PCLXL documentation, the number
	399	# of copies is an unsigned 16 bits integer
	400	self.copies[self.pagecount] = unpack(self.endianness + "H", minfile[pos-5:pos-3])[0]
	401	return 0
	402
[1577]	403	def array_8(self) :
	404	"""Handles byte arrays."""
[1576]	405	pos = self.pos
	406	datatype = self.minfile[pos]
	407	pos += 1
[1575]	408	length = self.tags[ord(datatype)]
[1576]	409	if callable(length) :
	410	self.pos = pos
[1575]	411	length = length()
[1576]	412	pos = self.pos
[1575]	413	posl = pos + length
	414	self.pos = posl
	415	if length == 1 :
[1580]	416	return unpack("B", self.minfile[pos:posl])[0]
[1575]	417	elif length == 2 :
[1580]	418	return unpack(self.endianness + "H", self.minfile[pos:posl])[0]
[1575]	419	elif length == 4 :
[1580]	420	return unpack(self.endianness + "I", self.minfile[pos:posl])[0]
[1575]	421	else :
	422	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
[1482]	423
	424	def array_16(self) :
	425	"""Handles byte arrays."""
[1577]	426	pos = self.pos
	427	datatype = self.minfile[pos]
	428	pos += 1
	429	length = self.tags[ord(datatype)]
	430	if callable(length) :
	431	self.pos = pos
	432	length = length()
	433	pos = self.pos
	434	posl = pos + length
	435	self.pos = posl
	436	if length == 1 :
[1580]	437	return 2 * unpack("B", self.minfile[pos:posl])[0]
[1577]	438	elif length == 2 :
[1580]	439	return 2 * unpack(self.endianness + "H", self.minfile[pos:posl])[0]
[1577]	440	elif length == 4 :
[1580]	441	return 2 * unpack(self.endianness + "I", self.minfile[pos:posl])[0]
[1577]	442	else :
	443	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
[1482]	444
	445	def array_32(self) :
	446	"""Handles byte arrays."""
[1577]	447	pos = self.pos
	448	datatype = self.minfile[pos]
	449	pos += 1
	450	length = self.tags[ord(datatype)]
	451	if callable(length) :
	452	self.pos = pos
	453	length = length()
	454	pos = self.pos
	455	posl = pos + length
	456	self.pos = posl
	457	if length == 1 :
[1580]	458	return 4 * unpack("B", self.minfile[pos:posl])[0]
[1577]	459	elif length == 2 :
[1580]	460	return 4 * unpack(self.endianness + "H", self.minfile[pos:posl])[0]
[1577]	461	elif length == 4 :
[1580]	462	return 4 * unpack(self.endianness + "I", self.minfile[pos:posl])[0]
[1577]	463	else :
	464	raise PDLAnalyzerError, "Error on array size at %s" % self.pos
[1482]	465
	466	def embeddedDataSmall(self) :
	467	"""Handle small amounts of data."""
[1576]	468	pos = self.pos
	469	length = ord(self.minfile[pos])
	470	self.pos = pos + 1
[1575]	471	return length
[1482]	472
	473	def embeddedData(self) :
	474	"""Handle normal amounts of data."""
[1575]	475	pos = self.pos
	476	pos4 = pos + 4
	477	self.pos = pos4
[1588]	478	return unpack(self.endianness + "I", self.minfile[pos:pos4])[0]
[1482]	479
[1575]	480	def littleEndian(self) :
[1482]	481	"""Toggles to little endianness."""
[1577]	482	self.endianness = "<" # little endian
[1575]	483	return 0
[1482]	484
[1575]	485	def bigEndian(self) :
[1482]	486	"""Toggles to big endianness."""
[1577]	487	self.endianness = ">" # big endian
[1575]	488	return 0
[1482]	489
	490	def getJobSize(self) :
[1591]	491	"""Counts pages in a PCLXL (PCL6) document.
	492
	493	Algorithm by Jerome Alet.
	494
	495	The documentation used for this was :
	496
	497	HP PCL XL Feature Reference
	498	Protocol Class 2.0
	499	http://www.hpdevelopersolutions.com/downloads/64/358/xl_ref20r22.pdf
	500	"""
[1575]	501	infileno = self.infile.fileno()
[1591]	502	self.copies = {}
[1599]	503	self.minfile = minfile = mmap.mmap(infileno, os.fstat(infileno)[6], prot=mmap.PROT_READ, flags=mmap.MAP_SHARED)
[1576]	504	tags = self.tags
[1482]	505	self.pagecount = 0
[1576]	506	self.pos = pos = self.infile.tell()
[1575]	507	try :
	508	while 1 :
[1576]	509	char = minfile[pos]
	510	pos += 1
	511	length = tags[ord(char)]
	512	if not length :
[1575]	513	continue
	514	if callable(length) :
[1576]	515	self.pos = pos
[1575]	516	length = length()
[1576]	517	pos = self.pos
	518	pos += length
[1575]	519	except IndexError : # EOF ?
	520	self.minfile.close() # reached EOF
[1591]	521
	522	# now handle number of copies for each page (may differ).
	523	for pnum in range(self.pagecount) :
	524	# if no number of copies defined, take 1, as explained
	525	# in PCLXL documentation.
	526	# NB : is number of copies is 0, the page won't be output
	527	# but the formula below is still correct : we want
	528	# to decrease the total number of pages in this case.
	529	self.pagecount += (self.copies.get(pnum, 1) - 1)
	530
[1482]	531	return self.pagecount
[1487]	532
[1482]	533	class PDLAnalyzer :
	534	"""Generic PDL Analyzer class."""
	535	def __init__(self, filename) :
[1487]	536	"""Initializes the PDL analyzer.
	537
	538	filename is the name of the file or '-' for stdin.
	539	filename can also be a file-like object which
	540	supports read() and seek().
	541	"""
[1482]	542	self.filename = filename
[1570]	543	try :
	544	import psyco
	545	except ImportError :
	546	pass # Psyco is not installed
	547	else :
	548	# Psyco is installed, tell it to compile
	549	# the CPU intensive methods : PCL and PCLXL
	550	# parsing will greatly benefit from this,
	551	# for PostScript and PDF the difference is
	552	# barely noticeable since they are already
	553	# almost optimal, and much more speedy anyway.
	554	psyco.bind(PostScriptAnalyzer.getJobSize)
	555	psyco.bind(PDFAnalyzer.getJobSize)
	556	psyco.bind(PCLAnalyzer.getJobSize)
	557	psyco.bind(PCLXLAnalyzer.getJobSize)
[1482]	558
	559	def getJobSize(self) :
	560	"""Returns the job's size."""
	561	self.openFile()
[1487]	562	try :
	563	pdlhandler = self.detectPDLHandler()
	564	except PDLAnalyzerError, msg :
	565	self.closeFile()
	566	raise PDLAnalyzerError, "ERROR : Unknown file format for %s (%s)" % (self.filename, msg)
	567	else :
[1482]	568	try :
	569	size = pdlhandler(self.infile).getJobSize()
	570	finally :
	571	self.closeFile()
	572	return size
	573
	574	def openFile(self) :
	575	"""Opens the job's data stream for reading."""
[1550]	576	self.mustclose = 0 # by default we don't want to close the file when finished
[1487]	577	if hasattr(self.filename, "read") and hasattr(self.filename, "seek") :
	578	# filename is in fact a file-like object
[1550]	579	infile = self.filename
[1487]	580	elif self.filename == "-" :
[1482]	581	# we must read from stdin
[1550]	582	infile = sys.stdin
[1482]	583	else :
	584	# normal file
[1553]	585	self.infile = open(self.filename, "rb")
[1550]	586	self.mustclose = 1
	587	return
[1482]	588
[1550]	589	# Use a temporary file, always seekable contrary to standard input.
[1553]	590	self.infile = tempfile.TemporaryFile(mode="w+b")
[1550]	591	while 1 :
	592	data = infile.read(MEGABYTE)
	593	if not data :
	594	break
	595	self.infile.write(data)
	596	self.infile.flush()
	597	self.infile.seek(0)
	598
[1482]	599	def closeFile(self) :
[1487]	600	"""Closes the job's data stream if we can close it."""
	601	if self.mustclose :
	602	self.infile.close()
[1544]	603	else :
	604	# if we don't have to close the file, then
	605	# ensure the file pointer is reset to the
	606	# start of the file in case the process wants
	607	# to read the file again.
	608	try :
	609	self.infile.seek(0)
	610	except :
	611	pass # probably stdin, which is not seekable
[1482]	612
	613	def isPostScript(self, data) :
	614	"""Returns 1 if data is PostScript, else 0."""
	615	if data.startswith("%!") or \
	616	data.startswith("\004%!") or \
	617	data.startswith("\033%-12345X%!PS") or \
	618	((data[:128].find("\033%-12345X") != -1) and \
	619	((data.find("LANGUAGE=POSTSCRIPT") != -1) or \
	620	(data.find("LANGUAGE = POSTSCRIPT") != -1) or \
[1543]	621	(data.find("LANGUAGE = Postscript") != -1))) or \
	622	(data.find("%!PS-Adobe") != -1) :
[1482]	623	return 1
	624	else :
	625	return 0
	626
[1547]	627	def isPDF(self, data) :
	628	"""Returns 1 if data is PDF, else 0."""
	629	if data.startswith("%PDF-") or \
	630	data.startswith("\033%-12345X%PDF-") or \
	631	((data[:128].find("\033%-12345X") != -1) and (data.upper().find("LANGUAGE=PDF") != -1)) or \
	632	(data.find("%PDF-") != -1) :
	633	return 1
	634	else :
	635	return 0
	636
[1482]	637	def isPCL(self, data) :
	638	"""Returns 1 if data is PCL, else 0."""
	639	if data.startswith("\033E\033") or \
	640	((data[:128].find("\033%-12345X") != -1) and \
	641	((data.find("LANGUAGE=PCL") != -1) or \
	642	(data.find("LANGUAGE = PCL") != -1) or \
	643	(data.find("LANGUAGE = Pcl") != -1))) :
	644	return 1
	645	else :
	646	return 0
	647
	648	def isPCLXL(self, data) :
	649	"""Returns 1 if data is PCLXL aka PCL6, else 0."""
	650	if ((data[:128].find("\033%-12345X") != -1) and \
	651	(data.find(" HP-PCL XL;") != -1) and \
	652	((data.find("LANGUAGE=PCLXL") != -1) or \
	653	(data.find("LANGUAGE = PCLXL") != -1))) :
	654	return 1
	655	else :
	656	return 0
	657
	658	def detectPDLHandler(self) :
	659	"""Tries to autodetect the document format.
	660
	661	Returns the correct PDL handler class or None if format is unknown
	662	"""
	663	# Try to detect file type by reading first block of datas
	664	self.infile.seek(0)
[1487]	665	firstblock = self.infile.read(KILOBYTE)
[1482]	666	self.infile.seek(0)
	667	if self.isPostScript(firstblock) :
	668	return PostScriptAnalyzer
	669	elif self.isPCLXL(firstblock) :
	670	return PCLXLAnalyzer
	671	elif self.isPCL(firstblock) :
	672	return PCLAnalyzer
[1547]	673	elif self.isPDF(firstblock) :
	674	return PDFAnalyzer
[1487]	675	else :
	676	raise PDLAnalyzerError, "Analysis of first data block failed."
	677
	678	def main() :
	679	"""Entry point for PDL Analyzer."""
	680	if (len(sys.argv) < 2) or ((not sys.stdin.isatty()) and ("-" not in sys.argv[1:])) :
	681	sys.argv.append("-")
	682
	683	totalsize = 0
	684	for arg in sys.argv[1:] :
	685	try :
	686	parser = PDLAnalyzer(arg)
	687	totalsize += parser.getJobSize()
	688	except PDLAnalyzerError, msg :
[1551]	689	sys.stderr.write("ERROR: %s\n" % msg)
[1487]	690	sys.stderr.flush()
	691	print "%s" % totalsize
	692
	693	if __name__ == "__main__" :
[1577]	694	main()

Note: See TracBrowser for help on using the browser.

Context Navigation

root / pykota / trunk / pykota / pdlanalyzer.py @ 1622

Download in other formats: