58 | | import os |
59 | | import mmap |
60 | | import struct |
61 | | import tempfile |
62 | | |
63 | | def debug(msg) : |
64 | | """Outputs a debug message on stderr.""" |
65 | | sys.stderr.write("%s\n" % msg) |
66 | | sys.stderr.flush() |
67 | | |
68 | | def ispostscript(data) : |
69 | | """Returns 1 if data is PostScript, else 0.""" |
70 | | if data.startswith("%!") or \ |
71 | | data.startswith("\004%!") or \ |
72 | | data.startswith("\033%-12345X%!PS") or \ |
73 | | ((data[:128].find("\033%-12345X") != -1) and \ |
74 | | ((data.find("LANGUAGE=POSTSCRIPT") != -1) or \ |
75 | | (data.find("LANGUAGE = POSTSCRIPT") != -1) or \ |
76 | | (data.find("LANGUAGE = Postscript") != -1))) : |
77 | | return 1 |
78 | | else : |
79 | | return 0 |
80 | | |
81 | | def ispcl(data) : |
82 | | """Returns 1 if data is PCL, else 0.""" |
83 | | if data.startswith("\033E\033") or \ |
84 | | ((data[:128].find("\033%-12345X") != -1) and \ |
85 | | ((data.find("LANGUAGE=PCL") != -1) or \ |
86 | | (data.find("LANGUAGE = PCL") != -1) or \ |
87 | | (data.find("LANGUAGE = Pcl") != -1))) : |
88 | | return 1 |
89 | | else : |
90 | | return 0 |
91 | | |
92 | | def ispclxl(data) : |
93 | | """Returns 1 if data is PCLXL aka PCL6, else 0.""" |
94 | | if ((data[:128].find("\033%-12345X") != -1) and \ |
95 | | (data.find(" HP-PCL XL;") != -1) and \ |
96 | | ((data.find("LANGUAGE=PCLXL") != -1) or \ |
97 | | (data.find("LANGUAGE = PCLXL") != -1))) : |
98 | | return 1 |
99 | | else : |
100 | | return 0 |
101 | | |
102 | | def postscript(infile) : |
103 | | """Count pages in a DSC compliant PostScript document.""" |
104 | | pagecount = 0 |
105 | | pagenum = None |
106 | | while 1 : |
107 | | line = infile.readline() |
108 | | if not line : |
109 | | break |
110 | | if line.startswith("%%Page: ") : |
111 | | pagecount += 1 |
112 | | return pagecount |
113 | | |
114 | | def pcl(infile) : |
115 | | """Count pages in a PCL5 document.""" |
116 | | # |
117 | | # Algorithm from pclcount |
118 | | # (c) 2003, by Eduardo Gielamo Oliveira & Rodolfo Broco Manin |
119 | | # published under the terms of the GNU General Public Licence v2. |
120 | | # |
121 | | # Backported from C to Python by Jerome Alet, then enhanced |
122 | | # with more PCL tags detected. I think all the necessary PCL tags |
123 | | # are recognized to correctly handle PCL5 files wrt their number |
124 | | # of pages. The documentation used for this was : |
125 | | # |
126 | | # HP PCL/PJL Reference Set |
127 | | # PCL5 Printer Language Technical Quick Reference Guide |
128 | | # http://h20000.www2.hp.com/bc/docs/support/SupportManual/bpl13205/bpl13205.pdf |
129 | | # |
130 | | infileno = infile.fileno() |
131 | | infile = mmap.mmap(infileno, os.fstat(infileno).st_size, access=mmap.ACCESS_READ) |
132 | | tagsends = { "&n" : "W", |
133 | | "&b" : "W", |
134 | | "*i" : "W", |
135 | | "*l" : "W", |
136 | | "*m" : "W", |
137 | | "*v" : "W", |
138 | | "*c" : "W", |
139 | | "(f" : "W", |
140 | | "*b" : "VW", |
141 | | "(s" : "W", |
142 | | ")s" : "W", |
143 | | "&p" : "X", |
144 | | "&l" : "X" } |
145 | | copies = 1 |
146 | | pagecount = resets = 0 |
147 | | tag = None |
148 | | position = 0 |
149 | | while 1 : |
150 | | try : |
151 | | char = infile[position] |
152 | | except IndexError : # EOF |
153 | | break |
154 | | position += 1 |
155 | | if char == "\014" : |
156 | | pagecount += 1 |
157 | | elif char == "\033" : |
158 | | # |
159 | | # <ESC>*b###W -> Start of a raster data row/block |
160 | | # <ESC>*b###V -> Start of a raster data plane |
161 | | # <ESC>*c###W -> Start of a user defined pattern |
162 | | # <ESC>*i###W -> Start of a viewing illuminant block |
163 | | # <ESC>*l###W -> Start of a color lookup table |
164 | | # <ESC>*m###W -> Start of a download dither matrix block |
165 | | # <ESC>*v###W -> Start of a configure image data block |
166 | | # <ESC>(s###W -> Start of a characters description block |
167 | | # <ESC>)s###W -> Start of a fonts description block |
168 | | # <ESC>(f###W -> Start of a symbol set block |
169 | | # <ESC>&b###W -> Start of configuration data block |
170 | | # <ESC>&l###X -> Number of copies |
171 | | # <ESC>&n###W -> Starts an alphanumeric string ID block |
172 | | # <ESC>&p###X -> Start of a non printable characters block |
173 | | # |
174 | | tagstart = infile[position] |
175 | | position += 1 |
176 | | if tagstart in "E9=YZ" : # one byte PCL tag |
177 | | if tagstart == "E" : |
178 | | resets += 1 |
179 | | continue # skip to next tag |
180 | | tag = tagstart + infile[position] |
181 | | position += 1 |
182 | | try : |
183 | | tagend = tagsends[tag] |
184 | | except KeyError : |
185 | | pass # Unsupported PCL tag |
186 | | else : |
187 | | # Now read the numeric argument |
188 | | size = 0 |
189 | | while 1 : |
190 | | char = infile[position] |
191 | | position += 1 |
192 | | if not char.isdigit() : |
193 | | break |
194 | | size = (size * 10) + int(char) |
195 | | if char in tagend : |
196 | | if tag == "&l" : |
197 | | copies = size |
198 | | else : |
199 | | # doing a read will prevent the seek |
200 | | # for unseekable streams. |
201 | | # we just ignore the block anyway. |
202 | | if tag == "&n" : |
203 | | # we have to take care of the operation id byte |
204 | | # which is before the string itself |
205 | | size += 1 |
206 | | position += size |
207 | | |
208 | | # if pagecount is still 0, we will return the number |
209 | | # of resets instead of the number of form feed characters. |
210 | | # but the number of resets is always at least 2 with a valid |
211 | | # pcl file : one at the very start and one at the very end |
212 | | # of the job's data. So we substract 2 from the number of |
213 | | # resets. And since on our test data we needed to substract |
214 | | # 1 more, we finally substract 3, and will test several |
215 | | # PCL files with this. If resets < 2, then the file is |
216 | | # probably not a valid PCL file, so we return 0 |
217 | | if not pagecount : |
218 | | return copies * (resets - 3) * (resets > 2) |
219 | | else : |
220 | | return copies * pagecount |
221 | | |
222 | | class PCLXLParser : |
223 | | def __init__(self, infile) : |
224 | | """Initialize PCLXL parser.""" |
225 | | self.infile = infile |
226 | | self.islittleendian = None |
227 | | found = 0 |
228 | | while not found : |
229 | | line = self.infile.readline() |
230 | | if not line : |
231 | | break |
232 | | if line[1:12] == " HP-PCL XL;" : |
233 | | found = 1 |
234 | | if line[0] == ")" : |
235 | | self.littleendian() |
236 | | elif line[0] == "(" : |
237 | | self.bigendian() |
238 | | if not found : |
239 | | raise TypeError, "This file doesn't seem to be PCLXL (aka PCL6)" |
240 | | else : |
241 | | self.tags = [None] * 256 |
242 | | self.tags[0x28] = self.bigendian # big endian |
243 | | self.tags[0x29] = self.littleendian # big endian |
244 | | self.tags[0x43] = self.beginPage # BeginPage |
245 | | self.tags[0x44] = self.endPage # EndPage |
246 | | |
247 | | self.tags[0xc0] = 1 # ubyte |
248 | | self.tags[0xc1] = 2 # uint16 |
249 | | self.tags[0xc2] = 4 # uint32 |
250 | | self.tags[0xc3] = 2 # sint16 |
251 | | self.tags[0xc4] = 4 # sint32 |
252 | | self.tags[0xc5] = 4 # real32 |
253 | | |
254 | | self.tags[0xc8] = self.array_8 # ubyte_array |
255 | | self.tags[0xc9] = self.array_16 # uint16_array |
256 | | self.tags[0xca] = self.array_32 # uint32_array |
257 | | self.tags[0xcb] = self.array_16 # sint16_array |
258 | | self.tags[0xcc] = self.array_32 # sint32_array |
259 | | self.tags[0xcd] = self.array_32 # real32_array |
260 | | |
261 | | self.tags[0xd0] = 2 # ubyte_xy |
262 | | self.tags[0xd1] = 4 # uint16_xy |
263 | | self.tags[0xd2] = 8 # uint32_xy |
264 | | self.tags[0xd3] = 4 # sint16_xy |
265 | | self.tags[0xd4] = 8 # sint32_xy |
266 | | self.tags[0xd5] = 8 # real32_xy |
267 | | |
268 | | self.tags[0xd0] = 4 # ubyte_box |
269 | | self.tags[0xd1] = 8 # uint16_box |
270 | | self.tags[0xd2] = 16 # uint32_box |
271 | | self.tags[0xd3] = 8 # sint16_box |
272 | | self.tags[0xd4] = 16 # sint32_box |
273 | | self.tags[0xd5] = 16 # real32_box |
274 | | |
275 | | self.tags[0xf8] = 1 # attr_ubyte |
276 | | self.tags[0xf9] = 2 # attr_uint16 |
277 | | |
278 | | self.tags[0xfa] = self.embeddedData # dataLength |
279 | | self.tags[0xfb] = self.embeddedDataSmall # dataLengthByte |
280 | | |
281 | | def beginPage(self) : |
282 | | """Indicates the beginning of a new page.""" |
283 | | self.pagecount += 1 |
284 | | debug("Begin page %i at %s" % (self.pagecount, self.infile.tell())) |
285 | | |
286 | | def endPage(self) : |
287 | | """Indicates the end of a page.""" |
288 | | debug("End page %i at %s" % (self.pagecount, self.infile.tell())) |
289 | | |
290 | | def handleArray(self, itemsize) : |
291 | | """Handles arrays.""" |
292 | | datatype = self.infile.read(1) |
293 | | length = self.tags[ord(datatype)] |
294 | | sarraysize = self.infile.read(length) |
295 | | if self.islittleendian : |
296 | | fmt = "<" |
297 | | else : |
298 | | fmt = ">" |
299 | | if length == 1 : |
300 | | fmt += "B" |
301 | | elif length == 2 : |
302 | | fmt += "H" |
303 | | elif length == 4 : |
304 | | fmt += "I" |
305 | | else : |
306 | | raise TypeError, "Error on array size at %s" % self.infile.tell() |
307 | | arraysize = struct.unpack(fmt, sarraysize)[0] |
308 | | return arraysize * itemsize |
309 | | |
310 | | def array_8(self) : |
311 | | """Handles byte arrays.""" |
312 | | return self.handleArray(1) |
313 | | |
314 | | def array_16(self) : |
315 | | """Handles byte arrays.""" |
316 | | return self.handleArray(2) |
317 | | |
318 | | def array_32(self) : |
319 | | """Handles byte arrays.""" |
320 | | return self.handleArray(4) |
321 | | |
322 | | def embeddedDataSmall(self) : |
323 | | """Handle small amounts of data.""" |
324 | | return ord(self.infile.read(1)) |
325 | | |
326 | | def embeddedData(self) : |
327 | | """Handle normal amounts of data.""" |
328 | | if self.islittleendian : |
329 | | fmt = "<I" |
330 | | else : |
331 | | fmt = ">I" |
332 | | return struct.unpack(fmt, self.infile.read(4))[0] |
333 | | |
334 | | def littleendian(self) : |
335 | | """Toggles to little endianness.""" |
336 | | self.islittleendian = 1 # little endian |
337 | | |
338 | | def bigendian(self) : |
339 | | """Toggles to big endianness.""" |
340 | | self.islittleendian = 0 # big endian |
341 | | |
342 | | def pagecount(self) : |
343 | | """Counts pages in a PCLXL (PCL6) document.""" |
344 | | self.pagecount = 0 |
345 | | while 1 : |
346 | | pos = self.infile.tell() |
347 | | char = self.infile.read(1) |
348 | | if not char : |
349 | | break |
350 | | index = ord(char) |
351 | | length = self.tags[index] |
352 | | if length is not None : |
353 | | if not length : |
354 | | debug("Unrecognized tag 0x%02x at %s\n" % (index, self.infile.tell())) |
355 | | elif callable(length) : |
356 | | length = length() |
357 | | if length : |
358 | | self.infile.read(length) |
359 | | return self.pagecount |
360 | | |
361 | | def pclxl(infile) : |
362 | | """Count pages in a PCL6 aka PCLXL document.""" |
363 | | parser = PCLXLParser(infile) |
364 | | return parser.pagecount() |
365 | | |
366 | | def smartpagecounter(filename) : |
367 | | """Autodetects file format and returns number of pages.""" |
368 | | if filename == "-" : |
369 | | # we must read from stdin |
370 | | # but since stdin is not seekable, we have to use a temporary |
371 | | # file instead. |
372 | | infile = tempfile.TemporaryFile() |
373 | | while 1 : |
374 | | data = sys.stdin.read(256 * 1024) |
375 | | if not data : |
376 | | break |
377 | | infile.write(data) |
378 | | infile.flush() |
379 | | infile.seek(0) |
380 | | else : |
381 | | # normal file |
382 | | infile = open(filename, "rb") |
383 | | |
384 | | # Try to detect file type by reading first block of datas |
385 | | firstblock = infile.read(1024) |
386 | | infile.seek(0) |
387 | | if ispostscript(firstblock) : |
388 | | size = postscript(infile) |
389 | | elif ispclxl(firstblock) : |
390 | | raise TypeError, "PCLXL (aka PCL6) is not supported yet." |
391 | | size = pclxl(infile) |
392 | | elif ispcl(firstblock) : |
393 | | size = pcl(infile) |
394 | | else : |
395 | | sys.stderr.write("ERROR : Unknown file format for %s\n" % filename) |
396 | | size = 0 |
397 | | infile.close() |
398 | | return size |
| 61 | from pykota import pdlanalyzer |