Changeset 1552 for pykota/trunk/pykota/pdlanalyzer.py
- Timestamp:
- 06/19/04 00:21:27 (20 years ago)
- Files:
-
- 1 modified
Legend:
- Unmodified
- Added
- Removed
-
pykota/trunk/pykota/pdlanalyzer.py
r1551 r1552 22 22 # 23 23 # $Log$ 24 # Revision 1.9 2004/06/18 22:21:27 jalet 25 # Native PDF parser greatly improved. 26 # GhostScript based PDF parser completely removed because native code 27 # is now portable across Python versions. 28 # 24 29 # Revision 1.8 2004/06/18 20:49:46 jalet 25 30 # "ERROR:" prefix added … … 51 56 import sys 52 57 import os 58 import re 53 59 import struct 54 60 import tempfile … … 75 81 """Count pages in a DSC compliant PostScript document.""" 76 82 pagecount = 0 77 while 1 : 78 line = self.infile.readline() 79 if not line : 80 break 83 for line in self.infile.xreadlines() : 81 84 if line.startswith("%%Page: ") : 82 85 pagecount += 1 … … 87 90 """Initialize PDF Analyzer.""" 88 91 self.infile = infile 89 try :90 if float(sys.version[:3]) >= 2.3 :91 self.getJobSize = self.native_getJobSize92 else :93 self.getJobSize = self.gs_getJobSize94 except :95 self.getJobSize = self.gs_getJobSize96 92 97 def native_getJobSize(self) : 98 """Counts pages in a PDF document natively.""" 93 def getJobSize(self) : 94 """Counts pages in a PDF document.""" 95 regexp = re.compile(r"(/Type) ?(/Page)[/ \r\n]") 99 96 pagecount = 0 100 content = [] 101 while 1 : 102 line = self.infile.readline() 103 if not line : 104 break 105 line = line.strip() 106 content.append(line) 107 if line.endswith("endobj") : 108 pagecount += " /".join([x.strip() for x in " ".join(content).split("/")]).count(" /Type /Page ") 109 content = [] 110 return pagecount 111 112 def gs_getJobSize(self) : 113 """Counts pages in a PDF document using GhostScript to convert PDF to PS.""" 114 MEGABYTE = 1024*1024 115 child = popen2.Popen4("gs -q -dNOPAUSE -dBATCH -dSAFER -sDEVICE=pswrite -sOutputFile=- -c save pop -f - 2>/dev/null") 116 try : 117 data = self.infile.read(MEGABYTE) 118 while data : 119 child.tochild.write(data) 120 data = self.infile.read(MEGABYTE) 121 child.tochild.flush() 122 child.tochild.close() 123 except (IOError, OSError), msg : 124 raise PDLAnalyzerError, "Unable to convert PDF input to PS with GhostScript : %s" % msg 125 126 psanalyzer = PostScriptAnalyzer(child.fromchild) 127 pagecount = psanalyzer.getJobSize() 128 child.fromchild.close() 129 try : 130 retcode = child.wait() 131 except OSError, msg : 132 self.filter.logger.log_message(_("Problem while waiting for PDF to PS converter (GhostScript pid %s) to exit : %s") % (child.pid, msg)) 133 else : 134 if os.WIFEXITED(retcode) : 135 status = os.WEXITSTATUS(retcode) 136 else : 137 status = retcode 138 if status : 139 raise PDLAnalyzerError, "PDF to PS converter (GhostScript pid %s) exit code is %s" % (child.pid, repr(status)) 97 for line in self.infile.xreadlines() : 98 pagecount += len(regexp.findall(line)) 140 99 return pagecount 141 100 … … 458 417 else : 459 418 # normal file 460 self.infile = open(self.filename, "rb U") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2419 self.infile = open(self.filename, "rb") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2 461 420 self.mustclose = 1 462 421 return … … 464 423 # Use a temporary file, always seekable contrary to standard input. 465 424 # This also has the benefit to let us use the "U" mode (new in Python 2.3) 466 self.infile = tempfile.TemporaryFile(mode="w+b U") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2425 self.infile = tempfile.TemporaryFile(mode="w+b") # TODO : "U" mode only works in 2.3, is ignored in 2.1 and 2.2 467 426 while 1 : 468 427 data = infile.read(MEGABYTE)