Changeset 240
- Timestamp:
- 07/20/05 14:45:04 (19 years ago)
- Location:
- pkpgcounter/trunk
- Files:
-
- 3 modified
Legend:
- Unmodified
- Added
- Removed
-
pkpgcounter/trunk/NEWS
r237 r240 22 22 pkpgcounter News : 23 23 24 * 1.58 : 25 26 - Fix for PDF files which contain several versions of the same PDF object. 27 28 - Doesn't break when python-psyco is not available. 29 24 30 * 1.57 : 25 31 -
pkpgcounter/trunk/pkpgpdls/pdf.py
r235 r240 27 27 import pdlparser 28 28 29 class PDFObject : 30 """A class for PDF objects.""" 31 def __init__(self, major, minor, description) : 32 """Initialize the PDF object.""" 33 self.major = major 34 self.minor = minor 35 self.description = description 36 self.comments = [] 37 self.content = [] 38 self.parent = None 39 self.kids = [] 40 29 41 class Parser(pdlparser.PDLParser) : 30 42 """A parser for PDF documents.""" … … 43 55 def getJobSize(self) : 44 56 """Counts pages in a PDF document.""" 57 # First we start with a generic PDF parser. 58 lastcomment = None 59 objects = {} 60 while 1 : 61 line = self.infile.readline() 62 if not line : 63 break 64 # now workaround the unavailability of "Universal New Line" 65 # under Python <2.3. 66 line = line.strip().replace("\r\n", " ").replace("\r", " ") 67 if line.startswith("% ") : 68 lastcomment = line[2:] 69 if line.endswith(" obj") : 70 # New object begins here 71 (n0, n1, dummy) = line.split() 72 (major, minor) = map(int, (n0, n1)) 73 obj = PDFObject(major, minor, lastcomment) 74 while 1 : 75 line = self.infile.readline() 76 if not line : 77 break 78 line = line.strip() 79 if line.startswith("% ") : 80 obj.comments.append(line) 81 elif line.startswith("endobj") : 82 break 83 else : 84 obj.content.append(line) 85 try : 86 # try to find a different version of this object 87 oldobject = objects[major] 88 except KeyError : 89 # not found, so we add it 90 objects[major] = obj 91 else : 92 # only overwrite older versions of this object 93 # same minor seems to be possible, so the latest one 94 # found in the file will be the one we keep. 95 # if we want the first one, just use > instead of >= 96 if minor >= oldobject.minor : 97 objects[major] = obj 98 99 # Now we check each PDF object we've just created. 45 100 self.iscolor = None 46 101 newpageregexp = re.compile(r"(/Type) ?(/Page)[/ \t\r\n]", re.I) 47 102 colorregexp = re.compile(r"(/ColorSpace) ?(/DeviceRGB|/DeviceCMYK)[/ \t\r\n]", re.I) 48 103 pagecount = 0 49 for line in self.infile.xreadlines() : 50 pagecount += len(newpageregexp.findall(line)) 51 if colorregexp.match(line) : 104 for object in objects.values() : 105 content = "".join(object.content) 106 pagecount += len(newpageregexp.findall(content)) 107 if colorregexp.match(content) : 52 108 self.iscolor = 1 53 109 if self.debug : 54 sys.stderr.write("ColorSpace : %s\n" % line)110 sys.stderr.write("ColorSpace : %s\n" % content) 55 111 return pagecount 56 112 … … 65 121 mustclose = 0 66 122 else : 67 infile = open(arg, "r b")123 infile = open(arg, "rU") 68 124 mustclose = 1 69 125 try : -
pkpgcounter/trunk/pkpgpdls/version.py
r237 r240 20 20 # 21 21 22 __version__ = "1.5 7"22 __version__ = "1.58" 23 23 24 24 __doc__ = """pkpgcounter : a generic Page Description Languages parser."""