Package translate :: Package storage :: Module dtd
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.dtd

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile) 
 23  these are specific .dtd files for localisation used by mozilla""" 
 24   
 25  from translate.storage import base 
 26  from translate.misc import quote 
 27   
 28  import re 
 29  import sys 
 30  import warnings 
 31  try: 
 32      from lxml import etree 
 33      import StringIO 
 34  except ImportError: 
 35      etree = None 
 36   
37 -def quotefordtd(source):
38 if '"' in source: 39 if "'" in source: 40 return "'" + source.replace("'", ''') + "'" 41 else: 42 return quote.singlequotestr(source) 43 else: 44 return quote.quotestr(source)
45
46 -def unquotefromdtd(source):
47 """unquotes a quoted dtd definition""" 48 # extract the string, get rid of quoting 49 if len(source) == 0: source = '""' 50 quotechar = source[0] 51 extracted, quotefinished = quote.extractwithoutquotes(source, quotechar, quotechar, allowreentry=False) 52 if quotechar == "'" and "'" in extracted: 53 extracted = extracted.replace("'", "'") 54 # the quote characters should be the first and last characters in the string 55 # of course there could also be quote characters within the string; not handled here 56 return extracted
57
58 -class dtdunit(base.TranslationUnit):
59 """this class represents an entity definition from a dtd file (and possibly associated comments)"""
60 - def __init__(self, source=""):
61 """construct the dtdunit, prepare it for parsing""" 62 super(dtdunit, self).__init__(source) 63 self.comments = [] 64 self.unparsedlines = [] 65 self.incomment = 0 66 self.inentity = 0 67 self.entity = "FakeEntityOnlyForInitialisationAndTesting" 68 self.source = source
69 70 # Note that source and target are equivalent for monolingual units
71 - def setsource(self, source):
72 """Sets the definition to the quoted value of source""" 73 self.definition = quotefordtd(source)
74
75 - def getsource(self):
76 """gets the unquoted source string""" 77 return unquotefromdtd(self.definition)
78 source = property(getsource, setsource) 79
80 - def settarget(self, target):
81 """Sets the definition to the quoted value of target""" 82 if target is None: 83 target = "" 84 self.definition = quotefordtd(target)
85
86 - def gettarget(self):
87 """gets the unquoted target string""" 88 return unquotefromdtd(self.definition)
89 target = property(gettarget, settarget) 90
91 - def isnull(self):
92 """returns whether this dtdunit doesn't actually have an entity definition""" 93 # for dtds, we currently return a blank string if there is no .entity (==location in other files) 94 # TODO: this needs to work better with base class expectations 95 return self.entity is None
96
97 - def parse(self, dtdsrc):
98 """read the first dtd element from the source code into this object, return linesprocessed""" 99 self.comments = [] 100 # make all the lists the same 101 self.locfilenotes = self.comments 102 self.locgroupstarts = self.comments 103 self.locgroupends = self.comments 104 self.locnotes = self.comments 105 # self.locfilenotes = [] 106 # self.locgroupstarts = [] 107 # self.locgroupends = [] 108 # self.locnotes = [] 109 # self.comments = [] 110 self.entity = None 111 self.definition = '' 112 if not dtdsrc: 113 return 0 114 lines = dtdsrc.split("\n") 115 linesprocessed = 0 116 comment = "" 117 for line in lines: 118 line += "\n" 119 linesprocessed += 1 120 # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1] 121 if not self.incomment: 122 if (line.find('<!--') != -1): 123 self.incomment = 1 124 self.continuecomment = 0 125 # now work out the type of comment, and save it (remember we're not in the comment yet) 126 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) 127 if comment.find('LOCALIZATION NOTE') != -1: 128 l = quote.findend(comment,'LOCALIZATION NOTE') 129 while (comment[l] == ' '): l += 1 130 if comment.find('FILE', l) == l: 131 self.commenttype = "locfile" 132 elif comment.find('BEGIN', l) == l: 133 self.commenttype = "locgroupstart" 134 elif comment.find('END', l) == l: 135 self.commenttype = "locgroupend" 136 else: 137 self.commenttype = "locnote" 138 else: 139 # plain comment 140 self.commenttype = "comment" 141 142 if self.incomment: 143 # some kind of comment 144 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment) 145 # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment 146 self.continuecomment = self.incomment 147 # strip the comment out of what will be parsed 148 line = line.replace(comment, "", 1) 149 # add a end of line of this is the end of the comment 150 if not self.incomment: 151 if line.isspace(): 152 comment += line 153 line = '' 154 else: 155 comment += '\n' 156 # check if there's actually an entity definition that's commented out 157 # TODO: parse these, store as obsolete messages 158 # if comment.find('<!ENTITY') != -1: 159 # # remove the entity from the comment 160 # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) 161 # depending on the type of comment (worked out at the start), put it in the right place 162 # make it record the comment and type as a tuple 163 commentpair = (self.commenttype, comment) 164 if self.commenttype == "locfile": 165 self.locfilenotes.append(commentpair) 166 elif self.commenttype == "locgroupstart": 167 self.locgroupstarts.append(commentpair) 168 elif self.commenttype == "locgroupend": 169 self.locgroupends.append(commentpair) 170 elif self.commenttype == "locnote": 171 self.locnotes.append(commentpair) 172 elif self.commenttype == "comment": 173 self.comments.append(commentpair) 174 175 if not self.inentity and not self.incomment: 176 entitypos = line.find('<!ENTITY') 177 if entitypos != -1: 178 self.inentity = 1 179 beforeentity = line[:entitypos].strip() 180 if beforeentity.startswith("#"): 181 self.hashprefix = beforeentity 182 self.entitypart = "start" 183 else: 184 self.unparsedlines.append(line) 185 186 if self.inentity: 187 if self.entitypart == "start": 188 # the entity definition 189 e = quote.findend(line,'<!ENTITY') 190 line = line[e:] 191 self.entitypart = "name" 192 self.entitytype = "internal" 193 if self.entitypart == "name": 194 e = 0 195 while (e < len(line) and line[e].isspace()): e += 1 196 self.entity = '' 197 if (e < len(line) and line[e] == '%'): 198 self.entitytype = "external" 199 self.entityparameter = "" 200 e += 1 201 while (e < len(line) and line[e].isspace()): e += 1 202 while (e < len(line) and not line[e].isspace()): 203 self.entity += line[e] 204 e += 1 205 while (e < len(line) and line[e].isspace()): e += 1 206 if self.entity: 207 if self.entitytype == "external": 208 self.entitypart = "parameter" 209 else: 210 self.entitypart = "definition" 211 # remember the start position and the quote character 212 if e == len(line): 213 self.entityhelp = None 214 continue 215 elif self.entitypart == "definition": 216 self.entityhelp = (e, line[e]) 217 self.instring = 0 218 if self.entitypart == "parameter": 219 paramstart = e 220 while (e < len(line) and line[e].isalnum()): e += 1 221 self.entityparameter += line[paramstart:e] 222 while (e < len(line) and line[e].isspace()): e += 1 223 line = line[e:] 224 e = 0 225 if not line: 226 continue 227 if line[0] in ('"', "'"): 228 self.entitypart = "definition" 229 self.entityhelp = (e, line[e]) 230 self.instring = 0 231 if self.entitypart == "definition": 232 if self.entityhelp is None: 233 e = 0 234 while (e < len(line) and line[e].isspace()): e += 1 235 if e == len(line): 236 continue 237 self.entityhelp = (e, line[e]) 238 self.instring = 0 239 # actually the lines below should remember instring, rather than using it as dummy 240 e = self.entityhelp[0] 241 if (self.entityhelp[1] == "'"): 242 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False) 243 elif (self.entityhelp[1] == '"'): 244 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False) 245 else: 246 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1])) 247 # for any following lines, start at the beginning of the line. remember the quote character 248 self.entityhelp = (0, self.entityhelp[1]) 249 self.definition += defpart 250 if not self.instring: 251 self.inentity = 0 252 break 253 254 # uncomment this line to debug processing 255 if 0: 256 for attr in dir(self): 257 r = repr(getattr(self, attr)) 258 if len(r) > 60: r = r[:57]+"..." 259 self.comments.append(("comment", "self.%s = %s" % (attr, r) )) 260 return linesprocessed
261
262 - def __str__(self):
263 """convert to a string. double check that unicode is handled somehow here""" 264 source = self.getoutput() 265 if isinstance(source, unicode): 266 return source.encode(getattr(self, "encoding", "UTF-8")) 267 return source
268
269 - def getoutput(self):
270 """convert the dtd entity back to string form""" 271 lines = [] 272 lines.extend([comment for commenttype, comment in self.comments]) 273 lines.extend(self.unparsedlines) 274 if self.isnull(): 275 result = "".join(lines) 276 return result.rstrip() + "\n" 277 # for f in self.locfilenotes: yield f 278 # for ge in self.locgroupends: yield ge 279 # for gs in self.locgroupstarts: yield gs 280 # for n in self.locnotes: yield n 281 if len(self.entity) > 0: 282 if getattr(self, 'entitytype', None) == 'external': 283 entityline = '<!ENTITY % '+self.entity+' '+self.entityparameter+' '+self.definition+'>' 284 else: 285 entityline = '<!ENTITY '+self.entity+' '+self.definition+'>' 286 if getattr(self, 'hashprefix', None): 287 entityline = self.hashprefix + " " + entityline 288 if isinstance(entityline, unicode): 289 entityline = entityline.encode('UTF-8') 290 lines.append(entityline+'\n') 291 return "".join(lines)
292
293 -class dtdfile(base.TranslationStore):
294 """this class represents a .dtd file, made up of dtdunits""" 295 UnitClass = dtdunit
296 - def __init__(self, inputfile=None):
297 """construct a dtdfile, optionally reading in from inputfile""" 298 base.TranslationStore.__init__(self, unitclass = self.UnitClass) 299 self.units = [] 300 self.filename = getattr(inputfile, 'name', '') 301 if inputfile is not None: 302 dtdsrc = inputfile.read() 303 self.parse(dtdsrc) 304 self.makeindex()
305
306 - def parse(self, dtdsrc):
307 """read the source code of a dtd file in and include them as dtdunits in self.units (any existing units are lost)""" 308 self.units = [] 309 start = 0 310 end = 0 311 lines = dtdsrc.split("\n") 312 while end < len(lines): 313 if (start == end): end += 1 314 foundentity = 0 315 while end < len(lines): 316 if end >= len(lines): 317 break 318 if lines[end].find('<!ENTITY') > -1: 319 foundentity = 1 320 if foundentity and re.match("[\"']\s*>", lines[end]): 321 end += 1 322 break 323 end += 1 324 # print "processing from %d to %d" % (start,end) 325 326 linesprocessed = 1 # to initialise loop 327 while linesprocessed >= 1: 328 newdtd = dtdunit() 329 try: 330 linesprocessed = newdtd.parse("\n".join(lines[start:end])) 331 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines): 332 self.units.append(newdtd) 333 except Exception, e: 334 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end]))) 335 start += linesprocessed
336
337 - def __str__(self):
338 """convert to a string. double check that unicode is handled somehow here""" 339 source = self.getoutput() 340 if etree is not None: 341 try: 342 dtd = etree.DTD(StringIO.StringIO(re.sub("#expand", "", source))) 343 except etree.DTDParseError: 344 warnings.warn("DTD file '%s' does not validate" % self.filename) 345 return None 346 if isinstance(source, unicode): 347 return source.encode(getattr(self, "encoding", "UTF-8")) 348 return source
349
350 - def getoutput(self):
351 """convert the units back to source""" 352 sources = [str(dtd) for dtd in self.units] 353 return "".join(sources)
354
355 - def makeindex(self):
356 """makes self.index dictionary keyed on entities""" 357 self.index = {} 358 for dtd in self.units: 359 if not dtd.isnull(): 360 self.index[dtd.entity] = dtd
361
362 - def rewrap(self):
363 for dtd in self.units: 364 lines = dtd.definition.split("\n") 365 if len(lines) > 1: 366 definition = lines[0] 367 for line in lines[1:]: 368 if definition[-1:].isspace() or line[:1].isspace(): 369 definition += line 370 else: 371 definition += " " + line 372 dtd.definition = definition
373 374 if __name__ == "__main__": 375 import sys 376 d = dtdfile(sys.stdin) 377 d.rewrap() 378 sys.stdout.write(str(d)) 379