Package translate :: Package storage :: Module lisa
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.lisa

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2007 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """Parent class for LISA standards (TMX, TBX, XLIFF)""" 
 24   
 25  import re 
 26   
 27  from translate.storage import base 
 28  from translate.lang import data 
 29  try: 
 30      from lxml import etree 
 31  except ImportError, e: 
 32      raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.") 
 33   
34 -def getText(node):
35 """joins together the text from all the text nodes in the nodelist and their children""" 36 # node.xpath is very slow, so we only use it if there are children 37 # TODO: consider rewriting by iterating over children 38 if node is not None: # The etree way of testing for children 39 # Only non-ASCII strings are returned as unicode, so we have to force 40 # the ASCII-only ones to be unicode as well 41 return unicode(node.xpath("string()")) # specific to lxml.etree 42 else: 43 return data.forceunicode(node.text) or u""
44 # if node.text is none, we want to return "" since the tag is there 45
46 -def _findAllMatches(text, re_obj):
47 """generate match objects for all @re_obj matches in @text.""" 48 start = 0 49 max = len(text) 50 while start < max: 51 m = re_obj.search(text, start) 52 if not m: break 53 yield m 54 start = m.end()
55 56 placeholders = ['(%[diouxXeEfFgGcrs])', r'(\\+.?)', '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)'] 57 re_placeholders = [re.compile(ph) for ph in placeholders]
58 -def _getPhMatches(text):
59 'return list of regexp matchobjects for with all place holders in the @text' 60 matches = [] 61 for re_ph in re_placeholders: 62 matches.extend(list(_findAllMatches(text, re_ph))) 63 64 # sort them so they come sequentially 65 matches.sort(lambda a, b: cmp(a.start(), b.start())) 66 return matches
67 68 XML_NS = 'http://www.w3.org/XML/1998/namespace' 69
70 -def getXMLlang(node):
71 """Sets the xml:lang attribute on node""" 72 return node.get("{%s}lang" % XML_NS)
73
74 -def setXMLlang(node, lang):
75 """Sets the xml:lang attribute on node""" 76 node.set("{%s}lang" % XML_NS, lang)
77
78 -def setXMLspace(node, value):
79 """Sets the xml:space attribute on node""" 80 node.set("{%s}space" % XML_NS, value)
81
82 -def namespaced(namespace, name):
83 """Returns name in Clark notation within the given namespace. 84 85 For example namespaced("source") in an XLIFF document might return 86 {urn:oasis:names:tc:xliff:document:1.1}source 87 This is needed throughout lxml. 88 """ 89 if namespace: 90 return "{%s}%s" % (namespace, name) 91 else: 92 return name
93
94 -class LISAunit(base.TranslationUnit):
95 """A single unit in the file. 96 Provisional work is done to make several languages possible.""" 97 98 #The name of the root element of this unit type:(termEntry, tu, trans-unit) 99 rootNode = "" 100 #The name of the per language element of this unit type:(termEntry, tu, trans-unit) 101 languageNode = "" 102 #The name of the innermost element of this unit type:(term, seg) 103 textNode = "" 104 105 namespace = None 106
107 - def __init__(self, source, empty=False):
108 """Constructs a unit containing the given source string""" 109 if empty: 110 return 111 self.xmlelement = etree.Element(self.rootNode) 112 #add descrip, note, etc. 113 114 super(LISAunit, self).__init__(source)
115
116 - def __eq__(self, other):
117 """Compares two units""" 118 languageNodes = self.getlanguageNodes() 119 otherlanguageNodes = other.getlanguageNodes() 120 if len(languageNodes) != len(otherlanguageNodes): 121 return False 122 for i in range(len(languageNodes)): 123 mytext = self.getNodeText(languageNodes[i]) 124 othertext = other.getNodeText(otherlanguageNodes[i]) 125 if mytext != othertext: 126 #TODO:^ maybe we want to take children and notes into account 127 return False 128 return True
129
130 - def namespaced(self, name):
131 """Returns name in Clark notation. 132 133 For example namespaced("source") in an XLIFF document might return 134 {urn:oasis:names:tc:xliff:document:1.1}source 135 This is needed throughout lxml. 136 """ 137 return namespaced(self.namespace, name)
138
139 - def setsource(self, text, sourcelang='en'):
140 text = data.forceunicode(text) 141 languageNodes = self.getlanguageNodes() 142 sourcelanguageNode = self.createlanguageNode(sourcelang, text, "source") 143 if len(languageNodes) > 0: 144 self.xmlelement[0] = sourcelanguageNode 145 else: 146 self.xmlelement.append(sourcelanguageNode)
147
148 - def getsource(self):
149 return self.getNodeText(self.getlanguageNode(lang=None, index=0))
150 source = property(getsource, setsource) 151
152 - def settarget(self, text, lang='xx', append=False):
153 #XXX: we really need the language - can't really be optional 154 """Sets the "target" string (second language), or alternatively appends to the list""" 155 text = data.forceunicode(text) 156 #Firstly deal with reinitialising to None or setting to identical string 157 if self.gettarget() == text: 158 return 159 languageNodes = self.getlanguageNodes() 160 assert len(languageNodes) > 0 161 if not text is None: 162 languageNode = self.createlanguageNode(lang, text, "target") 163 if append or len(languageNodes) == 1: 164 self.xmlelement.append(languageNode) 165 else: 166 self.xmlelement.insert(1, languageNode) 167 if not append and len(languageNodes) > 1: 168 self.xmlelement.remove(languageNodes[1])
169
170 - def gettarget(self, lang=None):
171 """retrieves the "target" text (second entry), or the entry in the 172 specified language, if it exists""" 173 if lang: 174 node = self.getlanguageNode(lang=lang) 175 else: 176 node = self.getlanguageNode(lang=None, index=1) 177 return self.getNodeText(node)
178 target = property(gettarget, settarget) 179
180 - def createlanguageNode(self, lang, text, purpose=None):
181 """Returns a xml Element setup with given parameters to represent a 182 single language entry. Has to be overridden.""" 183 return None
184
185 - def createPHnodes(self, parent, text):
186 """Create the text node in parent containing all the ph tags""" 187 matches = _getPhMatches(text) 188 if not matches: 189 parent.text = text 190 return 191 192 # Now we know there will definitely be some ph tags 193 start = matches[0].start() 194 pretext = text[:start] 195 if pretext: 196 parent.text = pretext 197 lasttag = parent 198 for i, m in enumerate(matches): 199 #pretext 200 pretext = text[start:m.start()] 201 # this will never happen with the first ph tag 202 if pretext: 203 lasttag.tail = pretext 204 #ph node 205 phnode = etree.SubElement(parent, "ph") 206 phnode.set("id", str(i+1)) 207 phnode.text = m.group() 208 lasttag = phnode 209 start = m.end() 210 #post text 211 if text[start:]: 212 lasttag.tail = text[start:]
213
214 - def getlanguageNodes(self):
215 """Returns a list of all nodes that contain per language information.""" 216 return self.xmlelement.findall(self.namespaced(self.languageNode))
217
218 - def getlanguageNode(self, lang=None, index=None):
219 """Retrieves a languageNode either by language or by index""" 220 if lang is None and index is None: 221 raise KeyError("No criterea for languageNode given") 222 languageNodes = self.getlanguageNodes() 223 if lang: 224 for set in languageNodes: 225 if getXMLlang(set) == lang: 226 return set 227 else:#have to use index 228 if index >= len(languageNodes): 229 return None 230 else: 231 return languageNodes[index] 232 return None
233
234 - def getNodeText(self, languageNode):
235 """Retrieves the term from the given languageNode""" 236 if languageNode is None: 237 return None 238 if self.textNode: 239 terms = languageNode.findall('.//%s' % self.namespaced(self.textNode)) 240 if len(terms) == 0: 241 return None 242 return getText(terms[0]) 243 else: 244 return getText(languageNode)
245
246 - def __str__(self):
247 return etree.tostring(self.xmlelement, pretty_print=True, encoding='utf-8')
248
249 - def createfromxmlElement(cls, element):
250 term = cls(None, empty=True) 251 term.xmlelement = element 252 return term
253 createfromxmlElement = classmethod(createfromxmlElement)
254
255 -class LISAfile(base.TranslationStore):
256 """A class representing a file store for one of the LISA file formats.""" 257 UnitClass = LISAunit 258 #The root node of the XML document: 259 rootNode = "" 260 #The root node of the content section: 261 bodyNode = "" 262 #The XML skeleton to use for empty construction: 263 XMLskeleton = "" 264 265 namespace = None 266
267 - def __init__(self, inputfile=None, sourcelanguage='en', targetlanguage=None, unitclass=None):
268 super(LISAfile, self).__init__(unitclass=unitclass) 269 self.setsourcelanguage(sourcelanguage) 270 self.settargetlanguage(targetlanguage) 271 if inputfile is not None: 272 self.parse(inputfile) 273 assert self.document.getroot().tag == self.namespaced(self.rootNode) 274 else: 275 # We strip out newlines to ensure that spaces in the skeleton doesn't 276 # interfere with the the pretty printing of lxml 277 self.parse(self.XMLskeleton.replace("\n", "")) 278 self.addheader()
279
280 - def addheader(self):
281 """Method to be overridden to initialise headers, etc.""" 282 pass
283
284 - def namespaced(self, name):
285 """Returns name in Clark notation. 286 287 For example namespaced("source") in an XLIFF document might return 288 {urn:oasis:names:tc:xliff:document:1.1}source 289 This is needed throughout lxml. 290 """ 291 return namespaced(self.namespace, name)
292
293 - def initbody(self):
294 """Initialises self.body so it never needs to be retrieved from the XML again.""" 295 self.namespace = self.document.getroot().nsmap.get(None, None) 296 self.body = self.document.find('//%s' % self.namespaced(self.bodyNode))
297
298 - def addsourceunit(self, source):
299 #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word? 300 """Adds and returns a new unit with the given string as first entry.""" 301 newunit = self.UnitClass(source) 302 self.addunit(newunit) 303 return newunit
304
305 - def addunit(self, unit, new=True):
306 unit.namespace = self.namespace 307 super(LISAfile, self).addunit(unit) 308 if new: 309 self.body.append(unit.xmlelement)
310
311 - def __str__(self):
312 """Converts to a string containing the file's XML""" 313 return etree.tostring(self.document, pretty_print=True, xml_declaration=True, encoding='utf-8')
314
315 - def parse(self, xml):
316 """Populates this object from the given xml string""" 317 if not hasattr(self, 'filename'): 318 self.filename = getattr(xml, 'name', '') 319 if hasattr(xml, "read"): 320 xml.seek(0) 321 posrc = xml.read() 322 xml = posrc 323 self.document = etree.fromstring(xml).getroottree() 324 self.encoding = self.document.docinfo.encoding 325 self.initbody() 326 assert self.document.getroot().tag == self.namespaced(self.rootNode) 327 termEntries = self.body.findall('.//%s' % self.namespaced(self.UnitClass.rootNode)) 328 if termEntries is None: 329 return 330 for entry in termEntries: 331 term = self.UnitClass.createfromxmlElement(entry) 332 self.addunit(term, new=False)
333