Package translate :: Package storage :: Module wordfast
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.wordfast

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2007 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Manage the Wordfast Translation Memory format 
 23  """ 
 24   
 25  import csv 
 26  import time 
 27  from translate.storage import base 
 28   
 29  WF_TIMEFORMAT = "%Y%m%d~%H%M%S" 
 30  """Time format used by Wordfast""" 
 31   
 32  WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", "target-lang", "license", "attr1list", "attr2list", "attr3list", "attr4list", "attr5list"] 
 33  """Field names for the Wordfast header""" 
 34   
 35  WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", "target", "attr1", "attr2", "attr3", "attr4"] 
 36  """Field names for a Wordfast TU""" 
 37   
 38  WF_FIELDNAMES_HEADER_DEFAULTS = { 
 39  "date": "%19000101~121212",  
 40  "userlist": "%User ID,TT,TT Translate-Toolkit",  
 41  "tucount": "%TU=00000001",  
 42  "src-lang": "%EN-US",  
 43  "version": "%Wordfast TM v.5.51w9/00",  
 44  "target-lang": "",  
 45  "license": "%---00000001",  
 46  "attr1list": "",  
 47  "attr2list": "",  
 48  "attr3list": "",  
 49  "attr4list": "" } 
 50  """Default or minimum header entries for a Wordfast file""" 
 51   
 52  # TODO Needs validation.  The following need to be checked against a WF TM file to ensure  
 53  # that the correct Unicode values have been chosen for the characters. For now these look 
 54  # correct and have been taken from Windows CP1252 and Macintosh code points found for 
 55  # the respective character sets on Linux. 
 56  WF_ESCAPE_MAP = ( 
 57                ("&'26;", u"\u0026"), # & - Ampersand (must be first to prevent escaping of escapes) 
 58                ("&'82;", u"\u201A"), # ‚ - Single low-9 quotation mark 
 59                ("&'85;", u"\u2026"), # … - Elippsis 
 60                ("&'91;", u"\u2018"), # ‘ - left single quotation mark 
 61                ("&'92;", u"\u2019"), # ’ - right single quotation mark 
 62                ("&'93;", u"\u201C"), # “ - left double quotation mark 
 63                ("&'94;", u"\u201D"), # ” - right double quotation mark 
 64                ("&'96;", u"\u2013"), # – - en dash (validate) 
 65                ("&'97;", u"\u2014"), # — - em dash (validate) 
 66                ("&'99;", u"\u2122"), # ™ - Trade mark 
 67                # Windows only 
 68                ("&'A0;", u"\u00A0"), #   - Non breaking space 
 69                ("&'A9;", u"\u00A9"), # © - Copyright 
 70                ("&'AE;", u"\u00AE"), # ® - Registered 
 71                ("&'BC;", u"\u00BC"), # ¼ 
 72                ("&'BD;", u"\u00BD"), # ½ 
 73                ("&'BE;", u"\u00BE"), # ¾ 
 74                # Mac only 
 75                ("&'A8;", u"\u00AE"), # ® - Registered 
 76                ("&'AA;", u"\u2122"), # ™ - Trade mark 
 77                ("&'C7;", u"\u00AB"), # « - Left-pointing double angle quotation mark 
 78                ("&'C8;", u"\u00BB"), # » - Right-pointing double angle quotation mark 
 79                ("&'C9;", u"\u2026"), # … - Horizontal Elippsis 
 80                ("&'CA;", u"\u00A0"), #   - Non breaking space 
 81                ("&'D0;", u"\u2013"), # – - en dash (validate) 
 82                ("&'D1;", u"\u2014"), # — - em dash (validate) 
 83                ("&'D2;", u"\u201C"), # “ - left double quotation mark 
 84                ("&'D3;", u"\u201D"), # ” - right double quotation mark 
 85                ("&'D4;", u"\u2018"), # ‘ - left single quotation mark 
 86                ("&'D5;", u"\u2019"), # ’ - right single quotation mark 
 87                ("&'E2;", u"\u201A"), # ‚ - Single low-9 quotation mark 
 88                ("&'E3;", u"\u201E"), # „ - Double low-9 quotation mark 
 89                # Other markers 
 90                #("&'B;", u"\n"), # Soft-break - XXX creates a problem with roundtripping could also be represented by \u2028 
 91               ) 
 92  """Mapping of Wordfast &'XX; escapes to correct Unicode characters""" 
 93   
 94  TAB_UTF16 = "\x00\x09" 
 95   
96 -def _char_to_wf(string):
97 """Char -> Wordfast &'XX; escapes 98 99 @note: Full roundtripping is not possible because of the escaping of \n and \t""" 100 # FIXME there is no platform check to ensure that we use Mac encodings when running on a Mac 101 if string: 102 for code, char in WF_ESCAPE_MAP: 103 string = string.replace(char.encode('utf-8'), code) 104 string = string.replace("\n", "\\n").replace("\t", "\\t") 105 return string
106
107 -def _wf_to_char(string):
108 """Wordfast &'XX; escapes -> Char""" 109 if string: 110 for code, char in WF_ESCAPE_MAP: 111 string = string.replace(code, char.encode('utf-8')) 112 string = string.replace("\\n", "\n").replace("\\t", "\t") 113 return string
114
115 -class WordfastDialect(csv.Dialect):
116 """Describe the properties of a Wordfast generated TAB-delimited file.""" 117 delimiter = "\t" 118 lineterminator = "\r\n" 119 quoting = csv.QUOTE_NONE 120 # We need to define the following 3 items for csv in Python < 2.5 121 doublequote = False 122 skipinitialspace = False 123 escapechar = ''
124 csv.register_dialect("wordfast", WordfastDialect) 125
126 -class WordfastTime(object):
127 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
128 - def __init__(self, newtime=None):
129 self._time = None 130 if not newtime: 131 self.time = None 132 elif isinstance(newtime, basestring): 133 self.timestring = newtime 134 elif isinstance(newtime, time.struct_time): 135 self.time = newtime
136
137 - def get_timestring(self):
138 """Get the time in the Wordfast time format""" 139 if not self._time: 140 return None 141 else: 142 return time.strftime(WF_TIMEFORMAT, self._time)
143
144 - def set_timestring(self, timestring):
145 """Set the time_sturct object using a Wordfast time formated string 146 147 @param timestring: A Wordfast time string (YYYMMDD~hhmmss) 148 @type timestring: String 149 """ 150 self._time = time.strptime(timestring, WF_TIMEFORMAT)
151 timestring = property(get_timestring, set_timestring) 152
153 - def get_time(self):
154 """Get the time_struct object""" 155 return self._time
156
157 - def set_time(self, newtime):
158 """Set the time_struct object 159 160 @param newtime: a new time object 161 @type newtime: time.time_struct 162 """ 163 if newtime and isinstance(newtime, time.struct_time): 164 self._time = newtime 165 else: 166 self._time = None
167 time = property(get_time, set_time) 168
169 - def __str__(self):
170 if not self.timestring: 171 return "" 172 else: 173 return self.timestring
174
175 -class WordfastHeader(object):
176 """A wordfast translation memory header"""
177 - def __init__(self, header=None):
178 self._header_dict = [] 179 if not header: 180 self.header = self._create_default_header() 181 elif isinstance(header, dict): 182 self.header = header
183
184 - def _create_default_header(self):
185 """Create a default Wordfast header with the date set to the current time""" 186 defaultheader = WF_FIELDNAMES_HEADER_DEFAULTS 187 defaultheader['date'] = '%%%s' % WordfastTime(time.localtime()).timestring 188 return defaultheader
189
190 - def getheader(self):
191 """Get the header dictionary""" 192 return self._header_dict
193
194 - def setheader(self, newheader):
195 self._header_dict = newheader
196 header = property(getheader, setheader) 197
198 - def settargetlang(self, newlang):
199 self._header_dict['target-lang'] = '%%%s' % newlang
200 targetlang = property(None, settargetlang) 201
202 - def settucount(self, count):
203 self._header_dict['tucount'] = '%%TU=%08d' % count
204 tucount = property(None, settucount)
205
206 -class WordfastUnit(base.TranslationUnit):
207 """A Wordfast translation memory unit"""
208 - def __init__(self, source=None):
209 self._dict = {} 210 if source: 211 self.source = source 212 super(WordfastUnit, self).__init__(source)
213
214 - def _update_timestamp(self):
215 """Refresh the timestamp for the unit""" 216 self._dict['date'] = WordfastTime(time.localtime()).timestring
217
218 - def getdict(self):
219 """Get the dictionary of values for a Wordfast line""" 220 return self._dict
221
222 - def setdict(self, newdict):
223 """Set the dictionary of values for a Wordfast line 224 225 @param newdict: a new dictionary with Wordfast line elements 226 @type newdict: Dict 227 """ 228 # TODO First check that the values are OK 229 self._dict = newdict
230 dict = property(getdict, setdict) 231
232 - def _get_source_or_target(self, key):
233 if self._dict[key] is None: 234 return None 235 elif self._dict[key]: 236 return _wf_to_char(self._dict[key]).decode('utf-8') 237 else: 238 return ""
239
240 - def _set_source_or_target(self, key, newvalue):
241 if newvalue is None: 242 self._dict[key] = None 243 if isinstance(newvalue, unicode): 244 newvalue = newvalue.encode('utf-8') 245 newvalue = _char_to_wf(newvalue) 246 if not key in self._dict or newvalue != self._dict[key]: 247 self._dict[key] = newvalue 248 self._update_timestamp()
249
250 - def getsource(self):
251 return self._get_source_or_target('source')
252
253 - def setsource(self, newsource):
254 return self._set_source_or_target('source', newsource)
255 source = property(getsource, setsource) 256
257 - def gettarget(self):
258 return self._get_source_or_target('target')
259
260 - def settarget(self, newtarget):
261 return self._set_source_or_target('target', newtarget)
262 target = property(gettarget, settarget) 263
264 - def settargetlang(self, newlang):
265 self._dict['target-lang'] = newlang
266 targetlang = property(None, settargetlang) 267
268 - def __str__(self):
269 return str(self._dict)
270
271 - def istranslated(self):
272 if not self._dict.get('source', None): 273 return False 274 return bool(self._dict.get('target', None))
275 276
277 -class WordfastTMFile(base.TranslationStore):
278 """A Wordfast translation memory file""" 279 Name = "Wordfast TM file" 280 Mimetypes = ["application/x-wordfast"] 281 Extensions = ["txt"]
282 - def __init__(self, inputfile=None, unitclass=WordfastUnit):
283 """construct a Wordfast TM, optionally reading in from inputfile.""" 284 self.UnitClass = unitclass 285 base.TranslationStore.__init__(self, unitclass=unitclass) 286 self.filename = '' 287 self.header = WordfastHeader() 288 self._encoding = 'utf-16' 289 if inputfile is not None: 290 self.parse(inputfile)
291
292 - def parse(self, input):
293 """parsese the given file or file source string""" 294 if hasattr(input, 'name'): 295 self.filename = input.name 296 elif not getattr(self, 'filename', ''): 297 self.filename = '' 298 if hasattr(input, "read"): 299 tmsrc = input.read() 300 input.close() 301 input = tmsrc 302 if TAB_UTF16 in input.split("\n")[0]: 303 self._encoding = 'utf-16' 304 else: 305 self._encoding = 'iso-8859-1' 306 try: 307 input = input.decode(self._encoding).encode('utf-8') 308 except: 309 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded") 310 for header in csv.DictReader(input.split("\n")[:1], fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast"): 311 self.header = WordfastHeader(header) 312 lines = csv.DictReader(input.split("\n")[1:], fieldnames=WF_FIELDNAMES, dialect="wordfast") 313 for line in lines: 314 newunit = WordfastUnit() 315 newunit.dict = line 316 self.addunit(newunit)
317
318 - def __str__(self):
319 output = csv.StringIO() 320 header_output = csv.StringIO() 321 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="wordfast") 322 unit_count = 0 323 for unit in self.units: 324 if unit.istranslated(): 325 unit_count += 1 326 writer.writerow(unit.dict) 327 if unit_count == 0: 328 return "" 329 output.reset() 330 self.header.tucount = unit_count 331 outheader = csv.DictWriter(header_output, fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast") 332 outheader.writerow(self.header.header) 333 header_output.reset() 334 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8') 335 try: 336 return decoded.encode(self._encoding) 337 except UnicodeEncodeError: 338 return decoded.encode('utf-16')
339