Package translate :: Package tools :: Module poterminology
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.poterminology

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # This file is part of translate. 
  5  # 
  6  # translate is free software; you can redistribute it and/or modify 
  7  # it under the terms of the GNU General Public License as published by 
  8  # the Free Software Foundation; either version 2 of the License, or 
  9  # (at your option) any later version. 
 10  #  
 11  # translate is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU General Public License for more details. 
 15  # 
 16  # You should have received a copy of the GNU General Public License 
 17  # along with translate; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 19   
 20  """reads a set of .po or .pot files to produce a pootle-terminology.pot 
 21   
 22  See: http://translate.sourceforge.net/wiki/toolkit/poterminology for examples and 
 23  usage instructions 
 24  """ 
 25   
 26  from translate.lang import factory as lang_factory 
 27  from translate.misc import optrecurse 
 28  from translate.storage import po 
 29  from translate.storage import factory 
 30  import os 
 31  import re 
 32  import sys 
 33   
34 -class TerminologyOptionParser(optrecurse.RecursiveOptionParser):
35 """a specialized Option Parser for the terminology tool...""" 36 37 # handles c-format and python-format 38 formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]") 39 # handles XML/HTML elements (<foo>text</foo> => text) 40 xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>") 41 # handles XML/HTML entities (&#32; &#x20; &amp; &my_entity;) 42 xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);", 43 flags=re.UNICODE|re.IGNORECASE) 44 45 sortorders = [ "frequency", "dictionary", "length" ] 46 47 files = 0 48 units = 0 49
50 - def parse_args(self, args=None, values=None):
51 """parses the command line options, handling implicit input/output args""" 52 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 53 # some intelligence as to what reasonable people might give on the command line 54 if args and not options.input: 55 if not options.output and len(args) > 1: 56 options.input = args[:-1] 57 args = args[-1:] 58 else: 59 options.input = args 60 args = [] 61 if args and not options.output: 62 options.output = args[-1] 63 args = args[:-1] 64 if not options.output: 65 options.output = "pootle-terminology.pot" 66 if args: 67 self.error("You have used an invalid combination of --input, --output and freestanding args") 68 if isinstance(options.input, list) and len(options.input) == 1: 69 options.input = options.input[0] 70 if options.inputmin == None: 71 options.inputmin = 1 72 elif options.inputmin == None: 73 options.inputmin = 2 74 return (options, args)
75
76 - def set_usage(self, usage=None):
77 """sets the usage string - if usage not given, uses getusagestring for each option""" 78 if usage is None: 79 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \ 80 "\n input directory is searched for PO files, terminology PO file is output file" 81 else: 82 super(TerminologyOptionParser, self).set_usage(usage)
83
84 - def run(self):
85 """parses the arguments, and runs recursiveprocess with the resulting options""" 86 (options, args) = self.parse_args() 87 options.inputformats = self.inputformats 88 options.outputoptions = self.outputoptions 89 self.usepsyco(options) 90 self.recursiveprocess(options)
91
92 - def recursiveprocess(self, options):
93 """recurse through directories and process files""" 94 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True): 95 if isinstance(options.input, list): 96 inputfiles = self.recurseinputfilelist(options) 97 else: 98 inputfiles = self.recurseinputfiles(options) 99 else: 100 if options.input: 101 inputfiles = [os.path.basename(options.input)] 102 options.input = os.path.dirname(options.input) 103 else: 104 inputfiles = [options.input] 105 if os.path.isdir(options.output): 106 options.output = os.path.join(options.output,"pootle-terminology.pot") 107 self.stopwords = {} 108 self.stoprelist = [] 109 actions = { '+': frozenset(), ':': frozenset(['skip']), 110 '<': frozenset(['phrase']), '=': frozenset(['word']), 111 '>': frozenset(['word','skip']), 112 '@': frozenset(['word','phrase']) } 113 if options.stopwordfile != None: 114 stopfile = open(options.stopwordfile, "r") 115 try: 116 for stopline in stopfile: 117 stoptype = stopline[0] 118 if stoptype == '#' or stoptype == "\n": 119 continue 120 elif stoptype == '/': 121 self.stoprelist.append(re.compile(stopline[1:-1]+'$')) 122 else: 123 self.stopwords[stopline[1:-1]] = actions[stoptype] 124 except KeyError, character: 125 self.warning("Bad line in stopword list %s starts with" % (options.stopwordfile), options, sys.exc_info()) 126 stopfile.close() 127 self.glossary = {} 128 self.initprogressbar(inputfiles, options) 129 for inputpath in inputfiles: 130 self.files += 1 131 fullinputpath = self.getfullinputpath(options, inputpath) 132 success = True 133 try: 134 self.processfile(None, options, fullinputpath) 135 except Exception, error: 136 if isinstance(error, KeyboardInterrupt): 137 raise 138 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info()) 139 success = False 140 self.reportprogress(inputpath, success) 141 del self.progressbar 142 self.outputterminology(options)
143
144 - def clean(self, string, options):
145 """returns the cleaned string that contains the text to be matched""" 146 for accelerator in options.accelchars: 147 string = string.replace(accelerator, "") 148 string = self.formatpat.sub(" ", string) 149 string = self.xmlelpat.sub(" ", string) 150 string = self.xmlentpat.sub(" ", string) 151 string = string.strip() 152 return string
153
154 - def addphrases(self, words, skips, translation, partials=True):
155 """adds (sub)phrases with non-skipwords and more than one word""" 156 if (len(words) > skips + 1 and 157 'skip' not in self.stopwords.get(words[0], frozenset()) and 158 'skip' not in self.stopwords.get(words[-1], frozenset())): 159 self.glossary.setdefault(' '.join(words), []).append(translation) 160 if partials: 161 part = list(words) 162 while len(part) > 2: 163 if 'skip' in self.stopwords.get(part.pop(), frozenset()): 164 skips -= 1 165 if (len(part) > skips + 1 and 166 'skip' not in self.stopwords.get(part[0], frozenset()) and 167 'skip' not in self.stopwords.get(part[-1], frozenset())): 168 self.glossary.setdefault(' '.join(part), []).append(translation)
169
170 - def processfile(self, fileprocessor, options, fullinputpath):
171 """process an individual file""" 172 inputfile = self.openinputfile(options, fullinputpath) 173 inputfile = factory.getobject(inputfile) 174 sourcelang = lang_factory.getlanguage(options.sourcelanguage) 175 rematchignore = frozenset(('word','phrase')) 176 defaultignore = frozenset() 177 for unit in inputfile.units: 178 self.units += 1 179 if unit.isheader(): 180 continue 181 if unit.hasplural(): 182 continue 183 if not options.invert: 184 source = self.clean(unit.source, options) 185 target = self.clean(unit.target, options) 186 else: 187 target = self.clean(unit.source, options) 188 source = self.clean(unit.target, options) 189 if len(source) <= 1: 190 continue 191 for sentence in sourcelang.sentences(source): 192 words = [] 193 skips = 0 194 for word in sourcelang.words(sentence): 195 if options.ignorecase or (options.foldtitle and word.istitle()): 196 word = word.lower() 197 ignore = defaultignore 198 if word in self.stopwords: 199 ignore = self.stopwords[word] 200 else: 201 for stopre in self.stoprelist: 202 if stopre.match(word) != None: 203 ignore = rematchignore 204 break 205 translation = (source, target, unit, fullinputpath) 206 if 'word' not in ignore: 207 # reduce plurals 208 root = word 209 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary: 210 root = word[0:-1] 211 elif len(root) > 2 and root + 's' in self.glossary: 212 self.glossary[root] = self.glossary.pop(root + 's') 213 self.glossary.setdefault(root, []).append(translation) 214 if options.termlength > 1: 215 if 'phrase' in ignore: 216 # add trailing phrases in previous words 217 while len(words) > 2: 218 if 'skip' in self.stopwords.get(words.pop(0), defaultignore): 219 skips -= 1 220 self.addphrases(words, skips, translation) 221 words = [] 222 skips = 0 223 else: 224 words.append(word) 225 if 'skip' in ignore: 226 skips += 1 227 if len(words) > options.termlength + skips: 228 while len(words) > options.termlength + skips: 229 if 'skip' in self.stopwords.get(words.pop(0), defaultignore): 230 skips -= 1 231 self.addphrases(words, skips, translation) 232 else: 233 self.addphrases(words, skips, translation, partials=False) 234 if options.termlength > 1: 235 # add trailing phrases in sentence after reaching end 236 while options.termlength > 1 and len(words) > 2: 237 if 'skip' in self.stopwords.get(words.pop(0), defaultignore): 238 skips -= 1 239 self.addphrases(words, skips, translation)
240
241 - def outputterminology(self, options):
242 """saves the generated terminology glossary""" 243 termfile = po.pofile() 244 terms = {} 245 locre = re.compile(r":[0-9]+$") 246 print >> sys.stderr, ("%d terms from %d units in %d files" % 247 (len(self.glossary), self.units, self.files)) 248 for term, translations in self.glossary.iteritems(): 249 if len(translations) <= 1: 250 continue 251 filecounts = {} 252 sources = {} 253 termunit = po.pounit(term) 254 locations = {} 255 sourcenotes = {} 256 transnotes = {} 257 targets = {} 258 fullmsg = False 259 for source, target, unit, filename in translations: 260 sources[source] = 1 261 filecounts[filename] = filecounts.setdefault(filename, 0) + 1 262 if term.lower() == self.clean(unit.source, options).lower(): 263 fullmsg = True 264 target = self.clean(unit.target, options) 265 if options.ignorecase or (options.foldtitle and target.istitle()): 266 target = target.lower() 267 unit.settarget(target) 268 if target != "": 269 targets.setdefault(target, []).append(filename) 270 if term.lower() == unit.source.strip().lower(): 271 sourcenotes[unit.getnotes("source code")] = None 272 transnotes[unit.getnotes("translator")] = None 273 else: 274 unit.settarget("") 275 unit.setsource(term) 276 termunit.merge(unit, overwrite=False, comments=False) 277 for loc in unit.getlocations(): 278 locations.setdefault(locre.sub("", loc)) 279 numsources = len(sources) 280 numfiles = len(filecounts) 281 numlocs = len(locations) 282 if numfiles < options.inputmin or numlocs < options.locmin: 283 continue 284 if fullmsg: 285 if numsources < options.fullmsgmin: 286 continue 287 elif numsources < options.substrmin: 288 continue 289 if len(targets.keys()) > 1: 290 txt = '; '.join(["%s {%s}" % (target, ', '.join(files)) 291 for target, files in targets.iteritems()]) 292 if termunit.gettarget().find('};') < 0: 293 termunit.settarget(txt) 294 termunit.markfuzzy() 295 else: 296 # if annotated multiple terms already present, keep as-is 297 termunit.addnote(txt, "translator") 298 locmax = 2 * options.locmin 299 if numlocs > locmax: 300 for location in locations.keys()[0:locmax]: 301 termunit.addlocation(location) 302 termunit.addlocation("(poterminology) %d more locations" 303 % (numlocs - locmax)) 304 else: 305 for location in locations.keys(): 306 termunit.addlocation(location) 307 for sourcenote in sourcenotes.keys(): 308 termunit.addnote(sourcenote, "source code") 309 for transnote in transnotes.keys(): 310 termunit.addnote(transnote, "translator") 311 for filename, count in filecounts.iteritems(): 312 termunit.othercomments.append("# (poterminology) %s (%d)\n" % (filename, count)) 313 terms[term] = (((10 * numfiles) + numsources, termunit)) 314 # reduce subphrase 315 termlist = terms.keys() 316 print >> sys.stderr, "%d terms after thresholding" % len(termlist) 317 termlist.sort(lambda x, y: cmp(len(x), len(y))) 318 for term in termlist: 319 words = term.split() 320 if len(words) <= 2: 321 continue 322 while len(words) > 2: 323 words.pop() 324 if terms[term][0] == terms.get(' '.join(words), [0])[0]: 325 del terms[' '.join(words)] 326 words = term.split() 327 while len(words) > 2: 328 words.pop(0) 329 if terms[term][0] == terms.get(' '.join(words), [0])[0]: 330 del terms[' '.join(words)] 331 print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys()) 332 termitems = terms.values() 333 if options.sortorders == None: 334 options.sortorders = self.sortorders 335 while len(options.sortorders) > 0: 336 order = options.sortorders.pop() 337 if order == "frequency": 338 termitems.sort(lambda x, y: cmp(y[0], x[0])) 339 elif order == "dictionary": 340 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower())) 341 elif order == "length": 342 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source))) 343 else: 344 self.warning("unknown sort order %s" % order, options) 345 for count, unit in termitems: 346 termfile.units.append(unit) 347 open(options.output, "w").write(str(termfile))
348
349 -def find_installed_file(filename):
350 root = __file__ 351 if os.path.islink(root): 352 root = os.path.realpath(root) 353 filepath = os.path.join( os.path.dirname(os.path.abspath(root)), os.path.pardir, 'share', filename ) 354 355 if not os.path.exists(filepath): 356 return None 357 return filepath
358
359 -def main():
360 formats = {"po":("po", None), "pot": ("pot", None), None:("po", None)} 361 parser = TerminologyOptionParser(formats) 362 parser.add_option("-I", "--ignore-case", dest="ignorecase", 363 action="store_true", default=False, help="make all terms lowercase") 364 parser.add_option("-F", "--fold-titlecase", dest="foldtitle", 365 action="store_true", default=False, help="fold \"Title Case\" to lowercase") 366 parser.add_option("", "--accelerator", dest="accelchars", default="", 367 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching") 368 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3", 369 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH") 370 parser.add_option("", "--inputs-needed", type="int", dest="inputmin", 371 help="omit terms appearing in less than MIN input files (default 1 - 2 if multiple input files)", metavar="MIN") 372 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1", 373 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN") 374 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2", 375 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN") 376 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2", 377 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN") 378 parser.add_option("", "--sort", dest="sortorders", action="append", 379 type="choice", choices=parser.sortorders, metavar="ORDER", 380 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders)) 381 parser.add_option("-S", "--stopword-list", type="string", dest="stopwordfile", 382 help="name of file containing stopword list", metavar="FILENAME", default=find_installed_file('stoplist-en')) 383 parser.add_option("", "--source-language", dest="sourcelanguage", default="en", 384 help="the source language code (default 'en')", metavar="LANG") 385 parser.add_option("-v", "--invert", dest="invert", 386 action="store_true", default=False, help="invert the source and target languages for terminology") 387 parser.set_usage() 388 parser.description = __doc__ 389 parser.run()
390 391 392 if __name__ == '__main__': 393 main() 394