1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 """reads a set of .po or .pot files to produce a pootle-terminology.pot
21
22 See: http://translate.sourceforge.net/wiki/toolkit/poterminology for examples and
23 usage instructions
24 """
25
26 from translate.lang import factory as lang_factory
27 from translate.misc import optrecurse
28 from translate.storage import po
29 from translate.storage import factory
30 import os
31 import re
32 import sys
33
35 """a specialized Option Parser for the terminology tool..."""
36
37
38 formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
39
40 xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
41
42 xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);",
43 flags=re.UNICODE|re.IGNORECASE)
44
45 sortorders = [ "frequency", "dictionary", "length" ]
46
47 files = 0
48 units = 0
49
51 """parses the command line options, handling implicit input/output args"""
52 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values)
53
54 if args and not options.input:
55 if not options.output and len(args) > 1:
56 options.input = args[:-1]
57 args = args[-1:]
58 else:
59 options.input = args
60 args = []
61 if args and not options.output:
62 options.output = args[-1]
63 args = args[:-1]
64 if not options.output:
65 options.output = "pootle-terminology.pot"
66 if args:
67 self.error("You have used an invalid combination of --input, --output and freestanding args")
68 if isinstance(options.input, list) and len(options.input) == 1:
69 options.input = options.input[0]
70 if options.inputmin == None:
71 options.inputmin = 1
72 elif options.inputmin == None:
73 options.inputmin = 2
74 return (options, args)
75
77 """sets the usage string - if usage not given, uses getusagestring for each option"""
78 if usage is None:
79 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \
80 "\n input directory is searched for PO files, terminology PO file is output file"
81 else:
82 super(TerminologyOptionParser, self).set_usage(usage)
83
91
93 """recurse through directories and process files"""
94 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True):
95 if isinstance(options.input, list):
96 inputfiles = self.recurseinputfilelist(options)
97 else:
98 inputfiles = self.recurseinputfiles(options)
99 else:
100 if options.input:
101 inputfiles = [os.path.basename(options.input)]
102 options.input = os.path.dirname(options.input)
103 else:
104 inputfiles = [options.input]
105 if os.path.isdir(options.output):
106 options.output = os.path.join(options.output,"pootle-terminology.pot")
107 self.stopwords = {}
108 self.stoprelist = []
109 actions = { '+': frozenset(), ':': frozenset(['skip']),
110 '<': frozenset(['phrase']), '=': frozenset(['word']),
111 '>': frozenset(['word','skip']),
112 '@': frozenset(['word','phrase']) }
113 if options.stopwordfile != None:
114 stopfile = open(options.stopwordfile, "r")
115 try:
116 for stopline in stopfile:
117 stoptype = stopline[0]
118 if stoptype == '#' or stoptype == "\n":
119 continue
120 elif stoptype == '/':
121 self.stoprelist.append(re.compile(stopline[1:-1]+'$'))
122 else:
123 self.stopwords[stopline[1:-1]] = actions[stoptype]
124 except KeyError, character:
125 self.warning("Bad line in stopword list %s starts with" % (options.stopwordfile), options, sys.exc_info())
126 stopfile.close()
127 self.glossary = {}
128 self.initprogressbar(inputfiles, options)
129 for inputpath in inputfiles:
130 self.files += 1
131 fullinputpath = self.getfullinputpath(options, inputpath)
132 success = True
133 try:
134 self.processfile(None, options, fullinputpath)
135 except Exception, error:
136 if isinstance(error, KeyboardInterrupt):
137 raise
138 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info())
139 success = False
140 self.reportprogress(inputpath, success)
141 del self.progressbar
142 self.outputterminology(options)
143
144 - def clean(self, string, options):
145 """returns the cleaned string that contains the text to be matched"""
146 for accelerator in options.accelchars:
147 string = string.replace(accelerator, "")
148 string = self.formatpat.sub(" ", string)
149 string = self.xmlelpat.sub(" ", string)
150 string = self.xmlentpat.sub(" ", string)
151 string = string.strip()
152 return string
153
154 - def addphrases(self, words, skips, translation, partials=True):
155 """adds (sub)phrases with non-skipwords and more than one word"""
156 if (len(words) > skips + 1 and
157 'skip' not in self.stopwords.get(words[0], frozenset()) and
158 'skip' not in self.stopwords.get(words[-1], frozenset())):
159 self.glossary.setdefault(' '.join(words), []).append(translation)
160 if partials:
161 part = list(words)
162 while len(part) > 2:
163 if 'skip' in self.stopwords.get(part.pop(), frozenset()):
164 skips -= 1
165 if (len(part) > skips + 1 and
166 'skip' not in self.stopwords.get(part[0], frozenset()) and
167 'skip' not in self.stopwords.get(part[-1], frozenset())):
168 self.glossary.setdefault(' '.join(part), []).append(translation)
169
170 - def processfile(self, fileprocessor, options, fullinputpath):
171 """process an individual file"""
172 inputfile = self.openinputfile(options, fullinputpath)
173 inputfile = factory.getobject(inputfile)
174 sourcelang = lang_factory.getlanguage(options.sourcelanguage)
175 rematchignore = frozenset(('word','phrase'))
176 defaultignore = frozenset()
177 for unit in inputfile.units:
178 self.units += 1
179 if unit.isheader():
180 continue
181 if unit.hasplural():
182 continue
183 if not options.invert:
184 source = self.clean(unit.source, options)
185 target = self.clean(unit.target, options)
186 else:
187 target = self.clean(unit.source, options)
188 source = self.clean(unit.target, options)
189 if len(source) <= 1:
190 continue
191 for sentence in sourcelang.sentences(source):
192 words = []
193 skips = 0
194 for word in sourcelang.words(sentence):
195 if options.ignorecase or (options.foldtitle and word.istitle()):
196 word = word.lower()
197 ignore = defaultignore
198 if word in self.stopwords:
199 ignore = self.stopwords[word]
200 else:
201 for stopre in self.stoprelist:
202 if stopre.match(word) != None:
203 ignore = rematchignore
204 break
205 translation = (source, target, unit, fullinputpath)
206 if 'word' not in ignore:
207
208 root = word
209 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary:
210 root = word[0:-1]
211 elif len(root) > 2 and root + 's' in self.glossary:
212 self.glossary[root] = self.glossary.pop(root + 's')
213 self.glossary.setdefault(root, []).append(translation)
214 if options.termlength > 1:
215 if 'phrase' in ignore:
216
217 while len(words) > 2:
218 if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
219 skips -= 1
220 self.addphrases(words, skips, translation)
221 words = []
222 skips = 0
223 else:
224 words.append(word)
225 if 'skip' in ignore:
226 skips += 1
227 if len(words) > options.termlength + skips:
228 while len(words) > options.termlength + skips:
229 if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
230 skips -= 1
231 self.addphrases(words, skips, translation)
232 else:
233 self.addphrases(words, skips, translation, partials=False)
234 if options.termlength > 1:
235
236 while options.termlength > 1 and len(words) > 2:
237 if 'skip' in self.stopwords.get(words.pop(0), defaultignore):
238 skips -= 1
239 self.addphrases(words, skips, translation)
240
242 """saves the generated terminology glossary"""
243 termfile = po.pofile()
244 terms = {}
245 locre = re.compile(r":[0-9]+$")
246 print >> sys.stderr, ("%d terms from %d units in %d files" %
247 (len(self.glossary), self.units, self.files))
248 for term, translations in self.glossary.iteritems():
249 if len(translations) <= 1:
250 continue
251 filecounts = {}
252 sources = {}
253 termunit = po.pounit(term)
254 locations = {}
255 sourcenotes = {}
256 transnotes = {}
257 targets = {}
258 fullmsg = False
259 for source, target, unit, filename in translations:
260 sources[source] = 1
261 filecounts[filename] = filecounts.setdefault(filename, 0) + 1
262 if term.lower() == self.clean(unit.source, options).lower():
263 fullmsg = True
264 target = self.clean(unit.target, options)
265 if options.ignorecase or (options.foldtitle and target.istitle()):
266 target = target.lower()
267 unit.settarget(target)
268 if target != "":
269 targets.setdefault(target, []).append(filename)
270 if term.lower() == unit.source.strip().lower():
271 sourcenotes[unit.getnotes("source code")] = None
272 transnotes[unit.getnotes("translator")] = None
273 else:
274 unit.settarget("")
275 unit.setsource(term)
276 termunit.merge(unit, overwrite=False, comments=False)
277 for loc in unit.getlocations():
278 locations.setdefault(locre.sub("", loc))
279 numsources = len(sources)
280 numfiles = len(filecounts)
281 numlocs = len(locations)
282 if numfiles < options.inputmin or numlocs < options.locmin:
283 continue
284 if fullmsg:
285 if numsources < options.fullmsgmin:
286 continue
287 elif numsources < options.substrmin:
288 continue
289 if len(targets.keys()) > 1:
290 txt = '; '.join(["%s {%s}" % (target, ', '.join(files))
291 for target, files in targets.iteritems()])
292 if termunit.gettarget().find('};') < 0:
293 termunit.settarget(txt)
294 termunit.markfuzzy()
295 else:
296
297 termunit.addnote(txt, "translator")
298 locmax = 2 * options.locmin
299 if numlocs > locmax:
300 for location in locations.keys()[0:locmax]:
301 termunit.addlocation(location)
302 termunit.addlocation("(poterminology) %d more locations"
303 % (numlocs - locmax))
304 else:
305 for location in locations.keys():
306 termunit.addlocation(location)
307 for sourcenote in sourcenotes.keys():
308 termunit.addnote(sourcenote, "source code")
309 for transnote in transnotes.keys():
310 termunit.addnote(transnote, "translator")
311 for filename, count in filecounts.iteritems():
312 termunit.othercomments.append("# (poterminology) %s (%d)\n" % (filename, count))
313 terms[term] = (((10 * numfiles) + numsources, termunit))
314
315 termlist = terms.keys()
316 print >> sys.stderr, "%d terms after thresholding" % len(termlist)
317 termlist.sort(lambda x, y: cmp(len(x), len(y)))
318 for term in termlist:
319 words = term.split()
320 if len(words) <= 2:
321 continue
322 while len(words) > 2:
323 words.pop()
324 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
325 del terms[' '.join(words)]
326 words = term.split()
327 while len(words) > 2:
328 words.pop(0)
329 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
330 del terms[' '.join(words)]
331 print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys())
332 termitems = terms.values()
333 if options.sortorders == None:
334 options.sortorders = self.sortorders
335 while len(options.sortorders) > 0:
336 order = options.sortorders.pop()
337 if order == "frequency":
338 termitems.sort(lambda x, y: cmp(y[0], x[0]))
339 elif order == "dictionary":
340 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower()))
341 elif order == "length":
342 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source)))
343 else:
344 self.warning("unknown sort order %s" % order, options)
345 for count, unit in termitems:
346 termfile.units.append(unit)
347 open(options.output, "w").write(str(termfile))
348
350 root = __file__
351 if os.path.islink(root):
352 root = os.path.realpath(root)
353 filepath = os.path.join( os.path.dirname(os.path.abspath(root)), os.path.pardir, 'share', filename )
354
355 if not os.path.exists(filepath):
356 return None
357 return filepath
358
360 formats = {"po":("po", None), "pot": ("pot", None), None:("po", None)}
361 parser = TerminologyOptionParser(formats)
362 parser.add_option("-I", "--ignore-case", dest="ignorecase",
363 action="store_true", default=False, help="make all terms lowercase")
364 parser.add_option("-F", "--fold-titlecase", dest="foldtitle",
365 action="store_true", default=False, help="fold \"Title Case\" to lowercase")
366 parser.add_option("", "--accelerator", dest="accelchars", default="",
367 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching")
368 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3",
369 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
370 parser.add_option("", "--inputs-needed", type="int", dest="inputmin",
371 help="omit terms appearing in less than MIN input files (default 1 - 2 if multiple input files)", metavar="MIN")
372 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
373 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
374 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2",
375 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
376 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2",
377 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")
378 parser.add_option("", "--sort", dest="sortorders", action="append",
379 type="choice", choices=parser.sortorders, metavar="ORDER",
380 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders))
381 parser.add_option("-S", "--stopword-list", type="string", dest="stopwordfile",
382 help="name of file containing stopword list", metavar="FILENAME", default=find_installed_file('stoplist-en'))
383 parser.add_option("", "--source-language", dest="sourcelanguage", default="en",
384 help="the source language code (default 'en')", metavar="LANG")
385 parser.add_option("-v", "--invert", dest="invert",
386 action="store_true", default=False, help="invert the source and target languages for terminology")
387 parser.set_usage()
388 parser.description = __doc__
389 parser.run()
390
391
392 if __name__ == '__main__':
393 main()
394