1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Class to perform translation memory matching from a store of translation units"""
23
24 from translate.search import lshtein
25 from translate.search import terminology
26 from translate.storage import base
27 from translate.storage import po
28 from translate.misc.multistring import multistring
29 import heapq
30
32 """Returns the length of the source string"""
33 return len(unit.source)
34
36 """Compares using sourcelen"""
37
38 xlen = sourcelen(x)
39 ylen = sourcelen(y)
40 return cmp(xlen, ylen)
41
43 """A class that will do matching and store configuration for the matching process"""
44 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
45 """max_candidates is the maximum number of candidates that should be assembled,
46 min_similarity is the minimum similarity that must be attained to be included in
47 the result, comparer is an optional Comparer with similarity() function"""
48 if comparer is None:
49 comparer = lshtein.LevenshteinComparer(max_length)
50 self.comparer = comparer
51 self.setparameters(max_candidates, min_similarity, max_length)
52 self.usefuzzy = usefuzzy
53 self.inittm(store)
54 self.addpercentage = True
55
68
70 """Initialises the memory for later use. We use simple base units for
71 speedup."""
72 self.existingunits = {}
73 self.candidates = base.TranslationStore()
74
75 if not isinstance(stores, list):
76 stores = [stores]
77 for store in stores:
78 self.extendtm(store.units, store=store, sort=False)
79 self.candidates.units.sort(sourcelencmp)
80
81
82
83 - def extendtm(self, units, store=None, sort=True):
84 """Extends the memory with extra unit(s).
85
86 @param units: The units to add to the TM.
87 @param store: Optional store from where some metadata can be retrieved
88 and associated with each unit.
89 @param sort: Optional parameter that can be set to False to supress
90 sorting of the candidates list. This should probably only be used in
91 inittm().
92 """
93 if not isinstance(units, list):
94 units = [units]
95 candidates = filter(self.usable, units)
96 for candidate in candidates:
97 simpleunit = base.TranslationUnit("")
98
99
100 if isinstance(candidate.source, multistring):
101 if len(candidate.source.strings) > 1:
102 simpleunit.orig_source = candidate.source
103 simpleunit.orig_target = candidate.target
104 simpleunit.source = unicode(candidate.source)
105 simpleunit.target = unicode(candidate.target)
106 else:
107 simpleunit.source = candidate.source
108 simpleunit.target = candidate.target
109
110
111
112
113 simpleunit.addnote(candidate.getnotes(origin="translator"))
114 simpleunit.fuzzy = candidate.isfuzzy()
115 if store:
116 simpleunit.filepath = store.filepath
117 simpleunit.translator = store.translator
118 simpleunit.date = store.date
119 self.candidates.units.append(simpleunit)
120 if sort:
121 self.candidates.units.sort(sourcelencmp)
122
123 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
124 """Sets the parameters without reinitialising the tm. If a parameter
125 is not specified, it is set to the default, not ignored"""
126 self.MAX_CANDIDATES = max_candidates
127 self.MIN_SIMILARITY = min_similarity
128 self.MAX_LENGTH = max_length
129
131 """Calculates a length beyond which we are not interested.
132 The extra fat is because we don't use plain character distance only."""
133 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
134
136 """Calculates the minimum length we are interested in.
137 The extra fat is because we don't use plain character distance only."""
138 return max(len(text) * (min_similarity/100.0), 1)
139
141 """Returns a list of possible matches for given source text.
142
143 @type text: String
144 @param text: The text that will be search for in the translation memory
145 @rtype: list
146 @return: a list of units with the source and target strings from the
147 translation memory. If self.addpercentage is true (default) the match
148 quality is given as a percentage in the notes.
149 """
150 bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES
151
152
153 min_similarity = self.MIN_SIMILARITY
154
155
156
157
158
159
160
161 startlength = self.getstartlength(min_similarity, text)
162 startindex = 0
163 endindex = len(self.candidates.units)
164 while startindex < endindex:
165 mid = (startindex + endindex) // 2
166 if sourcelen(self.candidates.units[mid]) < startlength:
167 startindex = mid + 1
168 else:
169 endindex = mid
170
171
172 stoplength = self.getstoplength(min_similarity, text)
173 lowestscore = 0
174
175 for candidate in self.candidates.units[startindex:]:
176 cmpstring = candidate.source
177 if len(cmpstring) > stoplength:
178 break
179 similarity = self.comparer.similarity(text, cmpstring, min_similarity)
180 if similarity < min_similarity:
181 continue
182 if similarity > lowestscore:
183 heapq.heapreplace(bestcandidates, (similarity, candidate))
184 lowestscore = bestcandidates[0][0]
185 if lowestscore >= 100:
186 break
187 if min_similarity < lowestscore:
188 min_similarity = lowestscore
189 stoplength = self.getstoplength(min_similarity, text)
190
191
192 def notzero(item):
193 score = item[0]
194 return score != 0
195 bestcandidates = filter(notzero, bestcandidates)
196
197 bestcandidates.sort()
198
199 bestcandidates.reverse()
200 return self.buildunits(bestcandidates)
201
203 """Builds a list of units conforming to base API, with the score in the comment"""
204 units = []
205 for score, candidate in candidates:
206 if hasattr(candidate, "orig_source"):
207 candidate.source = candidate.orig_source
208 candidate.target = candidate.orig_target
209 newunit = po.pounit(candidate.source)
210 newunit.target = candidate.target
211 newunit.markfuzzy(candidate.fuzzy)
212 newunit.filepath = candidate.filepath
213 newunit.translator = candidate.translator
214 newunit.date = candidate.date
215 candidatenotes = candidate.getnotes().strip()
216 if candidatenotes:
217 newunit.addnote(candidatenotes)
218 if self.addpercentage:
219 newunit.addnote("%d%%" % score)
220 units.append(newunit)
221 return units
222
224 """A matcher with settings specifically for terminology matching"""
225 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
226 if comparer is None:
227 comparer = terminology.TerminologyComparer(max_length)
228 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer)
229 self.addpercentage = False
230
232 """Normal initialisation, but convert all source strings to lower case"""
233 matcher.inittm(self, store)
234 for unit in self.candidates.units:
235 unit.source = unit.source.lower()
236
241
246
248 """Normal matching after converting text to lower case. Then replace
249 with the original unit to retain comments, etc."""
250 text = text.lower()
251 matches = matcher.matches(self, text)
252 return matches
253