Package translate :: Package search :: Module match
[hide private]
[frames] | no frames]

Source Code for Module translate.search.match

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2006-2007 Zuza Software Foundation 
  4  #  
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  #  
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22  """Class to perform translation memory matching from a store of translation units""" 
 23   
 24  from translate.search import lshtein 
 25  from translate.search import terminology 
 26  from translate.storage import base 
 27  from translate.storage import po 
 28  from translate.misc.multistring import multistring 
 29  import heapq 
 30   
31 -def sourcelen(unit):
32 """Returns the length of the source string""" 33 return len(unit.source)
34
35 -def sourcelencmp(x, y):
36 """Compares using sourcelen""" 37 # This is mostly useful for Python 2.3 38 xlen = sourcelen(x) 39 ylen = sourcelen(y) 40 return cmp(xlen, ylen)
41
42 -class matcher(object):
43 """A class that will do matching and store configuration for the matching process"""
44 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
45 """max_candidates is the maximum number of candidates that should be assembled, 46 min_similarity is the minimum similarity that must be attained to be included in 47 the result, comparer is an optional Comparer with similarity() function""" 48 if comparer is None: 49 comparer = lshtein.LevenshteinComparer(max_length) 50 self.comparer = comparer 51 self.setparameters(max_candidates, min_similarity, max_length) 52 self.usefuzzy = usefuzzy 53 self.inittm(store) 54 self.addpercentage = True
55
56 - def usable(self, unit):
57 """Returns whether this translation unit is usable for TM""" 58 #TODO: We might want to consider more attributes, such as approved, reviewed, etc. 59 source = unit.source 60 target = unit.target 61 if source and target and (self.usefuzzy or not unit.isfuzzy()): 62 if source in self.existingunits and self.existingunits[source] == target: 63 return False 64 else: 65 self.existingunits[source] = target 66 return True 67 return False
68
69 - def inittm(self, stores):
70 """Initialises the memory for later use. We use simple base units for 71 speedup.""" 72 self.existingunits = {} 73 self.candidates = base.TranslationStore() 74 75 if not isinstance(stores, list): 76 stores = [stores] 77 for store in stores: 78 self.extendtm(store.units, store=store, sort=False) 79 self.candidates.units.sort(sourcelencmp)
80 # print "TM initialised with %d candidates (%d to %d characters long)" % \ 81 # (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source)) 82
83 - def extendtm(self, units, store=None, sort=True):
84 """Extends the memory with extra unit(s). 85 86 @param units: The units to add to the TM. 87 @param store: Optional store from where some metadata can be retrieved 88 and associated with each unit. 89 @param sort: Optional parameter that can be set to False to supress 90 sorting of the candidates list. This should probably only be used in 91 inittm(). 92 """ 93 if not isinstance(units, list): 94 units = [units] 95 candidates = filter(self.usable, units) 96 for candidate in candidates: 97 simpleunit = base.TranslationUnit("") 98 # We need to ensure that we don't pass multistrings futher, since 99 # some modules (like the native Levenshtein) can't use it. 100 if isinstance(candidate.source, multistring): 101 if len(candidate.source.strings) > 1: 102 simpleunit.orig_source = candidate.source 103 simpleunit.orig_target = candidate.target 104 simpleunit.source = unicode(candidate.source) 105 simpleunit.target = unicode(candidate.target) 106 else: 107 simpleunit.source = candidate.source 108 simpleunit.target = candidate.target 109 # If we now only get translator comments, we don't get programmer 110 # comments in TM suggestions (in Pootle, for example). If we get all 111 # notes, pot2po adds all previous comments as translator comments 112 # in the new po file 113 simpleunit.addnote(candidate.getnotes(origin="translator")) 114 simpleunit.fuzzy = candidate.isfuzzy() 115 if store: 116 simpleunit.filepath = store.filepath 117 simpleunit.translator = store.translator 118 simpleunit.date = store.date 119 self.candidates.units.append(simpleunit) 120 if sort: 121 self.candidates.units.sort(sourcelencmp)
122
123 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
124 """Sets the parameters without reinitialising the tm. If a parameter 125 is not specified, it is set to the default, not ignored""" 126 self.MAX_CANDIDATES = max_candidates 127 self.MIN_SIMILARITY = min_similarity 128 self.MAX_LENGTH = max_length
129
130 - def getstoplength(self, min_similarity, text):
131 """Calculates a length beyond which we are not interested. 132 The extra fat is because we don't use plain character distance only.""" 133 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
134
135 - def getstartlength(self, min_similarity, text):
136 """Calculates the minimum length we are interested in. 137 The extra fat is because we don't use plain character distance only.""" 138 return max(len(text) * (min_similarity/100.0), 1)
139
140 - def matches(self, text):
141 """Returns a list of possible matches for given source text. 142 143 @type text: String 144 @param text: The text that will be search for in the translation memory 145 @rtype: list 146 @return: a list of units with the source and target strings from the 147 translation memory. If self.addpercentage is true (default) the match 148 quality is given as a percentage in the notes. 149 """ 150 bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES 151 #We use self.MIN_SIMILARITY, but if we already know we have max_candidates 152 #that are better, we can adjust min_similarity upwards for speedup 153 min_similarity = self.MIN_SIMILARITY 154 155 # We want to limit our search in self.candidates, so we want to ignore 156 # all units with a source string that is too short or too long. We use 157 # a binary search to find the shortest string, from where we start our 158 # search in the candidates. 159 160 # minimum source string length to be considered 161 startlength = self.getstartlength(min_similarity, text) 162 startindex = 0 163 endindex = len(self.candidates.units) 164 while startindex < endindex: 165 mid = (startindex + endindex) // 2 166 if sourcelen(self.candidates.units[mid]) < startlength: 167 startindex = mid + 1 168 else: 169 endindex = mid 170 171 # maximum source string length to be considered 172 stoplength = self.getstoplength(min_similarity, text) 173 lowestscore = 0 174 175 for candidate in self.candidates.units[startindex:]: 176 cmpstring = candidate.source 177 if len(cmpstring) > stoplength: 178 break 179 similarity = self.comparer.similarity(text, cmpstring, min_similarity) 180 if similarity < min_similarity: 181 continue 182 if similarity > lowestscore: 183 heapq.heapreplace(bestcandidates, (similarity, candidate)) 184 lowestscore = bestcandidates[0][0] 185 if lowestscore >= 100: 186 break 187 if min_similarity < lowestscore: 188 min_similarity = lowestscore 189 stoplength = self.getstoplength(min_similarity, text) 190 191 #Remove the empty ones: 192 def notzero(item): 193 score = item[0] 194 return score != 0
195 bestcandidates = filter(notzero, bestcandidates) 196 #Sort for use as a general list, and reverse so the best one is at index 0 197 bestcandidates.sort() 198 # We reverse as separate step for compatibility with Python 2.3 199 bestcandidates.reverse() 200 return self.buildunits(bestcandidates)
201
202 - def buildunits(self, candidates):
203 """Builds a list of units conforming to base API, with the score in the comment""" 204 units = [] 205 for score, candidate in candidates: 206 if hasattr(candidate, "orig_source"): 207 candidate.source = candidate.orig_source 208 candidate.target = candidate.orig_target 209 newunit = po.pounit(candidate.source) 210 newunit.target = candidate.target 211 newunit.markfuzzy(candidate.fuzzy) 212 newunit.filepath = candidate.filepath 213 newunit.translator = candidate.translator 214 newunit.date = candidate.date 215 candidatenotes = candidate.getnotes().strip() 216 if candidatenotes: 217 newunit.addnote(candidatenotes) 218 if self.addpercentage: 219 newunit.addnote("%d%%" % score) 220 units.append(newunit) 221 return units
222
223 -class terminologymatcher(matcher):
224 """A matcher with settings specifically for terminology matching"""
225 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
226 if comparer is None: 227 comparer = terminology.TerminologyComparer(max_length) 228 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer) 229 self.addpercentage = False
230
231 - def inittm(self, store):
232 """Normal initialisation, but convert all source strings to lower case""" 233 matcher.inittm(self, store) 234 for unit in self.candidates.units: 235 unit.source = unit.source.lower()
236
237 - def getstartlength(self, min_similarity, text):
238 # Let's number false matches by not working with terms of two 239 # characters or less 240 return 3
241
242 - def getstoplength(self, min_similarity, text):
243 # Let's ignore terms with more than 30 characters. Perhaps someone 244 # gave a file with normal (long) translations 245 return 30
246
247 - def matches(self, text):
248 """Normal matching after converting text to lower case. Then replace 249 with the original unit to retain comments, etc.""" 250 text = text.lower() 251 matches = matcher.matches(self, text) 252 return matches
253