Package translate :: Package search :: Module terminology
[hide private]
[frames] | no frames]

Source Code for Module translate.search.terminology

 1  # -*- coding: utf-8 -*- 
 2  #  
 3  # Copyright 2006 Zuza Software Foundation 
 4  #  
 5  # This file is part of translate. 
 6  # 
 7  # translate is free software; you can redistribute it and/or modify 
 8  # it under the terms of the GNU General Public License as published by 
 9  # the Free Software Foundation; either version 2 of the License, or 
10  # (at your option) any later version. 
11  #  
12  # translate is distributed in the hope that it will be useful, 
13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
15  # GNU General Public License for more details. 
16  # 
17  # You should have received a copy of the GNU General Public License 
18  # along with translate; if not, write to the Free Software 
19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20   
21  """A class that does terminology matching""" 
22   
23  import re 
24   
25  # We don't want to miss certain forms of words that only change a little 
26  # at the end. Now we are tying this code to English, but it should serve 
27  # us well. For example "category" should be found in "categories",  
28  # "copy" should be found in "copied" 
29  # 
30  # The tuples define a regular expression to search for, and what with 
31  # what it should be replaced. 
32  ignorepatterns = [("y\s*$", "ie"),          #category/categories, identify/identifies, apply/applied 
33                    ("[\s-]*", ""),           #down time / downtime, pre-order / preorder 
34                    ("-", " "),               #pre-order / pre order 
35                    (" ", "-"),               #pre order / pre-order 
36                   ] 
37   
38  #TODO: compile regexes 
39   
40 -class TerminologyComparer:
41 - def __init__(self, max_len=500):
42 self.MAX_LEN = max_len
43
44 - def similarity(self, a, b, stoppercentage=40):
45 """returns the match quality of term b in the text a""" 46 # We could segment the words, but mostly it will give less ideal 47 # results, since we'll miss plurals, etc. Then we also can't search for 48 # multiword terms, such as "Free Software". Ideally we should use a 49 # stemmer, like the Porter stemmer. 50 51 # So we just see if the word occurs anywhere. This is not perfect since 52 # we might get more than we bargained for. The term "form" will be found 53 # in the word "format", for example. A word like "at" will trigger too 54 # many false positives. 55 56 # First remove a possible disambiguating bracket at the end 57 b = re.sub("\s+\(.*\)\s*$", "", b) 58 59 if len(b) <= 2: 60 return 0 61 62 pos = a[:self.MAX_LEN].find(b) 63 if pos >= 0: 64 return 100 - pos * 10 / len(a[:self.MAX_LEN]) 65 66 for ignorepattern in ignorepatterns: 67 newb = re.sub(ignorepattern[0], ignorepattern[1], b) 68 if newb in a[:self.MAX_LEN]: 69 return 80 70 return 0
71