Package translate :: Package search :: Module segment
[hide private]
[frames] | no frames]

Source Code for Module translate.search.segment

 1  # -*- coding: utf-8 -*- 
 2  # 
 3  # Copyright 2006 Zuza Software Foundation 
 4  #  
 5  # This file is part of translate. 
 6  # 
 7  # translate is free software; you can redistribute it and/or modify 
 8  # it under the terms of the GNU General Public License as published by 
 9  # the Free Software Foundation; either version 2 of the License, or 
10  # (at your option) any later version. 
11  #  
12  # translate is distributed in the hope that it will be useful, 
13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
15  # GNU General Public License for more details. 
16  # 
17  # You should have received a copy of the GNU General Public License 
18  # along with translate; if not, write to the Free Software 
19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
20  # 
21   
22  """Module to deal with different types and uses of segmentation""" 
23   
24  #XXX: This module is now deprecated: Use language specific segmenters in the 
25  # lang package (character_iter, word_iter, sentence_iter, etc.). 
26   
27  punctuation = u".,;:!?-@#$%^*_()[]{}/\\'\"<>‘’‚‛“”„‟′″‴‵‶‷‹›«»±³¹²°¿©®×£¥" 
28   
29 -def character_iter(text):
30 """Returns an iterator over the characters in text.""" 31 #We don't return more than one consecutive whitespace character 32 prev = 'A' 33 for c in text: 34 if c.isspace() and prev.isspace(): 35 continue 36 prev = c 37 if not (c in punctuation): 38 yield c.lower()
39
40 -def characters(text):
41 """Returns a list of characters in text.""" 42 return [c for c in character_iter(text)]
43
44 -def word_iter(text):
45 """Returns an iterator over the words in text.""" 46 #TODO: Consider replacing puctuation with space before split() 47 for w in text.split(): 48 yield w.strip(punctuation).lower()
49
50 -def words(text):
51 """Returns a list of words in text.""" 52 return [w for w in word_iter(text)]
53
54 -def sentence_iter(text):
55 """Returns an iterator over the senteces in text.""" 56 #TODO: This is very naïve. We really should consider all punctuation, 57 #and return the punctuation with the sentence. 58 #TODO: Search for capital letter start with next sentence to avoid 59 #confusion with abbreviations. And remember Afrikaans "'n" :-) 60 for s in text.split(". "): 61 yield s.strip()
62
63 -def sentences(text):
64 """Returns a list of senteces in text.""" 65 return [s for s in sentence_iter(text)]
66