Package translate :: Package misc :: Module textwrap
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.textwrap

  1  # -*- coding: utf-8 -*- 
  2  """Text wrapping and filling. 
  3  """ 
  4   
  5  # Copyright (C) 1999-2001 Gregory P. Ward. 
  6  # Copyright (C) 2002, 2003 Python Software Foundation. 
  7  # Written by Greg Ward <gward@python.net> 
  8   
  9  __revision__ = "$Id: textwrap.py 4103 2006-10-20 07:35:02Z dwaynebailey $" 
 10   
 11  import string, re 
 12   
 13  # Do the right thing with boolean values for all known Python versions 
 14  # (so this module can be copied to projects that don't depend on Python 
 15  # 2.3, e.g. Optik and Docutils). 
 16  try: 
 17      True, False 
 18  except NameError: 
 19      (True, False) = (1, 0) 
 20   
 21  __all__ = ['TextWrapper', 'wrap', 'fill'] 
 22   
 23  # Hardcode the recognized whitespace characters to the US-ASCII 
 24  # whitespace characters.  The main reason for doing this is that in 
 25  # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales 
 26  # that character winds up in string.whitespace.  Respecting 
 27  # string.whitespace in those cases would 1) make textwrap treat 0xa0 the 
 28  # same as any other whitespace char, which is clearly wrong (it's a 
 29  # *non-breaking* space), 2) possibly cause problems with Unicode, 
 30  # since 0xa0 is not in range(128). 
 31  _whitespace = '\t\n\x0b\x0c\r ' 
 32   
33 -class TextWrapper:
34 """ 35 Object for wrapping/filling text. The public interface consists of 36 the wrap() and fill() methods; the other methods are just there for 37 subclasses to override in order to tweak the default behaviour. 38 If you want to completely replace the main wrapping algorithm, 39 you'll probably have to override _wrap_chunks(). 40 41 Several instance attributes control various aspects of wrapping: 42 width (default: 70) 43 the maximum width of wrapped lines (unless break_long_words 44 is false) 45 initial_indent (default: "") 46 string that will be prepended to the first line of wrapped 47 output. Counts towards the line's width. 48 subsequent_indent (default: "") 49 string that will be prepended to all lines save the first 50 of wrapped output; also counts towards each line's width. 51 expand_tabs (default: true) 52 Expand tabs in input text to spaces before further processing. 53 Each tab will become 1 .. 8 spaces, depending on its position in 54 its line. If false, each tab is treated as a single character. 55 drop_whitespace (default: true) 56 Drop leading and trailing whitespace from lines. 57 replace_whitespace (default: true) 58 Replace all whitespace characters in the input text by spaces 59 after tab expansion. Note that if expand_tabs is false and 60 replace_whitespace is true, every tab will be converted to a 61 single space! 62 fix_sentence_endings (default: false) 63 Ensure that sentence-ending punctuation is always followed 64 by two spaces. Off by default because the algorithm is 65 (unavoidably) imperfect. 66 break_long_words (default: true) 67 Break words longer than 'width'. If false, those words will not 68 be broken, and some lines might be longer than 'width'. 69 """ 70 71 whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace)) 72 73 unicode_whitespace_trans = {} 74 uspace = ord(u' ') 75 for x in map(ord, _whitespace): 76 unicode_whitespace_trans[x] = uspace 77 78 # This funky little regex is just the trick for splitting 79 # text up into word-wrappable chunks. E.g. 80 # "Hello there -- you goof-ball, use the -b option!" 81 # splits into 82 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! 83 # (after stripping out empty strings). 84 wordsep_re = re.compile( 85 r'(\s+|' # any whitespace 86 r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words 87 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash 88 89 # XXX this is not locale- or charset-aware -- string.lowercase 90 # is US-ASCII only (and therefore English-only) 91 sentence_end_re = re.compile(r'[%s]' # lowercase letter 92 r'[\.\!\?]' # sentence-ending punct. 93 r'[\"\']?' # optional end-of-quote 94 % string.lowercase) 95 96
97 - def __init__(self, 98 width=70, 99 initial_indent="", 100 subsequent_indent="", 101 expand_tabs=True, 102 drop_whitespace=True, 103 replace_whitespace=True, 104 fix_sentence_endings=False, 105 break_long_words=True):
106 self.width = width 107 self.initial_indent = initial_indent 108 self.subsequent_indent = subsequent_indent 109 self.expand_tabs = expand_tabs 110 self.drop_whitespace = drop_whitespace 111 self.replace_whitespace = replace_whitespace 112 self.fix_sentence_endings = fix_sentence_endings 113 self.break_long_words = break_long_words
114 115 116 # -- Private methods ----------------------------------------------- 117 # (possibly useful for subclasses to override) 118
119 - def _munge_whitespace(self, text):
120 """_munge_whitespace(text : string) -> string 121 122 Munge whitespace in text: expand tabs and convert all other 123 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz" 124 becomes " foo bar baz". 125 """ 126 if self.expand_tabs: 127 text = text.expandtabs() 128 if self.replace_whitespace: 129 if isinstance(text, str): 130 text = text.translate(self.whitespace_trans) 131 elif isinstance(text, unicode): 132 text = text.translate(self.unicode_whitespace_trans) 133 return text
134 135
136 - def _split(self, text):
137 """_split(text : string) -> [string] 138 139 Split the text to wrap into indivisible chunks. Chunks are 140 not quite the same as words; see wrap_chunks() for full 141 details. As an example, the text 142 Look, goof-ball -- use the -b option! 143 breaks into the following chunks: 144 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', 145 'use', ' ', 'the', ' ', '-b', ' ', 'option!' 146 """ 147 chunks = self.wordsep_re.split(text) 148 chunks = filter(None, chunks) 149 return chunks
150
151 - def _fix_sentence_endings(self, chunks):
152 """_fix_sentence_endings(chunks : [string]) 153 154 Correct for sentence endings buried in 'chunks'. Eg. when the 155 original text contains "... foo.\nBar ...", munge_whitespace() 156 and split() will convert that to [..., "foo.", " ", "Bar", ...] 157 which has one too few spaces; this method simply changes the one 158 space to two. 159 """ 160 i = 0 161 pat = self.sentence_end_re 162 while i < len(chunks)-1: 163 if chunks[i+1] == " " and pat.search(chunks[i]): 164 chunks[i+1] = " " 165 i += 2 166 else: 167 i += 1
168
169 - def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
170 """_handle_long_word(chunks : [string], 171 cur_line : [string], 172 cur_len : int, width : int) 173 174 Handle a chunk of text (most likely a word, not whitespace) that 175 is too long to fit in any line. 176 """ 177 space_left = max(width - cur_len, 1) 178 179 # If we're allowed to break long words, then do so: put as much 180 # of the next chunk onto the current line as will fit. 181 if self.break_long_words: 182 cur_line.append(reversed_chunks[-1][:space_left]) 183 reversed_chunks[-1] = reversed_chunks[-1][space_left:] 184 185 # Otherwise, we have to preserve the long word intact. Only add 186 # it to the current line if there's nothing already there -- 187 # that minimizes how much we violate the width constraint. 188 elif not cur_line: 189 cur_line.append(reversed_chunks.pop())
190 191 # If we're not allowed to break long words, and there's already 192 # text on the current line, do nothing. Next time through the 193 # main loop of _wrap_chunks(), we'll wind up here again, but 194 # cur_len will be zero, so the next line will be entirely 195 # devoted to the long word that we can't handle right now. 196
197 - def _wrap_chunks(self, chunks):
198 """_wrap_chunks(chunks : [string]) -> [string] 199 200 Wrap a sequence of text chunks and return a list of lines of 201 length 'self.width' or less. (If 'break_long_words' is false, 202 some lines may be longer than this.) Chunks correspond roughly 203 to words and the whitespace between them: each chunk is 204 indivisible (modulo 'break_long_words'), but a line break can 205 come between any two chunks. Chunks should not have internal 206 whitespace; ie. a chunk is either all whitespace or a "word". 207 Whitespace chunks will be removed from the beginning and end of 208 lines, but apart from that whitespace is preserved. 209 """ 210 lines = [] 211 if self.width <= 0: 212 raise ValueError("invalid width %r (must be > 0)" % self.width) 213 214 # Arrange in reverse order so items can be efficiently popped 215 # from a stack of chucks. 216 chunks.reverse() 217 218 while chunks: 219 220 # Start the list of chunks that will make up the current line. 221 # cur_len is just the length of all the chunks in cur_line. 222 cur_line = [] 223 cur_len = 0 224 225 # Figure out which static string will prefix this line. 226 if lines: 227 indent = self.subsequent_indent 228 else: 229 indent = self.initial_indent 230 231 # Maximum width for this line. 232 width = self.width - len(indent) 233 234 # First chunk on line is whitespace -- drop it, unless this 235 # is the very beginning of the text (ie. no lines started yet). 236 if self.drop_whitespace and chunks[-1].strip() == '' and lines: 237 del chunks[-1] 238 239 while chunks: 240 l = len(chunks[-1]) 241 242 # Can at least squeeze this chunk onto the current line. 243 if cur_len + l <= width: 244 cur_line.append(chunks.pop()) 245 cur_len += l 246 247 # Nope, this line is full. 248 else: 249 break 250 251 # The current line is full, and the next chunk is too big to 252 # fit on *any* line (not just this one). 253 if chunks and len(chunks[-1]) > width: 254 self._handle_long_word(chunks, cur_line, cur_len, width) 255 256 # If the last chunk on this line is all whitespace, drop it. 257 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': 258 del cur_line[-1] 259 260 # Convert current line back to a string and store it in list 261 # of all lines (return value). 262 if cur_line: 263 lines.append(indent + ''.join(cur_line)) 264 265 return lines
266 267 268 # -- Public interface ---------------------------------------------- 269
270 - def wrap(self, text):
271 """wrap(text : string) -> [string] 272 273 Reformat the single paragraph in 'text' so it fits in lines of 274 no more than 'self.width' columns, and return a list of wrapped 275 lines. Tabs in 'text' are expanded with string.expandtabs(), 276 and all other whitespace characters (including newline) are 277 converted to space. 278 """ 279 text = self._munge_whitespace(text) 280 chunks = self._split(text) 281 if self.fix_sentence_endings: 282 self._fix_sentence_endings(chunks) 283 return self._wrap_chunks(chunks)
284
285 - def fill(self, text):
286 """fill(text : string) -> string 287 288 Reformat the single paragraph in 'text' to fit in lines of no 289 more than 'self.width' columns, and return a new string 290 containing the entire wrapped paragraph. 291 """ 292 return "\n".join(self.wrap(text))
293 294 295 # -- Convenience interface --------------------------------------------- 296
297 -def wrap(text, width=70, **kwargs):
298 """Wrap a single paragraph of text, returning a list of wrapped lines. 299 300 Reformat the single paragraph in 'text' so it fits in lines of no 301 more than 'width' columns, and return a list of wrapped lines. By 302 default, tabs in 'text' are expanded with string.expandtabs(), and 303 all other whitespace characters (including newline) are converted to 304 space. See TextWrapper class for available keyword args to customize 305 wrapping behaviour. 306 """ 307 w = TextWrapper(width=width, **kwargs) 308 return w.wrap(text)
309
310 -def fill(text, width=70, **kwargs):
311 """Fill a single paragraph of text, returning a new string. 312 313 Reformat the single paragraph in 'text' to fit in lines of no more 314 than 'width' columns, and return a new string containing the entire 315 wrapped paragraph. As with wrap(), tabs are expanded and other 316 whitespace characters converted to space. See TextWrapper class for 317 available keyword args to customize wrapping behaviour. 318 """ 319 w = TextWrapper(width=width, **kwargs) 320 return w.fill(text)
321 322 323 # -- Loosely related functionality ------------------------------------- 324 325 _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) 326 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) 327
328 -def dedent(text):
329 """Remove any common leading whitespace from every line in `text`. 330 331 This can be used to make triple-quoted strings line up with the left 332 edge of the display, while still presenting them in the source code 333 in indented form. 334 335 Note that tabs and spaces are both treated as whitespace, but they 336 are not equal: the lines " hello" and "\thello" are 337 considered to have no common leading whitespace. (This behaviour is 338 new in Python 2.5; older versions of this module incorrectly 339 expanded tabs before searching for common leading whitespace.) 340 """ 341 # Look for the longest leading string of spaces and tabs common to 342 # all lines. 343 margin = None 344 text = _whitespace_only_re.sub('', text) 345 indents = _leading_whitespace_re.findall(text) 346 for indent in indents: 347 if margin is None: 348 margin = indent 349 350 # Current line more deeply indented than previous winner: 351 # no change (previous winner is still on top). 352 elif indent.startswith(margin): 353 pass 354 355 # Current line consistent with and no deeper than previous winner: 356 # it's the new winner. 357 elif margin.startswith(indent): 358 margin = indent 359 360 # Current line and previous winner have no common whitespace: 361 # there is no margin. 362 else: 363 margin = "" 364 break 365 366 # sanity check (testing/debugging only) 367 if 0 and margin: 368 for line in text.split("\n"): 369 assert not line or line.startswith(margin), \ 370 "line = %r, margin = %r" % (line, margin) 371 372 if margin: 373 text = re.sub(r'(?m)^' + margin, '', text) 374 return text
375 376 if __name__ == "__main__": 377 #print dedent("\tfoo\n\tbar") 378 #print dedent(" \thello there\n \t how are you?") 379 print dedent("Hello there.\n This is indented.") 380