1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """string processing utilities for extracting strings with various kinds of delimiters"""
23
24 import logging
25 import htmlentitydefs
26
28 """returns a list of locations where substr occurs in searchin
29 locations are not allowed to overlap"""
30 location = 0
31 locations = []
32 while location != -1:
33 location = searchin.find(substr, location)
34 if location != -1:
35 locations.append(location)
36 location += len(substr)
37 return locations
38
40 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
41 returns tuple of (quoted string with quotes, still in string at end)"""
42
43 instring = startinstring
44 enteredonce = False
45 lenstart = len(startdelim)
46 lenend = len(enddelim)
47 startdelim_places = find_all(source, startdelim)
48 if startdelim == enddelim:
49 enddelim_places = startdelim_places[:]
50 else:
51 enddelim_places = find_all(source, enddelim)
52 if escape is not None:
53 lenescape = len(escape)
54 escape_places = find_all(source, escape)
55 last_escape_pos = -1
56
57 true_escape = False
58 true_escape_places = []
59 for escape_pos in escape_places:
60 if escape_pos - lenescape in escape_places:
61 true_escape = not true_escape
62 else:
63 true_escape = True
64 if true_escape:
65 true_escape_places.append(escape_pos)
66 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
67 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
68 else:
69 enddelim_places = [pos + lenend for pos in enddelim_places]
70
71 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
72 significant_places.sort()
73 extracted = ""
74 lastpos = None
75 for pos in significant_places:
76 if instring and pos in enddelim_places:
77
78 if lastpos == pos - lenstart and lastpos in startdelim_places:
79 continue
80 extracted += source[lastpos:pos]
81 instring = False
82 lastpos = pos
83 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
84 instring = True
85 enteredonce = True
86 lastpos = pos
87 if instring:
88 extracted += source[lastpos:]
89 return (extracted, instring)
90
92 """Calls extract over multiple lines, remembering whether in the string or not"""
93 result = ""
94 instring = 0
95 for line in lines:
96 (string, instring) = extract(line, startdelim, enddelim, escape, instring)
97 result += string
98 if not instring: break
99 return result
100
102 "Extracts a doublequote-delimited string from a string, allowing for backslash-escaping"
103 (string, instring) = extract(source, '"', '"', '\\')
104 return string
105
109
111 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping
112 includeescapes can also be a function that takes the whole escaped string and returns the replaced version"""
113 instring = startinstring
114 enteredonce = False
115 lenstart = len(startdelim)
116 lenend = len(enddelim)
117 startdelim_places = find_all(source, startdelim)
118 if startdelim == enddelim:
119 enddelim_places = startdelim_places[:]
120 else:
121 enddelim_places = find_all(source, enddelim)
122 if escape is not None:
123 lenescape = len(escape)
124 escape_places = find_all(source, escape)
125 last_escape_pos = -1
126
127 true_escape = False
128 true_escape_places = []
129 for escape_pos in escape_places:
130 if escape_pos - lenescape in escape_places:
131 true_escape = not true_escape
132 else:
133 true_escape = True
134 if true_escape:
135 true_escape_places.append(escape_pos)
136 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places]
137 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places]
138 else:
139 enddelim_places = [pos + lenend for pos in enddelim_places]
140
141 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1]
142 significant_places.sort()
143 extracted = ""
144 lastpos = 0
145 callable_includeescapes = callable(includeescapes)
146 checkescapes = callable_includeescapes or not includeescapes
147 for pos in significant_places:
148 if instring and pos in enddelim_places and lastpos != pos - lenstart:
149 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim)
150 section = source[section_start:section_end]
151 if escape is not None and checkescapes:
152 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end]
153 new_section = ""
154 last_epos = 0
155 for epos in escape_list:
156 new_section += section[last_epos:epos]
157 if callable_includeescapes:
158 replace_escape = includeescapes(section[epos:epos+lenescape+1])
159
160 if not isinstance(replace_escape, basestring):
161 if replace_escape:
162 replace_escape = section[epos:epos+lenescape+1]
163 else:
164 replace_escape = section[epos+lenescape:epos+lenescape+1]
165 new_section += replace_escape
166 last_epos = epos + lenescape + 1
167 else:
168 last_epos = epos + lenescape
169 section = new_section + section[last_epos:]
170 extracted += section
171 instring = False
172 lastpos = pos
173 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry):
174 instring = True
175 enteredonce = True
176 lastpos = pos
177 if instring:
178 section_start = lastpos + len(startdelim)
179 section = source[section_start:]
180 if escape is not None and not includeescapes:
181 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos]
182 new_section = ""
183 last_epos = 0
184 for epos in escape_list:
185 new_section += section[last_epos:epos]
186 if callable_includeescapes and includeescapes(section[epos:epos+lenescape+1]):
187 last_epos = epos
188 else:
189 last_epos = epos + lenescape
190 section = new_section + section[last_epos:]
191 extracted += section
192 return (extracted, instring)
193
195 "Returns the same string, with double quotes escaped with backslash"
196 if escapeescapes:
197 return source.replace('\\', '\\\\').replace('"', '\\"')
198 else:
199 return source.replace('"','\\"')
200
202 "Returns the same string, with single quotes doubled"
203 return source.replace("'","''")
204
206 """encodes source using HTML entities e.g. © -> ©"""
207 output = ""
208 for char in source:
209 charnum = ord(char)
210 if charnum in htmlentitydefs.codepoint2name:
211 output += "&%s;" % htmlentitydefs.codepoint2name[charnum]
212 else:
213 output += str(char)
214 return output
215
217 """decodes source using HTML entities e.g. © -> ©"""
218 output = u""
219 inentity = False
220 for char in source:
221 if char == "&":
222 inentity = True
223 possibleentity = ""
224 continue
225 if inentity:
226 if char == ";":
227 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint:
228 output += unichr(htmlentitydefs.name2codepoint[possibleentity])
229 inentity = False
230 else:
231 output += "&" + possibleentity + ";"
232 inentity = False
233 elif char == " ":
234 output += "&" + possibleentity + char
235 inentity = False
236 else:
237 possibleentity += char
238 else:
239 output += char
240 return output
241
243 """encodes source in the escaped-unicode encoding used by Java .properties files"""
244 output = ""
245 for char in source:
246 charnum = ord(char)
247 if char in controlchars:
248 output += controlchars[char]
249 elif 0 <= charnum < 128:
250 output += str(char)
251 else:
252 output += "\\u%04X" % charnum
253 return output
254
256 """encodes source in the escaped-unicode encoding used by Mozilla .properties files"""
257 output = ""
258 for char in source:
259 charnum = ord(char)
260 if char in controlchars:
261 output += controlchars[char]
262 else:
263 output += char
264 return output
265
266 propertyescapes = {
267
268 "\\": "\\", "'": "'", '"': '"',
269
270 "b": "\b", "f": "\f", "t": "\t", "n": "\n", "v": "\v", "a": "\a"
271 }
272
273 controlchars = {
274
275 "\b": "\\b", "\f": "\\f", "\t": "\\t", "\n": "\\n", "\v": "\\v"
276 }
277
283
285 """decodes source from the escaped-unicode encoding used by mozilla .properties files"""
286
287
288
289 output = u""
290 s = 0
291 if isinstance(source, str):
292 source = source.decode("utf-8")
293 def unichr2(i):
294 """Returns a Unicode string of one character with ordinal 32 <= i, otherwise an escaped control character"""
295 if 32 <= i:
296 return unichr(i)
297 elif unichr(i) in controlchars:
298
299
300 return unichr(i)
301 else:
302 return "\\u%04x" % i
303 while s < len(source):
304 c = source[s]
305 if c != '\\':
306 output += c
307 s += 1
308 continue
309 s += 1
310 if s >= len(source):
311
312
313 output += c
314 continue
315 c = source[s]
316 s += 1
317 if c == '\n': pass
318
319 elif c in propertyescapes: output += propertyescapes[c]
320
321
322 elif c in "uU":
323 digits = 4
324 x = 0
325 for digit in range(digits):
326 x <<= 4
327 if s + digit >= len(source):
328 digits = digit
329 break
330 c = source[s+digit].lower()
331 if c.isdigit():
332 x += ord(c) - ord('0')
333 elif c in "abcdef":
334 x += ord(c) - ord('a') + 10
335 else:
336 break
337 s += digits
338 output += unichr2(x)
339 elif c == "N":
340 if source[s] != "{":
341 logging.warn("Invalid named unicode escape: no { after \\N")
342 output += "\\" + c
343 continue
344 s += 1
345 e = source.find("}", s)
346 if e == -1:
347 logging.warn("Invalid named unicode escape: no } after \\N{")
348 output += "\\" + c
349 continue
350 import unicodedata
351 name = source[s:e]
352 output += unicodedata.lookup(name)
353 s = e + 1
354 else:
355 output += "\\" + c
356 return output
357
359 "Returns a doublequote-delimited quoted string, escaping double quotes with backslash"
360 if isinstance(source, list):
361 firstline = True
362 for line in source:
363 if firstline:
364 newsource = '"' + escapequotes(line, escapeescapes) + '"'
365 firstline = False
366 else:
367 newsource = newsource + '\n' + '"' + escapequotes(line, escapeescapes) + '"'
368 return newsource
369 else:
370 return '"' + escapequotes(source, escapeescapes) + '"'
371
373 "Returns a doublequote-delimited quoted string, escaping single quotes with themselves"
374 return "'" + escapesinglequotes(source) + "'"
375
382
384 s = string.find(substring)
385 if s != -1:
386 s += len(substring)
387 return s
388
390 return string.rstrip("\r\n")
391
400
403
405 """encodes certain characters in the string using an encode dictionary"""
406 encoded = unencoded
407 for key, value in encodedict.iteritems():
408 if key in encoded:
409 encoded = encoded.replace(key, value)
410 return encoded
411
413 """convert numbers to utf8 codes in the values of a dictionary"""
414 for key, value in d.items():
415 if type(value) == int:
416 d[key] = unichr(value).encode('utf8')
417 return d
418
420 x = ' "this" " is " "a" " test!" '
421 print extract(x, '"', '"', None)
422 print extract(x, '"', '"', '!')
423 print extractwithoutquotes(x, '"', '"', None)
424 print extractwithoutquotes(x, '"', '"', '!')
425 print extractwithoutquotes(x, '"', '"', '!', includeescapes=False)
426
427 if __name__ == '__main__':
428 testcase()
429