Package translate :: Package storage :: Module html
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.html

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2004-2006 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """module for parsing html files for translation""" 
 24   
 25  import re 
 26  from translate.storage import base 
 27  from HTMLParser import HTMLParser 
 28   
29 -class htmlunit(base.TranslationUnit):
30 """A unit of translatable/localisable HTML content"""
31 - def __init__(self, source=None):
32 self.locations = [] 33 self.setsource(source)
34
35 - def getsource(self):
36 #TODO: Rethink how clever we should try to be with html entities. 37 return self.text.replace("&amp;", "&").replace("&lt;", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
38
39 - def setsource(self, source):
40 self.text = source.replace("&", "&amp;").replace("<", "&lt;")
41 source = property(getsource, setsource) 42
43 - def addlocation(self, location):
44 self.locations.append(location)
45
46 - def getlocations(self):
47 return self.locations
48 49
50 -class htmlfile(HTMLParser, base.TranslationStore):
51 UnitClass = htmlunit 52 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"] 53 markingattrs = [] 54 includeattrs = ["alt", "summary", "standby", "abbr", "content"] 55
56 - def __init__(self, includeuntaggeddata=None, inputfile=None):
57 self.units = [] 58 self.filename = getattr(inputfile, 'name', None) 59 self.currentblock = "" 60 self.currentblocknum = 0 61 self.currenttag = None 62 self.includeuntaggeddata = includeuntaggeddata 63 HTMLParser.__init__(self) 64 65 if inputfile is not None: 66 htmlsrc = inputfile.read() 67 inputfile.close() 68 self.parse(htmlsrc)
69
70 - def guess_encoding(self, htmlsrc):
71 """Returns the encoding of the html text. 72 73 We look for 'charset=' within a meta tag to do this. 74 """ 75 76 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']''' 77 result = re.findall(pattern, htmlsrc) 78 encoding = None 79 if result: 80 encoding = result[0] 81 return encoding
82
83 - def do_encoding(self, htmlsrc):
84 """Return the html text properly encoded based on a charset.""" 85 charset = self.guess_encoding(htmlsrc) 86 if charset: 87 return htmlsrc.decode(charset) 88 else: 89 return htmlsrc
90
91 - def phprep(self, text):
92 """Replaces all instances of PHP with placeholder tags, and returns 93 the new text and a dictionary of tags. The current implementation 94 replaces <?foo?> with <?md5(foo)?>. The hash => code conversions 95 are stored in self.phpdict for later use in restoring the real PHP. 96 97 The purpose of this is to remove all potential "tag-like" code from 98 inside PHP. The hash looks nothing like an HTML tag, but the following 99 PHP: 100 $a < $b ? $c : ($d > $e ? $f : $g) 101 looks like it contains an HTML tag: 102 < $b ? $c : ($d > 103 to nearly any regex. Hence, we replace all contents of PHP with simple 104 strings to help our regexes out. 105 106 """ 107 108 import md5 109 110 self.phpdict = {} 111 result = re.findall('(?s)<\?(.*?)\?>', text) 112 for cmd in result: 113 h = md5.new(cmd).hexdigest() 114 self.phpdict[h] = cmd 115 text = text.replace(cmd,h) 116 return text
117
118 - def reintrophp(self, text):
119 """Replaces the PHP placeholders in text with the real code""" 120 for hash, code in self.phpdict.items(): 121 text = text.replace(hash, code) 122 return text
123
124 - def parse(self, htmlsrc):
125 htmlsrc = self.do_encoding(htmlsrc) 126 htmlsrc = self.phprep(htmlsrc) #Clear out the PHP before parsing 127 self.feed(htmlsrc)
128
129 - def addhtmlblock(self, text):
130 text = self.strip_html(text) 131 text = self.reintrophp(text) #Before adding anything, restore PHP 132 if self.has_translatable_content(text): 133 self.currentblocknum += 1 134 unit = self.addsourceunit(text) 135 unit.addlocation("%s:%d" % (self.filename, self.currentblocknum))
136
137 - def strip_html(self, text):
138 """Strip unnecessary html from the text. 139 140 HTML tags are deemed unnecessary if it fully encloses the translatable 141 text, eg. '<a href="index.html">Home Page</a>'. 142 143 HTML tags that occurs within the normal flow of text will not be removed, 144 eg. 'This is a link to the <a href="index.html">Home Page</a>.' 145 """ 146 text = text.strip() 147 148 # If all that is left is PHP, return "" 149 result = re.findall('(?s)^<\?.*?\?>$', text) 150 if len(result) == 1: 151 return "" 152 153 # These two patterns are the same; the first one is more concise... 154 #pattern = '(?s)^<[^?>](?:(?:[^>]|(?:<\?.*?\?>))*[^?>])?>(.*)</.*[^?]>$' 155 pattern = re.compile(r''' 156 (?s)^ # We allow newlines, and match start of line 157 <[^?>] # Match start of tag and the first character (not ? or >) 158 (?: 159 (?: 160 [^>] # Anything that's not a > is valid tag material 161 | 162 (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid 163 )* # Repeat over valid tag material 164 [^?>] # If we have > 1 char, the last char can't be ? or > 165 )? # The repeated chars are optional, so that <a>, <p> work 166 > # Match ending > of opening tag 167 168 (.*) # Match actual contents of tag 169 170 </.*[^?]> # Match ending tag; can't end with ?> and must be >=1 char 171 $ # Match end of line 172 ''', re.VERBOSE) 173 result = re.findall(pattern, text) 174 if len(result) == 1: 175 text = self.strip_html(result[0]) 176 return text
177
178 - def has_translatable_content(self, text):
179 """Check if the supplied HTML snippet has any content that needs to be translated.""" 180 181 text = text.strip() 182 result = re.findall('(?i).*(charset.*=.*)', text) 183 if len(result) == 1: 184 return False 185 186 # TODO: Get a better way to find untranslatable entities. 187 if text == '&nbsp;': 188 return False 189 190 pattern = '<\?.*?\?>' # Lazily strip all PHP 191 result = re.sub(pattern, '', text).strip() 192 pattern = '<[^>]*>' #Strip all HTML tags 193 result = re.sub(pattern, '', result).strip() 194 if result: 195 return True 196 else: 197 return False
198 199 #From here on below, follows the methods of the HTMLParser 200
201 - def startblock(self, tag):
202 self.addhtmlblock(self.currentblock) 203 self.currentblock = "" 204 self.currenttag = tag
205
206 - def endblock(self):
207 self.addhtmlblock(self.currentblock) 208 self.currentblock = "" 209 self.currenttag = None
210
211 - def handle_starttag(self, tag, attrs):
212 newblock = 0 213 if tag in self.markingtags: 214 newblock = 1 215 for attrname, attrvalue in attrs: 216 if attrname in self.markingattrs: 217 newblock = 1 218 if attrname in self.includeattrs: 219 self.addhtmlblock(attrvalue) 220 221 if newblock: 222 self.startblock(tag) 223 elif self.currenttag is not None: 224 self.currentblock += self.get_starttag_text()
225
226 - def handle_startendtag(self, tag, attrs):
227 for attrname, attrvalue in attrs: 228 if attrname in self.includeattrs: 229 self.addhtmlblock(attrvalue) 230 if self.currenttag is not None: 231 self.currentblock += self.get_starttag_text()
232
233 - def handle_endtag(self, tag):
234 if tag == self.currenttag: 235 self.endblock() 236 elif self.currenttag is not None: 237 self.currentblock += '</%s>' % tag
238
239 - def handle_data(self, data):
240 if self.currenttag is not None: 241 self.currentblock += data 242 elif self.includeuntaggeddata: 243 self.startblock(None) 244 self.currentblock += data
245
246 - def handle_charref(self, name):
247 self.handle_data("&#%s;" % name)
248
249 - def handle_entityref(self, name):
250 self.handle_data("&%s;" % name)
251
252 - def handle_comment(self, data):
253 # we don't do anything with comments 254 pass
255
256 - def handle_pi(self, data):
257 self.handle_data("<?%s>" % data)
258
259 -class POHTMLParser(htmlfile):
260 pass
261