Package translate :: Package convert :: Module test_html2po
[hide private]
[frames] | no frames]

Source Code for Module translate.convert.test_html2po

  1  #!/usr/bin/env python 
  2   
  3  from translate.convert import html2po 
  4  from translate.convert import po2html 
  5  from translate.convert import test_convert 
  6  from translate.misc import wStringIO 
  7   
8 -class TestHTML2PO:
9 - def html2po(self, markup):
10 """Helper to convert html to po without a file.""" 11 inputfile = wStringIO.StringIO(markup) 12 convertor = html2po.html2po() 13 outputpo = convertor.convertfile(inputfile, "test", False, False) 14 return outputpo
15
16 - def po2html(self, posource, htmltemplate):
17 """Helper to convert po to html without a file.""" 18 inputfile = wStringIO.StringIO(posource) 19 outputfile = wStringIO.StringIO() 20 templatefile = wStringIO.StringIO(htmltemplate) 21 assert po2html.converthtml(inputfile, outputfile, templatefile) 22 return outputfile.getvalue()
23
24 - def countunits(self, pofile, expected):
25 """helper to check that we got the expected number of messages""" 26 actual = len(pofile.units) 27 if actual > 0: 28 if pofile.units[0].isheader(): 29 actual = actual - 1 30 print pofile 31 assert actual == expected
32
33 - def compareunit(self, pofile, unitnumber, expected):
34 """helper to validate a PO message""" 35 if not pofile.units[0].isheader(): 36 unitnumber = unitnumber - 1 37 print 'unit source: ' + str(pofile.units[unitnumber].source) + '|' 38 print 'expected: ' + expected.encode('utf-8') + '|' 39 assert unicode(pofile.units[unitnumber].source) == unicode(expected)
40
41 - def check_single(self, markup, itemtext):
42 """checks that converting this markup produces a single element with value itemtext""" 43 pofile = self.html2po(markup) 44 self.countunits(pofile, 1) 45 self.compareunit(pofile, 1, itemtext)
46
47 - def check_null(self, markup):
48 """checks that converting this markup produces no elements""" 49 pofile = self.html2po(markup) 50 self.countunits(pofile, 0)
51
52 - def check_phpsnippet(self, php):
53 """Given a snippet of php, put it into an HTML shell and see 54 if the results are as expected""" 55 self.check_single('<html><head></head><body><p><a href="'+php+'/site.html">Body text</a></p></body></html>', "Body text") 56 self.check_single('<html><head></head><body><p>More things in <a href="'+php+'/site.html">Body text</a></p></body></html>', 'More things in <a href="'+php+'/site.html">Body text</a>') 57 self.check_null('<html><head></head><body><p>'+php+'</p></body></html>')
58
59 - def test_htmllang(self):
60 """test to ensure that we no longer use the lang attribure""" 61 markup = '''<html lang="en"><head><title>My title</title></head><body></body></html>''' 62 pofile = self.html2po(markup) 63 self.countunits(pofile, 1) 64 # Check that the first item is the <title> not <head> 65 self.compareunit(pofile, 1, "My title")
66
67 - def test_title(self):
68 """test that we can extract the <title> tag""" 69 self.check_single("<html><head><title>My title</title></head><body></body></html>", "My title")
70
72 """Test a linebreak in the <title> tag""" 73 htmltext = '''<html> 74 <head> 75 <title>My 76 title</title> 77 </head> 78 <body> 79 </body> 80 </html> 81 ''' 82 self.check_single(htmltext, "My title")
83
84 - def test_meta(self):
85 """Test that we can extract certain <meta> info from <head>.""" 86 self.check_single('''<html><head><meta name="keywords" content="these are keywords"></head><body></body></html>''', "these are keywords")
87
88 - def test_tag_p(self):
89 """test that we can extract the <p> tag""" 90 self.check_single("<html><head></head><body><p>A paragraph.</p></body></html>", "A paragraph.") 91 markup = "<p>First line.<br>Second line.</p>" 92 pofile = self.html2po(markup) 93 self.compareunit(pofile, 1, "First line.<br>Second line.")
94
96 """Test newlines within the <p> tag.""" 97 htmltext = '''<html> 98 <head> 99 </head> 100 <body> 101 <p> 102 A paragraph is a section in a piece of writing, usually highlighting a 103 particular point or topic. It always begins on a new line and usually 104 with indentation, and it consists of at least one sentence. 105 </p> 106 </body> 107 </html> 108 ''' 109 self.check_single(htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.") 110 markup = "<p>First\nline.<br>Second\nline.</p>" 111 pofile = self.html2po(markup) 112 self.compareunit(pofile, 1, "First line.<br>Second line.")
113
114 - def test_tag_div(self):
115 """test that we can extract the <div> tag""" 116 self.check_single("<html><head></head><body><div>A paragraph.</div></body></html>", "A paragraph.") 117 markup = "<div>First line.<br>Second line.</div>" 118 pofile = self.html2po(markup) 119 self.compareunit(pofile, 1, "First line.<br>Second line.")
120
122 """Test linebreaks within a <div> tag.""" 123 htmltext = '''<html> 124 <head> 125 </head> 126 <body> 127 <div> 128 A paragraph is a section in a piece of writing, usually highlighting a 129 particular point or topic. It always begins on a new line and usually 130 with indentation, and it consists of at least one sentence. 131 </div> 132 </body> 133 </html> 134 ''' 135 self.check_single(htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.") 136 markup = "<div>First\nline.<br>Second\nline.</div>" 137 pofile = self.html2po(markup) 138 self.compareunit(pofile, 1, "First line.<br>Second line.")
139
140 - def test_tag_a(self):
141 """test that we can extract the <a> tag""" 142 self.check_single('<html><head></head><body><p>A paragraph with <a href="http://translate.org.za/">hyperlink</a>.</p></body></html>', 'A paragraph with <a href="http://translate.org.za/">hyperlink</a>.')
143
145 """Test that we can extract the <a> tag with newlines in it.""" 146 htmltext = '''<html> 147 <head> 148 </head> 149 <body> 150 <p>A 151 paragraph 152 with <a 153 href="http://translate.org.za/">hyperlink</a> 154 and 155 newlines.</p></body></html> 156 ''' 157 self.check_single(htmltext, 'A paragraph with <a href="http://translate.org.za/">hyperlink</a> and newlines.')
158
159 - def test_tag_img(self):
160 """Test that we can extract the alt attribute from the <img> tag.""" 161 self.check_single('''<html><head></head><body><img src="picture.png" alt="A picture"></body></html>''', "A picture")
162
163 - def test_img_empty(self):
164 """Test that we can extract the alt attribute from the <img> tag.""" 165 htmlsource = '''<html><head></head><body><img src="images/topbar.jpg" width="750" height="80"></body></html>''' 166 self.check_null(htmlsource)
167
168 - def test_tag_table_summary(self):
169 """Test that we can extract the summary attribute.""" 170 self.check_single('''<html><head></head><body><table summary="Table summary"></table></body></html>''', "Table summary")
171
172 - def test_table_simple(self):
173 """Test that we can fully extract a simple table.""" 174 markup = '''<html><head></head><body><table><tr><th>Heading One</th><th>Heading Two</th><tr><td>One</td><td>Two</td></tr></table></body></html>''' 175 pofile = self.html2po(markup) 176 self.countunits(pofile, 4) 177 self.compareunit(pofile, 1, "Heading One") 178 self.compareunit(pofile, 2, "Heading Two") 179 self.compareunit(pofile, 3, "One") 180 self.compareunit(pofile, 4, "Two")
181
182 - def test_table_complex(self):
183 markup = '''<table summary="This is the summary"><caption>A caption</caption><thead><tr><th abbr="Head 1">Heading One</th><th>Heading Two</th></thead><tfoot><tr><td>Foot One</td><td>Foot Two</td></tr></tfoot><tbody><tr><td>One</td><td>Two</td></tr></tbody></table>''' 184 pofile = self.html2po(markup) 185 self.countunits(pofile, 9) 186 self.compareunit(pofile, 1, "This is the summary") 187 self.compareunit(pofile, 2, "A caption") 188 self.compareunit(pofile, 3, "Head 1") 189 self.compareunit(pofile, 4, "Heading One") 190 self.compareunit(pofile, 5, "Heading Two") 191 self.compareunit(pofile, 6, "Foot One") 192 self.compareunit(pofile, 7, "Foot Two") 193 self.compareunit(pofile, 8, "One") 194 self.compareunit(pofile, 9, "Two")
195
196 - def test_table_empty(self):
197 """Test that we ignore tables that are empty. 198 199 A table is deemed empty if it has no translatable content. 200 """ 201 202 self.check_null('''<html><head></head><body><table><tr><td><img src="bob.png"></td></tr></table></body></html>''') 203 self.check_null('''<html><head></head><body><table><tr><td>&nbsp;</td></tr></table></body></html>''') 204 self.check_null('''<html><head></head><body><table><tr><td><strong></strong></td></tr></table></body></html>''')
205
206 - def test_address(self):
207 """Test to see if the address element is extracted""" 208 self.check_single("<body><address>My address</address></body>", "My address")
209
210 - def test_headings(self):
211 """Test to see if the h* elements are extracted""" 212 markup = "<html><head></head><body><h1>Heading One</h1><h2>Heading Two</h2><h3>Heading Three</h3><h4>Heading Four</h4><h5>Heading Five</h5><h6>Heading Six</h6></body></html>" 213 pofile = self.html2po(markup) 214 self.countunits(pofile, 6) 215 self.compareunit(pofile, 1, "Heading One") 216 self.compareunit(pofile, 2, "Heading Two") 217 self.compareunit(pofile, 3, "Heading Three") 218 self.compareunit(pofile, 4, "Heading Four") 219 self.compareunit(pofile, 5, "Heading Five") 220 self.compareunit(pofile, 6, "Heading Six")
221
223 """Test to see if h* elements with newlines can be extracted""" 224 markup = "<html><head></head><body><h1>Heading\nOne</h1><h2>Heading\nTwo</h2><h3>Heading\nThree</h3><h4>Heading\nFour</h4><h5>Heading\nFive</h5><h6>Heading\nSix</h6></body></html>" 225 pofile = self.html2po(markup) 226 self.countunits(pofile, 6) 227 self.compareunit(pofile, 1, "Heading One") 228 self.compareunit(pofile, 2, "Heading Two") 229 self.compareunit(pofile, 3, "Heading Three") 230 self.compareunit(pofile, 4, "Heading Four") 231 self.compareunit(pofile, 5, "Heading Five") 232 self.compareunit(pofile, 6, "Heading Six")
233
234 - def test_dt(self):
235 """Test to see if the definition list title (dt) element is extracted""" 236 self.check_single("<html><head></head><body><dl><dt>Definition List Item Title</dt></dl></body></html>", "Definition List Item Title")
237
238 - def test_dd(self):
239 """Test to see if the definition list description (dd) element is extracted""" 240 self.check_single("<html><head></head><body><dl><dd>Definition List Item Description</dd></dl></body></html>", "Definition List Item Description")
241
242 - def test_span(self):
243 """test to check that we don't double extract a span item""" 244 self.check_single("<html><head></head><body><p>You are a <span>Spanish</span> sentence.</p></body></html>", "You are a <span>Spanish</span> sentence.")
245
246 - def test_ul(self):
247 """Test to see if the list item <li> is exracted""" 248 markup = "<html><head></head><body><ul><li>Unordered One</li><li>Unordered Two</li></ul><ol><li>Ordered One</li><li>Ordered Two</li></ol></body></html>" 249 pofile = self.html2po(markup) 250 self.countunits(pofile, 4) 251 self.compareunit(pofile, 1, "Unordered One") 252 self.compareunit(pofile, 2, "Unordered Two") 253 self.compareunit(pofile, 3, "Ordered One") 254 self.compareunit(pofile, 4, "Ordered Two")
255
256 - def test_duplicates(self):
257 """check that we use the default style of msgid_comments to disambiguate duplicate messages""" 258 markup = "<html><head></head><body><p>Duplicate</p><p>Duplicate</p></body></html>" 259 pofile = self.html2po(markup) 260 self.countunits(pofile, 2) 261 # FIXME change this so that we check that the KDE comment is correctly added 262 self.compareunit(pofile, 1, "Duplicate") 263 self.compareunit(pofile, 2, "Duplicate")
264
265 - def wtest_multiline_reflow(self):
266 """check that we reflow multiline content to make it more readable for translators""" 267 self.check_single('''<td valign="middle" width="96%"><font class="headingwhite">South 268 Africa</font></td>''', '''<font class="headingwhite">South Africa</font>''')
269
270 - def wtest_nested_tags(self):
271 """check that we can extract items within nested tags""" 272 markup = "<div><p>Extract this</p>And this</div>" 273 pofile = self.html2po(markup) 274 self.countunits(pofile, 2) 275 self.compareunit(pofile, 1, "Extract this") 276 self.compareunit(pofile, 2, "And this")
277
278 - def test_carriage_return(self):
279 """Remove carriage returns from files in dos format.""" 280 htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\r 281 <html><!-- InstanceBegin template="/Templates/masterpage.dwt" codeOutsideHTMLIsLocked="false" -->\r 282 <head>\r 283 <!-- InstanceBeginEditable name="doctitle" -->\r 284 <link href="fmfi.css" rel="stylesheet" type="text/css">\r 285 </head>\r 286 \r 287 <body>\r 288 <p>The rapid expansion of telecommunications infrastructure in recent\r 289 years has helped to bridge the digital divide to a limited extent.</p> \r 290 </body>\r 291 <!-- InstanceEnd --></html>\r 292 ''' 293 294 self.check_single(htmlsource, 'The rapid expansion of telecommunications infrastructure in recent years has helped to bridge the digital divide to a limited extent.')
295
296 - def test_encoding_latin1(self):
297 """Convert HTML input in iso-8859-1 correctly to unicode.""" 298 htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> 299 <html><!-- InstanceBegin template="/Templates/masterpage.dwt" codeOutsideHTMLIsLocked="false" --> 300 <head> 301 <!-- InstanceBeginEditable name="doctitle" --> 302 <title>FMFI - South Africa - CSIR Openphone - Overview</title> 303 <!-- InstanceEndEditable --> 304 <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> 305 <meta name="keywords" content="fmfi, first mile, first inch, wireless, rural development, access devices, mobile devices, wifi, connectivity, rural connectivty, ict, low cost, cheap, digital divide, csir, idrc, community"> 306 307 <!-- InstanceBeginEditable name="head" --> 308 <!-- InstanceEndEditable --> 309 <link href="../../../fmfi.css" rel="stylesheet" type="text/css"> 310 </head> 311 312 <body> 313 <p>We aim to please \x96 will you aim too, please?</p> 314 <p>South Africa\x92s language diversity can be challenging.</p> 315 </body> 316 </html> 317 ''' 318 pofile = self.html2po(htmlsource) 319 320 self.countunits(pofile, 4) 321 self.compareunit(pofile, 3, u'We aim to please \x96 will you aim too, please?') 322 self.compareunit(pofile, 4, u'South Africa\x92s language diversity can be challenging.')
323
324 - def test_strip_html(self):
325 """Ensure that unnecessary html is stripped from the resulting unit.""" 326 327 htmlsource = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> 328 <html> 329 <head> 330 <title>FMFI - Contact</title> 331 </head> 332 <body> 333 <table width="100%" border="0" cellpadding="0" cellspacing="0"> 334 <tr align="left" valign="top"> 335 <td width="150" height="556"> 336 <table width="157" height="100%" border="0" cellspacing="0" id="leftmenubg-color"> 337 <tr> 338 <td align="left" valign="top" height="555"> 339 <table width="100%" border="0" cellspacing="0" cellpadding="2"> 340 <tr align="left" valign="top" bgcolor="#660000"> 341 <td width="4%"><strong></strong></td> 342 <td width="96%"><strong><font class="headingwhite">Projects</font></strong></td> 343 </tr> 344 <tr align="left" valign="top"> 345 <td valign="middle" width="4%"><img src="images/arrow.gif" width="8" height="8"></td> 346 <td width="96%"><a href="index.html">Home Page</a></td> 347 </tr> 348 </table> 349 </td> 350 </tr> 351 </table></td> 352 </table> 353 </body> 354 </html> 355 ''' 356 pofile = self.html2po(htmlsource) 357 self.countunits(pofile, 3) 358 self.compareunit(pofile, 2, u'Projects') 359 self.compareunit(pofile, 3, u'Home Page') 360 361 # Translate and convert back: 362 pofile.units[1].target = 'Projekte' 363 pofile.units[2].target = 'Tuisblad' 364 htmlresult = self.po2html(str(pofile), htmlsource).replace('\n', ' ').replace('= "', '="').replace('> <', '><') 365 snippet = '<td width="96%"><strong><font class="headingwhite">Projekte</font></strong></td>' 366 assert snippet in htmlresult 367 snippet = '<td width="96%"><a href="index.html">Tuisblad</a></td>' 368 assert snippet in htmlresult
369
370 - def test_php(self):
371 """Test that PHP snippets don't interfere""" 372 373 # A simple string 374 self.check_phpsnippet('''<?=$phpvariable?>''') 375 376 # Contains HTML tag charcters (< and >) 377 self.check_phpsnippet('''<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>''') 378 379 # Make sure basically any symbol can be handled 380 self.check_phpsnippet(''' <? asdfghjkl qwertyuiop 1234567890!@#$%^&*()-=_+[]\{}|;':",./<>? ?> ''')
381
382 - def test_multiple_php(self):
383 """Test multiple PHP snippets in a string to make sure they get restored properly""" 384 php1 = '''<?=$phpvariable?>''' 385 php2 = '''<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>''' 386 php3 = '''<? asdfghjklqwertyuiop1234567890!@#$%^&*()-=_+[]\{}|;':",./<>? ?>''' 387 388 # Put 3 different strings into an html string 389 innertext = '<a href="'+php1+'/site.html">Body text</a> and some '+php2+' more text '+php2+php3 390 htmlsource = '<html><head></head><body><p>'+innertext+'</p></body></html>' 391 self.check_single(htmlsource, innertext)
392
393 - def test_php_multiline(self):
394 395 # A multi-line php string to test 396 php1 = '''<? abc 397 def 398 ghi ?>''' 399 400 # Scatter the php strings throughout the file, and show what the translation should be 401 innertext = '<a href="'+php1+'/site.html">Body text</a> and some '+php1+' more text '+php1+php1 402 innertrans = '<a href="'+php1+'/site.html">Texte de corps</a> et encore de '+php1+' plus de texte '+php1+php1 403 404 htmlsource = '<html><head></head><body><p>'+innertext+'</p></body></html>' # Current html file 405 transsource = '<html><head></head><body><p>'+innertrans+'</p></body></html>' # Expected translation 406 407 pofile = self.html2po(htmlsource) 408 pofile.units[0].target = innertrans # Register the translation in the PO file 409 htmlresult = self.po2html(pofile, htmlsource) 410 assert htmlresult == transsource
411
412 -class TestHTML2POCommand(test_convert.TestConvertCommand, TestHTML2PO):
413 """Tests running actual html2po commands on files""" 414 convertmodule = html2po 415 defaultoptions = {"progress": "none"} 416
417 - def test_help(self):
418 """tests getting help""" 419 options = test_convert.TestConvertCommand.test_help(self) 420 options = self.help_check(options, "-P, --pot") 421 options = self.help_check(options, "--duplicates=DUPLICATESTYLE") 422 options = self.help_check(options, "-u, --untagged", last=True)
423