import os from io import BytesIO from translate.convert import html2po, po2html, test_convert class TestHTML2PO: def html2po( self, markup, duplicatestyle="msgctxt", keepcomments=False, ): """Helper to convert html to po without a file.""" inputfile = BytesIO(markup.encode() if isinstance(markup, str) else markup) convertor = html2po.html2po() return convertor.convertfile(inputfile, "test", duplicatestyle, keepcomments) def po2html(self, posource, htmltemplate): """Helper to convert po to html without a file.""" # Convert pofile object to bytes inputfile = BytesIO(bytes(posource)) outputfile = BytesIO() templatefile = BytesIO(htmltemplate.encode()) assert po2html.converthtml(inputfile, outputfile, templatefile) return outputfile.getvalue().decode("utf-8") def countunits(self, pofile, expected): """helper to check that we got the expected number of messages""" actual = len(pofile.units) if actual > 0: if pofile.units[0].isheader(): actual = actual - 1 print(pofile) assert actual == expected def compareunit(self, pofile, unitnumber, expected): """helper to validate a PO message""" if not pofile.units[0].isheader(): unitnumber = unitnumber - 1 print("unit source: " + pofile.units[unitnumber].source + "|") print("expected: " + expected + "|") assert str(pofile.units[unitnumber].source) == str(expected) def check_single(self, markup, itemtext): """checks that converting this markup produces a single element with value itemtext""" pofile = self.html2po(markup) self.countunits(pofile, 1) self.compareunit(pofile, 1, itemtext) def check_null(self, markup): """checks that converting this markup produces no elements""" pofile = self.html2po(markup) self.countunits(pofile, 0) def check_phpsnippet(self, php): """Given a snippet of php, put it into an HTML shell and see if the results are as expected""" self.check_single( '
', "Body text", ) self.check_single( 'More things in Body text
', 'More things in Body text', ) self.check_single( "" + php + "
", php ) def test_extract_lang_attribute_from_html_tag(self): """Test that the lang attribute is extracted from the html tag, issue #3884""" markup = """Français
', "Français") def test_title(self): """test that we can extract thetag""" self.check_single( "
A paragraph.
", "A paragraph." ) def test_tag_p_with_br(self): """test that we can extract the tag with an embedded
element"""
markup = "
First line.
Second line.
tag.""" htmltext = """
A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.
""" self.check_single( htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.", ) def test_tag_p_with_linebreak_and_embedded_br(self): """Test newlines within the tag when there is an embedded
element."""
markup = "
First\nline.
Second\nline.
A paragraph.
", "A paragraph." ) def test_tag_div(self): """test that we can extract theA paragraph with hyperlink.
', 'A paragraph with hyperlink.', ) def test_tag_a_with_linebreak(self): """Test that we can extract the tag with newlines in it.""" htmltext = """A paragraph with hyperlink and newlines.
""" self.check_single( htmltext, 'A paragraph with hyperlink and newlines.', ) def test_sequence_of_anchor_elements(self): """test that we can extract a sequence of anchor elements without mixing up start/end tags, issue #3768""" self.check_single( 'This is a link but this is not. However this is too
', 'This is a link but this is not. However this is too', ) def test_tag_img(self): """Test that we can extract the alt attribute from the tag.""" self.check_single( """""", "A picture", ) def test_img_empty(self): """Test that we can extract the alt attribute from the tag.""" htmlsource = """""" self.check_null(htmlsource) def test_tag_img_inside_a(self): """Test that we can extract the alt attribute from the tag when the img is embedded in a link.""" self.check_single( """""", "A picture", ) def test_tag_table_summary(self): """Test that we can extract the summary attribute.""" self.check_single( """Heading One | Heading Two |
---|---|
One | Two |
Heading One | Heading Two |
---|---|
Foot One | Foot Two |
One | Two |
You are a Spanish sentence.
", "You are a Spanish sentence.", ) def test_ul(self): """Test to see if the list itemDuplicate
Duplicate
" ) pofile = self.html2po(markup) self.countunits(pofile, 2) # FIXME change this so that we check that the msgctxt is correctly added self.compareunit(pofile, 1, "Duplicate") assert pofile.units[1].getlocations() == ["None+html.body.p:1-26"] self.compareunit(pofile, 2, "Duplicate") assert pofile.units[2].getlocations() == ["None+html.body.p:1-42"] def test_multiline_reflow(self): """check that we reflow multiline content to make it more readable for translators""" self.check_single( """Extract this
And thisThe rapid expansion of telecommunications infrastructure in recent\r years has helped to bridge the digital divide to a limited extent.
\r \r \r """ self.check_single( htmlsource, "The rapid expansion of telecommunications infrastructure in recent years has helped to bridge the digital divide to a limited extent.", ) def test_encoding_latin1(self): """Convert HTML input in iso-8859-1 correctly to unicode.""" """Also verifies that the charset declaration isn't extracted as a translation unit.""" htmlsource = b"""We aim to please \x96 will you aim too, please?
South Africa\x92s language diversity can be challenging.
""" pofile = self.html2po(htmlsource) self.countunits(pofile, 4) self.compareunit(pofile, 1, "FMFI - South Africa - CSIR Openphone - Overview") self.compareunit( pofile, 2, "fmfi, first mile, first inch, wireless, rural development, access devices, mobile devices, wifi, connectivity, rural connectivty, ict, low cost, cheap, digital divide, csir, idrc, community", ) self.compareunit(pofile, 3, "We aim to please \x96 will you aim too, please?") self.compareunit( pofile, 4, "South Africa\x92s language diversity can be challenging." ) def test_strip_html(self): """Ensure that unnecessary html is stripped from the resulting unit.""" htmlsource = """
|
<not an element> & " ' ’
", "<not an element> & \" ' \u2019", ) def test_entityrefs_in_attributes(self): """Should convert html entityrefs in attribute values""" # it would be even nicer if " and ' could be preserved, but the automatic unescaping of # attributes is deep inside html.HTMLParser. self.check_single( '', "’ ’
", "\u2019 \u2019", ) def test_php(self): """Test that PHP snippets don't interfere""" # A simple string self.check_phpsnippet("""=$phpvariable?>""") # Contains HTML tag characters (< and >) self.check_phpsnippet("""=($a < $b ? $foo : ($b > c ? $bar : $cat))?>""") # Make sure basically any symbol can be handled # NOTE quotation mark removed since it violates the HTML format when placed in an attribute self.check_phpsnippet( """ asdfghjkl qwertyuiop 1234567890!@#$%^&*()-=_+[]\\{}|;':,./<>? ?>""" ) def test_multiple_php(self): """Test multiple PHP snippets in a string to make sure they get restored properly""" php1 = """=$phpvariable?>""" php2 = """=($a < $b ? $foo : ($b > c ? $bar : $cat))?>""" php3 = """ asdfghjklqwertyuiop1234567890!@#$%^&*()-=_+[]\\{}|;':",./<>? ?>""" # Put 3 different strings into an html string innertext = ( 'Body text and some ' + php2 + " more text " + php2 + php3 ) htmlsource = "" + innertext + "
" self.check_single(htmlsource, innertext) def test_php_multiline(self): # A multi-line php string to test php1 = """ abc def ghi ?>""" # Scatter the php strings throughout the file, and show what the translation should be innertext = ( 'Body text and some ' + php1 + " more text " + php1 + php1 ) innertrans = ( 'Texte de corps et encore de ' + php1 + " plus de texte " + php1 + php1 ) htmlsource = ( "" + innertext + "
" ) # Current html file transsource = ( "" + innertrans + "
" ) # Expected translation pofile = self.html2po(htmlsource) pofile.units[1].target = innertrans # Register the translation in the PO file htmlresult = self.po2html(pofile, htmlsource) assert htmlresult == transsource def test_php_with_embedded_html(self): """Should not consume HTML within processing instructions""" self.check_single( "a
b
?> c", "ab
?> c", ) def test_comments(self): """Test that HTML comments are converted to translator notes in output""" pofile = self.html2po( "A paragraph.
", keepcomments=True, ) self.compareunit(pofile, 1, "A paragraph.") notes = pofile.getunits()[-1].getnotes() assert str(notes) == " a comment \n with another comment " def test_attribute_without_value(self): htmlsource = """ """ pofile = self.html2po(htmlsource) self.compareunit(pofile, 1, "EPS färg") class TestHTML2POCommand(test_convert.TestConvertCommand, TestHTML2PO): """Tests running actual html2po commands on files""" convertmodule = html2po defaultoptions = {"progress": "none"} def test_multifile_single(self): """Test the --multifile=single option and make sure it produces one pot file per input file.""" self.create_testfile( "file1.html", "