import os from io import BytesIO from translate.convert import html2po, po2html, test_convert class TestHTML2PO: def html2po( self, markup, duplicatestyle="msgctxt", keepcomments=False, ): """Helper to convert html to po without a file.""" inputfile = BytesIO(markup.encode() if isinstance(markup, str) else markup) convertor = html2po.html2po() return convertor.convertfile(inputfile, "test", duplicatestyle, keepcomments) def po2html(self, posource, htmltemplate): """Helper to convert po to html without a file.""" # Convert pofile object to bytes inputfile = BytesIO(bytes(posource)) outputfile = BytesIO() templatefile = BytesIO(htmltemplate.encode()) assert po2html.converthtml(inputfile, outputfile, templatefile) return outputfile.getvalue().decode("utf-8") def countunits(self, pofile, expected): """helper to check that we got the expected number of messages""" actual = len(pofile.units) if actual > 0: if pofile.units[0].isheader(): actual = actual - 1 print(pofile) assert actual == expected def compareunit(self, pofile, unitnumber, expected): """helper to validate a PO message""" if not pofile.units[0].isheader(): unitnumber = unitnumber - 1 print("unit source: " + pofile.units[unitnumber].source + "|") print("expected: " + expected + "|") assert str(pofile.units[unitnumber].source) == str(expected) def check_single(self, markup, itemtext): """checks that converting this markup produces a single element with value itemtext""" pofile = self.html2po(markup) self.countunits(pofile, 1) self.compareunit(pofile, 1, itemtext) def check_null(self, markup): """checks that converting this markup produces no elements""" pofile = self.html2po(markup) self.countunits(pofile, 0) def check_phpsnippet(self, php): """Given a snippet of php, put it into an HTML shell and see if the results are as expected""" self.check_single( '

Body text

', "Body text", ) self.check_single( '

More things in Body text

', 'More things in Body text', ) self.check_single( "

" + php + "

", php ) def test_extract_lang_attribute_from_html_tag(self): """Test that the lang attribute is extracted from the html tag, issue #3884""" markup = """ translate lang attribute """ pofile = self.html2po(markup) self.countunits(pofile, 2) self.compareunit(pofile, 1, "en") self.compareunit(pofile, 2, "translate lang attribute") def test_do_not_extract_lang_attribute_from_tags_other_than_html(self): """Test that the lang attribute is extracted from the html tag""" self.check_single('

Français

', "Français") def test_title(self): """test that we can extract the tag""" self.check_single( "<html><head><title>My title", "My title" ) def test_title_with_linebreak(self): """Test a linebreak in the tag""" htmltext = """<html> <head> <title>My title """ self.check_single(htmltext, "My title") def test_meta(self): """Test that we can extract certain info from .""" self.check_single( """""", "these are keywords", ) def test_tag_p(self): """test that we can extract the

tag""" self.check_single( "

A paragraph.

", "A paragraph." ) def test_tag_p_with_br(self): """test that we can extract the

tag with an embedded
element""" markup = "

First line.
Second line.

" pofile = self.html2po(markup) self.compareunit(pofile, 1, "First line.
Second line.") def test_tag_p_with_linebreak(self): """Test newlines within the

tag.""" htmltext = """

A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.

""" self.check_single( htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.", ) def test_tag_p_with_linebreak_and_embedded_br(self): """Test newlines within the

tag when there is an embedded
element.""" markup = "

First\nline.
Second\nline.

" pofile = self.html2po(markup) self.compareunit(pofile, 1, "First line.
Second line.") def test_uppercase_html(self): """Should ignore the casing of the html tags.""" self.check_single( "

A paragraph.

", "A paragraph." ) def test_tag_div(self): """test that we can extract the
tag""" self.check_single( "
A paragraph.
", "A paragraph.", ) markup = "
First line.
Second line.
" pofile = self.html2po(markup) self.compareunit(pofile, 1, "First line.
Second line.") def test_tag_div_with_linebreaks(self): """Test linebreaks within a
tag.""" htmltext = """
A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.
""" self.check_single( htmltext, "A paragraph is a section in a piece of writing, usually highlighting a particular point or topic. It always begins on a new line and usually with indentation, and it consists of at least one sentence.", ) markup = "
First\nline.
Second\nline.
" pofile = self.html2po(markup) self.compareunit(pofile, 1, "First line.
Second line.") def test_tag_a(self): """test that we can extract the tag""" self.check_single( '

A paragraph with hyperlink.

', 'A paragraph with hyperlink.', ) def test_tag_a_with_linebreak(self): """Test that we can extract the tag with newlines in it.""" htmltext = """

A paragraph with hyperlink and newlines.

""" self.check_single( htmltext, 'A paragraph with hyperlink and newlines.', ) def test_sequence_of_anchor_elements(self): """test that we can extract a sequence of anchor elements without mixing up start/end tags, issue #3768""" self.check_single( '

This is a link but this is not. However this is too

', 'This is a link but this is not. However this is too', ) def test_tag_img(self): """Test that we can extract the alt attribute from the tag.""" self.check_single( """A picture""", "A picture", ) def test_img_empty(self): """Test that we can extract the alt attribute from the tag.""" htmlsource = """""" self.check_null(htmlsource) def test_tag_img_inside_a(self): """Test that we can extract the alt attribute from the tag when the img is embedded in a link.""" self.check_single( """

A picture

""", "A picture", ) def test_tag_table_summary(self): """Test that we can extract the summary attribute.""" self.check_single( """
""", "Table summary", ) def test_table_simple(self): """Test that we can fully extract a simple table.""" markup = """
Heading OneHeading Two
OneTwo
""" pofile = self.html2po(markup) self.countunits(pofile, 4) self.compareunit(pofile, 1, "Heading One") self.compareunit(pofile, 2, "Heading Two") self.compareunit(pofile, 3, "One") self.compareunit(pofile, 4, "Two") def test_table_complex(self): markup = """
A caption
Heading OneHeading Two
Foot OneFoot Two
OneTwo
""" pofile = self.html2po(markup) self.countunits(pofile, 9) self.compareunit(pofile, 1, "This is the summary") self.compareunit(pofile, 2, "A caption") self.compareunit(pofile, 3, "Head 1") self.compareunit(pofile, 4, "Heading One") self.compareunit(pofile, 5, "Heading Two") self.compareunit(pofile, 6, "Foot One") self.compareunit(pofile, 7, "Foot Two") self.compareunit(pofile, 8, "One") self.compareunit(pofile, 9, "Two") def test_table_empty(self): """Test that we ignore tables that are empty. A table is deemed empty if it has no translatable content. """ self.check_null( """
""" ) self.check_null( """
 
""" ) self.check_null( """
""" ) def test_address(self): """Test to see if the address element is extracted""" self.check_single("
My address
", "My address") def test_headings(self): """Test to see if the h* elements are extracted""" markup = "

Heading One

Heading Two

Heading Three

Heading Four

Heading Five
Heading Six
" pofile = self.html2po(markup) self.countunits(pofile, 6) self.compareunit(pofile, 1, "Heading One") self.compareunit(pofile, 2, "Heading Two") self.compareunit(pofile, 3, "Heading Three") self.compareunit(pofile, 4, "Heading Four") self.compareunit(pofile, 5, "Heading Five") self.compareunit(pofile, 6, "Heading Six") def test_headings_with_linebreaks(self): """Test to see if h* elements with newlines can be extracted""" markup = "

Heading\nOne

Heading\nTwo

Heading\nThree

Heading\nFour

Heading\nFive
Heading\nSix
" pofile = self.html2po(markup) self.countunits(pofile, 6) self.compareunit(pofile, 1, "Heading One") self.compareunit(pofile, 2, "Heading Two") self.compareunit(pofile, 3, "Heading Three") self.compareunit(pofile, 4, "Heading Four") self.compareunit(pofile, 5, "Heading Five") self.compareunit(pofile, 6, "Heading Six") def test_dt(self): """Test to see if the definition list title (dt) element is extracted""" self.check_single( "
Definition List Item Title
", "Definition List Item Title", ) def test_dd(self): """Test to see if the definition list description (dd) element is extracted""" self.check_single( "
Definition List Item Description
", "Definition List Item Description", ) def test_span(self): """test to check that we don't double extract a span item""" self.check_single( "

You are a Spanish sentence.

", "You are a Spanish sentence.", ) def test_ul(self): """Test to see if the list item
  • is extracted""" markup = "
    • Unordered One
    • Unordered Two
    1. Ordered One
    2. Ordered Two
    " pofile = self.html2po(markup) self.countunits(pofile, 4) self.compareunit(pofile, 1, "Unordered One") self.compareunit(pofile, 2, "Unordered Two") self.compareunit(pofile, 3, "Ordered One") self.compareunit(pofile, 4, "Ordered Two") def test_nested_lists(self): """Nested lists should be extracted correctly""" markup = """Nested lists
    • Vegetables
    • Fruit
      • Bananas
      • Apples
      • Pears
      yeah, that should be enough
    • Meat
    """ pofile = self.html2po(markup) self.countunits(pofile, 8) self.compareunit(pofile, 1, "Nested lists") self.compareunit(pofile, 2, "Vegetables") self.compareunit(pofile, 3, "Fruit") self.compareunit(pofile, 4, "Bananas") self.compareunit(pofile, 5, "Apples") self.compareunit(pofile, 6, "Pears") self.compareunit(pofile, 7, "yeah, that should be enough") self.compareunit(pofile, 8, "Meat") def test_duplicates(self): """check that we use the default style of msgctxt to disambiguate duplicate messages""" markup = ( "

    Duplicate

    Duplicate

    " ) pofile = self.html2po(markup) self.countunits(pofile, 2) # FIXME change this so that we check that the msgctxt is correctly added self.compareunit(pofile, 1, "Duplicate") assert pofile.units[1].getlocations() == ["None+html.body.p:1-26"] self.compareunit(pofile, 2, "Duplicate") assert pofile.units[2].getlocations() == ["None+html.body.p:1-42"] def test_multiline_reflow(self): """check that we reflow multiline content to make it more readable for translators""" self.check_single( """South Africa""", """South Africa""", ) def test_nested_tags(self): """check that we can extract items within nested tags""" markup = "

    Extract this

    And this
    " pofile = self.html2po(markup) self.countunits(pofile, 2) self.compareunit(pofile, 1, "Extract this") self.compareunit(pofile, 2, "And this") def test_carriage_return(self): """Remove carriage returns from files in dos format.""" htmlsource = """\r \r \r \r \r \r \r \r

    The rapid expansion of telecommunications infrastructure in recent\r years has helped to bridge the digital divide to a limited extent.

    \r \r \r """ self.check_single( htmlsource, "The rapid expansion of telecommunications infrastructure in recent years has helped to bridge the digital divide to a limited extent.", ) def test_encoding_latin1(self): """Convert HTML input in iso-8859-1 correctly to unicode.""" """Also verifies that the charset declaration isn't extracted as a translation unit.""" htmlsource = b""" FMFI - South Africa - CSIR Openphone - Overview

    We aim to please \x96 will you aim too, please?

    South Africa\x92s language diversity can be challenging.

    """ pofile = self.html2po(htmlsource) self.countunits(pofile, 4) self.compareunit(pofile, 1, "FMFI - South Africa - CSIR Openphone - Overview") self.compareunit( pofile, 2, "fmfi, first mile, first inch, wireless, rural development, access devices, mobile devices, wifi, connectivity, rural connectivty, ict, low cost, cheap, digital divide, csir, idrc, community", ) self.compareunit(pofile, 3, "We aim to please \x96 will you aim too, please?") self.compareunit( pofile, 4, "South Africa\x92s language diversity can be challenging." ) def test_strip_html(self): """Ensure that unnecessary html is stripped from the resulting unit.""" htmlsource = """ FMFI - Contact
    Projects
    Home Page
    """ pofile = self.html2po(htmlsource) self.countunits(pofile, 3) self.compareunit(pofile, 2, "Projects") self.compareunit(pofile, 3, "Home Page") # Translate and convert back: pofile.units[2].target = "Projekte" pofile.units[3].target = "Tuisblad" htmlresult = ( self.po2html(bytes(pofile), htmlsource) .replace("\n", " ") .replace('= "', '="') .replace("> <", "><") ) snippet = 'Projekte' assert snippet in htmlresult snippet = 'Tuisblad' assert snippet in htmlresult def test_entityrefs_in_text(self): """Should extract html entityrefs, preserving the ones representing reserved characters""" """`See `.""" self.check_single( "

    <not an element> & " ' ’

    ", "<not an element> & \" ' \u2019", ) def test_entityrefs_in_attributes(self): """Should convert html entityrefs in attribute values""" # it would be even nicer if " and ' could be preserved, but the automatic unescaping of # attributes is deep inside html.HTMLParser. self.check_single( '<not an element> & " ' ’', " & \" ' \u2019", ) def test_charrefs(self): """Should extract html charrefs""" self.check_single( "

    ’ ’

    ", "\u2019 \u2019", ) def test_php(self): """Test that PHP snippets don't interfere""" # A simple string self.check_phpsnippet("""""") # Contains HTML tag characters (< and >) self.check_phpsnippet(""" c ? $bar : $cat))?>""") # Make sure basically any symbol can be handled # NOTE quotation mark removed since it violates the HTML format when placed in an attribute self.check_phpsnippet( """? ?>""" ) def test_multiple_php(self): """Test multiple PHP snippets in a string to make sure they get restored properly""" php1 = """""" php2 = """ c ? $bar : $cat))?>""" php3 = """? ?>""" # Put 3 different strings into an html string innertext = ( 'Body text and some ' + php2 + " more text " + php2 + php3 ) htmlsource = "

    " + innertext + "

    " self.check_single(htmlsource, innertext) def test_php_multiline(self): # A multi-line php string to test php1 = """""" # Scatter the php strings throughout the file, and show what the translation should be innertext = ( 'Body text and some ' + php1 + " more text " + php1 + php1 ) innertrans = ( 'Texte de corps et encore de ' + php1 + " plus de texte " + php1 + php1 ) htmlsource = ( "

    " + innertext + "

    " ) # Current html file transsource = ( "

    " + innertrans + "

    " ) # Expected translation pofile = self.html2po(htmlsource) pofile.units[1].target = innertrans # Register the translation in the PO file htmlresult = self.po2html(pofile, htmlsource) assert htmlresult == transsource def test_php_with_embedded_html(self): """Should not consume HTML within processing instructions""" self.check_single( "

    a b

    ?> c

    ", "a b

    ?> c", ) def test_comments(self): """Test that HTML comments are converted to translator notes in output""" pofile = self.html2po( "

    A paragraph.

    ", keepcomments=True, ) self.compareunit(pofile, 1, "A paragraph.") notes = pofile.getunits()[-1].getnotes() assert str(notes) == " a comment \n with another comment " def test_attribute_without_value(self): htmlsource = """ """ pofile = self.html2po(htmlsource) self.compareunit(pofile, 1, "EPS färg") class TestHTML2POCommand(test_convert.TestConvertCommand, TestHTML2PO): """Tests running actual html2po commands on files""" convertmodule = html2po defaultoptions = {"progress": "none"} def test_multifile_single(self): """Test the --multifile=single option and make sure it produces one pot file per input file.""" self.create_testfile( "file1.html", "
    You are only coming through in waves
    " ) self.create_testfile( "file2.html", "
    Your lips move but I cannot hear what you say
    " ) self.run_command("./", "pots", pot=True, multifile="single") assert os.path.isfile(self.get_testfilename("pots/file1.pot")) assert os.path.isfile(self.get_testfilename("pots/file2.pot")) content = str(self.read_testfile("pots/file1.pot")) assert "coming through" in content assert "cannot hear" not in content def test_multifile_onefile(self): """Test the --multifile=onefile option and make sure it produces a file, not a directory.""" self.create_testfile( "file1.html", "
    You are only coming through in waves
    " ) self.create_testfile( "file2.html", "
    Your lips move but I cannot hear what you say
    " ) self.run_command("./", "one.pot", pot=True, multifile="onefile") assert os.path.isfile(self.get_testfilename("one.pot")) content = str(self.read_testfile("one.pot")) assert "coming through" in content assert "cannot hear" in content def test_multifile_onefile_to_stdout(self, capsys): """Test the --multifile=onefile option without specifying an output file. Default is stdout.""" self.create_testfile( "file1.html", "
    You are only coming through in waves
    " ) self.create_testfile( "file2.html", "
    Your lips move but I cannot hear what you say
    " ) self.run_command("./", pot=True, multifile="onefile") content, err = capsys.readouterr() assert "coming through" in content assert "cannot hear" in content assert err == "" def test_help(self, capsys): """Test getting help.""" options = super().test_help(capsys) options = self.help_check(options, "-P, --pot") options = self.help_check(options, "--duplicates=DUPLICATESTYLE") options = self.help_check(options, "--keepcomments") options = self.help_check(options, "--multifile=MULTIFILESTYLE", last=True)