helpcontent2/to-wiki/wikiconv2.py

#!/usr/local/bin/python3.8
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

import os, sys, thread, threading, time, re, copy
import xml.parsers.expat
import codecs
from threading import Thread

root="source/"
max_threads = 25

titles = []

# map of id -> localized text
localization_data = {}

# to collect a list of pages that will be redirections to the pages with nice
# names
redirects = []

# to collect images that we will up-load later
images = set()

# various types of paragraphs
replace_paragraph_role = \
    {'start':{'bascode': '',
              'code': '<code>',
              'codeintip': '<code>',
              'emph' : '', # must be empty to be able to strip empty <emph/>
              'example': '<code>',
              'heading1': '= ',
              'heading2': '== ',
              'heading3': '=== ',
              'heading4': '==== ',
              'heading5': '===== ',
              'heading6': '====== ',
              'head1': '= ', # used only in one file, probably in error?
              'head2': '== ', # used only in one file, probably in error?
              'listitem': '',
              'logocode': '<code>',
              'note': '{{Note|1=',
              'null': '', # special paragraph for Variable, CaseInline, etc.
              'ol_item': '',
              'paragraph': '',
              'related': '', # used only in one file, probably in error?
              'relatedtopics': '', # used only in one file, probably in error?
              'sup' : '',
              'tablecontent': '| | ',
              'tablecontentcode': '| | <code>',
              'tablecontentnote': '| |{{Note|1=',
              'tablecontenttip': '| |{{Tip|1=',
              'tablecontentwarning': '| |{{Warning|1=',
              'tablehead': '! scope="col" | ',
              'tablenextnote': '\n{{Note|1=',
              'tablenextpara': '\n',
              'tablenextparacode': '\n<code>',
              'tablenexttip': '\n{{Tip|1=',
              'tablenextwarning': '\n{{Warning|1=',
              'tip': '{{Tip|1=',
              'ul_item': '',
              'variable': '',
              'warning': '{{Warning|1=',
             },
     'end':{'bascode': '\n',
            'code': '</code>\n\n',
            'codeintip': '</code>\n\n',
            'emph' : '',
            'example': '</code>\n\n',
            'heading1': ' =\n\n',
            'heading2': ' ==\n\n',
            'heading3': ' ===\n\n',
            'heading4': ' ====\n\n',
            'heading5': ' =====\n\n',
            'heading6': ' ======\n\n',
            'head1': ' =\n\n', # used only in one file, probably in error?
            'head2': ' ==\n\n', # used only in one file, probably in error?
            'listitem': '',
            'logocode': '</code>\n\n',
            'note': '}}\n\n',
            'null': '', # special paragraph for Variable, CaseInline, etc.
            'ol_item': '',
            'paragraph': '\n\n',
            'related': '\n\n', # used only in one file, probably in error?
            'relatedtopics': '\n\n', # used only in one file, probably in error?
            'sup' : '',
            'tablecontent': '\n',
            'tablecontentcode': '</code>\n',
            'tablecontentnote': '}}\n\n',
            'tablecontenttip': '}}\n\n',
            'tablecontentwarning': '}}\n\n',
            'tablehead': '\n',
            'tablenextnote': '}}\n\n',
            'tablenextpara': '\n',
            'tablenextparacode': '</code>\n',
            'tablenexttip': '}}\n\n',
            'tablenextwarning': '}}\n\n',
            'tip': '}}\n\n',
            'ul_item': '',
            'variable': '',
            'warning': '}}\n\n',
           },
     'templ':{'bascode': False,
              'code': False,
              'codeintip': False,
              'emph' : False,
              'example': False,
              'heading1': False,
              'heading2': False,
              'heading3': False,
              'heading4': False,
              'heading5': False,
              'heading6': False,
              'head1': False,
              'head2': False,
              'listitem': False,
              'logocode': False,
              'note': True,
              'null': False,
              'ol_item': False,
              'paragraph': False,
              'related': False,
              'relatedtopics': False,
              'sup' : False,
              'tablecontent': False,
              'tablecontentcode': False,
              'tablecontentnote': True,
              'tablecontenttip': True,
              'tablecontentwarning': True,
              'tablehead': False,
              'tablenextnote': True,
              'tablenextpara': False,
              'tablenextparacode': False,
              'tablenexttip': True,
              'tablenextwarning': True,
              'tip': True,
              'ul_item': False,
              'variable': False,
              'warning': True,
           }
    }

section_id_mapping = \
    {'relatedtopics': 'RelatedTopics'}

# text snippets that we need to convert
replace_text_list = \
    [["$[officename]", "{{ProductName}}"],
     ["%PRODUCTNAME", "{{ProductName}}"],
     ["$PRODUCTNAME", "{{ProductName}}"],
     ["font size", u"\u200dfont size"],
     ["''","<nowiki>''</nowiki>"]
    ]

def get_link_filename(link, name):
    text = link.strip()
    fragment = ''
    if text.find('http') == 0:
        text = name
    else:
        f = text.find('#')
        if f >= 0:
            fragment = text[f:]
            text = text[0:f]

    for title in titles:
        try:
            if title[0].find(text) >= 0:
                return (title[1].strip(), fragment)
        except:
            pass
    return (link, '')

def replace_text(text):
    for i in replace_text_list:
        if text.find(i[0]) >= 0:
            text = text.replace(i[0],i[1])
    return text

# modify the text so that in templates like {{Name|something}}, the 'something'
# does not look like template params
def escape_equals_sign(text):
    depth = 0
    t = ''
    for i in text:
        if i == '=':
            if depth == 0:
                t = t + '&#61;'
            else:
                t = t + '='
        else:
            t = t + i
            if i == '{' or i == '[' or i == '<':
                depth = depth + 1
            elif i == '}' or i == ']' or i == '>':
                depth = depth - 1
                if depth < 0:
                    depth = 0

    return t

def xopen(path, mode, encoding):
    """Wrapper around open() to support both python2 and python3."""
    if sys.version_info >= (3,):
        return open(path, mode, encoding=encoding)
    else:
        return open(path, mode)

# used by escape_help_text
helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-zA-Z]+?=[\\\\]??".*?") *[/]??>''')

def escape_help_text(text):
    """Escapes the help text as it would be in an SDF file."""

    for tag in helptagre.findall(text):
        escapethistag = False
        for escape_tag in ["ahelp", "link", "item", "emph", "defaultinline", "switchinline", "caseinline", "variable", "bookmark_value", "image", "embedvar", "alt"]:
            if tag.startswith("<%s" % escape_tag) or tag == "</%s>" % escape_tag:
                escapethistag = True
        if tag in ["<br/>", "<help-id-missing/>"]:
            escapethistag = True
        if escapethistag:
            escaped_tag = ("\\<" + tag[1:-1] + "\\>")
            text = text.replace(tag, escaped_tag)
    return text


def load_localization_data(po_root):
    global localization_data
    localization_data = {}
    for root, dirs, files in os.walk(po_root):
        for file in files:
            if re.search(r'\.po$', file) == None:
                continue
            path = "%s/%s" % (root, file)
            sock = xopen(path, "r", encoding='utf-8')
            hashKey = None
            transCollecting = False
            trans = ""
            it = iter(sock)
            line = next(it, None)
            while line != None:
                line=line.decode("utf-8")
                if line.startswith('msgctxt ""'): # constructing the hashKey
                    key=[]
                    allGood = True
                    i=0
                    while i<2 and allGood:
                        msgctxt_line = next(it, None);
                        if  msgctxt_line != None and msgctxt_line.strip().startswith('"'):
                            key.append( msgctxt_line[1:-4] ) #-4 cuts \\n"\n from the end of the line
                            i=i+1
                        else:
                            allGood = False
                    if i==2: #hash key is allowed to be constructed
                        hashKey = '#'.join( (re.sub(r'^.*helpcontent2/source/', r'source/', path[:-3]) + '/' + key[0] , key[1]) )
                    else:
                        hashKey = None
                elif hashKey != None: # constructing trans value for hashKey
                    if transCollecting:
                        if line.startswith('"'):
                            trans= trans + line.strip()[1:-1]
                        else:
                            transCollecting = False
                            localization_data[hashKey] = escape_help_text(trans)
                            hashKey = None
                    elif line.startswith('msgstr '):
                        trans = line.strip()[8:-1]
                        if trans == '': # possibly multiline
                            transCollecting = True
                        else:
                            localization_data[hashKey] = escape_help_text(trans)
                            hashKey = None
                line = next(it, None)
    return True

def unescape(str):
    unescape_map = {'<': {True:'<', False:'&lt;'},
                    '>': {True:'>', False:'&gt;'},
                    '&': {True:'&', False:'&amp;'},
                    '"': {True:'"', False:'"'}}
    result = ''
    escape = False
    for c in str:
        if c == '\\':
            if escape:
                result = result + '\\'
                escape = False
            else:
                escape = True
        else:
            try:
                replace = unescape_map[c]
                result = result + replace[escape]
            except:
                result = result + c
            escape = False

    return result

def get_localized_text(filename, id):
    try:
        str = localization_data['%s#%s'% (filename, id)]
    except:
        return ''

    return unescape(str)

def href_to_fname_id(href):
    link = href.replace('"', '')
    fname = link
    id = ''
    if link.find("#") >= 0:
        fname = link[:link.find("#")]
        id = link[link.find("#")+1:]
    else:
        sys.stderr.write('Reference without a "#" in "%s".'% link)

    return [fname, id]

# Exception classes
class UnhandledItemType(Exception):
    pass
# Base class for all the elements
#
# self.name - name of the element, to drop the self.child_parsing flag
# self.objects - collects the child objects that are constructed during
#                parsing of the child elements
# self.child_parsing - flag whether we are parsing a child, or the object
#                      itself
# self.parent - parent object
class ElementBase:
    def __init__(self, name, parent):
        self.name = name
        self.objects = []
        self.child_parsing = False
        self.parent = parent

    def start_element(self, parser, name, attrs):
        pass

    def end_element(self, parser, name):
        if name == self.name:
            self.parent.child_parsing = False

    def char_data(self, parser, data):
        pass

    def get_curobj(self):
        if self.child_parsing:
            return self.objects[len(self.objects)-1].get_curobj()
        return self

    # start parsing a child element
    def parse_child(self, child):
        self.child_parsing = True
        self.objects.append(child)

    # construct the wiki representation of this object, including the objects
    # held in self.objects (here only the text of the objects)
    def get_all(self):
        text = u''
        for i in self.objects:
            text = text + i.get_all()
        return text

    # for handling variables, and embedding in general
    # id - the variable name we want to get
    def get_variable(self, id):
        for i in self.objects:
            if i != None:
                var = i.get_variable(id)
                if var != None:
                    return var
        return None

    # embed part of another file into current structure
    def embed_href(self, parent_parser, fname, id):
        # parse another xhp
        parser = XhpParser('source/' + fname, False, \
                parent_parser.current_app, parent_parser.wiki_page_name, \
                parent_parser.lang)
        var = parser.get_variable(id)

        if var != None:
            try:
                if var.role == 'variable':
                    var.role = 'paragraph'
            except:
                pass
            self.objects.append(var)
        elif parser.follow_embed:
            sys.stderr.write('Cannot find reference "#%s" in "%s".\n'% \
                    (id, fname))

    def unhandled_element(self, parser, name):
        sys.stderr.write('Warning: Unhandled element "%s" in "%s" (%s)\n'% \
                        (name, self.name, parser.filename))

# Base class for trivial elements that operate on char_data
#
# Like <comment>, or <title>
class TextElementBase(ElementBase):
    def __init__(self, attrs, parent, element_name, start, end, templ):
        ElementBase.__init__(self, element_name, parent)
        self.text = u''
        self.start = start
        self.end = end
        self.templ = templ

    def char_data(self, parser, data):
        self.text = self.text + data

    def get_all(self):
        if self.templ:
            return self.start + escape_equals_sign(replace_text(self.text)) + self.end
        else:
            return self.start + replace_text(self.text) + self.end

class XhpFile(ElementBase):
    def __init__(self):
        ElementBase.__init__(self, None, None)

    def start_element(self, parser, name, attrs):
        if name == 'body':
            # ignored, we flatten the structure
            pass
        elif name == 'bookmark':
            self.parse_child(Bookmark(attrs, self, 'div', parser))
        elif name == 'comment':
            self.parse_child(Comment(attrs, self))
        elif name == 'embed' or name == 'embedvar':
            if parser.follow_embed:
                (fname, id) = href_to_fname_id(attrs['href'])
                self.embed_href(parser, fname, id)
        elif name == 'helpdocument':
            # ignored, we flatten the structure
            pass
        elif name == 'list':
            self.parse_child(List(attrs, self, False))
        elif name == 'meta':
            self.parse_child(Meta(attrs, self))
        elif name == 'paragraph':
            parser.parse_paragraph(attrs, self)
        elif name == 'section':
            self.parse_child(Section(attrs, self))
        elif name == 'sort':
            self.parse_child(Sort(attrs, self))
        elif name == 'switch':
            self.parse_child(Switch(attrs, self, parser.embedding_app))
        elif name == 'table':
            self.parse_child(Table(attrs, self))
        elif name == 'bascode':
            self.parse_child(BasicCode(attrs, self))
        else:
            self.unhandled_element(parser, name)

class Bookmark(ElementBase):
    def __init__(self, attrs, parent, type, parser):
        ElementBase.__init__(self, 'bookmark', parent)

        self.type = type

        self.id = attrs['id']
        self.app = ''
        self.redirect = ''
        self.target = ''
        self.authoritative = False

        # let's construct the name of the redirect, so that we can point
        # to the wikihelp directly from the LO code; wiki then takes care of
        # the correct redirect
        branch = attrs['branch']
        if branch.find('hid/') == 0 and (parser.current_app_raw != '' or parser.follow_embed):
            name = branch[branch.find('/') + 1:]

            self.app = parser.current_app_raw
            self.target = parser.wiki_page_name
            self.authoritative = parser.follow_embed
            self.redirect = name.replace("/", "%2F")

    def get_all(self):
        global redirects
        # first of all, we need to create a redirect page for this one
        if self.redirect != '' and self.target != '':
            redirects.append([self.app, self.redirect, \
                '%s#%s'% (self.target, self.id), \
                self.authoritative])

        # then we also have to setup ID inside the page
        if self.type == 'div':
            return '<div id="%s"></div>\n'% self.id
        elif self.type == 'span':
            return '<span id="%s"></span>'% self.id
        else:
            sys.stderr.write('Unknown bookmark type "%s"'% self.type)

        return ''

class Image(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'image', parent)
        self.src     = attrs['src']
        self.align   = 'left'
        self.alt     = False
        self.alttext = ""

    def start_element(self, parser, name, attrs):
        if name == 'alt':
            self.alt = True
        else:
            self.unhandled_element(parser, name)

    def end_element(self, parser, name):
        ElementBase.end_element(self, parser, name)

        if name == 'alt':
            self.alt = False

    def char_data(self, parser, data):
        if self.alt:
            self.alttext = self.alttext + data

    def get_all(self):
        global images
        images.add(self.src)

        name = self.src[self.src.rfind('/') + 1:]
        wikitext = "[[Image:"+name+"|border|"+self.align+"|"
        wikitext = wikitext + self.alttext+"]]"
        return wikitext

    def get_curobj(self):
        return self

class Br(TextElementBase):
    def __init__(self, attrs, parent):
        TextElementBase.__init__(self, attrs, parent, 'br', '<br/>', '', False)

class Comment(TextElementBase):
    def __init__(self, attrs, parent):
        TextElementBase.__init__(self, attrs, parent, 'comment', '<!-- ', ' -->', False)

class HelpIdMissing(TextElementBase):
    def __init__(self, attrs, parent):
        TextElementBase.__init__(self, attrs, parent, 'help-id-missing', '{{MissingHelpId}}', '', False)

class Text:
    def __init__(self, text):
        self.wikitext = replace_text(text)

    def get_all(self):
        return self.wikitext

    def get_variable(self, id):
        return None

class TableCell(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'tablecell', parent)
        self.cellHasChildElement = False

    def start_element(self, parser, name, attrs):
        self.cellHasChildElement = True
        if name == 'bookmark':
            self.parse_child(Bookmark(attrs, self, 'div', parser))
        elif name == 'comment':
            self.parse_child(Comment(attrs, self))
        elif name == 'embed' or name == 'embedvar':
            (fname, id) = href_to_fname_id(attrs['href'])
            if parser.follow_embed:
                self.embed_href(parser, fname, id)
        elif name == 'paragraph':
            parser.parse_localized_paragraph(TableContentParagraph, attrs, self)
        elif name == 'section':
            self.parse_child(Section(attrs, self))
        elif name == 'bascode':
            # ignored, do not syntax highlight in table cells
            pass
        elif name == 'list':
            self.parse_child(List(attrs, self, True))
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        text = ''
        if not self.cellHasChildElement: # an empty element
            if self.parent.isTableHeader: # get from TableRow Element
                role = 'tablehead'
            else:
                role = 'tablecontent'
            text = text + replace_paragraph_role['start'][role]
            text = text + replace_paragraph_role['end'][role]
        text = text + ElementBase.get_all(self)
        return text

class TableRow(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'tablerow', parent)

    def start_element(self, parser, name, attrs):
        self.isTableHeader = False
        if name == 'tablecell':
            self.parse_child(TableCell(attrs, self))
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        text = '|-\n' + ElementBase.get_all(self)
        return text

class BasicCode(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'bascode', parent)

    def start_element(self, parser, name, attrs):
        if name == 'paragraph':
            parser.parse_localized_paragraph(BasicCodeParagraph, attrs, self)
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        text = '<source lang="oobas">\n' + ElementBase.get_all(self) + '</source>\n\n'
        return text

class Table(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'table', parent)

    def start_element(self, parser, name, attrs):
        if name == 'comment':
            self.parse_child(Comment(attrs, self))
        elif name == 'tablerow':
            self.parse_child(TableRow(attrs, self))
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        # + ' align="left"' etc.?
        text = '{| class="wikitable"\n' + \
            ElementBase.get_all(self) + \
            '|}\n\n'
        return text

class ListItem(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'listitem', parent)

    def start_element(self, parser, name, attrs):
        if name == 'bookmark':
            self.parse_child(Bookmark(attrs, self, 'span', parser))
        elif name == 'embed' or name == 'embedvar':
            (fname, id) = href_to_fname_id(attrs['href'])
            if parser.follow_embed:
                self.embed_href(parser, fname, id)
        elif name == 'paragraph':
            parser.parse_localized_paragraph(ListItemParagraph, attrs, self)
        elif name == 'list':
            self.parse_child(List(attrs, self, False))
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        text = '*'
        postfix = '\n'
        if self.parent.startwith > 0:
            text = '<li>'
            postfix = '</li>'
        elif self.parent.type == 'ordered':
            text = '#'

        # add the text itself
        linebreak = False
        for i in self.objects:
            if linebreak:
                text = text + '<br/>'
            ti = i.get_all()
            # when the object is another list (i.e. nested lists), only the first item
            # gets the '#' sign in the front by the previous statement
            # the below re.sub inserts the extra '#' for all additional items of the list
            ti = re.sub(r'\n\s*#', '\n##', ti)
            text = text + ti
            linebreak = True

        return text + postfix

class List(ElementBase):
    def __init__(self, attrs, parent, isInTable):
        ElementBase.__init__(self, 'list', parent)

        self.isInTable = isInTable
        self.type = attrs['type']
        try:
            self.startwith = int(attrs['startwith'])
        except:
            self.startwith = 0

    def start_element(self, parser, name, attrs):
        if name == 'listitem':
            self.parse_child(ListItem(attrs, self))
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        text = ""
        if self.isInTable:
            text = '| |\n'
        if self.startwith > 0:
            text = text + '<ol start="%d">\n'% self.startwith

        text = text + ElementBase.get_all(self)

        if self.startwith > 0:
            text = text + '\n</ol>\n'
        else:
            text = text + '\n'
        return text

# To handle elements that should be completely ignored
class Ignore(ElementBase):
    def __init__(self, attrs, parent, element_name):
        ElementBase.__init__(self, element_name, parent)

class OrigTitle(TextElementBase):
    def __init__(self, attrs, parent):
        TextElementBase.__init__(self, attrs, parent, 'title', '{{OrigLang|', '}}\n', True)

class Title(TextElementBase):
    def __init__(self, attrs, parent, localized_title):
        TextElementBase.__init__(self, attrs, parent, 'title', '{{Lang|', '}}\n', True)
        self.localized_title = localized_title

    def get_all(self):
        if self.localized_title != '':
            self.text = self.localized_title
        return TextElementBase.get_all(self)

class Topic(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'topic', parent)

    def start_element(self, parser, name, attrs):
        if name == 'title':
            if parser.lang == '':
                self.parse_child(OrigTitle(attrs, self))
            else:
                self.parse_child(Title(attrs, self, get_localized_text(parser.filename, 'tit')))
        elif name == 'filename':
            self.parse_child(Ignore(attrs, self, name))
        else:
            self.unhandled_element(parser, name)

class Meta(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'meta', parent)

    def start_element(self, parser, name, attrs):
        if name == 'topic':
            self.parse_child(Topic(attrs, self))
        elif name == 'history':
            self.parse_child(Ignore(attrs, self, name))
        else:
            self.unhandled_element(parser, name)

class Section(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'section', parent)
        self.id = attrs[ 'id' ]

    def start_element(self, parser, name, attrs):
        if name == 'bookmark':
            self.parse_child(Bookmark(attrs, self, 'div', parser))
        elif name == 'comment':
            self.parse_child(Comment(attrs, self))
        elif name == 'embed' or name == 'embedvar':
            (fname, id) = href_to_fname_id(attrs['href'])
            if parser.follow_embed:
                self.embed_href(parser, fname, id)
        elif name == 'list':
            self.parse_child(List(attrs, self, False))
        elif name == 'paragraph':
            parser.parse_paragraph(attrs, self)
        elif name == 'section':
            # sections can be nested
            self.parse_child(Section(attrs, self))
        elif name == 'switch':
            self.parse_child(Switch(attrs, self, parser.embedding_app))
        elif name == 'table':
            self.parse_child(Table(attrs, self))
        elif name == 'bascode':
            self.parse_child(BasicCode(attrs, self))
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        mapping = ''
        try:
            mapping = section_id_mapping[self.id]
        except:
            pass

        # some of the section ids are used as real id's, some of them have
        # function (like relatetopics), and have to be templatized
        text = ''
        if mapping != '':
            text = '{{%s|%s}}\n\n'% (mapping, \
                    escape_equals_sign(ElementBase.get_all(self)))
        else:
            text = ElementBase.get_all(self)

        return text

    def get_variable(self, id):
        var = ElementBase.get_variable(self, id)
        if var != None:
            return var
        if id == self.id:
            return self
        return None

class Sort(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'sort', parent)

        try:
            self.order = attrs['order']
        except:
            self.order = 'asc'

    def start_element(self, parser, name, attrs):
        if name == 'section':
            self.parse_child(Section(attrs, self))
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        rev = False
        if self.order == 'asc':
            rev = True
        self.objects = sorted(self.objects, key=lambda obj: obj.id, reverse=rev)

        return ElementBase.get_all(self)

class Link(ElementBase):
    def __init__(self, attrs, parent, lang):
        ElementBase.__init__(self, 'link', parent)

        self.link = attrs['href']
        try:
            self.lname = attrs['name']
        except:
            self.lname = self.link[self.link.rfind("/")+1:]
        # Override lname
        self.default_name = self.lname
        (self.lname, self.fragment) = get_link_filename(self.link, self.lname)
        self.wikitext = ""
        self.lang = lang

    def char_data(self, parser, data):
        self.wikitext = self.wikitext + data

    def get_all(self):
        if self.wikitext == "":
            self.wikitext = self.default_name

        self.wikitext = replace_text(self.wikitext)
        if self.link.find("http") == 0:
            text = '[%s %s]'% (self.link, self.wikitext)
        elif self.lang != '':
            text = '[[%s/%s%s|%s]]'% (self.lname, self.lang, self.fragment, self.wikitext)
        else:
            text = '[[%s%s|%s]]'% (self.lname, self.fragment, self.wikitext)
        return text

class SwitchInline(ElementBase):
    def __init__(self, attrs, parent, app):
        ElementBase.__init__(self, 'switchinline', parent)
        self.switch = attrs['select']
        self.embedding_app = app

    def start_element(self, parser, name, attrs):
        if name == 'caseinline':
            self.parse_child(CaseInline(attrs, self, False))
        elif name == 'defaultinline':
            self.parse_child(CaseInline(attrs, self, True))
        else:
            self.unhandled_element(parser, name)

    def get_all(self):
        if len(self.objects) == 0:
            return ''
        elif self.switch == 'sys':
            system = {'MAC':'', 'UNIX':'', 'WIN':'', 'default':''}
            for i in self.objects:
                if i.case == 'MAC' or i.case == 'UNIX' or \
                   i.case == 'WIN' or i.case == 'default':
                    system[i.case] = i.get_all()
                elif i.case == 'OS2':
                    # ignore, there is only one mention of OS2, which is a
                    # 'note to translators', and no meat
                    pass
                elif i.case == 'HIDE_HERE':
                    # do what the name suggest ;-)
                    pass
                else:
                    sys.stderr.write('Unhandled "%s" case in "sys" switchinline.\n'% \
                            i.case )
            text = '{{System'
            for i in [['default', 'default'], ['MAC', 'mac'], \
                      ['UNIX', 'unx'], ['WIN', 'win']]:
                if system[i[0]] != '':
                    text = '%s|%s=%s'% (text, i[1], system[i[0]])
            return text + '}}'
        elif self.switch == 'appl':
            # we want directly use the right text, when inlining something
            # 'shared' into an 'app'
            if self.embedding_app == '':
                text = ''
                default = ''
                for i in self.objects:
                    appls = {'BASIC':'Basic', 'CALC':'Calc', \
                             'CHART':'Chart', 'DRAW':'Draw', \
                             'IMAGE':'Draw', 'IMPRESS': 'Impress', \
                             'MATH':'Math', 'WRITER':'Writer', \
                             'OFFICE':'', 'default':''}
                    try:
                        app = appls[i.case]
                        all = i.get_all()
                        if all == '':
                            pass
                        elif app == '':
                            default = all
                        else:
                            text = text + '{{WhenIn%s|%s}}'% (app, escape_equals_sign(all))
                    except:
                        sys.stderr.write('Unhandled "%s" case in "appl" switchinline.\n'% \
                                i.case)

                if text == '':
                    text = default
                elif default != '':
                    text = text + '{{WhenDefault|%s}}'% escape_equals_sign(default)

                return text
            else:
                for i in self.objects:
                    if i.case == self.embedding_app:
                        return i.get_all()

        return ''

class Case(ElementBase):
    def __init__(self, attrs, parent, is_default):
        ElementBase.__init__(self, 'case', parent)

        if is_default:
            self.name = 'default'
            self.case = 'default'
        else:
            self.case = attrs['select']

    def start_element(self, parser, name, attrs):
        if name == 'bookmark':
            self.parse_child(Bookmark(attrs, self, 'div', parser))
        elif name == 'comment':
            self.parse_child(Comment(attrs, self))
        elif name == 'embed' or name == 'embedvar':
            if parser.follow_embed:
                (fname, id) = href_to_fname_id(attrs['href'])
                self.embed_href(parser, fname, id)
        elif name == 'list':
            self.parse_child(List(attrs, self, False))
        elif name == 'paragraph':
            parser.parse_paragraph(attrs, self)
        elif name == 'section':
            self.parse_child(Section(attrs, self))
        elif name == 'table':
            self.parse_child(Table(attrs, self))
        elif name == 'bascode':
            self.parse_child(BasicCode(attrs, self))
        else:
            self.unhandled_element(parser, name)

class Switch(SwitchInline):
    def __init__(self, attrs, parent, app):
        SwitchInline.__init__(self, attrs, parent, app)
        self.name = 'switch'

    def start_element(self, parser, name, attrs):
        self.embedding_app = parser.embedding_app
        if name == 'case':
            self.parse_child(Case(attrs, self, False))
        elif name == 'default':
            self.parse_child(Case(attrs, self, True))
        else:
            self.unhandled_element(parser, name)

class Item(ElementBase):
    replace_type = \
            {'start':{'acronym' : '\'\'',
                      'code': '<code>',
                      'input': '<code>',
                      'keycode': '{{KeyCode|',
                      'tasto': '{{KeyCode|',
                      'litera': '<code>',
                      'literal': '<code>',
                      'menuitem': '{{MenuItem|',
                      'mwnuitem': '{{MenuItem|',
                      'OpenOffice.org': '',
                      'productname': '',
                      'unknown': '<code>'
                     },
             'end':{'acronym' : '\'\'',
                    'code': '</code>',
                    'input': '</code>',
                    'keycode': '}}',
                    'tasto': '}}',
                    'litera': '</code>',
                    'literal': '</code>',
                    'menuitem': '}}',
                    'mwnuitem': '}}',
                    'OpenOffice.org': '',
                    'productname': '',
                    'unknown': '</code>'
                   },
             'templ':{'acronym': False,
                      'code': False,
                      'input': False,
                      'keycode': True,
                      'tasto': True,
                      'litera': False,
                      'literal': False,
                      'menuitem': True,
                      'mwnuitem': True,
                      'OpenOffice.org': False,
                      'productname': False,
                      'unknown': False
                     }}

    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'item', parent)

        try:
            self.type = attrs['type']
        except:
            self.type = 'unknown'
        self.text = ''

    def char_data(self, parser, data):
        self.text = self.text + data

    def get_all(self):
        try:
            text = ''
            if self.replace_type['templ'][self.type]:
                text = escape_equals_sign(replace_text(self.text))
            else:
                text = replace_text(self.text)
            return self.replace_type['start'][self.type] + \
                   text + \
                   self.replace_type['end'][self.type]
        except:
            try:
                sys.stderr.write('Unhandled item type "%s".\n'% self.type)
            except:
                sys.stderr.write('Unhandled item type. Possibly type has been localized.\n')
            finally:
                raise UnhandledItemType

class Paragraph(ElementBase):
    def __init__(self, attrs, parent):
        ElementBase.__init__(self, 'paragraph', parent)

        try:
            self.role = attrs['role']
        except:
            self.role = 'paragraph'

        try:
            self.id = attrs['id']
        except:
            self.id = ""

        try:
            self.level = int(attrs['level'])
        except:
            self.level = 0

        self.is_first = (len(self.parent.objects) == 0)

    def start_element(self, parser, name, attrs):
        if name == 'ahelp':
            try:
                if attrs['visibility'] == 'hidden':
                    self.parse_child(Ignore(attrs, self, name))
            except:
                pass
        elif name == 'br':
            self.parse_child(Br(attrs, self))
        elif name == 'comment':
            self.parse_child(Comment(attrs, self))
        elif name == 'emph':
            self.parse_child(Emph(attrs, self))
        elif name == 'sup':
            self.parse_child(Sup(attrs, self))
        elif name == 'embedvar':
            if parser.follow_embed:
                (fname, id) = href_to_fname_id(attrs['href'])
                self.embed_href(parser, fname, id)
        elif name == 'help-id-missing':
            self.parse_child(HelpIdMissing(attrs, self))
        elif name == 'image':
            self.parse_child(Image(attrs, self))
        elif name == 'item':
            self.parse_child(Item(attrs, self))
        elif name == 'link':
            self.parse_child(Link(attrs, self, parser.lang))
        elif name == 'localized':
            # we ignore this tag, it is added arbitrary for the paragraphs
            # that come from .sdf files
            pass
        elif name == 'switchinline':
            self.parse_child(SwitchInline(attrs, self, parser.embedding_app))
        elif name == 'variable':
            self.parse_child(Variable(attrs, self))
        else:
            self.unhandled_element(parser, name)

    def char_data(self, parser, data):
        if self.role == 'paragraph' or self.role == 'heading' or \
                self.role == 'listitem' or self.role == 'variable':
            if data != '' and data[0] == ' ':
                data = ' ' + data.lstrip()
            data = data.replace('\n', ' ')

        if len(data):
            self.objects.append(Text(data))

    def get_all(self):
        role = self.role
        if role == 'heading':
            if self.level <= 0:
                sys.stderr.write('Heading, but the level is %d.\n'% self.level)
            elif self.level < 6:
                role = 'heading%d'% self.level
            else:
                role = 'heading6'

        # if we are not the first para in the table, we need special handling
        if not self.is_first and role.find('table') == 0:
            if role == 'tablecontentcode':
                role = 'tablenextparacode'
            elif role == 'tablecontentnote':
                role = 'tablenextnote'
            elif role == 'tablecontenttip':
                role = 'tablenexttip'
            elif role == 'tablecontentwarning':
                role = 'tablenextwarning'
            else:
                role = 'tablenextpara'

        # the text itself
        try:
            children = ElementBase.get_all(self)
        except UnhandledItemType:
            raise UnhandledItemType('Paragraph id: '+str(self.id))
        if self.role != 'emph' and self.role != 'bascode' and self.role != 'logocode':
            children = children.strip()

        if len(children) == 0:
            return ''

        # prepend the markup according to the role
        text = ''
        try:
            text = text + replace_paragraph_role['start'][role]
        except:
            sys.stderr.write( "Unknown paragraph role start: " + role + "\n" )

        if replace_paragraph_role['templ'][role]:
            text = text + escape_equals_sign(children)
        else:
            text = text + children

        # append the markup according to the role
        try:
            text = text + replace_paragraph_role['end'][role]
        except:
            sys.stderr.write( "Unknown paragraph role end: " + role + "\n" )

        return text

class Variable(Paragraph):
    def __init__(self, attrs, parent):
        Paragraph.__init__(self, attrs, parent)
        self.name = 'variable'
        self.role = 'variable'
        self.id = attrs['id']

    def get_variable(self, id):
        if id == self.id:
            return self
        return None

class CaseInline(Paragraph):
    def __init__(self, attrs, parent, is_default):
        Paragraph.__init__(self, attrs, parent)

        self.role = 'null'
        if is_default:
            self.name = 'defaultinline'
            self.case = 'default'
        else:
            self.name = 'caseinline'
            self.case = attrs['select']

class Emph(Paragraph):
    def __init__(self, attrs, parent):
        Paragraph.__init__(self, attrs, parent)
        self.name = 'emph'
        self.role = 'emph'

    def get_all(self):
        text = Paragraph.get_all(self)
        if len(text):
            return "'''" + text + "'''"
        return ''

class Sup(Paragraph):
    def __init__(self, attrs, parent):
        Paragraph.__init__(self, attrs, parent)
        self.name = 'sup'
        self.role = 'sup'

    def get_all(self):
        text = Paragraph.get_all(self)
        if len(text):
            return "<sup>" + text + "</sup>"
        return ''

class ListItemParagraph(Paragraph):
    def __init__(self, attrs, parent):
        Paragraph.__init__(self, attrs, parent)
        self.role = 'listitem'

class BasicCodeParagraph(Paragraph):
    def __init__(self, attrs, parent):
        Paragraph.__init__(self, attrs, parent)
        self.role = 'bascode'

class TableContentParagraph(Paragraph):
    def __init__(self, attrs, parent):
        Paragraph.__init__(self, attrs, parent)
        if self.role != 'tablehead' and self.role != 'tablecontent':
            if self.role == 'code':
                self.role = 'tablecontentcode'
            elif self.role == 'bascode':
                self.role = 'tablecontentcode'
            elif self.role == 'logocode':
                self.role = 'tablecontentcode'
            elif self.role == 'note':
                self.role = 'tablecontentnote'
            elif self.role == 'tip':
                self.role = 'tablecontenttip'
            elif self.role == 'warning':
                self.role = 'tablecontentwarning'
            else:
                self.role = 'tablecontent'
        if self.role == 'tablehead':
            self.parent.parent.isTableHeader = True # self.parent.parent is TableRow Element
        else:
            self.parent.parent.isTableHeader = False

class ParserBase:
    def __init__(self, filename, follow_embed, embedding_app, current_app, wiki_page_name, lang, head_object, buffer):
        self.filename = filename
        self.follow_embed = follow_embed
        self.embedding_app = embedding_app
        self.current_app = current_app
        self.wiki_page_name = wiki_page_name
        self.lang = lang
        self.head_obj = head_object

        p = xml.parsers.expat.ParserCreate()
        p.StartElementHandler = self.start_element
        p.EndElementHandler = self.end_element
        p.CharacterDataHandler = self.char_data

        p.Parse(buffer)

    def start_element(self, name, attrs):
        self.head_obj.get_curobj().start_element(self, name, attrs)

    def end_element(self, name):
        self.head_obj.get_curobj().end_element(self, name)

    def char_data(self, data):
        self.head_obj.get_curobj().char_data(self, data)

    def get_all(self):
        return self.head_obj.get_all()

    def get_variable(self, id):
        return self.head_obj.get_variable(id)

    def parse_localized_paragraph(self, Paragraph_type, attrs, obj):
        localized_text = ''
        try:
            localized_text = get_localized_text(self.filename, attrs['id'])
        except:
            pass

        paragraph = Paragraph_type(attrs, obj)
        if localized_text != '':
            # parse the localized text
            text = u'<?xml version="1.0" encoding="UTF-8"?><localized>' + localized_text + '</localized>'
            try:
                ParserBase(self.filename, self.follow_embed, self.embedding_app, \
                        self.current_app, self.wiki_page_name, self.lang, \
                        paragraph, text.encode('utf-8'))
            except xml.parsers.expat.ExpatError:
                sys.stderr.write( 'Invalid XML in translated text. Using the original text. Error location:\n'\
                                  + 'Current xhp: ' + self.filename + '\nParagraph id: ' + attrs['id'] + '\n')
                obj.parse_child(Paragraph_type(attrs, obj)) # new paragraph must be created because "paragraph" is corrupted by "ParserBase"
            else:
                # add it to the overall structure
                obj.objects.append(paragraph)
                # and ignore the original text
                obj.parse_child(Ignore(attrs, obj, 'paragraph'))
        else:
            obj.parse_child(paragraph)

    def parse_paragraph(self, attrs, obj):
        ignore_this = False
        try:
            if attrs['role'] == 'heading' and int(attrs['level']) == 1 \
                    and self.ignore_heading and self.follow_embed:
                self.ignore_heading = False
                ignore_this = True
        except:
            pass

        if ignore_this:
            obj.parse_child(Ignore(attrs, obj, 'paragraph'))
        else:
            self.parse_localized_paragraph(Paragraph, attrs, obj)

class XhpParser(ParserBase):
    def __init__(self, filename, follow_embed, embedding_app, wiki_page_name, lang):
        # we want to ignore the 1st level="1" heading, because in most of the
        # cases, it is the only level="1" heading in the file, and it is the
        # same as the page title
        self.ignore_heading = True

        current_app = ''
        self.current_app_raw = ''
        for i in [['sbasic', 'BASIC'], ['scalc', 'CALC'], \
                  ['sdatabase', 'DATABASE'], ['sdraw', 'DRAW'], \
                  ['schart', 'CHART'], ['simpress', 'IMPRESS'], \
                  ['smath', 'MATH'], ['swriter', 'WRITER']]:
            if filename.find('/%s/'% i[0]) >= 0:
                self.current_app_raw = i[0]
                current_app = i[1]
                break

        if embedding_app == '':
            embedding_app = current_app

        file = codecs.open(filename, "r", "utf-8")
        buf = file.read()
        file.close()

        ParserBase.__init__(self, filename, follow_embed, embedding_app,
                current_app, wiki_page_name, lang, XhpFile(), buf.encode('utf-8'))

class WikiConverter(Thread):
    def __init__(self, inputfile, wiki_page_name, lang, outputfile):
        Thread.__init__(self)
        self.inputfile = inputfile
        self.wiki_page_name = wiki_page_name
        self.lang = lang
        self.outputfile = outputfile

    def run(self):
        parser = XhpParser(self.inputfile, True, '', self.wiki_page_name, self.lang)
        file = codecs.open(self.outputfile, "wb", "utf-8")
        file.write(parser.get_all())
        file.close()

def write_link(r, target):
    fname = 'wiki/%s'% r
    try:
        file = open(fname, "w")
        file.write('#REDIRECT [[%s]]\n'% target)
        file.close()
    except:
        sys.stderr.write('Unable to write "%s".\n'%'wiki/%s'% fname)

def write_redirects():
    print 'Generating the redirects...'
    written = {}
    # in the first pass, immediately write the links that are embedded, so that
    # we can always point to that source versions
    for redir in redirects:
        app = redir[0]
        redirect = redir[1]
        target = redir[2]
        authoritative = redir[3]

        if app != '':
            r = '%s/%s'% (app, redirect)
            if authoritative:
                write_link(r, target)
                written[r] = True
            else:
                try:
                    written[r]
                except:
                    written[r] = False

    # in the second pass, output the wiki links
    for redir in redirects:
        app = redir[0]
        redirect = redir[1]
        target = redir[2]

        if app == '':
            for i in ['swriter', 'scalc', 'simpress', 'sdraw', 'smath', \
                      'schart', 'sbasic', 'sdatabase']:
                write_link('%s/%s'% (i, redirect), target)
        else:
            r = '%s/%s'% (app, redirect)
            if not written[r]:
                write_link(r, target)

# Main Function
def convert(title_data, generate_redirects, lang, po_root):
    if lang == '':
        print 'Generating the main wiki pages...'
    else:
        print 'Generating the wiki pages for language %s...'% lang

    global titles
    titles = [t for t in title_data]
    global redirects
    redirects = []
    global images
    images = set()

    if lang != '':
        sys.stderr.write('Using localizations from "%s"\n'% po_root)
        if not load_localization_data(po_root):
            return

    for title in titles:
        while threading.active_count() > max_threads:
            time.sleep(0.001)

        infile = title[0].strip()
        wikiname = title[1].strip()
        articledir = 'wiki/' + wikiname
        try:
            os.mkdir(articledir)
        except:
            pass

        outfile = ''
        if lang != '':
            wikiname = '%s/%s'% (wikiname, lang)
            outfile = '%s/%s'% (articledir, lang)
        else:
            outfile = '%s/MAIN'% articledir

        try:
            file = open(outfile, 'r')
        except:
            try:
                wiki = WikiConverter(infile, wikiname, lang, outfile)
                wiki.start()
                continue
            except:
                print 'Failed to convert "%s" into "%s".\n'% \
                        (infile, outfile)
        sys.stderr.write('Warning: Skipping: %s > %s\n'% (infile, outfile))
        file.close()

    # wait for everyone to finish
    while threading.active_count() > 1:
        time.sleep(0.001)

    if lang == '':
        # set of the images used here
        print 'Generating "images.txt", the list of used images...'
        file = open('images.txt', "w")
        for image in images:
            file.write('%s\n'% image)
        file.close()

        # generate the redirects
        if generate_redirects:
            write_redirects()

# vim:set shiftwidth=4 softtabstop=4 expandtab: