examples/trans/trans.py

#!/neo/opt/bin/python

import sys, string, os, getopt, pwd, signal, time, re
import fcntl

import tstart

import db_trans
from log import *
import neo_cgi, neo_util
import odb

eTransError = "eTransError"

DONE = 0
DEBUG = 0

TIER2_DIV = 11
TIER1_DIV = 11 * TIER2_DIV

if not DEBUG: LOGGING_STATUS[DEV_UPDATE] = 0

def handleSignal(*arg):
  global DONE
  DONE = 1

def usage():
  print "usage info!!"

def exceptionString():
  import StringIO, traceback

  ## get the traceback message
  sfp = StringIO.StringIO()
  traceback.print_exc(file=sfp)
  exception = sfp.getvalue()
  sfp.close()

  return exception

class TransLoc:
    def __init__ (self, string_id, filename, location):
        self.string_id = string_id
        self.filename = filename
        self.location = location

class Translator:
    _HTML_TAG_RE = None
    _HTML_TAG_REGEX = '<[^!][^>]*?>'
    _HTML_CMT_RE = None
    _HTML_CMT_REGEX = '<!--.*?-->'
    _CS_TAG_RE = None
    _CS_TAG_REGEX = '<\\?.+?\\?>'

    def __init__ (self):
        self.tdb = db_trans.trans_connect()

        # configuration data ......
        #  - we should stop hardcoding this... - jeske

        self.root = "testroot"
        self.languages = ['es', 'en']

        self.ignore_paths = ['tmpl/m']  # common place for mockups
        self.ignore_files = ['blah_ignore.cs'] # ignore clearsilver file

        # ignore clearsilver javascript files
        self.ignore_patterns = ['tmpl/[^ ]*_js.cs']

        # ............................


        if self.root is None:
            raise "Unable to determine installation root"


        if Translator._HTML_TAG_RE is None:
            Translator._HTML_TAG_RE = re.compile(Translator._HTML_TAG_REGEX, re.MULTILINE | re.DOTALL)
        if Translator._HTML_CMT_RE is None:
            Translator._HTML_CMT_RE = re.compile(Translator._HTML_CMT_REGEX, re.MULTILINE | re.DOTALL)
        if Translator._CS_TAG_RE is None:
            Translator._CS_TAG_RE = re.compile(Translator._CS_TAG_REGEX, re.MULTILINE | re.DOTALL)

        self._html_state = 0


    def parseHTMLTag(self, data):
        # this is only called if we see a full tag in one parse...
        i = 0
        if len(data) == 0: return []
        if data[0] in '/?': return []
        while i < len(data) and data[i] not in ' \n\r\t>': i = i + 1
        if i == len(data): return []
        tag = data[:i].lower()
        #print "Searching tag: %s" % data
        #print "Found tag: %s" % tag
        results = []
        attrfind = re.compile(
            r'\s*([a-zA-Z_][-.a-zA-Z_0-9]*)(\s*=\s*'
            r'(\'[^\']*\'|"[^"]*"|[^ \t\n<>]*))?')
        k = i
        attrs = {}
        attrs_beg = {}
        while k < len(data):
            match = attrfind.match(data, k)
            if not match: break
            attrname, rest, attrvalue = match.group(1, 2, 3)
            if not rest:
               attrvalue = attrname
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:
               attrvalue = attrvalue[1:-1]
            attrname = attrname.lower()
            if attrs.has_key(attrname):
                log("Can't handle duplicate attrs: %s" % attrname)
            attrs[attrname] = attrvalue
            attrs_beg[attrname] = match.start(3)
            k = match.end(0)

        find_l = []
        if tag == "input":
            if attrs.get('type', "").lower() in ["submit", "button"]:
                find_l.append((attrs.get('value', ''), attrs_beg.get('value', 0)))

        for s,k in find_l:
            if s:
                x = data[k:].find(s)
                if x != -1: results.append((s, x+k, 1))

        return results

    def parseHTML(self, data, reset=1):
        if reset: self._html_state = 0
        if DEBUG: print "- %d ---------\n%s\n- E ---------" % (self._html_state, data)

        results = []
        i = 0
        n = len(data)
        # if we had state from the last parse... find it
        if self._html_state:
            if self._html_state == 2:
                x = string.find(data[i:], '-->')
                l = 3
            else:
                x = string.find(data[i:], '>')
                l = 1
            if x == -1: return results
            i = i + x + l
            self._html_state = 0
        while i < n:
            if DEBUG: print "MATCHING>%s<MATCHING" % data[i:]
            cmt_b = string.find(data[i:], '<!--')
            cmt_e = string.find(data[i:], '-->')
            tag_b = string.find(data[i:], '<')
            tag_e = string.find(data[i:], '>')
            if DEBUG: print "B> %d %d %d %d <B" % (cmt_b, cmt_e, tag_b, tag_e)
            if cmt_b != -1 and cmt_b <= tag_b:
                x = i
                y = i+cmt_b-1
                while x < y and data[x] in string.whitespace: x+=1
                while y > x and data[y] in string.whitespace: y-=1
                results.append((data[x:y+1], x, 1))
                if cmt_e == -1: # partial comment:
                    self._html_state = 2
                    break
                i = i + cmt_e + 3
            elif tag_b != -1:
                x = i
                y = i+tag_b-1
                while x < y and data[x] in string.whitespace: x+=1
                while y > x and data[y] in string.whitespace: y-=1
                results.append((data[x:y+1], x, 1))
                if tag_e == -1: # partial tag
                    self._html_state = 1
                    break
                h_results = self.parseHTMLTag(data[i+tag_b+1:i+tag_e])
                h_results = map(lambda x: (x[0], x[1] + i+tag_b+1, x[2]), h_results)
                results = results + h_results
                i = i + tag_e + 1
            else:
                x = i
                y = n-1
                while x < y and data[x] in string.whitespace: x+=1
                while y > x and data[y] in string.whitespace: y-=1
                results.append((data[x:y+1], x, 1))
                break
        return results

    def parseCS(self, data):
        results = []
        i = 0
        n = len(data)
        while i < n:
            m = Translator._CS_TAG_RE.search(data, i)
            if not m:
                # search for a partial...
                x = string.find(data[i:], '<?')
                if x == -1:
                    results.append((data[i:], i))
                else:
                    results.append((data[i:x], i))
                break
            (b, e) = m.span()
            if i != b: results.append((data[i:b], i))
            i = e
        t_results = []
        self._html_in = 0
        for (s, ofs) in results:
            r = self.parseHTML(s, reset=0)
            r = map(lambda x: (x[0], x[1] + ofs, x[2]), r)
            t_results = t_results + r
        return t_results

    def descendHDF(self, obj, prefix):
        results = []
        while obj is not None:
            if obj.value():
                attrs = obj.attrs()
                attrs = map(lambda x: x[0], attrs)
                if "Lang" in attrs:
                    if prefix:
                        results.append((obj.value(), "%s.%s" % (prefix, obj.name()), 0))
                    else:
                        results.append((obj.value(), "%s" % (obj.name()), 0))
            if obj.child():
                if prefix:
                    results = results + self.descendHDF(obj.child(), "%s.%s" % (prefix, obj.name()))
                else:
                    results = results + self.descendHDF(obj.child(), (obj.name()))
            obj = obj.next()
        return results

    def parseHDF(self, data):
        # Ok, we handle HDF files specially.. the theory is, we only
        # extract entire HDF elements which have the attribute Lang
        hdf = neo_util.HDF()
        hdf.readString(data, 1)
        return self.descendHDF(hdf, "")

    def handleFile(self, file):
        if file in self.ignore_files: return []
        for a_re in self.ignore_patterns:
            if re.match(a_re,file):
                return []
        fpath = self.root + '/' + file
        x = string.rfind(file, '.')
        if x == -1: return []
        data = open(fpath, 'r').read()
        ext = file[x:]
        strings = []
        if ext in ['.cst', '.cs']:
            strings = self.parseCS(data)
        elif ext in ['.html', '.htm']:
            strings = self.parseHTML(data)
        elif ext in ['.hdf']:
            strings = self.parseHDF(data)
        if len(strings):
            print "Found %d strings in %s" % (len(strings), file)
            return strings
        return []

    def walkDirectory(self, path):
        if path in self.ignore_paths: return []
        fpath = self.root + '/' + path
        files = os.listdir(fpath)
        dirs = []
        results = []
        for file in files:
            if file[0] == '.': continue
            fname = fpath + '/' + file
            if os.path.isdir(fname):
                dirs.append(file)
            else:
                strings = self.handleFile(path + '/' + file)
                if len(strings):
                    results.append((path + '/' + file, strings))
        for dir in dirs:
            if dir not in ["release"]:
                results = results + self.walkDirectory(path + '/' + dir)
        return results

    def cleanHtmlString(self, s):
        s = re.sub("\s+", " ", s)
        return string.strip(s)

    def containsWords(self, s, ishtml):
        if ishtml:
            s = string.replace(s, '&nbsp;', ' ')
            s = string.replace(s, '&quot;', '"')
            s = string.replace(s, '&copy;', '')
            s = string.replace(s, '&lt;', '<')
            s = string.replace(s, '&gt;', '>')
            s = string.replace(s, '&amp;', '&')
        for x in range (len (s)):
          n = ord(s[x])
          if (n>47 and n<58) or (n>64 and n<91) or (n>96 and n<123): return 1
        return 0

    def findString(self, s):
        rows = self.tdb.strings.fetchRows( ('string', s) )
        if len(rows) == 0:
            row = self.tdb.strings.newRow()
            row.string = s
            row.save()
            return row.string_id
        elif len(rows) > 1:
            raise eTransError, "String %s exists multiple times!" % s
        else:
            return rows[0].string_id

    def loadStrings(self, one_file=None, verbose=0):
        if one_file is not None:
            strings = self.handleFile(one_file)
            results = [(one_file, strings)]
        else:
            results = self.walkDirectory('tmpl')
        uniq = {}
        cnt = 0
        seen_hdf = {}
        for fname, strings in results:
            for (s, ofs, ishtml) in strings:
                if s and string.strip(s):
                    l = len(s)
                    if ishtml:
                        s = self.cleanHtmlString(s)
                    if self.containsWords(s, ishtml):
                        if type(ofs) == type(""): # HDF
                            if seen_hdf.has_key(ofs):
                                if seen_hdf[ofs][0] != s:
                                    log("Duplicate HDF Name %s:\n\t file %s = %s\n\t file %s = %s" % (ofs, seen_hdf[ofs][1], seen_hdf[ofs][0], fname, s))
                            else:
                                seen_hdf[ofs] = (s, fname)
                        try:
                            uniq[s].append((fname, ofs, l))
                        except KeyError:
                            uniq[s] = [(fname, ofs, l)]
                        cnt = cnt + 1
        print "%d strings, %d unique" % (cnt, len(uniq.keys()))
        fp = open("map", 'w')
        for (s, locs) in uniq.items():
            locs = map(lambda x: "%s:%s:%d" % x, locs)
            fp.write('#: %s\n' % (string.join(locs, ',')))
            fp.write('msgid=%s\n\n' % repr(s))

        log("Loading strings/locations into database")
        locations = []
        for (s, locs) in uniq.items():
            s_id = self.findString(s)
            for (fname, ofs, l) in locs:
                if type(ofs) == type(""): # ie, its HDF
                    location = "hdf:%s" % ofs
                else:
                    location = "ofs:%d:%d" % (ofs, l)
                loc_r = TransLoc(s_id, fname, location)
                locations.append(loc_r)
        return locations

    def stringsHDF(self, prefix, locations, lang='en', exist=0, tiered=0):
        hdf = neo_util.HDF()
        if exist and lang == 'en': return hdf
        done = {}
        locations.sort()
        maps = self.tdb.maps.fetchRows( ('lang', lang) )
        maps_d = {}
        for map in maps:
            maps_d[int(map.string_id)] = map
        strings = self.tdb.strings.fetchRows()
        strings_d = {}
        for string in strings:
            strings_d[int(string.string_id)] = string
        count = 0
        for loc in locations:
            s_id = int(loc.string_id)
            if done.has_key(s_id): continue
            try:
                s_row = maps_d[s_id]
                if exist: continue
            except KeyError:
                try:
                    s_row = strings_d[s_id]
                except KeyError:
                    log("Missing string_id %d, skipping" % s_id)
                    continue
            count = count + 1
            if tiered:
                hdf.setValue("%s.%d.%d.%s" % (prefix, int(s_id) / TIER1_DIV, int(s_id) / TIER2_DIV, s_id), s_row.string)
            else:
                hdf.setValue("%s.%s" % (prefix, s_id), s_row.string)
            done[s_id] = 1
        if exist == 1: log("Missing %d strings for lang %s" % (count, lang))
        return hdf

    def dumpStrings(self, locations, lang=None):
        log("Dumping strings to HDF")
        if lang is None:
            langs = ['en']
            sql = "select lang from nt_trans_maps group by lang"
            cursor = self.tdb.defaultCursor()
            cursor.execute(sql)
            rows = cursor.fetchall()
            for row in rows:
                langs.append(row[0])
        else:
            langs = [lang]

        for a_lang in langs:
            hdf = self.stringsHDF('S', locations, a_lang)
            hdf.writeFile("strings_%s.hdf" % a_lang)

        for a_lang in langs:
            hdf = self.stringsHDF('S', locations, a_lang, exist=1)
            if hdf.child():
                hdf.writeFile("strings_missing_%s.hdf" % a_lang)

    def fetchString(self, s_id, lang):
        if lang == "hdf":
            return "<?cs var:Lang.Extracted.%d.%d.%s ?>" % (int(s_id) / TIER1_DIV, int(s_id) / TIER2_DIV, s_id)
        rows = self.tdb.maps.fetchRows( [('string_id', s_id), ('lang', lang)] )
        if len(rows) == 0:
            try:
                row = self.tdb.strings.fetchRow( ('string_id', s_id) )
            except odb.eNoMatchingRows:
                log("Unable to find string id %s" % s_id)
                raise eNoString
            if lang != 'en':
                log("Untranslated string for id %s" % s_id)
            return row.string
        else:
            return rows[0].string

    def dumpFiles(self, locations, lang):
        log("Dumping files for %s" % lang)
        files = {}
        for row in locations:
            try:
                files[row.filename].append(row)
            except KeyError:
                files[row.filename] = [row]

        hdf_map = []

        os.system("rm -rf %s/gen/tmpl" % (self.root))
        for file in files.keys():
            fname = "%s/gen/%s" % (self.root, file)
            try:
                os.makedirs(os.path.dirname(fname))
            except OSError, reason:
                if reason[0] != 17:
                    raise
            do_hdf = 0
            x = string.rfind(file, '.')
            if x != -1 and file[x:] == '.hdf':
                do_hdf = 1
            ofs = []
            for loc in files[file]:
                parts = string.split(loc.location, ':')
                if len(parts) == 3 and parts[0] == 'ofs' and do_hdf == 0:
                    ofs.append((int(parts[1]), int(parts[2]), loc.string_id))
                elif len(parts) == 2 and parts[0] == 'hdf' and do_hdf == 1:
                    hdf_map.append((parts[1], loc.string_id))
                else:
                    log("Invalid location for loc_id %s" % loc.loc_id)
                    continue
            if not do_hdf:
                ofs.sort()
                data = open(self.root + '/' + file).read()
                # ok, now we split up the original data into sections
                x = 0
                n = len(data)
                out = []
                #sys.stderr.write("%s\n" % repr(ofs))
                while len(ofs):
                    if ofs[0][0] > x:
                        out.append(data[x:ofs[0][0]])
                        x = ofs[0][0]
                    elif ofs[0][0] == x:
                        out.append(self.fetchString(ofs[0][2], lang))
                        x = ofs[0][0] + ofs[0][1]
                        ofs = ofs[1:]
                    else:
                        log("How did we get here? %s x=%d ofs=%d sid=%d" % (file, x, ofs[0][0], ofs[0][2]))
                        log("Data[x:20]: %s" % data[x:20])
                        log("Data[ofs:20]: %s" % data[ofs[0][0]:20])
                        break
                if n > x:
                    out.append(data[x:])
                odata = string.join(out, '')
                open(fname, 'w').write(odata)

        if lang == "hdf":
            langs = self.languages
        else:
            langs = [lang]

        for d_lang in langs:
          # dumping the extracted strings
          hdf = self.stringsHDF('Lang.Extracted', locations, d_lang, tiered=1)
          fname = "%s/gen/tmpl/lang_%s.hdf" % (self.root, d_lang)
          hdf.writeFile(fname)
          data = open(fname).read()
          fp = open(fname, 'w')
          fp.write('## AUTOMATICALLY GENERATED -- DO NOT EDIT\n\n')
          fp.write(data)
          fp.write('\n#include "lang_map.hdf"\n')

          # dumping the hdf strings file
          if d_lang == "en":
            map_file = "%s/gen/tmpl/lang_map.hdf" % (self.root)
          else:
            map_file = "%s/gen/tmpl/%s/lang_map.hdf" % (self.root, d_lang)
          try:
              os.makedirs(os.path.dirname(map_file))
          except OSError, reason:
              if reason[0] != 17: raise
          map_hdf = neo_util.HDF()
          for (name, s_id) in hdf_map:
              str = hdf.getValue('Lang.Extracted.%d.%d.%s' % (int(s_id) / TIER1_DIV, int(s_id) / TIER2_DIV, s_id), '')
              map_hdf.setValue(name, str)
          map_hdf.writeFile(map_file)

    def loadMap(self, file, prefix, lang):
        log("Loading map for language %s" % lang)
        hdf = neo_util.HDF()
        hdf.readFile(file)
        obj = hdf.getChild(prefix)
        updates = 0
        new_r = 0
        while obj is not None:
            s_id = obj.name()
            str = obj.value()

            try:
                map_r = self.tdb.maps.fetchRow( [('string_id', s_id), ('lang', lang)])
            except odb.eNoMatchingRows:
                map_r = self.tdb.maps.newRow()
                map_r.string_id = s_id
                map_r.lang = lang
                new_r = new_r + 1

            if map_r.string != str:
                updates = updates + 1
                map_r.string = str
                map_r.save()

            obj = obj.next()
        log("New maps: %d  Updates: %d" % (new_r, updates - new_r))


def main(argv):
  alist, args = getopt.getopt(argv[1:], "f:v:", ["help", "load=", "lang="])

  one_file = None
  verbose = 0
  load_file = None
  lang = 'en'
  for (field, val) in alist:
    if field == "--help":
      usage(argv[0])
      return -1
    if field == "-f":
      one_file = val
    if field == "-v":
      verbose = int(val)
    if field == "--load":
        load_file = val
    if field == "--lang":
        lang = val


  global DONE

  #signal.signal(signal.SIGTERM, handleSignal)
  #signal.signal(signal.SIGINT, handleSignal)

  log("trans: start")

  start_time = time.time()

  try:
    t = Translator()
    if load_file:
        t.loadMap(load_file, 'S', lang)
    else:
        locations = t.loadStrings(one_file, verbose=verbose)
        t.dumpStrings(locations)
        t.dumpFiles(locations, 'hdf')
  except KeyboardInterrupt:
    pass
  except:
    import handle_error
    handle_error.handleException("Translation Error")

if __name__ == "__main__":
  main(sys.argv)