1# -*- coding: utf-8 -*-
2# Copyright 2010-2018, Google Inc.
3# All rights reserved.
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31__author__ = "taku"
32
33import optparse
34
35
36def IsPrefix(str, key):
37  try:
38    n = str.index(key)
39  except:
40    n = -1
41
42  if n == 0:
43    return True
44  else:
45    return False
46
47
48def LoadRewriteMapRule(filename):
49  fh = open(filename, 'rb')
50  rule = []
51  for line in fh:
52    line = line.rstrip(b'\n')
53    if not line or line.startswith(b'#'):
54      continue
55    fields = line.split()
56    rule.append([fields[0], fields[1]])
57  fh.close()
58  return rule
59
60
61def ReadPOSID(id_file, special_pos_file):
62  pos_list = []
63
64  fh = open(id_file, 'rb')
65  for line in fh:
66    fields = line.split()
67    pos_list.append(fields[1])
68  fh.close()
69
70  fh = open(special_pos_file, 'rb')
71  for line in fh:
72    if len(line) <= 1 or line[0:1] == b'#':
73      continue
74    fields = line.split()
75    pos_list.append(fields[0])
76  fh.close()
77
78  return pos_list
79
80
81def ParseOptions():
82  parser = optparse.OptionParser()
83  parser.add_option('--id_def', dest='id_def',
84                    help='POS ID definition file')
85  parser.add_option('--special_pos', dest='special_pos',
86                    help='Special POS definition file')
87  parser.add_option('--pos_group_def', dest='pos_group_def',
88                    help='Left POS ID group definition file')
89  parser.add_option('--output', dest='output',
90                    help='Output file for binary mode')
91  return parser.parse_args()[0]
92
93
94def main():
95  opts = ParseOptions()
96
97  # read lid file
98  pos_list = ReadPOSID(opts.id_def, opts.special_pos)
99
100  # read rule file
101  rules = LoadRewriteMapRule(opts.pos_group_def)
102
103  current_id = 1
104  id_map = {}
105  ids = []
106
107  for target in pos_list:
108    id = 0
109    for rule in rules:
110      if IsPrefix(target, rule[0]):
111        if rule[1] in id_map:
112          id = id_map[rule[1]]
113        else:
114          id = current_id
115          id_map[rule[1]] = current_id
116          current_id += 1
117    ids.append(id)
118
119  with open(opts.output, 'wb') as f:
120    f.write(''.join(chr(id) for id in ids).encode('utf-8'))
121
122
123if __name__ == '__main__':
124  main()
125