1# -*- coding: utf-8 -*-
2# Copyright 2010-2018, Google Inc.
3# All rights reserved.
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""A tool to generate boundary data.
32
33Bounday data binary image is an array of uint16 whose length is 2N, where N is
34the number of POS IDs including special POS.  The array has the following
35structure:
36
37-------------------------------------
38prefix penalty of POS ID 0 (2 bytes)
39-------------------------------------
40suffix penalty of POS ID 0 (2 bytes)
41-------------------------------------
42prefix penalty of POS ID 1 (2 bytes)
43-------------------------------------
44suffix penalty of POS ID 1 (2 bytes)
45-------------------------------------
46  .
47  .
48  .
49-------------------------------------
50prefix penalty of POS ID N (2 bytes)
51-------------------------------------
52suffix penalty of POS ID N (2 bytes)
53-------------------------------------
54
55See converter/segmenter.cc for how it's used.
56"""
57
58__author__ = "taku"
59
60import optparse
61import re
62import struct
63import sys
64
65
66def PatternToRegexp(pattern):
67  return '^' + pattern.replace('*', '[^,]+')
68
69
70def LoadPatterns(file):
71  prefix = []
72  suffix = []
73  fh = open(file, 'r')
74  for line in fh:
75    if len(line) <= 1 or line[0] == '#':
76      continue
77    fields = line.split()
78    label = fields[0]
79    feature = fields[1]
80    cost = int(fields[2])
81    if cost < 0 or cost > 0xffff:
82      sys.exit(-1)
83    if label == 'PREFIX':
84      prefix.append([re.compile(PatternToRegexp(feature)), cost])
85    elif label == 'SUFFIX':
86      suffix.append([re.compile(PatternToRegexp(feature)), cost])
87    else:
88      print('format error %s' % (line))
89      sys.exit(0)
90  fh.close()
91  return (prefix, suffix)
92
93
94def GetCost(patterns, feature):
95  for p in patterns:
96    pat = p[0]
97    cost = p[1]
98    if pat.match(feature):
99      return cost
100  return 0
101
102
103def LoadFeatures(filename):
104  features = []
105  fh = open(filename, 'r')
106  for line in fh:
107    fields = line.split()
108    features.append(fields[1])
109  fh.close()
110  return features
111
112
113def CountSpecialPos(filename):
114  count = 0
115  fh = open(filename, 'r')
116  for line in fh:
117    line = line.rstrip()
118    if not line or line[0] == '#':
119      continue
120    count += 1
121  fh.close()
122  return count
123
124
125def ParseOptions():
126  parser = optparse.OptionParser()
127  parser.add_option('--boundary_def', dest='boundary_def',
128                    help='Boundary definition file')
129  parser.add_option('--id_def', dest='id_def',
130                    help='Boundary definition file')
131  parser.add_option('--special_pos', dest='special_pos',
132                    help='Special POS definition file')
133  parser.add_option('--output', dest='output',
134                    help='Output binary file')
135  return parser.parse_args()[0]
136
137
138def main():
139  opts = ParseOptions()
140
141  prefix, suffix = LoadPatterns(opts.boundary_def)
142  features = LoadFeatures(opts.id_def)
143  num_special_pos = CountSpecialPos(opts.special_pos)
144
145  with open(opts.output, 'wb') as f:
146    for feature in features:
147      f.write(struct.pack('<H', GetCost(prefix, feature)))
148      f.write(struct.pack('<H', GetCost(suffix, feature)))
149
150    for _ in range(num_special_pos):
151      f.write(struct.pack('<H', 0))
152      f.write(struct.pack('<H', 0))
153
154
155if __name__ == '__main__':
156  main()
157