1# -*- coding: utf-8 -*- 2# Copyright 2010-2018, Google Inc. 3# All rights reserved. 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31__author__ = "taku" 32 33import optparse 34 35 36def IsPrefix(str, key): 37 try: 38 n = str.index(key) 39 except: 40 n = -1 41 42 if n == 0: 43 return True 44 else: 45 return False 46 47 48def LoadRewriteMapRule(filename): 49 fh = open(filename, 'rb') 50 rule = [] 51 for line in fh: 52 line = line.rstrip(b'\n') 53 if not line or line.startswith(b'#'): 54 continue 55 fields = line.split() 56 rule.append([fields[0], fields[1]]) 57 fh.close() 58 return rule 59 60 61def ReadPOSID(id_file, special_pos_file): 62 pos_list = [] 63 64 fh = open(id_file, 'rb') 65 for line in fh: 66 fields = line.split() 67 pos_list.append(fields[1]) 68 fh.close() 69 70 fh = open(special_pos_file, 'rb') 71 for line in fh: 72 if len(line) <= 1 or line[0:1] == b'#': 73 continue 74 fields = line.split() 75 pos_list.append(fields[0]) 76 fh.close() 77 78 return pos_list 79 80 81def ParseOptions(): 82 parser = optparse.OptionParser() 83 parser.add_option('--id_def', dest='id_def', 84 help='POS ID definition file') 85 parser.add_option('--special_pos', dest='special_pos', 86 help='Special POS definition file') 87 parser.add_option('--pos_group_def', dest='pos_group_def', 88 help='Left POS ID group definition file') 89 parser.add_option('--output', dest='output', 90 help='Output file for binary mode') 91 return parser.parse_args()[0] 92 93 94def main(): 95 opts = ParseOptions() 96 97 # read lid file 98 pos_list = ReadPOSID(opts.id_def, opts.special_pos) 99 100 # read rule file 101 rules = LoadRewriteMapRule(opts.pos_group_def) 102 103 current_id = 1 104 id_map = {} 105 ids = [] 106 107 for target in pos_list: 108 id = 0 109 for rule in rules: 110 if IsPrefix(target, rule[0]): 111 if rule[1] in id_map: 112 id = id_map[rule[1]] 113 else: 114 id = current_id 115 id_map[rule[1]] = current_id 116 current_id += 1 117 ids.append(id) 118 119 with open(opts.output, 'wb') as f: 120 f.write(''.join(chr(id) for id in ids).encode('utf-8')) 121 122 123if __name__ == '__main__': 124 main() 125