1# -*- coding: utf-8 -*- 2# Copyright 2010-2018, Google Inc. 3# All rights reserved. 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""Generator script for connection data.""" 32 33__author__ = "hidehiko" 34 35import io 36import logging 37import optparse 38import os 39import struct 40import sys 41 42from build_tools import code_generator_util 43 44INVALID_COST = 30000 45INVALID_1BYTE_COST = 255 46RESOLUTION_FOR_1BYTE = 64 47FILE_MAGIC = b'\xAB\xCD' 48 49FALSE_VALUES = ['f', 'false', '0'] 50TRUE_VALUES = ['t', 'true', '1'] 51 52 53def ParseBoolFlag(value): 54 if value is None: 55 return False 56 57 value = value.lower() 58 if value in TRUE_VALUES: 59 return True 60 if value in FALSE_VALUES: 61 return False 62 63 # Unknown value. 64 logging.critical('Unknown boolean flag: %s', value) 65 sys.exit(1) 66 67 68def GetPosSize(filepath): 69 # The pos-size should be equal to the number of lines. 70 # TODO(hidehiko): Merge this method with pos_util in dictionary. 71 with open(filepath, 'r') as stream: 72 stream = code_generator_util.SkipLineComment(stream) 73 # Count the number of lines. 74 return sum(1 for _ in stream) 75 76 77def ParseConnectionFile(text_connection_file, pos_size, special_pos_size): 78 # The result is a square matrix. 79 mat_size = pos_size + special_pos_size 80 81 matrix = [[0] * mat_size for _ in range(mat_size)] 82 with open(text_connection_file) as stream: 83 stream = code_generator_util.SkipLineComment(stream) 84 # The first line contains the matrix column/row size. 85 size = next(stream).rstrip() 86 assert (int(size) == pos_size), '%s != %d' % (size, pos_size) 87 88 for array_index, cost in enumerate(stream): 89 cost = int(cost.rstrip()) 90 rid = array_index // pos_size 91 lid = array_index % pos_size 92 if rid == 0 and lid == 0: 93 cost = 0 94 matrix[rid][lid] = cost 95 96 # Fill INVALID_COST in matrix elements for special POS. 97 for rid in range(pos_size, mat_size): 98 for lid in range(1, mat_size): # Skip EOS 99 matrix[rid][lid] = INVALID_COST 100 101 for lid in range(pos_size, mat_size): 102 for rid in range(1, mat_size): # Skip BOS 103 matrix[rid][lid] = INVALID_COST 104 105 return matrix 106 107 108def CreateModeValueList(matrix): 109 """Create a list of modes of each row.""" 110 result = [] 111 for row in matrix: 112 m = {} 113 for cost in row: 114 if cost == INVALID_COST: 115 # Heuristically, we do not compress INVALID_COST. 116 continue 117 m[cost] = m.get(cost, 0) + 1 118 mode_value = max(m.items(), key=lambda x: x[1])[0] 119 result.append(mode_value) 120 return result 121 122 123def CompressMatrixByModeValue(matrix, mode_value_list): 124 # To compress the data size, we hold mode values for each row in a separate 125 # list, and fill None into the matrix if it equals to the corresponding 126 # mode value. 127 assert len(matrix) == len(mode_value_list) 128 for row, mode_value in zip(matrix, mode_value_list): 129 for index in range(len(row)): 130 if row[index] == mode_value: 131 row[index] = None 132 133 134def OutputBitList(bit_list, stream): 135 # Make sure that the bit list is aligned to the byte boundary. 136 assert len(bit_list) % 8 == 0 137 for bits in code_generator_util.SplitChunk(bit_list, 8): 138 byte = 0 139 for bit_index, bit in enumerate(bits): 140 if bit: 141 # Fill in LSB to MSB order. 142 byte |= (1 << bit_index) 143 stream.write(struct.pack('B', byte)) 144 145 146def BuildBinaryData(matrix, mode_value_list, use_1byte_cost): 147 # To compress the connection data, we use two-level succinct bit vector. 148 # 149 # The basic idea to compress the rid-lid matrix is compressing each row as 150 # follows: 151 # find the mode value of the row, and set the cells containins the value 152 # empty, thus we get a sparse array. 153 # We can compress sparse array by using succinct bit vector. 154 # (Please see also storage/louds/simple_succinct_bit_vector_index and 155 # storage/louds/bit_vector_based_array.) 156 # In addition, we compress the bit vector, too. Fortunately the distribution 157 # of bits is biased, so we group consecutive 8-bits and create another 158 # bit vector, named chunk-bits; 159 # - if no bits are 1, the corresponding bit is 0, otherwise 1. 160 # By using the bit vector, we can compact the original bit vector by skipping 161 # consecutive eight 0-bits. We can calculate the actual bit position in 162 # the compact bit vector by using Rank1 operation on chunk-bits. 163 # 164 # The file format is as follows: 165 # FILE_MAGIC (\xAB\xCD): 2bytes 166 # Resolution: 2bytes 167 # Num rids: 2bytes 168 # Num lids: 2bytes 169 # A list of mode values: 2bytes * rids (aligned to 32bits) 170 # A list of row data. 171 # 172 # The row data format is as follows: 173 # The size of compact bits in bytes: 2bytes 174 # The size of values in bytes: 2bytes 175 # chunk_bits, compact_bits, followed by values. 176 177 if use_1byte_cost: 178 resolution = RESOLUTION_FOR_1BYTE 179 else: 180 resolution = 1 181 stream = io.BytesIO() 182 183 # Output header. 184 stream.write(FILE_MAGIC) 185 matrix_size = len(matrix) 186 assert 0 <= matrix_size <= 65535 187 stream.write(struct.pack('<HHH', resolution, matrix_size, matrix_size)) 188 189 # Output mode value list. 190 for value in mode_value_list: 191 assert 0 <= value <= 65536 192 stream.write(struct.pack('<H', value)) 193 194 # 4 bytes alignment. 195 if len(mode_value_list) % 2: 196 stream.write(b'\x00\x00') 197 198 # Process each row: 199 for row in matrix: 200 chunk_bits = [] 201 compact_bits = [] 202 values = [] 203 204 for chunk in code_generator_util.SplitChunk(row, 8): 205 if all(cost is None for cost in chunk): 206 # All bits are 0, so output 0-chunk bit. 207 chunk_bits.append(False) 208 continue 209 210 chunk_bits.append(True) 211 for cost in chunk: 212 if cost is None: 213 compact_bits.append(False) 214 else: 215 compact_bits.append(True) 216 if use_1byte_cost: 217 if cost == INVALID_COST: 218 cost = INVALID_1BYTE_COST 219 else: 220 cost //= resolution 221 assert cost != INVALID_1BYTE_COST 222 values.append(cost) 223 224 # 4 bytes alignment. 225 while len(chunk_bits) % 32: 226 chunk_bits.append(False) 227 while len(compact_bits) % 32: 228 compact_bits.append(False) 229 if use_1byte_cost: 230 while len(values) % 4: 231 values.append(0) 232 values_size = len(values) 233 else: 234 while len(values) % 2: 235 values.append(0) 236 values_size = len(values) * 2 237 238 # Output the bits for a row. 239 stream.write(struct.pack('<HH', len(compact_bits) // 8, values_size)) 240 OutputBitList(chunk_bits, stream) 241 OutputBitList(compact_bits, stream) 242 if use_1byte_cost: 243 for value in values: 244 assert 0 <= value <= 255 245 stream.write(struct.pack('<B', value)) 246 else: 247 for value in values: 248 assert 0 <= value <= 65535 249 stream.write(struct.pack('<H', value)) 250 251 return stream.getvalue() 252 253 254def ParseOptions(): 255 parser = optparse.OptionParser() 256 parser.add_option('--text_connection_file', dest='text_connection_file') 257 parser.add_option('--id_file', dest='id_file') 258 parser.add_option('--special_pos_file', dest='special_pos_file') 259 parser.add_option('--target_compiler', dest='target_compiler') 260 parser.add_option('--use_1byte_cost', dest='use_1byte_cost') 261 parser.add_option('--binary_output_file', dest='binary_output_file') 262 parser.add_option('--header_output_file', dest='header_output_file') 263 return parser.parse_args()[0] 264 265 266def main(): 267 options = ParseOptions() 268 269 pos_size = GetPosSize(options.id_file) 270 special_pos_size = GetPosSize(options.special_pos_file) 271 matrix = ParseConnectionFile( 272 options.text_connection_file, pos_size, special_pos_size) 273 mode_value_list = CreateModeValueList(matrix) 274 CompressMatrixByModeValue(matrix, mode_value_list) 275 binary = BuildBinaryData( 276 matrix, mode_value_list, ParseBoolFlag(options.use_1byte_cost)) 277 278 if options.binary_output_file: 279 dirpath = os.path.dirname(options.binary_output_file) 280 if not os.path.exists(dirpath): 281 os.makedirs(dirpath) 282 with open(options.binary_output_file, 'wb') as stream: 283 stream.write(binary) 284 285 if options.header_output_file: 286 dirpath = os.path.dirname(options.header_output_file) 287 if not os.path.exists(dirpath): 288 os.makedirs(dirpath) 289 with open(options.header_output_file, 'wb') as stream: 290 code_generator_util.WriteCppDataArray( 291 binary, 'ConnectionData', options.target_compiler, stream) 292 293 294if __name__ == '__main__': 295 main() 296