1# -*- coding: utf-8 -*-
2# Copyright 2010-2018, Google Inc.
3# All rights reserved.
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Generator script for connection data."""
32
33__author__ = "hidehiko"
34
35import io
36import logging
37import optparse
38import os
39import struct
40import sys
41
42from build_tools import code_generator_util
43
44INVALID_COST = 30000
45INVALID_1BYTE_COST = 255
46RESOLUTION_FOR_1BYTE = 64
47FILE_MAGIC = b'\xAB\xCD'
48
49FALSE_VALUES = ['f', 'false', '0']
50TRUE_VALUES = ['t', 'true', '1']
51
52
53def ParseBoolFlag(value):
54  if value is None:
55    return False
56
57  value = value.lower()
58  if value in TRUE_VALUES:
59    return True
60  if value in FALSE_VALUES:
61    return False
62
63  # Unknown value.
64  logging.critical('Unknown boolean flag: %s', value)
65  sys.exit(1)
66
67
68def GetPosSize(filepath):
69  # The pos-size should be equal to the number of lines.
70  # TODO(hidehiko): Merge this method with pos_util in dictionary.
71  with open(filepath, 'r') as stream:
72    stream = code_generator_util.SkipLineComment(stream)
73    # Count the number of lines.
74    return sum(1 for _ in stream)
75
76
77def ParseConnectionFile(text_connection_file, pos_size, special_pos_size):
78  # The result is a square matrix.
79  mat_size = pos_size + special_pos_size
80
81  matrix = [[0] * mat_size for _ in range(mat_size)]
82  with open(text_connection_file) as stream:
83    stream = code_generator_util.SkipLineComment(stream)
84    # The first line contains the matrix column/row size.
85    size = next(stream).rstrip()
86    assert (int(size) == pos_size), '%s != %d' % (size, pos_size)
87
88    for array_index, cost in enumerate(stream):
89      cost = int(cost.rstrip())
90      rid = array_index // pos_size
91      lid = array_index % pos_size
92      if rid == 0 and lid == 0:
93        cost = 0
94      matrix[rid][lid] = cost
95
96  # Fill INVALID_COST in matrix elements for special POS.
97  for rid in range(pos_size, mat_size):
98    for lid in range(1, mat_size):  # Skip EOS
99      matrix[rid][lid] = INVALID_COST
100
101  for lid in range(pos_size, mat_size):
102    for rid in range(1, mat_size):  # Skip BOS
103      matrix[rid][lid] = INVALID_COST
104
105  return matrix
106
107
108def CreateModeValueList(matrix):
109  """Create a list of modes of each row."""
110  result = []
111  for row in matrix:
112    m = {}
113    for cost in row:
114      if cost == INVALID_COST:
115        # Heuristically, we do not compress INVALID_COST.
116        continue
117      m[cost] = m.get(cost, 0) + 1
118    mode_value = max(m.items(), key=lambda x: x[1])[0]
119    result.append(mode_value)
120  return result
121
122
123def CompressMatrixByModeValue(matrix, mode_value_list):
124  # To compress the data size, we hold mode values for each row in a separate
125  # list, and fill None into the matrix if it equals to the corresponding
126  # mode value.
127  assert len(matrix) == len(mode_value_list)
128  for row, mode_value in zip(matrix, mode_value_list):
129    for index in range(len(row)):
130      if row[index] == mode_value:
131        row[index] = None
132
133
134def OutputBitList(bit_list, stream):
135  # Make sure that the bit list is aligned to the byte boundary.
136  assert len(bit_list) % 8 == 0
137  for bits in code_generator_util.SplitChunk(bit_list, 8):
138    byte = 0
139    for bit_index, bit in enumerate(bits):
140      if bit:
141        # Fill in LSB to MSB order.
142        byte |= (1 << bit_index)
143    stream.write(struct.pack('B', byte))
144
145
146def BuildBinaryData(matrix, mode_value_list, use_1byte_cost):
147  # To compress the connection data, we use two-level succinct bit vector.
148  #
149  # The basic idea to compress the rid-lid matrix is compressing each row as
150  # follows:
151  # find the mode value of the row, and set the cells containins the value
152  # empty, thus we get a sparse array.
153  # We can compress sparse array by using succinct bit vector.
154  # (Please see also storage/louds/simple_succinct_bit_vector_index and
155  # storage/louds/bit_vector_based_array.)
156  # In addition, we compress the bit vector, too. Fortunately the distribution
157  # of bits is biased, so we group consecutive 8-bits and create another
158  # bit vector, named chunk-bits;
159  # - if no bits are 1, the corresponding bit is 0, otherwise 1.
160  # By using the bit vector, we can compact the original bit vector by skipping
161  # consecutive eight 0-bits. We can calculate the actual bit position in
162  # the compact bit vector by using Rank1 operation on chunk-bits.
163  #
164  # The file format is as follows:
165  # FILE_MAGIC (\xAB\xCD): 2bytes
166  # Resolution: 2bytes
167  # Num rids: 2bytes
168  # Num lids: 2bytes
169  # A list of mode values: 2bytes * rids (aligned to 32bits)
170  # A list of row data.
171  #
172  # The row data format is as follows:
173  # The size of compact bits in bytes: 2bytes
174  # The size of values in bytes: 2bytes
175  # chunk_bits, compact_bits, followed by values.
176
177  if use_1byte_cost:
178    resolution = RESOLUTION_FOR_1BYTE
179  else:
180    resolution = 1
181  stream = io.BytesIO()
182
183  # Output header.
184  stream.write(FILE_MAGIC)
185  matrix_size = len(matrix)
186  assert 0 <= matrix_size <= 65535
187  stream.write(struct.pack('<HHH', resolution, matrix_size, matrix_size))
188
189  # Output mode value list.
190  for value in mode_value_list:
191    assert 0 <= value <= 65536
192    stream.write(struct.pack('<H', value))
193
194  # 4 bytes alignment.
195  if len(mode_value_list) % 2:
196    stream.write(b'\x00\x00')
197
198  # Process each row:
199  for row in matrix:
200    chunk_bits = []
201    compact_bits = []
202    values = []
203
204    for chunk in code_generator_util.SplitChunk(row, 8):
205      if all(cost is None for cost in chunk):
206        # All bits are 0, so output 0-chunk bit.
207        chunk_bits.append(False)
208        continue
209
210      chunk_bits.append(True)
211      for cost in chunk:
212        if cost is None:
213          compact_bits.append(False)
214        else:
215          compact_bits.append(True)
216          if use_1byte_cost:
217            if cost == INVALID_COST:
218              cost = INVALID_1BYTE_COST
219            else:
220              cost //= resolution
221              assert cost != INVALID_1BYTE_COST
222          values.append(cost)
223
224    # 4 bytes alignment.
225    while len(chunk_bits) % 32:
226      chunk_bits.append(False)
227    while len(compact_bits) % 32:
228      compact_bits.append(False)
229    if use_1byte_cost:
230      while len(values) % 4:
231        values.append(0)
232      values_size = len(values)
233    else:
234      while len(values) % 2:
235        values.append(0)
236      values_size = len(values) * 2
237
238    # Output the bits for a row.
239    stream.write(struct.pack('<HH', len(compact_bits) // 8, values_size))
240    OutputBitList(chunk_bits, stream)
241    OutputBitList(compact_bits, stream)
242    if use_1byte_cost:
243      for value in values:
244        assert 0 <= value <= 255
245        stream.write(struct.pack('<B', value))
246    else:
247      for value in values:
248        assert 0 <= value <= 65535
249        stream.write(struct.pack('<H', value))
250
251  return stream.getvalue()
252
253
254def ParseOptions():
255  parser = optparse.OptionParser()
256  parser.add_option('--text_connection_file', dest='text_connection_file')
257  parser.add_option('--id_file', dest='id_file')
258  parser.add_option('--special_pos_file', dest='special_pos_file')
259  parser.add_option('--target_compiler', dest='target_compiler')
260  parser.add_option('--use_1byte_cost', dest='use_1byte_cost')
261  parser.add_option('--binary_output_file', dest='binary_output_file')
262  parser.add_option('--header_output_file', dest='header_output_file')
263  return parser.parse_args()[0]
264
265
266def main():
267  options = ParseOptions()
268
269  pos_size = GetPosSize(options.id_file)
270  special_pos_size = GetPosSize(options.special_pos_file)
271  matrix = ParseConnectionFile(
272      options.text_connection_file, pos_size, special_pos_size)
273  mode_value_list = CreateModeValueList(matrix)
274  CompressMatrixByModeValue(matrix, mode_value_list)
275  binary = BuildBinaryData(
276      matrix, mode_value_list, ParseBoolFlag(options.use_1byte_cost))
277
278  if options.binary_output_file:
279    dirpath = os.path.dirname(options.binary_output_file)
280    if not os.path.exists(dirpath):
281      os.makedirs(dirpath)
282    with open(options.binary_output_file, 'wb') as stream:
283      stream.write(binary)
284
285  if options.header_output_file:
286    dirpath = os.path.dirname(options.header_output_file)
287    if not os.path.exists(dirpath):
288      os.makedirs(dirpath)
289    with open(options.header_output_file, 'wb') as stream:
290      code_generator_util.WriteCppDataArray(
291          binary, 'ConnectionData', options.target_compiler, stream)
292
293
294if __name__ == '__main__':
295  main()
296