1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #include "converter/segmenter.h"
31 
32 #include "base/bitarray.h"
33 #include "base/logging.h"
34 #include "base/port.h"
35 #include "converter/node.h"
36 #include "data_manager/data_manager_interface.h"
37 
38 namespace mozc {
39 
CreateFromDataManager(const DataManagerInterface & data_manager)40 Segmenter *Segmenter::CreateFromDataManager(
41     const DataManagerInterface &data_manager) {
42   size_t l_num_elements = 0;
43   size_t r_num_elements = 0;
44   const uint16 *l_table = nullptr;
45   const uint16 *r_table = nullptr;
46   size_t bitarray_num_bytes = 0;
47   const char *bitarray_data = nullptr;
48   const uint16 *boundary_data = nullptr;
49   data_manager.GetSegmenterData(&l_num_elements, &r_num_elements,
50                                 &l_table, &r_table,
51                                 &bitarray_num_bytes, &bitarray_data,
52                                 &boundary_data);
53   return new Segmenter(l_num_elements, r_num_elements,
54                        l_table, r_table,
55                        bitarray_num_bytes, bitarray_data,
56                        boundary_data);
57 }
58 
Segmenter(size_t l_num_elements,size_t r_num_elements,const uint16 * l_table,const uint16 * r_table,size_t bitarray_num_bytes,const char * bitarray_data,const uint16 * boundary_data)59 Segmenter::Segmenter(
60     size_t l_num_elements, size_t r_num_elements, const uint16 *l_table,
61     const uint16 *r_table, size_t bitarray_num_bytes,
62     const char *bitarray_data, const uint16 *boundary_data)
63     : l_num_elements_(l_num_elements), r_num_elements_(r_num_elements),
64       l_table_(l_table), r_table_(r_table),
65       bitarray_num_bytes_(bitarray_num_bytes),
66       bitarray_data_(bitarray_data), boundary_data_(boundary_data) {
67   DCHECK(l_table_);
68   DCHECK(r_table_);
69   DCHECK(bitarray_data_);
70   DCHECK(boundary_data_);
71   CHECK_LE(l_num_elements_ * r_num_elements_, bitarray_num_bytes_ * 8);
72 }
73 
~Segmenter()74 Segmenter::~Segmenter() {}
75 
IsBoundary(const Node & lnode,const Node & rnode,bool is_single_segment) const76 bool Segmenter::IsBoundary(const Node &lnode, const Node &rnode,
77                            bool is_single_segment) const {
78   if (lnode.node_type == Node::BOS_NODE ||
79       rnode.node_type == Node::EOS_NODE) {
80     return true;
81   }
82 
83   // Always return false in prediction mode.
84   // This implies that converter always returns single-segment-result
85   // in prediction mode.
86   if (is_single_segment) {
87     return false;
88   }
89 
90   // Concatenate particle and content word into one segment,
91   // if lnode locates at the beginning of user input.
92   // This hack is for handling ambiguous bunsetsu segmentation.
93   // e.g. "かみ|にかく" => "紙|に書く" or "紙二角".
94   // If we segment "に書く" into two segments, "二角" is never be shown.
95   // There exits some implicit assumpution that user expects that his/her input
96   // becomes one bunsetu. So, it would be better to keep "二角" even after "紙".
97   if (lnode.attributes & Node::STARTS_WITH_PARTICLE) {
98     return false;
99   }
100 
101   return IsBoundary(lnode.rid, rnode.lid);
102 }
103 
IsBoundary(uint16 rid,uint16 lid) const104 bool Segmenter::IsBoundary(uint16 rid, uint16 lid) const {
105   const uint32 bitarray_index = l_table_[rid] + l_num_elements_ * r_table_[lid];
106   return BitArray::GetValue(reinterpret_cast<const char*>(bitarray_data_),
107                             bitarray_index);
108 }
109 
GetPrefixPenalty(uint16 lid) const110 int32 Segmenter::GetPrefixPenalty(uint16 lid) const {
111   return boundary_data_[2 * lid];
112 }
113 
GetSuffixPenalty(uint16 rid) const114 int32 Segmenter::GetSuffixPenalty(uint16 rid) const {
115   return boundary_data_[2 * rid + 1];
116 }
117 
118 }  // namespace mozc
119