1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 #include "converter/segmenter.h"
31
32 #include "base/bitarray.h"
33 #include "base/logging.h"
34 #include "base/port.h"
35 #include "converter/node.h"
36 #include "data_manager/data_manager_interface.h"
37
38 namespace mozc {
39
CreateFromDataManager(const DataManagerInterface & data_manager)40 Segmenter *Segmenter::CreateFromDataManager(
41 const DataManagerInterface &data_manager) {
42 size_t l_num_elements = 0;
43 size_t r_num_elements = 0;
44 const uint16 *l_table = nullptr;
45 const uint16 *r_table = nullptr;
46 size_t bitarray_num_bytes = 0;
47 const char *bitarray_data = nullptr;
48 const uint16 *boundary_data = nullptr;
49 data_manager.GetSegmenterData(&l_num_elements, &r_num_elements,
50 &l_table, &r_table,
51 &bitarray_num_bytes, &bitarray_data,
52 &boundary_data);
53 return new Segmenter(l_num_elements, r_num_elements,
54 l_table, r_table,
55 bitarray_num_bytes, bitarray_data,
56 boundary_data);
57 }
58
Segmenter(size_t l_num_elements,size_t r_num_elements,const uint16 * l_table,const uint16 * r_table,size_t bitarray_num_bytes,const char * bitarray_data,const uint16 * boundary_data)59 Segmenter::Segmenter(
60 size_t l_num_elements, size_t r_num_elements, const uint16 *l_table,
61 const uint16 *r_table, size_t bitarray_num_bytes,
62 const char *bitarray_data, const uint16 *boundary_data)
63 : l_num_elements_(l_num_elements), r_num_elements_(r_num_elements),
64 l_table_(l_table), r_table_(r_table),
65 bitarray_num_bytes_(bitarray_num_bytes),
66 bitarray_data_(bitarray_data), boundary_data_(boundary_data) {
67 DCHECK(l_table_);
68 DCHECK(r_table_);
69 DCHECK(bitarray_data_);
70 DCHECK(boundary_data_);
71 CHECK_LE(l_num_elements_ * r_num_elements_, bitarray_num_bytes_ * 8);
72 }
73
~Segmenter()74 Segmenter::~Segmenter() {}
75
IsBoundary(const Node & lnode,const Node & rnode,bool is_single_segment) const76 bool Segmenter::IsBoundary(const Node &lnode, const Node &rnode,
77 bool is_single_segment) const {
78 if (lnode.node_type == Node::BOS_NODE ||
79 rnode.node_type == Node::EOS_NODE) {
80 return true;
81 }
82
83 // Always return false in prediction mode.
84 // This implies that converter always returns single-segment-result
85 // in prediction mode.
86 if (is_single_segment) {
87 return false;
88 }
89
90 // Concatenate particle and content word into one segment,
91 // if lnode locates at the beginning of user input.
92 // This hack is for handling ambiguous bunsetsu segmentation.
93 // e.g. "かみ|にかく" => "紙|に書く" or "紙二角".
94 // If we segment "に書く" into two segments, "二角" is never be shown.
95 // There exits some implicit assumpution that user expects that his/her input
96 // becomes one bunsetu. So, it would be better to keep "二角" even after "紙".
97 if (lnode.attributes & Node::STARTS_WITH_PARTICLE) {
98 return false;
99 }
100
101 return IsBoundary(lnode.rid, rnode.lid);
102 }
103
IsBoundary(uint16 rid,uint16 lid) const104 bool Segmenter::IsBoundary(uint16 rid, uint16 lid) const {
105 const uint32 bitarray_index = l_table_[rid] + l_num_elements_ * r_table_[lid];
106 return BitArray::GetValue(reinterpret_cast<const char*>(bitarray_data_),
107 bitarray_index);
108 }
109
GetPrefixPenalty(uint16 lid) const110 int32 Segmenter::GetPrefixPenalty(uint16 lid) const {
111 return boundary_data_[2 * lid];
112 }
113
GetSuffixPenalty(uint16 rid) const114 int32 Segmenter::GetSuffixPenalty(uint16 rid) const {
115 return boundary_data_[2 * rid + 1];
116 }
117
118 } // namespace mozc
119