1 /* 2 * libpinyin 3 * Library to deal with pinyin. 4 * 5 * Copyright (C) 2015 Peng Wu <alexepico@gmail.com> 6 * 7 * This program is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #ifndef FLEXIBLE_NGRAM_BDB_H 22 #define FLEXIBLE_NGRAM_BDB_H 23 24 #ifdef HAVE_BERKELEY_DB 25 #include <errno.h> 26 #include <db.h> 27 #endif 28 29 namespace pinyin{ 30 31 /** 32 * FlexibleBigram: 33 * @MagicHeader: the struct type of the magic header. 34 * @ArrayHeader: the struct type of the array header. 35 * @ArrayItem: the struct type of the array item. 36 * 37 * The flexible bi-gram is mainly used for training purpose. 38 * 39 */ 40 template<typename MagicHeader, typename ArrayHeader, 41 typename ArrayItem> 42 class FlexibleBigram{ 43 /* Note: some flexible bi-gram file format check should be here. */ 44 private: 45 DB * m_db; 46 47 phrase_token_t m_magic_header_index[2]; 48 49 char m_magic_number[4]; 50 reset()51 void reset(){ 52 if ( m_db ){ 53 m_db->sync(m_db, 0); 54 m_db->close(m_db, 0); 55 m_db = NULL; 56 } 57 } 58 59 public: 60 /** 61 * FlexibleBigram::FlexibleBigram: 62 * @magic_number: the 4 bytes magic number of the flexible bi-gram. 63 * 64 * The constructor of the FlexibleBigram. 65 * 66 */ FlexibleBigram(const char * magic_number)67 FlexibleBigram(const char * magic_number){ 68 m_db = NULL; 69 m_magic_header_index[0] = null_token; 70 m_magic_header_index[1] = null_token; 71 72 memcpy(m_magic_number, magic_number, sizeof(m_magic_number)); 73 } 74 75 /** 76 * FlexibleBigram::~FlexibleBigram: 77 * 78 * The destructor of the FlexibleBigram. 79 * 80 */ ~FlexibleBigram()81 ~FlexibleBigram(){ 82 reset(); 83 } 84 85 /** 86 * FlexibleBigram::attach: 87 * @dbfile: the path name of the flexible bi-gram. 88 * @flags: the attach flags for the Berkeley DB. 89 * @returns: whether the attach operation is successful. 90 * 91 * Attach Berkeley DB on filesystem for training purpose. 92 * 93 */ attach(const char * dbfile,guint32 flags)94 bool attach(const char * dbfile, guint32 flags){ 95 reset(); 96 u_int32_t db_flags = 0; 97 98 if ( flags & ATTACH_READONLY ) 99 db_flags |= DB_RDONLY; 100 if ( flags & ATTACH_READWRITE ) 101 assert( !(flags & ATTACH_READONLY ) ); 102 103 if ( !dbfile ) 104 return false; 105 106 int ret = db_create(&m_db, NULL, 0); 107 if ( ret != 0 ) 108 assert(false); 109 110 ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644); 111 if ( ret != 0 && (flags & ATTACH_CREATE) ) { 112 db_flags |= DB_CREATE; 113 /* Create database file here, and write the signature. */ 114 ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644); 115 if ( ret != 0 ) 116 return false; 117 118 DBT db_key; 119 memset(&db_key, 0, sizeof(DBT)); 120 db_key.data = m_magic_header_index; 121 db_key.size = sizeof(m_magic_header_index); 122 DBT db_data; 123 memset(&db_data, 0, sizeof(DBT)); 124 db_data.data = m_magic_number; 125 db_data.size = sizeof(m_magic_number); 126 db_data.flags = DB_DBT_PARTIAL; 127 db_data.doff = 0; 128 db_data.dlen = sizeof(m_magic_number); 129 130 ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); 131 return ret == 0; 132 } 133 134 /* check the signature. */ 135 DBT db_key; 136 memset(&db_key, 0, sizeof(DBT)); 137 db_key.data = m_magic_header_index; 138 db_key.size = sizeof(m_magic_header_index); 139 DBT db_data; 140 memset(&db_data, 0, sizeof(DBT)); 141 db_data.flags = DB_DBT_PARTIAL; 142 db_data.doff = 0; 143 db_data.dlen = sizeof(m_magic_number); 144 ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); 145 if ( ret != 0 ) 146 return false; 147 if ( sizeof(m_magic_number) != db_data.size ) 148 return false; 149 if ( memcmp(db_data.data, m_magic_number, 150 sizeof(m_magic_number)) == 0 ) 151 return true; 152 return false; 153 } 154 155 /** 156 * FlexibleBigram::load: 157 * @index: the previous token in the flexible bi-gram. 158 * @single_gram: the single gram of the previous token. 159 * @copy: whether copy content to the single gram. 160 * @returns: whether the load operation is successful. 161 * 162 * Load the single gram of the previous token. 163 * 164 */ 165 bool load(phrase_token_t index, 166 FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram, 167 bool copy=false){ 168 single_gram = NULL; 169 if ( !m_db ) 170 return false; 171 172 DBT db_key; 173 memset(&db_key, 0, sizeof(DBT)); 174 db_key.data = &index; 175 db_key.size = sizeof(phrase_token_t); 176 177 DBT db_data; 178 memset(&db_data, 0, sizeof(DBT)); 179 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); 180 if ( ret != 0) 181 return false; 182 183 single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem> 184 (db_data.data, db_data.size, copy); 185 186 return true; 187 } 188 189 /** 190 * FlexibleBigram::store: 191 * @index: the previous token in the flexible bi-gram. 192 * @single_gram: the single gram of the previous token. 193 * @returns: whether the store operation is successful. 194 * 195 * Store the single gram of the previous token. 196 * 197 */ store(phrase_token_t index,FlexibleSingleGram<ArrayHeader,ArrayItem> * single_gram)198 bool store(phrase_token_t index, 199 FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){ 200 if ( !m_db ) 201 return false; 202 203 DBT db_key; 204 memset(&db_key, 0, sizeof(DBT)); 205 db_key.data = &index; 206 db_key.size = sizeof(phrase_token_t); 207 DBT db_data; 208 memset(&db_data, 0, sizeof(DBT)); 209 db_data.data = single_gram->m_chunk.begin(); 210 db_data.size = single_gram->m_chunk.size(); 211 212 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); 213 return ret == 0; 214 } 215 216 /** 217 * FlexibleBigram::remove: 218 * @index: the previous token in the flexible bi-gram. 219 * @returns: whether the remove operation is successful. 220 * 221 * Remove the single gram of the previous token. 222 * 223 */ remove(phrase_token_t index)224 bool remove(phrase_token_t index){ 225 if ( !m_db ) 226 return false; 227 228 DBT db_key; 229 memset(&db_key, 0, sizeof(DBT)); 230 db_key.data = &index; 231 db_key.size = sizeof(phrase_token_t); 232 233 int ret = m_db->del(m_db, NULL, &db_key, 0); 234 return ret == 0; 235 } 236 237 /** 238 * FlexibleBigram::get_all_items: 239 * @items: the GArray to store all previous tokens. 240 * @returns: whether the get operation is successful. 241 * 242 * Get the array of all previous tokens for parameter estimation. 243 * 244 */ get_all_items(GArray * items)245 bool get_all_items(GArray * items){ 246 g_array_set_size(items, 0); 247 248 if ( !m_db ) 249 return false; 250 251 DBC * cursorp; 252 DBT key, data; 253 int ret; 254 255 /* Get a cursor */ 256 m_db->cursor(m_db, NULL, &cursorp, 0); 257 258 if (NULL == cursorp) 259 return false; 260 261 /* Initialize our DBTs. */ 262 memset(&key, 0, sizeof(DBT)); 263 memset(&data, 0, sizeof(DBT)); 264 265 /* Iterate over the database, retrieving each record in turn. */ 266 while ((ret = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){ 267 if (key.size != sizeof(phrase_token_t)){ 268 /* skip magic header. */ 269 continue; 270 } 271 phrase_token_t * token = (phrase_token_t *) key.data; 272 g_array_append_val(items, *token); 273 } 274 275 if ( ret != DB_NOTFOUND ){ 276 fprintf(stderr, "training db error, exit!"); 277 278 if (cursorp != NULL) 279 cursorp->c_close(cursorp); 280 281 exit(EIO); 282 } 283 284 /* Cursors must be closed */ 285 if (cursorp != NULL) 286 cursorp->c_close(cursorp); 287 return true; 288 } 289 290 /** 291 * FlexibleBigram::get_magic_header: 292 * @header: the magic header. 293 * @returns: whether the get operation is successful. 294 * 295 * Get the magic header of the flexible bi-gram. 296 * 297 */ get_magic_header(MagicHeader & header)298 bool get_magic_header(MagicHeader & header){ 299 /* clear retval */ 300 memset(&header, 0, sizeof(MagicHeader)); 301 302 if ( !m_db ) 303 return false; 304 305 DBT db_key; 306 memset(&db_key, 0, sizeof(DBT)); 307 db_key.data = m_magic_header_index; 308 db_key.size = sizeof(m_magic_header_index); 309 DBT db_data; 310 memset(&db_data, 0, sizeof(DBT)); 311 db_data.flags = DB_DBT_PARTIAL; 312 db_data.doff = sizeof(m_magic_number); 313 db_data.dlen = sizeof(MagicHeader); 314 315 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); 316 if ( ret != 0 ) 317 return false; 318 319 if ( sizeof(MagicHeader) != db_data.size ) 320 return false; 321 322 memcpy(&header, db_data.data, sizeof(MagicHeader)); 323 return true; 324 } 325 326 /** 327 * FlexibleBigram::set_magic_header: 328 * @header: the magic header. 329 * @returns: whether the set operation is successful. 330 * 331 * Set the magic header of the flexible bi-gram. 332 * 333 */ set_magic_header(const MagicHeader & header)334 bool set_magic_header(const MagicHeader & header){ 335 if ( !m_db ) 336 return false; 337 338 DBT db_key; 339 memset(&db_key, 0, sizeof(DBT)); 340 db_key.data = m_magic_header_index; 341 db_key.size = sizeof(m_magic_header_index); 342 DBT db_data; 343 memset(&db_data, 0, sizeof(DBT)); 344 db_data.data = (void *) &header; 345 db_data.size = sizeof(MagicHeader); 346 db_data.flags = DB_DBT_PARTIAL; 347 db_data.doff = sizeof(m_magic_number); 348 db_data.dlen = sizeof(MagicHeader); 349 350 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); 351 return ret == 0; 352 } 353 354 /** 355 * FlexibleBigram::get_array_header: 356 * @index: the previous token in the flexible bi-gram. 357 * @header: the array header in the single gram of the previous token. 358 * @returns: whether the get operation is successful. 359 * 360 * Get the array header in the single gram of the previous token. 361 * 362 */ get_array_header(phrase_token_t index,ArrayHeader & header)363 bool get_array_header(phrase_token_t index, ArrayHeader & header){ 364 /* clear retval */ 365 memset(&header, 0, sizeof(ArrayHeader)); 366 367 if ( !m_db ) 368 return false; 369 370 DBT db_key; 371 memset(&db_key, 0, sizeof(DBT)); 372 db_key.data = &index; 373 db_key.size = sizeof(phrase_token_t); 374 375 DBT db_data; 376 memset(&db_data, 0, sizeof(DBT)); 377 db_data.flags = DB_DBT_PARTIAL; 378 db_data.doff = 0; 379 db_data.dlen = sizeof(ArrayHeader); 380 int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0); 381 if ( ret != 0 ) 382 return false; 383 384 assert(db_data.size == sizeof(ArrayHeader)); 385 memcpy(&header, db_data.data, sizeof(ArrayHeader)); 386 return true; 387 } 388 389 /** 390 * FlexibleBigram::set_array_header: 391 * @index: the previous token of the flexible bi-gram. 392 * @header: the array header in the single gram of the previous token. 393 * @returns: whether the set operation is successful. 394 * 395 * Set the array header in the single gram of the previous token. 396 * 397 */ set_array_header(phrase_token_t index,const ArrayHeader & header)398 bool set_array_header(phrase_token_t index, const ArrayHeader & header){ 399 if ( !m_db ) 400 return false; 401 402 DBT db_key; 403 memset(&db_key, 0, sizeof(DBT)); 404 db_key.data = &index; 405 db_key.size = sizeof(phrase_token_t); 406 DBT db_data; 407 memset(&db_data, 0, sizeof(DBT)); 408 db_data.data = (void *)&header; 409 db_data.size = sizeof(ArrayHeader); 410 db_data.flags = DB_DBT_PARTIAL; 411 db_data.doff = 0; 412 db_data.dlen = sizeof(ArrayHeader); 413 414 int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0); 415 return ret == 0; 416 } 417 418 }; 419 420 }; 421 422 #endif 423