1 /*
2  *  libpinyin
3  *  Library to deal with pinyin.
4  *
5  *  Copyright (C) 2015 Peng Wu <alexepico@gmail.com>
6  *
7  *  This program is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #ifndef FLEXIBLE_NGRAM_BDB_H
22 #define FLEXIBLE_NGRAM_BDB_H
23 
24 #ifdef HAVE_BERKELEY_DB
25 #include <errno.h>
26 #include <db.h>
27 #endif
28 
29 namespace pinyin{
30 
31 /**
32  * FlexibleBigram:
33  * @MagicHeader: the struct type of the magic header.
34  * @ArrayHeader: the struct type of the array header.
35  * @ArrayItem: the struct type of the array item.
36  *
37  * The flexible bi-gram is mainly used for training purpose.
38  *
39  */
40 template<typename MagicHeader, typename ArrayHeader,
41          typename ArrayItem>
42 class FlexibleBigram{
43     /* Note: some flexible bi-gram file format check should be here. */
44 private:
45     DB * m_db;
46 
47     phrase_token_t m_magic_header_index[2];
48 
49     char m_magic_number[4];
50 
reset()51     void reset(){
52         if ( m_db ){
53             m_db->sync(m_db, 0);
54             m_db->close(m_db, 0);
55             m_db = NULL;
56         }
57     }
58 
59 public:
60     /**
61      * FlexibleBigram::FlexibleBigram:
62      * @magic_number: the 4 bytes magic number of the flexible bi-gram.
63      *
64      * The constructor of the FlexibleBigram.
65      *
66      */
FlexibleBigram(const char * magic_number)67     FlexibleBigram(const char * magic_number){
68         m_db = NULL;
69         m_magic_header_index[0] = null_token;
70         m_magic_header_index[1] = null_token;
71 
72         memcpy(m_magic_number, magic_number, sizeof(m_magic_number));
73     }
74 
75     /**
76      * FlexibleBigram::~FlexibleBigram:
77      *
78      * The destructor of the FlexibleBigram.
79      *
80      */
~FlexibleBigram()81     ~FlexibleBigram(){
82         reset();
83     }
84 
85     /**
86      * FlexibleBigram::attach:
87      * @dbfile: the path name of the flexible bi-gram.
88      * @flags: the attach flags for the Berkeley DB.
89      * @returns: whether the attach operation is successful.
90      *
91      * Attach Berkeley DB on filesystem for training purpose.
92      *
93      */
attach(const char * dbfile,guint32 flags)94     bool attach(const char * dbfile, guint32 flags){
95         reset();
96         u_int32_t db_flags = 0;
97 
98         if ( flags & ATTACH_READONLY )
99             db_flags |= DB_RDONLY;
100         if ( flags & ATTACH_READWRITE )
101             assert( !(flags & ATTACH_READONLY ) );
102 
103         if ( !dbfile )
104             return false;
105 
106         int ret = db_create(&m_db, NULL, 0);
107         if ( ret != 0 )
108             assert(false);
109 
110         ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
111         if ( ret != 0 && (flags & ATTACH_CREATE) ) {
112             db_flags |= DB_CREATE;
113             /* Create database file here, and write the signature. */
114             ret = m_db->open(m_db, NULL, dbfile, NULL, DB_HASH, db_flags, 0644);
115             if ( ret != 0 )
116                 return false;
117 
118             DBT db_key;
119             memset(&db_key, 0, sizeof(DBT));
120             db_key.data = m_magic_header_index;
121             db_key.size = sizeof(m_magic_header_index);
122             DBT db_data;
123             memset(&db_data, 0, sizeof(DBT));
124             db_data.data = m_magic_number;
125             db_data.size = sizeof(m_magic_number);
126             db_data.flags = DB_DBT_PARTIAL;
127             db_data.doff = 0;
128             db_data.dlen = sizeof(m_magic_number);
129 
130             ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
131             return ret == 0;
132         }
133 
134         /* check the signature. */
135         DBT db_key;
136         memset(&db_key, 0, sizeof(DBT));
137         db_key.data = m_magic_header_index;
138         db_key.size = sizeof(m_magic_header_index);
139         DBT db_data;
140         memset(&db_data, 0, sizeof(DBT));
141         db_data.flags = DB_DBT_PARTIAL;
142         db_data.doff = 0;
143         db_data.dlen = sizeof(m_magic_number);
144         ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
145         if ( ret != 0 )
146             return false;
147         if ( sizeof(m_magic_number) != db_data.size )
148             return false;
149         if ( memcmp(db_data.data, m_magic_number,
150                     sizeof(m_magic_number)) == 0 )
151             return true;
152         return false;
153     }
154 
155     /**
156      * FlexibleBigram::load:
157      * @index: the previous token in the flexible bi-gram.
158      * @single_gram: the single gram of the previous token.
159      * @copy: whether copy content to the single gram.
160      * @returns: whether the load operation is successful.
161      *
162      * Load the single gram of the previous token.
163      *
164      */
165     bool load(phrase_token_t index,
166               FlexibleSingleGram<ArrayHeader, ArrayItem> * & single_gram,
167               bool copy=false){
168         single_gram = NULL;
169         if ( !m_db )
170             return false;
171 
172         DBT db_key;
173         memset(&db_key, 0, sizeof(DBT));
174         db_key.data = &index;
175         db_key.size = sizeof(phrase_token_t);
176 
177         DBT db_data;
178         memset(&db_data, 0, sizeof(DBT));
179         int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
180         if ( ret != 0)
181             return false;
182 
183         single_gram = new FlexibleSingleGram<ArrayHeader, ArrayItem>
184             (db_data.data, db_data.size, copy);
185 
186         return true;
187     }
188 
189     /**
190      * FlexibleBigram::store:
191      * @index: the previous token in the flexible bi-gram.
192      * @single_gram: the single gram of the previous token.
193      * @returns: whether the store operation is successful.
194      *
195      * Store the single gram of the previous token.
196      *
197      */
store(phrase_token_t index,FlexibleSingleGram<ArrayHeader,ArrayItem> * single_gram)198     bool store(phrase_token_t index,
199                FlexibleSingleGram<ArrayHeader, ArrayItem> * single_gram){
200         if ( !m_db )
201             return false;
202 
203         DBT db_key;
204         memset(&db_key, 0, sizeof(DBT));
205         db_key.data = &index;
206         db_key.size = sizeof(phrase_token_t);
207         DBT db_data;
208         memset(&db_data, 0, sizeof(DBT));
209         db_data.data = single_gram->m_chunk.begin();
210         db_data.size = single_gram->m_chunk.size();
211 
212         int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
213         return ret == 0;
214     }
215 
216     /**
217      * FlexibleBigram::remove:
218      * @index: the previous token in the flexible bi-gram.
219      * @returns: whether the remove operation is successful.
220      *
221      * Remove the single gram of the previous token.
222      *
223      */
remove(phrase_token_t index)224     bool remove(phrase_token_t index){
225         if ( !m_db )
226             return false;
227 
228         DBT db_key;
229         memset(&db_key, 0, sizeof(DBT));
230         db_key.data = &index;
231         db_key.size = sizeof(phrase_token_t);
232 
233         int ret = m_db->del(m_db, NULL, &db_key, 0);
234         return ret == 0;
235     }
236 
237     /**
238      * FlexibleBigram::get_all_items:
239      * @items: the GArray to store all previous tokens.
240      * @returns: whether the get operation is successful.
241      *
242      * Get the array of all previous tokens for parameter estimation.
243      *
244      */
get_all_items(GArray * items)245     bool get_all_items(GArray * items){
246         g_array_set_size(items, 0);
247 
248         if ( !m_db )
249             return false;
250 
251         DBC * cursorp;
252         DBT key, data;
253         int ret;
254 
255         /* Get a cursor */
256         m_db->cursor(m_db, NULL, &cursorp, 0);
257 
258         if (NULL == cursorp)
259             return false;
260 
261         /* Initialize our DBTs. */
262         memset(&key, 0, sizeof(DBT));
263         memset(&data, 0, sizeof(DBT));
264 
265         /* Iterate over the database, retrieving each record in turn. */
266         while ((ret =  cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0 ){
267             if (key.size != sizeof(phrase_token_t)){
268                 /* skip magic header. */
269                 continue;
270             }
271             phrase_token_t * token = (phrase_token_t *) key.data;
272             g_array_append_val(items, *token);
273         }
274 
275         if ( ret != DB_NOTFOUND ){
276             fprintf(stderr, "training db error, exit!");
277 
278             if (cursorp != NULL)
279                 cursorp->c_close(cursorp);
280 
281             exit(EIO);
282         }
283 
284         /* Cursors must be closed */
285         if (cursorp != NULL)
286             cursorp->c_close(cursorp);
287         return true;
288     }
289 
290     /**
291      * FlexibleBigram::get_magic_header:
292      * @header: the magic header.
293      * @returns: whether the get operation is successful.
294      *
295      * Get the magic header of the flexible bi-gram.
296      *
297      */
get_magic_header(MagicHeader & header)298     bool get_magic_header(MagicHeader & header){
299         /* clear retval */
300         memset(&header, 0, sizeof(MagicHeader));
301 
302         if ( !m_db )
303             return false;
304 
305         DBT db_key;
306         memset(&db_key, 0, sizeof(DBT));
307         db_key.data = m_magic_header_index;
308         db_key.size = sizeof(m_magic_header_index);
309         DBT db_data;
310         memset(&db_data, 0, sizeof(DBT));
311         db_data.flags = DB_DBT_PARTIAL;
312         db_data.doff = sizeof(m_magic_number);
313         db_data.dlen = sizeof(MagicHeader);
314 
315         int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
316         if ( ret != 0 )
317             return false;
318 
319         if ( sizeof(MagicHeader) != db_data.size )
320             return false;
321 
322         memcpy(&header, db_data.data, sizeof(MagicHeader));
323         return true;
324     }
325 
326     /**
327      * FlexibleBigram::set_magic_header:
328      * @header: the magic header.
329      * @returns: whether the set operation is successful.
330      *
331      * Set the magic header of the flexible bi-gram.
332      *
333      */
set_magic_header(const MagicHeader & header)334     bool set_magic_header(const MagicHeader & header){
335         if ( !m_db )
336             return false;
337 
338         DBT db_key;
339         memset(&db_key, 0, sizeof(DBT));
340         db_key.data = m_magic_header_index;
341         db_key.size = sizeof(m_magic_header_index);
342         DBT db_data;
343         memset(&db_data, 0, sizeof(DBT));
344         db_data.data = (void *) &header;
345         db_data.size = sizeof(MagicHeader);
346         db_data.flags = DB_DBT_PARTIAL;
347         db_data.doff = sizeof(m_magic_number);
348         db_data.dlen = sizeof(MagicHeader);
349 
350         int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
351         return ret == 0;
352     }
353 
354     /**
355      * FlexibleBigram::get_array_header:
356      * @index: the previous token in the flexible bi-gram.
357      * @header: the array header in the single gram of the previous token.
358      * @returns: whether the get operation is successful.
359      *
360      * Get the array header in the single gram of the previous token.
361      *
362      */
get_array_header(phrase_token_t index,ArrayHeader & header)363     bool get_array_header(phrase_token_t index, ArrayHeader & header){
364         /* clear retval */
365         memset(&header, 0, sizeof(ArrayHeader));
366 
367         if ( !m_db )
368             return false;
369 
370         DBT db_key;
371         memset(&db_key, 0, sizeof(DBT));
372         db_key.data = &index;
373         db_key.size = sizeof(phrase_token_t);
374 
375         DBT db_data;
376         memset(&db_data, 0, sizeof(DBT));
377         db_data.flags = DB_DBT_PARTIAL;
378         db_data.doff = 0;
379         db_data.dlen = sizeof(ArrayHeader);
380         int ret = m_db->get(m_db, NULL, &db_key, &db_data, 0);
381         if ( ret != 0 )
382             return false;
383 
384         assert(db_data.size == sizeof(ArrayHeader));
385         memcpy(&header, db_data.data, sizeof(ArrayHeader));
386         return true;
387     }
388 
389     /**
390      * FlexibleBigram::set_array_header:
391      * @index: the previous token of the flexible bi-gram.
392      * @header: the array header in the single gram of the previous token.
393      * @returns: whether the set operation is successful.
394      *
395      * Set the array header in the single gram of the previous token.
396      *
397      */
set_array_header(phrase_token_t index,const ArrayHeader & header)398     bool set_array_header(phrase_token_t index, const ArrayHeader & header){
399         if ( !m_db )
400             return false;
401 
402         DBT db_key;
403         memset(&db_key, 0, sizeof(DBT));
404         db_key.data = &index;
405         db_key.size = sizeof(phrase_token_t);
406         DBT db_data;
407         memset(&db_data, 0, sizeof(DBT));
408         db_data.data = (void *)&header;
409         db_data.size = sizeof(ArrayHeader);
410         db_data.flags = DB_DBT_PARTIAL;
411         db_data.doff = 0;
412         db_data.dlen = sizeof(ArrayHeader);
413 
414         int ret = m_db->put(m_db, NULL, &db_key, &db_data, 0);
415         return ret == 0;
416     }
417 
418 };
419 
420 };
421 
422 #endif
423