1 /**
2  * @file   ngram_write_bin.c
3  *
4  * <JA>
5  * @brief  N-gram��Х��ʥ�����ǥե�����˽��Ф�
6  *
7  * rev.3.5 ��ꡤ�ɤ߹��ߤι�®�����θ���ƽ��Ф��ΥХ��ȥ���������
8  * Big endian ���꤫��ޥ����¸���ѹ����줿���ޤ�����ǥå����� 24bit ��
9  * ����� 2-gram �ΥХå����եǡ����ΰ��̤�Ԥ��ʤɡ��ե����������
10  * �������ͤ������ѹ����줿������ˤ�ꡤ3.5 �ʹߤ� mkbingram ��
11  * �����������Х��ʥ�N-gram��, 3.4.2������ Julius �ǤϻȤ��ʤ���
12  * (�إå������å��ǥ��顼�Ȥʤ�)
13  *
14  * �ʤ� 3.5 �ʹߤ� Julius �ǤϽ���Υ�ǥ������ʤ��ɤ�롥���ξ��,
15  * ����ǥå����� 24bit ���ȥХå����դΰ��̤ϥ�ǥ��ɤ߹��߻���
16  * �������ٹԤ��롥
17  *
18  * �Х��ȥ��������˴ؤ��ƥإå��˵��Ҥ��뤳�Ȥǡ��ɤ߹��߻���Ƚ�ꤷ��
19  * �ɤ߹��ࡥ����ˤ�ꡤ�ۤʤ�Х��ȥ��������Υޥ������������
20  * �Х��ʥ�N-gram�Ǥ�����ʤ��ɤ�롥��������Υ�ǥ�⤽�Τޤ�
21  * �ɤ߹���롥
22  * </JA>
23  *
24  * <EN>
25  * @brief  Write a whole N-gram data to a file in binary format
26  *
27  * From 3.5, internal format of binary N-gram has changed for using
28  * machine-dependent natural byte order (previously fixed to big endian),
29  * 24bit index and 2-gram backoff compression.  So, binary N-gram
30  * generated by mkbingram of 3.5 and later will not work on 3.4.2 and
31  * earlier versions.
32  *
33  * There is full upward- and cross-machine compatibility in 3.5.  Old
34  * binary N-gram files still can be read directly, in which case the conversion
35  * to 24bit index will performed just after model has been read.
36  * Byte order will also considered by header information, so
37  * binary N-gram still can be used among different machines.
38  * </EN>
39  *
40  * @author Akinobu LEE
41  * @date   Wed Feb 16 17:23:16 2005
42  *
43  * $Revision: 1.4 $
44  *
45  */
46 /*
47  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
48  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
49  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
50  * All rights reserved
51  */
52 
53 #include <sent/stddefs.h>
54 #include <sent/ngram2.h>
55 
56 static boolean need_swap; ///< TRUE if need byte swap
57 
58 #define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE
59 
60 static unsigned int count;
61 void
reset_wrt_counter()62 reset_wrt_counter()
63 {
64   count = 0;
65 }
66 static unsigned int
get_wrt_counter()67 get_wrt_counter()
68 {
69   return count;
70 }
71 
72 
73 /**
74  * Binary write function, with byte swapping if needed.
75  *
76  * @param fp [in] file pointer
77  * @param buf [in] data buffer to write
78  * @param unitbyte [in] unit size in bytes
79  * @param unitnum [in] number of unit to write
80  */
81 static boolean
wrtfunc(FILE * fp,void * buf,size_t unitbyte,size_t unitnum)82 wrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum)
83 {
84   if (need_swap == TRUE && unitbyte != 1) {
85     swap_bytes((char *)buf, unitbyte, unitnum);
86   }
87   if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) {
88     jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum);
89     return FALSE;
90   }
91   if (need_swap == TRUE && unitbyte != 1) {
92     swap_bytes((char *)buf, unitbyte, unitnum);
93   }
94   count += unitbyte * unitnum;
95   return TRUE;
96 }
97 
98 /**
99  * Write header information, with identifier string.
100  *
101  * @param fp [in] file pointer
102  * @param str [in] user header string (any string within BINGRAM_HDSIZE
103  * bytes is allowed)
104  * @param version [in] file format version id
105  */
106 static boolean
write_header(FILE * fp,char * str)107 write_header(FILE *fp, char *str)
108 {
109   char buf[BINGRAM_HDSIZE];
110   int i, totallen;
111 
112   for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
113   totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
114   if (totallen >= BINGRAM_HDSIZE) {
115     jlog("Warning: write_bingram: header too long, last will be truncated\n");
116     i = strlen(str) - (totallen - BINGRAM_HDSIZE);
117     str[i] = '\0';
118   }
119   sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
120   wrt(fp, buf, 1, BINGRAM_HDSIZE);
121 
122   return TRUE;
123 }
124 
125 /**
126  * Write a whole N-gram data in binary format.
127  *
128  * @param fp [in] file pointer
129  * @param ndata [in] N-gram data to write
130  * @param headerstr [in] user header string
131  *
132  * @return TRUE on success, FALSE on failure
133  */
134 boolean
ngram_write_bin(FILE * fp,NGRAM_INFO * ndata,char * headerstr)135 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
136 {
137   int i,n;
138   unsigned int len;
139   int wlen;
140   NGRAM_TUPLE_INFO *t;
141 
142   reset_wrt_counter();
143 
144   /* write initial header */
145   if (write_header(fp, headerstr) == FALSE) return FALSE;
146 
147   /* swap not needed any more */
148   need_swap = FALSE;
149 
150   /* write some header info */
151   wrt(fp, &(ndata->n), sizeof(int), 1);
152   wrt(fp, &(ndata->dir), sizeof(int), 1);
153   wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
154 
155   /* write total info */
156   for(n=0;n<ndata->n;n++) {
157     wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
158     /*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/
159   }
160 
161   /* unk_*, isopen, max_word_num are set after read, so need not save */
162 
163   /* write wname */
164   wlen = 0;
165   for(i=0;i<ndata->max_word_num;i++) {
166     wlen += strlen(ndata->wname[i]) + 1;
167   }
168   wrt(fp, &wlen, sizeof(int), 1);
169   for(i=0;i<ndata->max_word_num;i++) {
170     wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */
171   }
172 
173   /* write N-gram */
174   for(n=0;n<ndata->n;n++) {
175     t = &(ndata->d[n]);
176 
177     wrt(fp, &(t->is24bit), sizeof(boolean), 1);
178     wrt(fp, &(t->ct_compaction), sizeof(boolean), 1);
179     wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1);
180     wrt(fp, &(t->context_num), sizeof(NNID), 1);
181     if (n > 0) {
182       if (t->is24bit) {
183 	wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
184 	wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
185       } else {
186 	wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
187       }
188       wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
189       wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
190     }
191     wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum);
192     if (t->bo_wt) {
193       i = 1;
194       wrt(fp, &i, sizeof(int), 1);
195       wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
196     } else {
197       i = 0;
198       wrt(fp, &i, sizeof(int), 1);
199     }
200     if (t->nnid2ctid_upper) {
201       i = 1;
202       wrt(fp, &i, sizeof(int), 1);
203       wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
204       wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
205     } else {
206       i = 0;
207       wrt(fp, &i, sizeof(int), 1);
208     }
209 
210   }
211 
212   /* write additional LR 2-gram */
213   if (ndata->bo_wt_1) {
214     i = 1;
215     wrt(fp, &i, sizeof(int), 1);
216     wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
217   } else {
218     i = 0;
219     wrt(fp, &i, sizeof(int), 1);
220   }
221   if (ndata->p_2) {
222     i = 1;
223     wrt(fp, &i, sizeof(int), 1);
224     wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
225   } else {
226     i = 0;
227     wrt(fp, &i, sizeof(int), 1);
228   }
229 
230   len = get_wrt_counter();
231   jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0);
232   return TRUE;
233 }
234