1 /**
2 * @file ngram_write_bin.c
3 *
4 * <JA>
5 * @brief N-gram��Х��ʥ�����ǥե�����˽Ф�
6 *
7 * rev.3.5 ��ꡤ�ɤ߹��ߤι�®�����θ���ƽФ��ΥХ��ȥ���������
8 * Big endian ���꤫��ޥ����¸���ѹ����줿���ޤ�����ǥå����� 24bit ��
9 * ����� 2-gram �ΥХå����եǡ����ΰ��̤�Ԥ��ʤɡ��ե����������
10 * �������ͤ������ѹ����줿������ˤ�ꡤ3.5 �ʹߤ� mkbingram ��
11 * �����������Х��ʥ�N-gram��, 3.4.2������ Julius �ǤϻȤ��ʤ���
12 * (�إå������å��ǥ��顼�Ȥʤ�)
13 *
14 * �ʤ� 3.5 �ʹߤ� Julius �ǤϽ���Υ�ǥ������ʤ��ɤ�롥���ξ��,
15 * ����ǥå����� 24bit ���ȥХå����դΰ��̤ϥ�ǥ��ɤ߹�����
16 * �������ٹԤ��롥
17 *
18 * �Х��ȥ��������˴ؤ��ƥإå��˵��Ҥ��뤳�Ȥǡ��ɤ߹�����Ƚ�ꤷ��
19 * �ɤ߹��ࡥ����ˤ�ꡤ�ۤʤ�Х��ȥ��������Υޥ������������
20 * �Х��ʥ�N-gram�Ǥ�����ʤ��ɤ�롥��������Υ�ǥ�⤽�Τޤ�
21 * �ɤ߹���롥
22 * </JA>
23 *
24 * <EN>
25 * @brief Write a whole N-gram data to a file in binary format
26 *
27 * From 3.5, internal format of binary N-gram has changed for using
28 * machine-dependent natural byte order (previously fixed to big endian),
29 * 24bit index and 2-gram backoff compression. So, binary N-gram
30 * generated by mkbingram of 3.5 and later will not work on 3.4.2 and
31 * earlier versions.
32 *
33 * There is full upward- and cross-machine compatibility in 3.5. Old
34 * binary N-gram files still can be read directly, in which case the conversion
35 * to 24bit index will performed just after model has been read.
36 * Byte order will also considered by header information, so
37 * binary N-gram still can be used among different machines.
38 * </EN>
39 *
40 * @author Akinobu LEE
41 * @date Wed Feb 16 17:23:16 2005
42 *
43 * $Revision: 1.4 $
44 *
45 */
46 /*
47 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
48 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
49 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
50 * All rights reserved
51 */
52
53 #include <sent/stddefs.h>
54 #include <sent/ngram2.h>
55
56 static boolean need_swap; ///< TRUE if need byte swap
57
58 #define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE
59
60 static unsigned int count;
61 void
reset_wrt_counter()62 reset_wrt_counter()
63 {
64 count = 0;
65 }
66 static unsigned int
get_wrt_counter()67 get_wrt_counter()
68 {
69 return count;
70 }
71
72
73 /**
74 * Binary write function, with byte swapping if needed.
75 *
76 * @param fp [in] file pointer
77 * @param buf [in] data buffer to write
78 * @param unitbyte [in] unit size in bytes
79 * @param unitnum [in] number of unit to write
80 */
81 static boolean
wrtfunc(FILE * fp,void * buf,size_t unitbyte,size_t unitnum)82 wrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum)
83 {
84 if (need_swap == TRUE && unitbyte != 1) {
85 swap_bytes((char *)buf, unitbyte, unitnum);
86 }
87 if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) {
88 jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum);
89 return FALSE;
90 }
91 if (need_swap == TRUE && unitbyte != 1) {
92 swap_bytes((char *)buf, unitbyte, unitnum);
93 }
94 count += unitbyte * unitnum;
95 return TRUE;
96 }
97
98 /**
99 * Write header information, with identifier string.
100 *
101 * @param fp [in] file pointer
102 * @param str [in] user header string (any string within BINGRAM_HDSIZE
103 * bytes is allowed)
104 * @param version [in] file format version id
105 */
106 static boolean
write_header(FILE * fp,char * str)107 write_header(FILE *fp, char *str)
108 {
109 char buf[BINGRAM_HDSIZE];
110 int i, totallen;
111
112 for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
113 totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
114 if (totallen >= BINGRAM_HDSIZE) {
115 jlog("Warning: write_bingram: header too long, last will be truncated\n");
116 i = strlen(str) - (totallen - BINGRAM_HDSIZE);
117 str[i] = '\0';
118 }
119 sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
120 wrt(fp, buf, 1, BINGRAM_HDSIZE);
121
122 return TRUE;
123 }
124
125 /**
126 * Write a whole N-gram data in binary format.
127 *
128 * @param fp [in] file pointer
129 * @param ndata [in] N-gram data to write
130 * @param headerstr [in] user header string
131 *
132 * @return TRUE on success, FALSE on failure
133 */
134 boolean
ngram_write_bin(FILE * fp,NGRAM_INFO * ndata,char * headerstr)135 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
136 {
137 int i,n;
138 unsigned int len;
139 int wlen;
140 NGRAM_TUPLE_INFO *t;
141
142 reset_wrt_counter();
143
144 /* write initial header */
145 if (write_header(fp, headerstr) == FALSE) return FALSE;
146
147 /* swap not needed any more */
148 need_swap = FALSE;
149
150 /* write some header info */
151 wrt(fp, &(ndata->n), sizeof(int), 1);
152 wrt(fp, &(ndata->dir), sizeof(int), 1);
153 wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
154
155 /* write total info */
156 for(n=0;n<ndata->n;n++) {
157 wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
158 /*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/
159 }
160
161 /* unk_*, isopen, max_word_num are set after read, so need not save */
162
163 /* write wname */
164 wlen = 0;
165 for(i=0;i<ndata->max_word_num;i++) {
166 wlen += strlen(ndata->wname[i]) + 1;
167 }
168 wrt(fp, &wlen, sizeof(int), 1);
169 for(i=0;i<ndata->max_word_num;i++) {
170 wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */
171 }
172
173 /* write N-gram */
174 for(n=0;n<ndata->n;n++) {
175 t = &(ndata->d[n]);
176
177 wrt(fp, &(t->is24bit), sizeof(boolean), 1);
178 wrt(fp, &(t->ct_compaction), sizeof(boolean), 1);
179 wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1);
180 wrt(fp, &(t->context_num), sizeof(NNID), 1);
181 if (n > 0) {
182 if (t->is24bit) {
183 wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
184 wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
185 } else {
186 wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
187 }
188 wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
189 wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
190 }
191 wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum);
192 if (t->bo_wt) {
193 i = 1;
194 wrt(fp, &i, sizeof(int), 1);
195 wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
196 } else {
197 i = 0;
198 wrt(fp, &i, sizeof(int), 1);
199 }
200 if (t->nnid2ctid_upper) {
201 i = 1;
202 wrt(fp, &i, sizeof(int), 1);
203 wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
204 wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
205 } else {
206 i = 0;
207 wrt(fp, &i, sizeof(int), 1);
208 }
209
210 }
211
212 /* write additional LR 2-gram */
213 if (ndata->bo_wt_1) {
214 i = 1;
215 wrt(fp, &i, sizeof(int), 1);
216 wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
217 } else {
218 i = 0;
219 wrt(fp, &i, sizeof(int), 1);
220 }
221 if (ndata->p_2) {
222 i = 1;
223 wrt(fp, &i, sizeof(int), 1);
224 wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
225 } else {
226 i = 0;
227 wrt(fp, &i, sizeof(int), 1);
228 }
229
230 len = get_wrt_counter();
231 jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0);
232 return TRUE;
233 }
234