1 /**
2 * @file factoring_sub.c
3 *
4 * <JA>
5 * @brief ���쥹������factoring������1�ѥ���
6 *
7 * ���Υե�����ˤϡ��裱�ѥ��ˤ����Ƹ��쥹������ factoring ��Ԥ������
8 * �ؿ����ޤޤ�Ƥ��ޤ�. �ڹ�¤�������ǤΥ��֥ĥ���ñ��ꥹ��
9 * (successor list) �ι��ۡ������ǧ����θ��쥹�������롼����
10 * �ޤޤ�ޤ�.
11 *
12 * successor list �ϡ��ڹ�¤������γƥΡ��ɤ˳���դ����롤
13 * ���ΥΡ��ɤ�ͭ����ñ��Υꥹ�ȤǤ�. �ڹ�¤������ˤ����ơ�
14 * ����ʬ�μ��ΥΡ��ɤ����Υꥹ�Ȥ��ݻ����ޤ�. �ºݤˤϥꥹ�Ȥ��Ѳ�����
15 * ��ꡤ���ʤ���ڹ�¤������λޤ�ʬ�����˳���դ����ޤ�.
16 * �㤨�С��ʲ��Τ褦���ڹ�¤������ξ�硤�����νƤ���Ρ��ɤ�
17 * successor list ������դ����ޤ�.
18 * <pre>
19 *
20 * 2-o-o - o-o-o - o-o-o word "A"
21 * /
22 * 1-o-o
23 * \ 4-o-o word "B"
24 * \ /
25 * 3-o-o - 5-o-o - 7-o-o word "C"
26 * \ \
27 * \ 8-o-o word "D"
28 * 6-o-o word "E"
29 * </pre>
30 *
31 * �� successor list �Ϥ��Υ��֥ĥ�˴ޤޤ��ñ��Υꥹ�ȤǤ�.
32 * ������Ǥϰʲ��Τ褦�ˤʤ�ޤ�.
33 *
34 * <pre>
35 * node | successor list (wchmm->state[node].sc)
36 * =======================
37 * 1 | A B C D E
38 * 2 | A
39 * 3 | B C D E
40 * 4 | B
41 * 5 | C D
42 * 6 | E
43 * 7 | C
44 * 8 | D
45 * </pre>
46 *
47 * ���� successor list �˴ޤޤ��ñ�줬���Ĥˤʤä��Ȥ������λ�����
48 * ñ�줬���ꤹ��. �嵭�ξ�硤ñ�� "A" �ϥΡ��� 2 �ΰ��֤Ǥ��Ǥ�
49 * ���θ�³ñ��Ȥ��� "A" �ʳ�̵���Τǡ������dz��ꤹ��.
50 * ���ʤ����ñ�� A �����Τʸ��쥹�����ϡ�ñ�콪ü���Ԥ����Ρ��� 2 �Ƿ�ޤ�.
51 *
52 * �裱�ѥ��ˤ����� factoring �η��ϡ��ºݤˤ� beam.c �ǹԤʤ���.
53 * 2-gram factoring�ξ�硤���Ρ��ɤ� successor list ��¸�ߤ����,
54 * ���� successor list ��ñ��� 2-gram �κ����ͤ���, ���¤��Ƥ��Ƥ���
55 * factoring �ͤ�����. successor list ��ñ�줬1�ĤΥΡ��ɤǤϡ�
56 * ������2-gram����ưŪ�˳�����Ƥ���.
57 * 1-gram factoring�ξ�硤���Ρ��ɤ� successor list ��¸�ߤ����硤
58 * ���� successor list ��ñ��� 1-gram �κ����ͤ��ᡤ���¤��Ƥ��Ƥ���
59 * factoring �ͤ�����. successor list ��ñ�줬���ĤΥΡ��ɤǡ��Ϥ����
60 * 2-gram �������.
61 *
62 * �ºݤǤ� 1-gram factoring �Ǥϳ� successor list �ˤ����� factoring ��
63 * ��ñ����������¸�ʤΤǡ�successor list ���ۻ������Ƥ��餫���������
64 * ����. ���ʤ��������ư�����ڹ�¤��������۸塤successor list
65 * ���ۤ����顤ñ���2�İʾ�ޤ� successor list �ˤĤ��ƤϤ��� 1-gram ��
66 * �����ͤ�����ơ�����ΥΡ��ɤ� fscore ���Ф˳�Ǽ���Ƥ���������
67 * successor list �� free ���Ƥ��ޤ��Ф褤. ñ�줬���ĤΤߤ� successor list
68 * �ˤĤ��ƤϤ���ñ��ID��Ĥ��Ƥ�����õ�����˥ѥ�����������ã������
69 * ���Τ�2-gram���������ɤ�.
70 *
71 * DFAʸˡ���ѻ��ϡ��ǥե���ȤǤϸ�������(���ƥ���������)��
72 * ���ƥ���ñ�̤��ڤ��ۤ��뤳�Ȥ���Ū��ɽ������. ���Τ��ᡤ
73 * ������ factoring �������Ѥ����ʤ�. ��������
74 * CATEGORY_TREE �� undefined �Ǥ���С�����Ū factoring ���Ѥ�����������
75 * Ŭ�Ѥ�Ԥ����Ȥ��ǽ�Ǥ���.
76 * ���ʤ�������Ρ��ɤ� successor list ��¸�ߤ����,
77 * ���� successor list ��γ�ñ���ľ��ñ���ñ���������Ĵ��,
78 * ���Τ�����ĤǤ���³��ǽ��ñ�줬����С��������ܤ��������Ĥ�
79 * �ʤ�������ܤ����ʤ�. ���ε�ǽ�ϵ��ѻ��ͤΤ���˻Ĥ���Ƥ���ΤߤǤ���.
80 * </JA>
81 *
82 * <EN>
83 * @brief LM factoring on 1st pass.
84 * </EN>
85 *
86 * This file contains functions to do language score factoring on the 1st
87 * pass. They build a successor lists which holds the successive words in
88 * each sub tree on the tree lexicon, and also provide a factored LM
89 * probability on each nodes on the tree lexicon.
90 *
91 * The "successor list" will be assigned for each lexicon tree node to
92 * represent a list of words that exist in the sub-tree and share the node.
93 * Actually they will be assigned to the branch node.
94 * Below is the example of successor lists on a tree lexicon, in which
95 * the lists is assigned to the numbered nodes.
96 *
97 * <pre>
98 * 2-o-o - o-o-o - o-o-o word "A"
99 * /
100 * 1-o-o
101 * \ 4-o-o word "B"
102 * \ /
103 * 3-o-o - 5-o-o - 7-o-o word "C"
104 * \ \
105 * \ 8-o-o word "D"
106 * 6-o-o word "E"
107 * </pre>
108 *
109 * The contents of the successor lists are the following:
110 *
111 * <pre>
112 * node | successor list (wchmm->state[node].sc)
113 * =======================
114 * 1 | A B C D E
115 * 2 | A
116 * 3 | B C D E
117 * 4 | B
118 * 5 | C D
119 * 6 | E
120 * 7 | C
121 * 8 | D
122 * </pre>
123 *
124 * When the 1st pass proceeds, if the next going node has a successor list,
125 * all the word 2-gram scores in the successor list on the next node
126 * will be computed, and the propagating LM value in the token on
127 * the current node will be replaced by the maximum value of the scores
128 * when copied to the next node. Appearently, if the successor list has
129 * only one word, it means that the word can be determined on that point,
130 * and the precise 2-gram value will be assigned as is.
131 *
132 * When using 1-gram factoring, the computation will be slightly different.
133 * Since the factoring value (maximum value of 1-gram scores on each successor
134 * list) is independent of the word context, they can be computed statically
135 * before the search. Thus, for all the successor lists that have more than
136 * two words, the maximum 1-gram value is computed and stored to
137 * "fscore" member in tree lexicon, and the successor lists will be freed.
138 * The successor lists with only one word should still remain in the
139 * tree lexicon, to compute the precise 2-gram scores for the words.
140 *
141 *
142 * When using DFA grammar, Julian builds separated lexicon trees for every
143 * word categories, to statically express the catergory-pair constraint.
144 * Thus these factoring scheme is not used by default.
145 * However you can still force Julian to use the grammar-based
146 * deterministic factoring scheme by undefining CATEGORY_TREE.
147 * If CATEGORY_TREE is undefined, the word connection constraint will be
148 * performed based on the successor list at the middle of tree lexicon.
149 * This enables single tree search on Julian. This function is left
150 * only for technical reference.
151 *
152 * @author Akinobu LEE
153 * @date Mon Mar 7 23:20:26 2005
154 *
155 * $Revision: 1.3 $
156 *
157 */
158
159 /*
160 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
161 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
162 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
163 * All rights reserved
164 */
165
166 #include <julius/julius.h>
167
168 /*----------------------------------------------------------------------*/
169
170 /**
171 * <JA>
172 * @brief �ڹ�¤�������Τ���Ρ��ɤ� successor list ��ñ����ɲä���.
173 *
174 * ���Ǥ�Ʊ��ñ�줬��Ͽ����Ƥ���С���������Ͽ�Ϥ���ʤ�.
175 * ñ���ID�Ǿ������¸�����.
176 *
177 * @param wchmm [i/o] �ڹ�¤������
178 * @param node [in] �Ρ����ֹ�
179 * @param w [in] ñ��ID
180 * </JA>
181 * <EN>
182 * @brief Add a word to the successor list on a node in tree lexicon.
183 * Words in lists should be ordered by ID.
184 *
185 * @param wchmm [i/o] tree lexicon
186 * @param node [in] node id
187 * @param w [in] word id
188 * </EN>
189 */
190 static void
add_successor(WCHMM_INFO * wchmm,int node,WORD_ID w)191 add_successor(WCHMM_INFO *wchmm, int node, WORD_ID w)
192 {
193 S_CELL *sctmp, *sc;
194
195 /* malloc a new successor list element */
196 sctmp=(S_CELL *) mymalloc(sizeof(S_CELL));
197 /* assign word ID to the new element */
198 sctmp->word = w;
199 /* add the new element to existing list (keeping order) */
200 if (wchmm->state[node].scid == 0) {
201 j_internal_error("add_successor: sclist id not assigned to branch node?\n");
202 }
203 sc = wchmm->sclist[wchmm->state[node].scid];
204 if (sc == NULL || sctmp->word < sc->word) {
205 sctmp->next = sc;
206 wchmm->sclist[wchmm->state[node].scid] = sctmp;
207 } else {
208 for(;sc;sc=sc->next) {
209 if (sc->next == NULL || sctmp->word < (sc->next)->word) {
210 if (sctmp->word == sc->word) break; /* avoid duplication */
211 sctmp->next = sc->next;
212 sc->next = sctmp;
213 break;
214 }
215 }
216 }
217 }
218
219 /**
220 * <JA>
221 * ���ĤΥΡ��ɾ�� successor list �����פ��뤫�ɤ��������å�����
222 *
223 * @param wchmm [in] �ڹ�¤������
224 * @param node1 [in] ���Ĥ�ΥΡ���ID
225 * @param node2 [in] ���Ĥ�ΥΡ���ID
226 *
227 * @return �����˰��פ���� TRUE�����פ��ʤ���� FALSE.
228 * </JA>
229 * <EN>
230 * Check if successor lists on two nodes are the same.
231 *
232 * @param wchmm [in] tree lexicon
233 * @param node1 [in] 1st node id
234 * @param node2 [in] 2nd node id
235 *
236 * @return TRUE if they have the same successor list, or FALSE if they differ.
237 * </EN>
238 */
239 static boolean
match_successor(WCHMM_INFO * wchmm,int node1,int node2)240 match_successor(WCHMM_INFO *wchmm, int node1, int node2)
241 {
242 S_CELL *sc1,*sc2;
243
244 /* assume successor is sorted by ID */
245 if (wchmm->state[node1].scid == 0 || wchmm->state[node2].scid == 0) {
246 j_internal_error("match_successor: sclist id not assigned to branch node?\n");
247 }
248 sc1 = wchmm->sclist[wchmm->state[node1].scid];
249 sc2 = wchmm->sclist[wchmm->state[node2].scid];
250 for (;;) {
251 if (sc1 == NULL || sc2 == NULL) {
252 if (sc1 == NULL && sc2 == NULL) {
253 return TRUE;
254 } else {
255 return FALSE;
256 }
257 } else if (sc1->word != sc2->word) {
258 return FALSE;
259 }
260 sc1 = sc1->next;
261 sc2 = sc2->next;
262 }
263 }
264
265 /**
266 * <JA>
267 * ����Ρ��ɾ�� successor list ����ˤ���.
268 *
269 * @param wchmm [i/o] �ڹ�¤������
270 * @param scid [in] node id
271 * </JA>
272 * <EN>
273 * Free successor list at the node
274 *
275 * @param wchmm [i/o] tree lexicon
276 * @param scid [in] node id
277 * </EN>
278 */
279 static void
free_successor(WCHMM_INFO * wchmm,int scid)280 free_successor(WCHMM_INFO *wchmm, int scid)
281 {
282 S_CELL *sc;
283 S_CELL *sctmp;
284
285 /* free sclist */
286 sc = wchmm->sclist[scid];
287 while (sc != NULL) {
288 sctmp = sc;
289 sc = sc->next;
290 free(sctmp);
291 }
292 }
293
294 /**
295 * <JA>
296 * �ڹ�¤������夫�����ä��줿 successor list �ˤĤ��ơ�
297 * ���μ��Τ������ƥꥹ�Ȥ�ͤ�륬���١������쥯������Ԥ�.
298 *
299 * @param wchmm [i/o] �ڹ�¤������
300 * </JA>
301 * <EN>
302 * Garbage collection of the successor list, by deleting successor lists
303 * to which the link was deleted on the lexicon tree.
304 *
305 * @param wchmm [i/o] tree lexiton
306 * </EN>
307 */
308 static void
compaction_successor(WCHMM_INFO * wchmm)309 compaction_successor(WCHMM_INFO *wchmm)
310 {
311 int src, dst;
312
313 dst = 1;
314 for(src=1;src<wchmm->scnum;src++) {
315 if (wchmm->state[wchmm->sclist2node[src]].scid <= 0) {
316 /* already freed, skip */
317 continue;
318 }
319 if (dst != src) {
320 wchmm->sclist[dst] = wchmm->sclist[src];
321 wchmm->sclist2node[dst] = wchmm->sclist2node[src];
322 wchmm->state[wchmm->sclist2node[dst]].scid = dst;
323 }
324 dst++;
325 }
326 if (debug2_flag) {
327 jlog("DEBUG: successor list shrinked from %d to %d\n", wchmm->scnum, dst);
328 }
329 wchmm->scnum = dst;
330 }
331
332 /**
333 * <JA>
334 * successor list �Ѥ˳���դ���줿�����ΰ��ͭ����Ĺ���˽̤��.
335 * ������ۻ��䡤1-gram factoring �Τ���˺�����줿 successor list ʬ��
336 * ������������.
337 *
338 * @param wchmm [i/o] �ڹ�¤������
339 * </JA>
340 * <EN>
341 * Shrink the memory area that has been allocated for building successor list.
342 *
343 * @param wchmm [i/o] tree lexicon
344 * </EN>
345 */
346 static void
shrink_successor(WCHMM_INFO * wchmm)347 shrink_successor(WCHMM_INFO *wchmm)
348 {
349 if (wchmm->sclist) {
350 wchmm->sclist = (S_CELL **)myrealloc(wchmm->sclist, sizeof(S_CELL *) * wchmm->scnum);
351 }
352 if (wchmm->sclist2node) {
353 wchmm->sclist2node = (int *)myrealloc(wchmm->sclist2node, sizeof(int) * wchmm->scnum);
354 }
355 }
356
357 /**
358 * <JA>
359 * �ڹ�¤�����������Ρ��ɤ� successor list ���ۤ���ᥤ��ؿ�
360 *
361 * @param wchmm [i/o] �ڹ�¤������
362 * </JA>
363 * <EN>
364 * Main function to build whole successor list to lexicon tree.
365 *
366 * @param wchmm [i/o] tree lexicon
367 * </EN>
368 *
369 * @callgraph
370 * @callergraph
371 *
372 */
373 void
make_successor_list(WCHMM_INFO * wchmm)374 make_successor_list(WCHMM_INFO *wchmm)
375 {
376 int node;
377 WORD_ID w;
378 int i;
379 boolean *freemark;
380 int s;
381
382 jlog("STAT: make successor lists for factoring\n");
383
384 /* 1. initialize */
385 /* initialize node->sclist index on wchmm tree */
386 for (node=0;node<wchmm->n;node++) wchmm->state[node].scid = 0;
387
388 /* parse the tree to get the maximum size of successor list */
389 s = 1;
390 for (w=0;w<wchmm->winfo->num;w++) {
391 for (i=0;i<wchmm->winfo->wlen[w];i++) {
392 if (wchmm->state[wchmm->offset[w][i]].scid == 0) {
393 wchmm->state[wchmm->offset[w][i]].scid = s;
394 s++;
395 }
396 }
397 if (wchmm->state[wchmm->wordend[w]].scid == 0) {
398 wchmm->state[wchmm->wordend[w]].scid = s;
399 s++;
400 }
401 }
402 wchmm->scnum = s;
403 if (debug2_flag) {
404 jlog("DEBUG: initial successor list size = %d\n", wchmm->scnum);
405 }
406
407 /* allocate successor list for the maximum size */
408 wchmm->sclist = (S_CELL **)mymalloc(sizeof(S_CELL *) * wchmm->scnum);
409 for (i=1;i<wchmm->scnum;i++) wchmm->sclist[i] = NULL;
410 wchmm->sclist2node = (int *)mymalloc(sizeof(int) * wchmm->scnum);
411
412 /* allocate misc. work area */
413 freemark = (boolean *)mymalloc(sizeof(boolean) * wchmm->scnum);
414 for (i=1;i<wchmm->scnum;i++) freemark[i] = FALSE;
415
416 /* 2. make initial successor list: assign at all possible nodes */
417 for (w=0;w<wchmm->winfo->num;w++) {
418 /* at each start node of phonemes */
419 for (i=0;i<wchmm->winfo->wlen[w];i++) {
420 wchmm->sclist2node[wchmm->state[wchmm->offset[w][i]].scid] = wchmm->offset[w][i];
421 add_successor(wchmm, wchmm->offset[w][i], w);
422 }
423 /* at word end */
424 wchmm->sclist2node[wchmm->state[wchmm->wordend[w]].scid] = wchmm->wordend[w];
425 add_successor(wchmm, wchmm->wordend[w], w);
426 }
427
428 /* 3. erase unnecessary successor list */
429 /* sucessor list same as the previous node is not needed, so */
430 /* parse lexicon tree from every leaf to find the same succesor list */
431 for (w=0;w<wchmm->winfo->num;w++) {
432 node = wchmm->wordend[w]; /* begin from the word end node */
433 i = wchmm->winfo->wlen[w]-1;
434 while (i >= 0) { /* for each phoneme start node */
435 if (node == wchmm->offset[w][i]) {
436 /* word with only 1 state: skip */
437 i--;
438 continue;
439 }
440 if (match_successor(wchmm, node, wchmm->offset[w][i])) {
441 freemark[wchmm->state[node].scid] = TRUE; /* mark the node */
442 }
443 /*
444 * if (freemark[wchmm->offset[w][i]] != FALSE) {
445 * break;
446 * }
447 */
448 node = wchmm->offset[w][i];
449 i--;
450 }
451 }
452 /* really free */
453 for (i=1;i<wchmm->scnum;i++) {
454 if (freemark[i] == TRUE) {
455 free_successor(wchmm, i);
456 /* reset node -> sclist link */
457 wchmm->state[wchmm->sclist2node[i]].scid = 0;
458 }
459 }
460 /* garbage collection of deleted sclist */
461 compaction_successor(wchmm);
462
463 free(freemark);
464
465 jlog("STAT: done\n");
466 }
467
468 #ifdef UNIGRAM_FACTORING
469
470 /**
471 * <JA>
472 * �ڹ�¤�����������Ρ��ɤ� successor list ���ۤ���ᥤ��ؿ�(unigram factoring ��
473 *
474 * @param wchmm [i/o] �ڹ�¤������
475 * </JA>
476 * <EN>
477 * Main function to build whole successor list to lexicon tree for unigram factoring
478 *
479 * @param wchmm [i/o] tree lexicon
480 * </EN>
481 *
482 * @callgraph
483 * @callergraph
484 *
485 */
486 void
make_successor_list_unigram_factoring(WCHMM_INFO * wchmm)487 make_successor_list_unigram_factoring(WCHMM_INFO *wchmm)
488 {
489
490 #ifndef FAST_FACTOR1_SUCCESSOR_LIST
491
492 /* old way */
493 make_successor_list(wchmm);
494 calc_all_unigram_factoring_values(wchmm);
495
496 #else /* ~FAST_FACTOR1_SUCCESSOR_LIST */
497
498 /* new way */
499
500 int node, node2;
501 WORD_ID w, w2;
502 int i, j, n, f;
503 int s;
504 LOGPROB tmpprob;
505
506 jlog("STAT: make successor lists for unigram factoring\n");
507
508 /* 1. initialize */
509 /* initialize node->sclist index on wchmm tree */
510 for (node=0;node<wchmm->n;node++) wchmm->state[node].scid = 0;
511
512 /* in unigram factoring, number of successor = vocabulary size */
513 wchmm->scnum = wchmm->winfo->num + 1;
514 if (debug2_flag) {
515 jlog("DEBUG: successor list size = %d\n", wchmm->scnum);
516 }
517
518 /* allocate successor list */
519 wchmm->sclist = (S_CELL **)mymalloc(sizeof(S_CELL *) * wchmm->scnum);
520 for (i=1;i<wchmm->scnum;i++) wchmm->sclist[i] = NULL;
521 /* sclist2node is not used */
522
523 /* 2. make successor list, and count needed fscore num */
524 f = 1;
525 s = 1;
526 for (w=0;w<wchmm->winfo->num;w++) {
527 for (i=0;i<wchmm->winfo->wlen[w] + 1;i++) {
528 if (i < wchmm->winfo->wlen[w]) {
529 node = wchmm->offset[w][i];
530 } else {
531 node = wchmm->wordend[w];
532 }
533 if (wchmm->state[node].scid == 0) { /* not assigned */
534 /* new node found, assign new and exit here */
535 wchmm->state[node].scid = s++;
536 if (s > wchmm->scnum) {
537 jlog("InternalError: make_successor_list_unigram_factoring: scid num exceeded?\n");
538 return;
539 }
540 add_successor(wchmm, node, w);
541 break;
542 } else if (wchmm->state[node].scid > 0) {
543 /* that node has sclist */
544 /* move it to the current first isolated node in that word */
545 w2 = wchmm->sclist[wchmm->state[node].scid]->word;
546 for(j=i+1;j<wchmm->winfo->wlen[w2] + 1;j++) {
547 if (j < wchmm->winfo->wlen[w2]) {
548 node2 = wchmm->offset[w2][j];
549 } else {
550 node2 = wchmm->wordend[w2];
551 }
552 if (wchmm->state[node2].scid == 0) { /* not assigned */
553 /* move sclist to there */
554 wchmm->state[node2].scid = wchmm->state[node].scid;
555 break;
556 }
557 }
558 if (j >= wchmm->winfo->wlen[w2] + 1) {
559 /* not found? */
560 jlog("InternalError: make_successor_list_unigram_factoring: no isolated word for %d\n", w2);
561 return;
562 }
563 /* make current node as fscore node */
564 n = f++;
565 wchmm->state[node].scid = -n;
566 /* not compute unigram factoring value yet */
567 }
568
569 }
570 }
571
572 /* 2. allocate fscore buffer */
573 wchmm->fsnum = f;
574 wchmm->fscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->fsnum);
575 for(n=0;n<wchmm->fsnum;n++) wchmm->fscore[n] = LOG_ZERO;
576
577 /* 3. parse again to assign fscore values */
578 for (w=0;w<wchmm->winfo->num;w++) {
579 for (i=0;i<wchmm->winfo->wlen[w] + 1;i++) {
580 if (i < wchmm->winfo->wlen[w]) {
581 node = wchmm->offset[w][i];
582 } else {
583 node = wchmm->wordend[w];
584 }
585 if (wchmm->state[node].scid < 0) {
586 /* update max */
587 if (wchmm->ngram) {
588 tmpprob = uni_prob(wchmm->ngram, wchmm->winfo->wton[w])
589 #ifdef CLASS_NGRAM
590 + wchmm->winfo->cprob[w]
591 #endif
592 ;
593 } else {
594 tmpprob = LOG_ZERO;
595 }
596 if (wchmm->lmvar == LM_NGRAM_USER) {
597 tmpprob = (*(wchmm->uni_prob_user))(wchmm->winfo, w, tmpprob);
598 }
599 n = - wchmm->state[node].scid;
600 if (wchmm->fscore[n] < tmpprob) {
601 wchmm->fscore[n] = tmpprob;
602 }
603 }
604
605 }
606 }
607
608 #endif /* ~FAST_FACTOR1_SUCCESSOR_LIST */
609
610 jlog("STAT: done\n");
611 }
612
613 #endif /* UNIGRAM_FACTORING */
614
615
616 /**
617 * <JA>
618 * ���ۤ��줿 factoring ����� multipath �Ѥ�Ĵ������. factoring �����,
619 * ��ǥ����Τ��åפ������ܤ�������Ϥ�����β��Ǥإ��ԡ�����.
620 * �ޤ���(���Ϥ�����ʤ�)ʸƬʸˡ�Ρ��ɤ�ñ����Ƭ�Ρ��ɤ��饳�ԡ�����.
621 *
622 * @param wchmm [in] �ڹ�¤������
623 * </JA>
624 * <EN>
625 * Adjust factoring data in tree lexicon for multipath transition handling.
626 *
627 * @param wchmm [in] tree lexicon
628 * </EN>
629 *
630 * @callgraph
631 * @callergraph
632 *
633 */
634 void
adjust_sc_index(WCHMM_INFO * wchmm)635 adjust_sc_index(WCHMM_INFO *wchmm)
636 {
637 WORD_ID w;
638 int i,j,k;
639 HMM_Logical *ltmp;
640 int ltmp_state_num;
641 int ato;
642 LOGPROB prob;
643 int node, scid;
644 A_CELL2 *ac;
645
646 /* duplicate scid for HMMs with more than one arc from initial state */
647 for(w=0;w<wchmm->winfo->num;w++) {
648 for(k=0;k<wchmm->winfo->wlen[w];k++) {
649 node = wchmm->offset[w][k];
650 scid = wchmm->state[node].scid;
651 if (scid == 0) continue;
652 ltmp = wchmm->winfo->wseq[w][k];
653 ltmp_state_num = hmm_logical_state_num(ltmp);
654 if ((hmm_logical_trans(ltmp))->a[0][ltmp_state_num-1] != LOG_ZERO) {
655 j = k + 1;
656 if (j == wchmm->winfo->wlen[w]) {
657 if (wchmm->state[wchmm->wordend[w]].scid == 0) {
658 jlog("STAT: word %d: factoring node copied for skip phone\n", w);
659 wchmm->state[wchmm->wordend[w]].scid = scid;
660 }
661 } else {
662 if (wchmm->state[wchmm->offset[w][j]].scid == 0) {
663 jlog("STAT: word %d: factoring node copied for skip phone\n", w);
664 wchmm->state[wchmm->offset[w][j]].scid = scid;
665 }
666 }
667 }
668 for(ato=1;ato<ltmp_state_num;ato++) {
669 prob = (hmm_logical_trans(ltmp))->a[0][ato];
670 if (prob != LOG_ZERO) {
671 wchmm->state[node+ato-1].scid = scid;
672 }
673 }
674 }
675 }
676
677 /* move scid and fscore on the head state to the head grammar state */
678 for(i=0;i<wchmm->startnum;i++) {
679 node = wchmm->startnode[i];
680 if (wchmm->state[node].out.state != NULL) {
681 j_internal_error("adjust_sc_index: outprob exist in word-head node??\n");
682 }
683 if (wchmm->next_a[node] != LOG_ZERO) {
684 if (wchmm->state[node+1].scid != 0) {
685 if (wchmm->state[node].scid != 0 && wchmm->state[node].scid != wchmm->state[node+1].scid) {
686 j_internal_error("adjust_sc_index: different successor list within word-head phone?\n");
687 }
688 wchmm->state[node].scid = wchmm->state[node+1].scid;
689 wchmm->state[node+1].scid = 0;
690 }
691 }
692 for(ac=wchmm->ac[node];ac;ac=ac->next) {
693 for(k=0;k<ac->n;k++) {
694 if (wchmm->state[ac->arc[k]].scid != 0) {
695 if (wchmm->state[node].scid != 0 && wchmm->state[node].scid != wchmm->state[ac->arc[k]].scid) {
696 j_internal_error("adjust_sc_index: different successor list within word-head phone?\n");
697 }
698 wchmm->state[node].scid = wchmm->state[ac->arc[k]].scid;
699 wchmm->state[ac->arc[k]].scid = 0;
700 }
701 }
702 }
703 }
704 }
705
706
707 /* -------------------------------------------------------------------- */
708 /* factoring computation */
709
710 /**
711 * <JA>
712 * �ڹ�¤�������Ѥ� factoring ����å����������դ����ƽ��������.
713 * ���δؿ��ϥץ���೫�ϻ��˰��٤����ƤФ��.
714 *
715 * @param wchmm [i/o] �ڹ�¤������
716 * </JA>
717 * <EN>
718 * Initialize factoring cache for a tree lexicon, allocating memory for
719 * cache. This should be called only once on start up.
720 *
721 * @param wchmm [i/o] tree lexicon
722 * </EN>
723 *
724 * @callgraph
725 * @callergraph
726 *
727 */
728 void
max_successor_cache_init(WCHMM_INFO * wchmm)729 max_successor_cache_init(WCHMM_INFO *wchmm)
730 {
731 int i;
732 LM_PROB_CACHE *l;
733 WORD_ID wnum;
734
735 /* finally shrink the memory area of successor list here */
736 shrink_successor(wchmm);
737
738 /* for word-internal */
739 l = &(wchmm->lmcache);
740
741 l->probcache = (LOGPROB *) mymalloc(sizeof(LOGPROB) * wchmm->scnum);
742 l->lastwcache = (WORD_ID *) mymalloc(sizeof(WORD_ID) * wchmm->scnum);
743 for (i=0;i<wchmm->scnum;i++) {
744 l->lastwcache[i] = WORD_INVALID;
745 }
746 /* for cross-word */
747 if (wchmm->ngram) {
748 wnum = wchmm->ngram->max_word_num;
749 } else {
750 wnum = wchmm->winfo->num;
751 }
752 #ifdef HASH_CACHE_IW
753 l->iw_cache_num = wnum * jconf.search.pass1.iw_cache_rate / 100;
754 if (l->iw_cache_num < 10) l->iw_cache_num = 10;
755 #else
756 l->iw_cache_num = wnum;
757 #endif /* HASH_CACHE_IW */
758 l->iw_sc_cache = (LOGPROB **)mymalloc(sizeof(LOGPROB *) * l->iw_cache_num);
759 for (i=0;i<l->iw_cache_num;i++) {
760 l->iw_sc_cache[i] = NULL;
761 }
762 #ifdef HASH_CACHE_IW
763 l->iw_lw_cache = (WORD_ID *)mymalloc(sizeof(WORD_ID) * l->iw_cache_num);
764 for (i=0;i<l->iw_cache_num;i++) {
765 l->iw_lw_cache[i] = WORD_INVALID;
766 }
767 #endif
768 }
769
770 /**
771 * <JA>
772 * ñ��֤� factoring cache �Υ����ΰ���������.
773 *
774 * @param wchmm [i/o] �ڹ�¤������
775 * </JA>
776 * <EN>
777 * Free cross-word factoring cache.
778 *
779 * @param wchmm [i/o] tree lexicon
780 * </EN>
781 */
782 static void
max_successor_prob_iw_free(WCHMM_INFO * wchmm)783 max_successor_prob_iw_free(WCHMM_INFO *wchmm)
784 {
785 int i;
786 LM_PROB_CACHE *l;
787 l = &(wchmm->lmcache);
788 for (i=0;i<l->iw_cache_num;i++) {
789 if (l->iw_sc_cache[i] != NULL) free(l->iw_sc_cache[i]);
790 l->iw_sc_cache[i] = NULL;
791 }
792 }
793
794 /**
795 * <JA>
796 * factoring �� cache �Υ����ΰ�����Ʋ�������.
797 *
798 * @param wchmm [i/o] �ڹ�¤������
799 * </JA>
800 * <EN>
801 * Free all memory for factoring cache.
802 *
803 * @param wchmm [i/o] tree lexicon
804 * </EN>
805 *
806 * @callgraph
807 * @callergraph
808 *
809 */
810 void
max_successor_cache_free(WCHMM_INFO * wchmm)811 max_successor_cache_free(WCHMM_INFO *wchmm)
812 {
813 free(wchmm->lmcache.probcache);
814 free(wchmm->lmcache.lastwcache);
815 max_successor_prob_iw_free(wchmm);
816 free(wchmm->lmcache.iw_sc_cache);
817 #ifdef HASH_CACHE_IW
818 free(wchmm->lmcache.iw_lw_cache);
819 #endif
820 }
821
822 #ifdef UNIGRAM_FACTORING
823
824 /**
825 * <JA>
826 * @brief ñ����Ƭ�Ρ��ɤΤ���Factoring �ˤ����ƥ���å��夬ɬ�פʥΡ��ɤ�
827 * �ꥹ�Ȥ��������.
828 *
829 * 1-gram factoring �ϡ��ޥΡ��ɤˤ�����ľ��ñ��˰�¸���ʤ�������
830 * (unigram�κ�����)��Ϳ����. ���Τ��ᡤñ��֤� factoring ���ˤ����ơ�
831 * �ڹ�¤��������ʣ����ñ��Ƕ�ͭ����Ƥ���ñ����Ƭ�Ρ��ɤˤĤ��Ƥϡ�
832 * �����ͤ�ľ��ñ��ˤ�餺�����ͤǤ��ꡤǧ������ñ��֥���å�����ݻ�
833 * ����ɬ�פϤʤ�.
834 *
835 * ���δؿ��Ǥϡ�ñ����Ƭ�Ρ��ɤΥꥹ�Ȥ��餽�Τ褦�� factoring ����å��夬
836 * ���פʥΡ��ɤ�������ơ�1-gram factoring ����ñ��֥���å��夬ɬ�פ�
837 * ñ����Ƭ�Ρ��ɡʡ�¾��ñ��ȶ�ͭ����Ƥ��ʤ���Ω����ñ����Ƭ�Ρ��ɡˤ�
838 * �ꥹ�Ȥ��������wchmm->start2isolate ����� wchmm->isolatenum �˳�Ǽ����.
839 *
840 * @param wchmm [i/o] �ڹ�¤������
841 * </JA>
842 * <EN>
843 * @brief Make a list of word head nodes on which cross-word factoring cache
844 * is needed.
845 *
846 * On 1-gram factoring, the branch nodes on tree lexicon has a fixed
847 * factoring value (maximum 1-gram score of all sub-tree words). Thus, when
848 * computing cross-word factoring at word head nodes on inter-word
849 * transition, such 1-gram factoring nodes on word head, shared by several
850 * words, need not be cached in inter-word factoring cache.
851 *
852 * This function make a list of word-head nodes which requires inter-word
853 * factoring caching (i.e. isolated word head nodes, does not shared by other
854 * words) from the existing list of word head nodes, and set it to
855 * wchmm->start2isolate and wchmm->isolatenum.
856 *
857 * @param wchmm [i/o] tree lexicon
858 * </EN>
859 *
860 * @callgraph
861 * @callergraph
862 *
863 */
864 void
make_iwcache_index(WCHMM_INFO * wchmm)865 make_iwcache_index(WCHMM_INFO *wchmm)
866 {
867 int i, node, num;
868
869 wchmm->start2isolate = (int *)mymalloc(sizeof(int) * wchmm->startnum);
870 num = 0;
871 for(i=0;i<wchmm->startnum;i++) {
872 node = wchmm->startnode[i];
873 if (wchmm->state[node].scid >= 0) { /* not a factoring node (isolated node, has no 1-gram factoring value) */
874 wchmm->start2isolate[i] = num;
875 num++;
876 } else { /* factoring node (shared) */
877 wchmm->start2isolate[i] = -1;
878 }
879 }
880 wchmm->isolatenum = num;
881 }
882
883 /**
884 * <JA>
885 * @brief �ڹ�¤�������� 1-gram factoring �ͤ�����Ƴ�Ǽ����.
886 *
887 * 1-gram factoring �Ǥ�ñ��֤Ƕ�ͭ����Ƥ���ޥΡ��ɤǤ� 1-gram �κ�����
888 * ��Ϳ����. ñ������ˤ��ʤ����ᡤ�����ͤ�ǧ����������
889 * �����Ƥ������Ȥ��Ǥ���. ���δؿ����ڹ�¤������
890 * ���ΤˤĤ��ơ���ͭ����Ƥ����successor list �ˣ��İʾ��ñ�����ĥΡ��ɡ�
891 * �Ρ��ɤ� 1-gram factoring �ͤ�����Ƴ�Ǽ����. 1-gram factoring�ͤ�
892 * ����ϡ����ΥΡ��ɤ� successor list �Ϥ�Ϥ����פǤ��뤿�ᡤ������
893 * �������.
894 *
895 * �ºݤˤϡ�factoring �ͤ� wchmm->fscore �˽缡��¸���졤�Ρ��ɤ�
896 * scid �ˤ�����¸�ͤؤΥ���ǥå���(1-)������ͤ���Ǽ�����. ���פˤʤä�
897 * successor list �ϡ��ºݤˤ� compaction_successor ��ǡ��б�����Ρ��ɤ�
898 * scid ����ˤʤäƤ��� successor list �������뤳�ȤǹԤʤ���.
899 *
900 * @param wchmm [i/o] �ڹ�¤������
901 * </JA>
902 * <EN>
903 * @brief Calculate all the 1-gram factoring values on tree lexicon.
904 *
905 * On 1-gram factoring, the shared nodes on branch has fixed factoring score
906 * from 1-gram values, independent of the word context on recognition. So
907 * the values are fixed for all recognition and can be calculated before
908 * search. This function stores all the neede 1-gram factoring value by
909 * traversing tree lexicon with successor lists and compute maximum 1-gram
910 * for each successor lists that has more than two words (=shared).
911 * Since a successor list is no more neede after the 1-gram value is computed,
912 * they will be freed.
913 *
914 * Actually, computed factoring scores will be stored in wchmm->fscore
915 * sequencially, and the index value, starting from 1,
916 * to the fscore list is stored in scid of each nodes as a negative value.
917 * The free will be performed in compaction_successor() by checking if a
918 * successor's corresponding scid on tree lexicon has negative value.
919 *
920 * @param wchmm [i/o] tree lexicon
921 * </EN>
922 *
923 * @callgraph
924 * @callergraph
925 *
926 */
927 void
calc_all_unigram_factoring_values(WCHMM_INFO * wchmm)928 calc_all_unigram_factoring_values(WCHMM_INFO *wchmm)
929 {
930 S_CELL *sc, *sctmp;
931 LOGPROB tmpprob, maxprob;
932 int i, n;
933
934 /* count needed number of 1-gram factoring nodes */
935 n = 0;
936 for (i=1;i<wchmm->scnum;i++) {
937 sc = wchmm->sclist[i];
938 if (sc == NULL) {
939 j_internal_error("call_all_unigram_factoring_values: sclist has no sc?\n");
940 }
941 if (sc->next != NULL) {
942 /* more than two words, so compute maximum 1-gram probability */
943 n++;
944 }
945 }
946 wchmm->fsnum = n + 1;
947 /* allocate area */
948 wchmm->fscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->fsnum);
949 /* assign values */
950 n = 1;
951 for (i=1;i<wchmm->scnum;i++) {
952 sc = wchmm->sclist[i];
953 if (sc->next != NULL) {
954 maxprob = LOG_ZERO;
955 for (sctmp = sc; sctmp; sctmp = sctmp->next) {
956 if (wchmm->ngram) {
957 tmpprob = uni_prob(wchmm->ngram, wchmm->winfo->wton[sctmp->word])
958 #ifdef CLASS_NGRAM
959 + wchmm->winfo->cprob[sctmp->word]
960 #endif
961 ;
962 } else {
963 tmpprob = LOG_ZERO;
964 }
965 if (wchmm->lmvar == LM_NGRAM_USER) {
966 tmpprob = (*(wchmm->uni_prob_user))(wchmm->winfo, sctmp->word, tmpprob);
967 }
968 if (maxprob < tmpprob) maxprob = tmpprob;
969 }
970 wchmm->fscore[n] = maxprob;
971 free_successor(wchmm, i);
972 wchmm->state[wchmm->sclist2node[i]].scid = - n;
973 n++;
974 }
975 }
976 /* garbage collection of factored sclist */
977 compaction_successor(wchmm);
978 }
979
980 #else /* ~UNIGRAM_FACTORING */
981
982 /**
983 * <JA>
984 * �ڹ�¤�������Τ���Ρ��ɤˤĤ��ơ�Ϳ����줿ñ��������Ф���2-gram
985 * �������������.
986 *
987 * @param wchmm [in] �ڹ�¤������
988 * @param lastword [in] ľ��ñ��
989 * @param node [in] @a wchmm ��ΥΡ����ֹ�
990 *
991 * @return 2-gram ��Ψ.
992 * </JA>
993 * <EN>
994 * Compute 2-gram factoring value for the node and return the probability.
995 *
996 * @param wchmm [in] tree lexicon
997 * @param lastword [in] the last context word
998 * @param node [in] node ID on @a wchmm
999 *
1000 * @return the log probability of 2-gram on that node.
1001 * </EN>
1002 *
1003 */
1004 static LOGPROB
calc_successor_prob(WCHMM_INFO * wchmm,WORD_ID lastword,int node)1005 calc_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node)
1006 {
1007 S_CELL *sc;
1008 LOGPROB tmpprob, maxprob;
1009 WORD_ID lw;
1010
1011 maxprob = LOG_ZERO;
1012 if (wchmm->ngram) {
1013 lw = wchmm->winfo->wton[lastword];
1014 }
1015
1016 for (sc = wchmm->sclist[wchmm->state[node].scid]; sc; sc = sc->next) {
1017 if (wchmm->ngram) {
1018 tmpprob = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, lw , wchmm->winfo->wton[sc->word])
1019 #ifdef CLASS_NGRAM
1020 + wchmm->winfo->cprob[sc->word]
1021 #endif
1022 ;
1023 } else {
1024 tmpprob = LOG_ZERO;
1025 }
1026 if (wchmm->lmvar == LM_NGRAM_USER) {
1027 tmpprob = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, sc->word, tmpprob);
1028 }
1029 if (maxprob < tmpprob) maxprob = tmpprob;
1030 }
1031
1032 return(maxprob);
1033 }
1034
1035 #endif /* ~UNIGRAM_FACTORING */
1036
1037 /**
1038 * <JA>
1039 * @brief ñ����Τ���Ρ��ɤˤĤ��� factoring �ͤ������.
1040 *
1041 * 1-gram factoring �Ǹ���factoring�ͤ�������Ϥ����ͤ�¨�¤��֤����.
1042 * ¾�ξ��ϡ����ΥΡ��ɤΥ��֥ĥ���ñ��� 2-gram��Ψ�ʤκ����͡ˤ�
1043 * �������.
1044 *
1045 * ñ���� factoring ����å��夬��θ�����. ���ʤ���ƥΡ��ɤˤĤ���
1046 * ľ��ñ�줬�����������줿�Ȥ���Ʊ���Ǥ���С�
1047 * ������ͤ��֤��졤�����Ǥʤ�����ͤ����������å��夬���������.
1048 *
1049 * @param wchmm [in] �ڹ�¤������
1050 * @param lastword [in] ľ��ñ���ID
1051 * @param node [in] �Ρ����ֹ�
1052 *
1053 * @return �����ǥ륹����
1054 * </JA>
1055 * <EN>
1056 * @brief compute factoring LM score for the given word-internal node.
1057 *
1058 * If it is a shared branch node and 1-gram factoring is used, the
1059 * constant factoring value which has already been assigned before search
1060 * will be returned immediately. Else, the maximum 2-gram probability
1061 * of corresponding successor words are computed.
1062 *
1063 * The word-internal factoring cache is consulted within this function.
1064 * If the given last word is the same as the last call on that node,
1065 * the last computed value will be returned, else the maximum value
1066 * will be computed update the cache with the last word and value.
1067 *
1068 * @param wchmm [in] tree lexicon
1069 * @param lastword [in] word ID of last context word
1070 * @param node [in] node ID
1071 *
1072 * @return the LM factoring score.
1073 * </EN>
1074 *
1075 * @callgraph
1076 * @callergraph
1077 *
1078 */
1079 LOGPROB
max_successor_prob(WCHMM_INFO * wchmm,WORD_ID lastword,int node)1080 max_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node)
1081 {
1082 LOGPROB maxprob;
1083 WORD_ID last_nword, w;
1084 int scid;
1085 LM_PROB_CACHE *l;
1086
1087 l = &(wchmm->lmcache);
1088
1089 if (lastword != WORD_INVALID) { /* return nothing if no previous word */
1090 if (wchmm->ngram) {
1091 last_nword = wchmm->winfo->wton[lastword];
1092 } else {
1093 last_nword = lastword;
1094 }
1095 scid = wchmm->state[node].scid;
1096 #ifdef UNIGRAM_FACTORING
1097 if (scid < 0) {
1098 /* return 1-gram factoring value already calced */
1099 return(wchmm->fscore[(- scid)]);
1100 } else {
1101 /* this node has only one successor */
1102 /* return precise 2-gram score */
1103 if (last_nword != l->lastwcache[scid]) {
1104 /* calc and cache */
1105 w = (wchmm->sclist[scid])->word;
1106 if (wchmm->ngram) {
1107 maxprob = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, last_nword, wchmm->winfo->wton[w])
1108 #ifdef CLASS_NGRAM
1109 + wchmm->winfo->cprob[w]
1110 #endif
1111 ;
1112 } else {
1113 maxprob = LOG_ZERO;
1114 }
1115 if (wchmm->lmvar == LM_NGRAM_USER) {
1116 maxprob = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, w, maxprob);
1117 }
1118 l->lastwcache[scid] = last_nword;
1119 l->probcache[scid] = maxprob;
1120 return(maxprob);
1121 } else {
1122 /* return cached */
1123 return (l->probcache[scid]);
1124 }
1125 }
1126 #else /* UNIGRAM_FACTORING */
1127 /* 2-gram */
1128 if (last_nword != l->lastwcache[scid]) {
1129 maxprob = calc_successor_prob(wchmm, lastword, node);
1130 /* store to cache */
1131 l->lastwcache[scid] = last_nword;
1132 l->probcache[scid] = maxprob;
1133 return(maxprob);
1134 } else {
1135 return (l->probcache[scid]);
1136 }
1137 #endif /* UNIGRAM_FACTORING */
1138 } else {
1139 return(0.0);
1140 #if 0
1141 maxprob = LOG_ZERO;
1142 for (sc=wchmm->state[node].sc;sc;sc=sc->next) {
1143 tmpprob = uni_prob(wchmm->ngram, sc->word);
1144 if (maxprob < tmpprob) maxprob = tmpprob;
1145 }
1146 return(maxprob);
1147 #endif
1148 }
1149
1150 }
1151
1152 /**
1153 * <JA>
1154 * @brief ñ��֤� factoring �ͤΥꥹ�Ȥ��֤�.
1155 *
1156 * Ϳ����줿ľ��ñ����Ф��ơ�factoring�ͤ�����٤����Ƥ�ñ����Ƭ�ؤ�
1157 * factoring �ͤ���������Υꥹ�Ȥ��֤�. ����factoring�ͤ�
1158 * ľ��ñ�줴�Ȥ˥ꥹ��ñ�̤ǥ���å��夵���. ���ʤ��������ľ��ñ�줬
1159 * ����ޤǤ˰��٤Ǥ�ľ��ñ��Ȥ��ƽи����Ƥ�����硤���Υꥹ�ȤΤޤ�
1160 * �֤�.
1161 *
1162 * @param wchmm [in] �ڹ�¤������
1163 * @param lastword [in] ľ��ñ��
1164 *
1165 * @return ��ñ����Ƭ�Ρ��ɤؤ� factoring �������Υꥹ��
1166 * </JA>
1167 * <EN>
1168 * @brief Compute cross-word facgtoring values for word head nodes and return
1169 * the list.
1170 *
1171 * Given a last word, this function compute the factoring LM scores for all
1172 * the word head node to which the context-dependent (not 1-gram) factoring
1173 * values should be computed. The resulting list of factoring values are
1174 * cached within this function per the last word.
1175 *
1176 * @param wchmm [in] tree lexicon
1177 * @param lastword [in] last word
1178 *
1179 * @return the list of factoring LM scores for all the needed word-head nodes.
1180 * </EN>
1181 *
1182 * @callgraph
1183 * @callergraph
1184 *
1185 */
1186 LOGPROB *
max_successor_prob_iw(WCHMM_INFO * wchmm,WORD_ID lastword)1187 max_successor_prob_iw(WCHMM_INFO *wchmm, WORD_ID lastword)
1188 {
1189 int i, j, x, node;
1190 int last_nword;
1191 WORD_ID w;
1192 LM_PROB_CACHE *l;
1193 LOGPROB p;
1194
1195 l = &(wchmm->lmcache);
1196
1197 if (wchmm->ngram) {
1198 last_nword = wchmm->winfo->wton[lastword];
1199 } else {
1200 last_nword = lastword;
1201 }
1202
1203 #ifdef HASH_CACHE_IW
1204 x = last_nword % l->iw_cache_num;
1205 if (l->iw_lw_cache[x] == last_nword) { /* cache hit */
1206 return(l->iw_sc_cache[x]);
1207 }
1208 #else /* full cache */
1209 if (l->iw_sc_cache[last_nword] != NULL) { /* cache hit */
1210 return(l->iw_sc_cache[last_nword]);
1211 }
1212 x = last_nword;
1213 /* cache mis-hit, calc probs and cache them as new */
1214 #endif
1215 /* allocate cache memory */
1216 if (l->iw_sc_cache[x] == NULL) {
1217 #ifdef UNIGRAM_FACTORING
1218 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum);
1219 #else
1220 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum);
1221 #endif
1222 if (l->iw_sc_cache[x] == NULL) { /* malloc failed */
1223 /* clear existing cache, and retry */
1224 max_successor_prob_iw_free(wchmm);
1225 jlog("STAT: inter-word LM cache (%dMB) rehashed\n",
1226 (l->iw_cache_num *
1227 #ifdef UNIGRAM_FACTORING
1228 wchmm->isolatenum
1229 #else
1230 wchmm->startnum
1231 #endif
1232 ) / 1000 * sizeof(LOGPROB) / 1000);
1233 #ifdef UNIGRAM_FACTORING
1234 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum);
1235 #else
1236 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum);
1237 #endif
1238 if (l->iw_sc_cache[x] == NULL) { /* malloc failed again? */
1239 j_internal_error("max_successor_prob_iw: cannot malloc\n");
1240 }
1241 }
1242 }
1243
1244 /* calc prob for all startid */
1245 #ifdef UNIGRAM_FACTORING
1246 for (j=0;j<wchmm->startnum;j++) {
1247 i = wchmm->start2isolate[j];
1248 if (i == -1) continue;
1249 node = wchmm->startnode[j];
1250 if (wchmm->state[node].scid <= 0) {
1251 /* should not happen!!! below is just for debugging */
1252 j_internal_error("max_successor_prob_iw: isolated (not shared) tree root node has unigram factoring value??\n");
1253 } else {
1254 w = (wchmm->sclist[wchmm->state[node].scid])->word;
1255 if (wchmm->ngram) {
1256 p = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, last_nword, wchmm->winfo->wton[w])
1257 #ifdef CLASS_NGRAM
1258 + wchmm->winfo->cprob[w]
1259 #endif
1260 ;
1261 } else {
1262 p = LOG_ZERO;
1263 }
1264 if (wchmm->lmvar == LM_NGRAM_USER) {
1265 p = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, w, p);
1266 }
1267 l->iw_sc_cache[x][i] = p;
1268 }
1269 }
1270 #else /* ~UNIGRAM_FACTORING */
1271 for (i=0;i<wchmm->startnum;i++) {
1272 node = wchmm->startnode[i];
1273 l->iw_sc_cache[x][i] = calc_successor_prob(wchmm, lastword, node);
1274 }
1275 #endif
1276 #ifdef HASH_CACHE_IW
1277 l->iw_lw_cache[x] = last_nword;
1278 #endif
1279
1280 return(l->iw_sc_cache[x]);
1281 }
1282
1283 /**
1284 * <JA>
1285 * @brief ʸˡ�ˤ��ñ�������Ū factoring
1286 *
1287 * Julian �ˤ����� CATEGORY_TREE ���������Ƥ���Ȥ��ʥǥե���ȡˡ�
1288 * �ڹ�¤������ϥ��ƥ���ñ�̡ʤ��ʤ����ʸ����ε���ñ�̡ˤǹ��ۤ���뤿�ᡤ
1289 * ��1�ѥ��Ǥθ����ǥ�Ǥ��륫�ƥ����������ñ��λϽ�ü��Ŭ�ѤǤ���.
1290 *
1291 * ���� CATEGORY_TREE ���������Ƥ��ʤ���硤�ڹ�¤�������
1292 * �������Τ�ñ����ڤ�����뤿�ᡤ���ƥ���������� N-gram (Julius) ��
1293 * Ʊ�ͤ�ñ����� factoring ��Ʊ�ͤε�����Ŭ�Ѥ����ɬ�פ�����.
1294 *
1295 * ���δؿ��� CATEGORY_TREE ���������Ƥ��ʤ��Ȥ��ˡ��嵭�� factoring
1296 * �ʷ���Ū factoring �ȸƤФ��ˤ�Ԥʤ����������Ƥ���.
1297 *
1298 * @param wchmm [in] �ڹ�¤������
1299 * @param lastword [in] ľ��ñ��
1300 * @param node [in] �Ρ����ֹ�
1301 *
1302 * @return ���ƥ�������夽�λޤؤ����ܤ��������� TRUE, �Բ�ǽ�Ǥ���� FALSE
1303 * </JA>
1304 * <EN>
1305 * @brief Deterministic factoring for grammar-based recognition (Julian)
1306 *
1307 * If CATEGORY_TREE is defined (this is default) on Julian, the tree lexicon
1308 * will be organized per category and the category-pair constraint used
1309 * in the 1st pass can be applied statically at cross-word transition.
1310 *
1311 * If the CATEGORY_TREE is not defined, a single tree lexicon will be
1312 * constucted for a whole dictionary. In this case, the category-pair
1313 * constraint should be applied dynamically in the word-internal transition,
1314 * like the factoring scheme with N-gram (Julius).
1315 *
1316 * This function provides such word-internal factoring for grammar-based
1317 * recognition (called deterministic factoring) when CATEGORY_TREE is
1318 * undefined in Julian.
1319 *
1320 * @param wchmm [in] tree lexicon
1321 * @param lastword [in] last word
1322 * @param node [in] node ID to check the constraint
1323 *
1324 * @return TRUE if the transition to the branch is allowed on the category-pair
1325 * constraint, or FALSE if not allowed.
1326 * </EN>
1327 *
1328 * @callgraph
1329 * @callergraph
1330 *
1331 */
1332 boolean
can_succeed(WCHMM_INFO * wchmm,WORD_ID lastword,int node)1333 can_succeed(WCHMM_INFO *wchmm, WORD_ID lastword, int node)
1334 {
1335 int lc;
1336 S_CELL *sc;
1337
1338 /* return TRUE if at least one subtree word can connect */
1339
1340 if (lastword == WORD_INVALID) { /* case at beginning-of-word */
1341 for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) {
1342 if (dfa_cp_begin(wchmm->dfa, sc->word) == TRUE) return(TRUE);
1343 }
1344 return(FALSE);
1345 } else {
1346 lc = wchmm->winfo->wton[lastword];
1347 for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) {
1348 if (dfa_cp(wchmm->dfa, lc, sc->word) == TRUE) return(TRUE);
1349 }
1350 return(FALSE);
1351 }
1352 }
1353
1354 /* end of file */
1355