1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * corpus.h -- Corpus-file related misc functions.
39  *
40  * **********************************************
41  * CMU ARPA Speech Project
42  *
43  * Copyright (c) 1996 Carnegie Mellon University.
44  * ALL RIGHTS RESERVED.
45  * **********************************************
46  *
47  * HISTORY
48  * $Log$
49  * Revision 1.1  2006/04/05  20:27:30  dhdfu
50  * A Great Reorganzation of header files and executables
51  *
52  * Revision 1.13  2006/02/22 19:49:25  arthchan2003
53  * Merged from SPHINX3_5_2_RCI_IRII:
54  * 1, Add structure utt_res_t, this is an utterance-based resouce
55  * structure. Add basic operation such as free and report.
56  * 2, Modify the structure of the loop in ctl_corpus to make it not so
57  * clunky. Tested with make check .
58  * 3, Completely removed ctl_process_dyn_lm, it is a product of code
59  * duplication (alright, it is written by me......)
60  * 4, Fixed doc-dox.
61  *
62  * Revision 1.12.4.3  2005/07/27 23:19:11  arthchan2003
63  * 1, Added utt_res_t structure and its methods. 2, Changed the function pointer prototype. 3, Removed the lm and mllr set process out of ctl_process
64  *
65  * Revision 1.12.4.2  2005/07/26 03:14:17  arthchan2003
66  * Removed ctl_process_dyn_lm. One of my sin.
67  *
68  * Revision 1.12.4.1  2005/07/05 06:25:40  arthchan2003
69  * Fixed dox-doc.
70  *
71  * Revision 1.12  2005/06/21 20:44:34  arthchan2003
72  * 1, Fixed doxygen documentation, 2, Add the $ keyword.
73  *
74  * Revision 1.4  2005/06/18 20:05:23  archan
75  * Sphinx3 to s3.generic: Set lm correctly in dag.c and astar.c.  Same changes should also be applied to decode_anytopo.
76  *
77  * Revision 1.3  2005/03/30 01:22:46  archan
78  * Fixed mistakes in last updates. Add
79  *
80  *
81  * 09-Dec-1999	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
82  * 		Added ctl_process_utt ().
83  *
84  * 01-Mar-1999	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
85  * 		Updated ctl_infile() spec to included check for already existing file extension.
86  *
87  * 23-Mar-1998	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
88  * 		Added a general purpose data argument to ctl_process() and its function
89  * 		argument func.
90  *
91  * 22-Nov-1997	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
92  * 		Added an optional validation function argument and an optional
93  *		duplicate-resolution function argument to both corpus_load_headid() and
94  * 		corpus_load_tailid().
95  *
96  * 25-Oct-1997	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
97  * 		Created.
98  */
99 
100 
101 #ifndef _S3_CORPUS_H_
102 #define _S3_CORPUS_H_
103 
104 #include <stdio.h>
105 
106 #include <hash_table.h>
107 #include <profile.h>
108 #include <s3types.h>
109 
110 
111 
112 /** \file corpus.h
113  *  \brief Operations on corpus
114  */
115 #ifdef __cplusplus
116 extern "C" {
117 #endif
118 #if 0
119 } /* Fool Emacs into not indenting things. */
120 #endif
121 
122 /** \struct utt_res_t
123     \brief A structure to store utterance-based resource
124     Assume that most resource are string pointers, the string itself
125     is pre-allocated somewhere.
126 */
127 typedef struct
128 {
129     char* uttfile; /**< Utterance file name */
130     char* lmname;  /**< LM file name for this utterance */
131 
132     char* fsgname;  /**< FSG file name for this utterance. For one
133                        utterance, one could only use either LM or fsg */
134 
135     char* regmatname; /**< The regression matrix file name for this utterance */
136 
137     char* cb2mllrname; /**< The code book to regression matrix file name for this utterance
138                         */
139 } utt_res_t;
140 
141 #define utt_res_set_uttfile(ur,name) ur->uttfile=name
142 #define utt_res_set_lmname(ur,name)  ur->lmname=name
143 #define utt_res_set_fsgname(ur,name) ur->fsgname=name
144 #define utt_res_set_regmatname(ur,name) ur->regmatname=name
145 #define utt_res_set_cb2mllrname(ur,name) ur->cb2mllrname=name
146 
147 /** This just return a new utter_res_t */
148 utt_res_t* new_utt_res(void);
149 
150 /** Free utt_res_t */
151 void free_utt_res(
152     utt_res_t* ur /**< an utt_res_t */
153     );
154 
155 /** Report what's inside utt_res_t */
156 void report_utt_res(
157     utt_res_t *ur /**< an utt_res_t */
158     );
159 
160 /**
161  * \struct corpus_t
162  * \brief  Structure for a corpus: essentially a set of strings each associated with a
163  * unique ID.
164  * Structure for a corpus: essentially a set of strings each associated with a
165  * unique ID.  (Such as a reference sentence file, hypothesis file, and various
166  * control files.)
167  * NOTE: IDs are CASE-SENSITIVE.
168  */
169 typedef struct {
170     hash_table_t *ht;	/**< Hash table for IDs; CASE-SENSITIVE */
171     int32 n;		/**< #IDs (and corresponding argument strings) in the corpus */
172     char **str;		/**< The argument strings */
173 } corpus_t;
174 
175 
176 /**
177  * Load a corpus from the given file and return it.
178  * Each line is a separate entry in the corpus.  Blank lines are skipped.
179  * The ID is the FIRST word in a line.
180  *
181  * Validation:
182  *
183  * validate is an optional, application-supplied function to determine if each input
184  * corpus data entry is eligible (valid) for inclusion in the final corpus.  It should
185  * return an integer value signifying the following actions:
186  *      0: Not valid, skip the entry;
187  *     !0: Valid, include the entry.
188  * If validate is NULL, every input entry is included in the corpus.
189  *
190  * Duplicate resolution:
191  *
192  * dup_resolve is an optional, application-supplied function to resolve duplicate keys
193  * (IDs).  It may be NULL if none is available.  If present, and a duplicate key is
194  * encountered, the function is invoked with the original and the duplicate corpus
195  * strings as arguments (s1 and s2, respectively).  It should return an integer value
196  * signifying the following actions:
197  *      0: Retain the original string, discard the new one;
198  *     >0: Replace the original string with the new one;
199  *     <0: Error (causes a FATAL_ERROR).
200  * If dup_resolve is NULL, any duplicate ID causes a FATAL_ERROR.
201  *
202  * Return value: Ptr to corpus if successful.
203  */
204 corpus_t *corpus_load_headid (const char *file,	/**< Input file name, the file must be seekable and rewindable */
205 			      int32 (*validate)(char *str),
206 			      int32 (*dup_resolve)(char *s1, char *s2));
207 
208 /**
209  * Similar to corpus_load_headid, but the ID is at the END of each line, in parentheses.
210  */
211 corpus_t *corpus_load_tailid (const char *file,	/**< Input file name, the file must be seekable and rewindable */
212 			      int32 (*validate)(char *str),
213 			      int32 (*dup_resolve)(char *s1, char *s2));
214 
215 /**
216  * Lookup the given corpus for the given ID and return the associated string.
217  * Return NULL if ID not found.
218  */
219 char *corpus_lookup (corpus_t *corp, const char *id);
220 
221 
222 /**
223  * Read another entry from a S3 format "control file" and parse its various fields.
224  * Blank lines and lines beginning with a hash-character (#) are omitted.
225  * Control file entry format:
226  *     uttfile(usually cepstrum file) [startframe endframe [uttid]]
227  * Any error in control file entry format is FATAL.
228  * Return value: 0 if successful, -1 if no more entries left.
229  */
230 
231 int32 ctl_read_entry (FILE *fp,         /**< In: an input file pointer */
232 		      char *uttfile,	/**< Out: (Cep)file containing utterance data */
233 		      int32 *sf,	/**< Out: Start frame in uttfile; 0 if omitted */
234 		      int32 *ef,	/**< Out: End frame in uttfile; -1 (signifying
235 					   until EOF) if omitted */
236 		      char *uttid	/**< Out: Utterance ID (generated from uttfile/sf/ef
237 					   if omitted) */
238     );
239 
240 
241 /**
242  * Process the given control file (or stdin if NULL): Skip the first
243  * nskip entries, and process the next count entries by calling the
244  * given function (*func) for each entry.  Any error in reading the
245  * control file is FATAL.  ctllmfile and ctlmllrfile can be specified
246  * optionally. If they are not specified, then NULL could be used.
247  *
248  * Return value: ptmr_t structure containing cpu/elapsed time stats for the run.
249  */
250 S3DECODER_EXPORT
251 ptmr_t ctl_process (const char *ctlfile,	/**< In: Control file to read; use stdin if NULL */
252 		    const char *ctllmfile,     /**< In: Control file that specify the lm used for the corresponding utterance */
253 		    const char *ctlmllrfile,   /**< In: Contorl file that specify the mllr used for the corresponding utterance */
254 		    int32 nskip,	/**< In: No. of entries to skip at the head */
255 		    int32 count,	/**< In: No. of entries to process after nskip */
256 		    void (*func) (void *kb, utt_res_t *ur, int32 sf, int32 ef, char *uttid),
257 		    /**< In: Function to be invoked for each of the
258 		       count entries processed. */
259 		    void *kb		/**< In: A catch-all data pointer to be passed as
260 					   the first argument to func above */
261     );
262 
263 
264 /**
265  * Like ctl_process, but process the single filename given (uttfile), count times.  After each
266  * processing, wait for the time of modification on the given file to change.  In this mode,
267  * the decoder can be used to process a dynamically generated sequence of utterances.  To avoid
268  * race conditions, each new instance of the file should be created "in an instant": by creating
269  * it under a temporary name and finally renaming it to the given filename atomically.
270  * @return: ptmr_t structure containing cpu/elapsed time stats for the run.
271  */
272 S3DECODER_EXPORT
273 ptmr_t ctl_process_utt (const char *uttfile,	/**< In: Filename to be process (in its entirety) */
274 			int32 count,	/**< In: No. of iterations to process uttfile */
275 			void (*func) (void *kb, utt_res_t *ur, int32 sf, int32 ef, char *uttid),/**< A function pointer that do the actual processing */
276 
277 			void *kb);
278 
279 /**
280  * Build a complete input filename from the given uttname, directory and file-extension:
281  *   If utt begins with a / ignore dir, otherwise prefix dir/ to utt;
282  *   If a non-empty file extension is provided, and utt doesn't already have that extension,
283  * 	append .ext to filename.
284  */
285 void ctl_infile (char *file,	/**< Out: Generated filename (allocated by caller) */
286 		 const char *dir,	/**< In: Optional directory spec if relative utt specified */
287 		 const char *ext,	/**< In: File extension to be appended to utt to generate
288 					   complete filename */
289 		 const char *utt	/**< In: Utterance file pathname, absolute or relative,
290 					   with or without file extension.  This is usually the
291 					   first field in a control file */
292     );
293 
294 /**
295  * Build a complete output filename from the given components as follows:
296  *     if dir ends with ,CTL and utt does not begin with /, use dir/utt
297  *     if dir ends with ,CTL and utt DOES begin with /, filename is utt
298  *     if dir does not end with ,CTL, filename is dir/uttid.
299  * If a non-empty ext specified append .ext to generated filename.
300  */
301 void ctl_outfile (char *file,	/**< Out: Generated filename (allocated by caller) */
302 		  const char *dir,	/**< In: Directory for the generated filename; see comment
303 					   for special handling of ,CTL suffix */
304 		  const char *ext,	/**< In: File-extension applied to the generated filename */
305 		  const char *utt,	/**< In: Utterance file pathname, absolute or relative,
306 					   with or without extension.  This is usually the first
307 					   field in a control file. */
308 		  const char *uttid	/**< In: Utterance ID (derived from the control file */
309     );
310 
311 #if 0
312 { /* Stop indent from complaining */
313 #endif
314 #ifdef __cplusplus
315 }
316 #endif
317 
318 #endif
319