1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 2009 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 /** 38 * \file sphinx_lm_convert.c 39 * Language model conversion tool. 40 */ 41 #include <sphinxbase/logmath.h> 42 #include <sphinxbase/ngram_model.h> 43 #include <sphinxbase/cmd_ln.h> 44 #include <sphinxbase/ckd_alloc.h> 45 #include <sphinxbase/err.h> 46 #include <sphinxbase/pio.h> 47 #include <sphinxbase/strfuncs.h> 48 49 #include <stdio.h> 50 #include <string.h> 51 #include <math.h> 52 53 static const arg_t defn[] = { 54 { "-help", 55 ARG_BOOLEAN, 56 "no", 57 "Shows the usage of the tool"}, 58 59 { "-logbase", 60 ARG_FLOAT64, 61 "1.0001", 62 "Base in which all log-likelihoods calculated" }, 63 64 { "-i", 65 REQARG_STRING, 66 NULL, 67 "Input language model file (required)"}, 68 69 { "-o", 70 REQARG_STRING, 71 NULL, 72 "Output language model file (required)"}, 73 74 { "-ifmt", 75 ARG_STRING, 76 NULL, 77 "Input language model format (will guess if not specified)"}, 78 79 { "-ofmt", 80 ARG_STRING, 81 NULL, 82 "Output language model file (will guess if not specified)"}, 83 84 { "-ienc", 85 ARG_STRING, 86 NULL, 87 "Input language model text encoding (no conversion done if not specified)"}, 88 89 { "-oenc", 90 ARG_STRING, 91 "utf8", 92 "Output language model text encoding"}, 93 94 { "-case", 95 ARG_STRING, 96 NULL, 97 "Ether 'lower' or 'upper' - case fold to lower/upper case (NOT UNICODE AWARE)" }, 98 99 { "-mmap", 100 ARG_BOOLEAN, 101 "no", 102 "Use memory-mapped I/O for reading binary LM files"}, 103 104 { "-debug", 105 ARG_INT32, 106 NULL, 107 "Verbosity level for debugging messages" 108 }, 109 110 { NULL, 0, NULL, NULL } 111 }; 112 113 static void 114 usagemsg(char *pgm) 115 { 116 E_INFO("Usage: %s -i <input.lm> \\\n", pgm); 117 E_INFOCONT("\t[-ifmt txt] [-ofmt dmp]\n"); 118 E_INFOCONT("\t-o <output.lm.DMP>\n"); 119 120 exit(0); 121 } 122 123 124 int 125 main(int argc, char *argv[]) 126 { 127 cmd_ln_t *config; 128 ngram_model_t *lm = NULL; 129 logmath_t *lmath; 130 int itype, otype; 131 char const *kase; 132 133 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL) 134 return 1; 135 136 if (cmd_ln_boolean_r(config, "-help")) { 137 usagemsg(argv[0]); 138 } 139 140 err_set_debug_level(cmd_ln_int32_r(config, "-debug")); 141 142 /* Create log math object. */ 143 if ((lmath = logmath_init 144 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) { 145 E_FATAL("Failed to initialize log math\n"); 146 } 147 148 if (cmd_ln_str_r(config, "-i") == NULL || cmd_ln_str_r(config, "-i") == NULL) { 149 E_ERROR("Please specify both input and output models\n"); 150 goto error_out; 151 } 152 153 154 /* Load the input language model. */ 155 if (cmd_ln_str_r(config, "-ifmt")) { 156 if ((itype = ngram_str_to_type(cmd_ln_str_r(config, "-ifmt"))) 157 == NGRAM_INVALID) { 158 E_ERROR("Invalid input type %s\n", cmd_ln_str_r(config, "-ifmt")); 159 goto error_out; 160 } 161 lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"), 162 itype, lmath); 163 } 164 else { 165 lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"), 166 NGRAM_AUTO, lmath); 167 } 168 169 if (lm == NULL) { 170 E_FATAL("Failed to read the model from the file '%s'", cmd_ln_str_r(config, "-i")); 171 } 172 173 /* Guess or set the output language model type. */ 174 if (cmd_ln_str_r(config, "-ofmt")) { 175 if ((otype = ngram_str_to_type(cmd_ln_str_r(config, "-ofmt"))) 176 == NGRAM_INVALID) { 177 E_ERROR("Invalid output type %s\n", cmd_ln_str_r(config, "-ofmt")); 178 goto error_out; 179 } 180 } 181 else { 182 otype = ngram_file_name_to_type(cmd_ln_str_r(config, "-o")); 183 } 184 185 /* Recode the language model if desired. */ 186 if (cmd_ln_str_r(config, "-ienc")) { 187 if (ngram_model_recode(lm, cmd_ln_str_r(config, "-ienc"), 188 cmd_ln_str_r(config, "-oenc")) != 0) { 189 E_ERROR("Failed to recode language model from %s to %s\n", 190 cmd_ln_str_r(config, "-ienc"), 191 cmd_ln_str_r(config, "-oenc")); 192 goto error_out; 193 } 194 } 195 196 /* Case fold if requested. */ 197 if ((kase = cmd_ln_str_r(config, "-case"))) { 198 if (0 == strcmp(kase, "lower")) { 199 ngram_model_casefold(lm, NGRAM_LOWER); 200 } 201 else if (0 == strcmp(kase, "upper")) { 202 ngram_model_casefold(lm, NGRAM_UPPER); 203 } 204 else { 205 E_ERROR("Unknown value for -case: %s\n", kase); 206 goto error_out; 207 } 208 } 209 210 /* Write the output language model. */ 211 if (ngram_model_write(lm, cmd_ln_str_r(config, "-o"), otype) != 0) { 212 E_ERROR("Failed to write language model in format %s to %s\n", 213 ngram_type_to_str(otype), cmd_ln_str_r(config, "-o")); 214 goto error_out; 215 } 216 217 /* That's all folks! */ 218 ngram_model_free(lm); 219 return 0; 220 221 error_out: 222 ngram_model_free(lm); 223 return 1; 224 } 225