1 /*************************************************************************/ 2 /* */ 3 /* Language Technologies Institute */ 4 /* Carnegie Mellon University */ 5 /* Copyright (c) 2007-2017 */ 6 /* All Rights Reserved. */ 7 /* */ 8 /* Permission is hereby granted, free of charge, to use and distribute */ 9 /* this software and its documentation without restriction, including */ 10 /* without limitation the rights to use, copy, modify, merge, publish, */ 11 /* distribute, sublicense, and/or sell copies of this work, and to */ 12 /* permit persons to whom this work is furnished to do so, subject to */ 13 /* the following conditions: */ 14 /* 1. The code must retain the above copyright notice, this list of */ 15 /* conditions and the following disclaimer. */ 16 /* 2. Any modifications must be clearly marked as such. */ 17 /* 3. Original authors' names are not deleted. */ 18 /* 4. The authors' names are not used to endorse or promote products */ 19 /* derived from this software without specific prior written */ 20 /* permission. */ 21 /* */ 22 /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ 23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ 24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ 25 /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ 26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ 27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ 28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ 29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ 30 /* THIS SOFTWARE. */ 31 /* */ 32 /*************************************************************************/ 33 /* Author: Alan W Black (awb@cs.cmu.edu) */ 34 /* Date: November 2007 */ 35 /*************************************************************************/ 36 /* */ 37 /* clustergen db */ 38 /* */ 39 /* A statistical corpus based synthesizer. */ 40 /* See Black, A. (2006), CLUSTERGEN: A Statistical Parametric */ 41 /* Synthesizer using Trajectory Modeling", Interspeech 2006 - ICSLP, */ 42 /* Pittsburgh, PA. */ 43 /* http://www.cs.cmu.edu/~awb/papers/is2006/IS061394.PDF */ 44 /* */ 45 /*************************************************************************/ 46 #ifndef _CST_CG_H__ 47 #define _CST_CG_H__ 48 49 #include "cst_cart.h" 50 #include "cst_track.h" 51 #include "cst_wave.h" 52 #include "cst_audio.h" 53 #include "cst_synth.h" /* for dur_stat */ 54 55 /* The parameters in the spectral model(s) may be encoded in various ways */ 56 /* mostly to preserve space. The decoded can be dependent on the shape type */ 57 /* This needs to be coordinated with the model creation in make_cg.scm */ 58 /* This is currently very BIG_HAMMER-like and hard coded */ 59 60 /* Original rf3 (and before) encoding stddev follows mean in pairs: 61 F0 (1), MCEP_STATIC (25), MCEP_DELTA (25), ME (5), Voicing (1) (114 shorts) 62 Encoded as linear range in unsigned short from min over range */ 63 #define CST_CG_MODEL_SHAPE_BASE_MINRANGE 1 64 /* Quantized rf3, encode everything as two 8 bit values in each short. 65 Good generic quantized version, but the next one is used for most models 66 (57 shorts) 67 */ 68 #define CST_CG_MODEL_SHAPE_QUANTIZED_PARAMS 2 69 /* Quantized rf3, (mostly stddev follows mean in pairs -- except deltas 70 no F0, MCEP_STATIC (50) MCEP_DELTAS stddev (25), ME (5), Voice (2) 71 Encoded as unsigned chars that index into qtable. we don't include 72 the mcep_deltas means, they aren't needed. They are still held as shorts 73 so you need to sub index them see cst_cg_quantized_params_index() 74 (41 shorts) */ 75 #define CST_CG_MODEL_SHAPE_QUANTIZED_PARAMS_41 3 76 77 typedef struct cst_cg_db_struct { 78 /* Please do not change this structure, but if you do, only add things 79 to the end of the struct. If do you change it , please modify 80 dump/load voice too (in cst_cg_dump_voice and cst_cg_map) */ 81 const char *name; 82 const char * const *types; 83 int num_types; 84 85 int sample_rate; 86 87 float f0_mean, f0_stddev; 88 89 /* Cluster trees */ 90 int num_f0_models; 91 const cst_cart ***f0_trees; 92 93 int num_param_models; 94 const cst_cart *** param_trees; 95 96 const cst_cart *spamf0_accent_tree; /* spam accent tree */ 97 const cst_cart *spamf0_phrase_tree; /* spam phrase tree */ 98 99 /* Model params e.g. mceps, deltas intersliced with stddevs */ 100 /* may be compressed/quantized based on value of model_shape */ 101 int *num_channels; 102 int *num_frames; 103 const unsigned short *** model_vectors; 104 105 int num_channels_spamf0_accent; 106 int num_frames_spamf0_accent; 107 const float * const * spamf0_accent_vectors; 108 109 /* Currently shared between different models */ 110 const float *model_min; /* for vector coeffs encoding */ 111 const float *model_range; /* for vector coeffs encoding */ 112 113 const float ***qtable; /* q(uantization) tables for each model */ 114 115 int model_shape; /* model compression technique */ 116 117 float frame_advance; 118 119 /* duration models (cart + phonedurs) */ 120 int num_dur_models; 121 const dur_stat *** dur_stats; 122 const cst_cart ** dur_cart; 123 124 /* phone to states map */ 125 const char * const * const *phone_states; 126 127 /* Other parameters */ 128 int do_mlpg; /* implies deltas are in the model_vectors */ 129 float *dynwin; 130 int dynwinsize; 131 132 float mlsa_alpha; 133 float mlsa_beta; 134 135 int multimodel; 136 int mixed_excitation; 137 138 /* filters for Mixed Excitation */ 139 int ME_num; 140 int ME_order; 141 const double * const *me_h; 142 143 int spamf0; 144 float gain; 145 146 int freeable; /* doesn't get dumped, but 1 when this a freeable struct */ 147 148 } cst_cg_db; 149 150 CST_VAL_USER_TYPE_DCLS(cg_db,cst_cg_db) 151 void delete_cg_db(cst_cg_db *db); 152 153 cst_utterance *cg_synth(cst_utterance *utt); 154 cst_wave *mlsa_resynthesis(const cst_track *t, 155 const cst_track *str, 156 cst_cg_db *cg_db, 157 cst_audio_streaming_info *asc, 158 int mlsa_speech_param); 159 cst_track *mlpg(const cst_track *param_track, cst_cg_db *cg_db); 160 161 cst_voice *cst_cg_load_voice(const char *voxdir, 162 const cst_lang lang_table[]); 163 int cst_cg_dump_voice(const cst_voice *v,const cst_string *filename); 164 165 #endif 166