1 /*************************************************************************/
2 /*                                                                       */
3 /*                  Language Technologies Institute                      */
4 /*                     Carnegie Mellon University                        */
5 /*                      Copyright (c) 2007-2017                          */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
34 /*               Date:  November 2007                                    */
35 /*************************************************************************/
36 /*                                                                       */
37 /*  clustergen db                                                        */
38 /*                                                                       */
39 /*  A statistical corpus based synthesizer.                              */
40 /*  See Black, A. (2006), CLUSTERGEN: A Statistical Parametric           */
41 /*  Synthesizer using Trajectory Modeling", Interspeech 2006 - ICSLP,    */
42 /*  Pittsburgh, PA.                                                      */
43 /*  http://www.cs.cmu.edu/~awb/papers/is2006/IS061394.PDF                */
44 /*                                                                       */
45 /*************************************************************************/
46 #ifndef _CST_CG_H__
47 #define _CST_CG_H__
48 
49 #include "cst_cart.h"
50 #include "cst_track.h"
51 #include "cst_wave.h"
52 #include "cst_audio.h"
53 #include "cst_synth.h" /* for dur_stat */
54 
55 /* The parameters in the spectral model(s) may be encoded in various ways */
56 /* mostly to preserve space.  The decoded can be dependent on the shape type */
57 /* This needs to be coordinated with the model creation in make_cg.scm */
58 /* This is currently very BIG_HAMMER-like and hard coded */
59 
60 /* Original rf3 (and before) encoding stddev follows mean in pairs:
61    F0 (1), MCEP_STATIC (25), MCEP_DELTA (25), ME (5), Voicing (1)  (114 shorts)
62    Encoded as linear range in unsigned short from min over range */
63 #define CST_CG_MODEL_SHAPE_BASE_MINRANGE 1
64 /* Quantized rf3, encode everything as two 8 bit values in each short.
65    Good generic quantized version, but the next one is used for most models
66    (57 shorts)
67    */
68 #define CST_CG_MODEL_SHAPE_QUANTIZED_PARAMS 2
69 /* Quantized rf3, (mostly stddev follows mean in pairs -- except deltas
70    no F0, MCEP_STATIC (50) MCEP_DELTAS stddev (25), ME (5), Voice (2)
71    Encoded as unsigned chars that index into qtable.  we don't include
72    the mcep_deltas means, they aren't needed.  They are still held as shorts
73    so you need to sub index them see cst_cg_quantized_params_index()
74    (41 shorts) */
75 #define CST_CG_MODEL_SHAPE_QUANTIZED_PARAMS_41 3
76 
77 typedef struct cst_cg_db_struct {
78     /* Please do not change this structure, but if you do, only add things
79        to the end of the struct.  If do you change it , please modify
80        dump/load voice too (in cst_cg_dump_voice and cst_cg_map) */
81     const char *name;
82     const char * const *types;
83     int num_types;
84 
85     int sample_rate;
86 
87     float f0_mean, f0_stddev;
88 
89     /* Cluster trees */
90     int num_f0_models;
91     const cst_cart ***f0_trees;
92 
93     int num_param_models;
94     const cst_cart *** param_trees;
95 
96     const cst_cart *spamf0_accent_tree; /* spam accent tree */
97     const cst_cart *spamf0_phrase_tree; /* spam phrase tree */
98 
99     /* Model params e.g. mceps, deltas intersliced with stddevs */
100     /* may be compressed/quantized based on value of model_shape */
101     int *num_channels;
102     int *num_frames;
103     const unsigned short *** model_vectors;
104 
105     int num_channels_spamf0_accent;
106     int num_frames_spamf0_accent;
107     const float * const * spamf0_accent_vectors;
108 
109     /* Currently shared between different models */
110     const float *model_min;    /* for vector coeffs encoding */
111     const float *model_range;  /* for vector coeffs encoding */
112 
113     const float ***qtable;     /* q(uantization) tables for each model */
114 
115     int model_shape;           /* model compression technique */
116 
117     float frame_advance;
118 
119     /* duration models (cart + phonedurs) */
120     int num_dur_models;
121     const dur_stat *** dur_stats;
122     const cst_cart ** dur_cart;
123 
124     /* phone to states map */
125     const char * const * const *phone_states;
126 
127     /* Other parameters */
128     int do_mlpg;  /* implies deltas are in the model_vectors */
129     float *dynwin;
130     int dynwinsize;
131 
132     float mlsa_alpha;
133     float mlsa_beta;
134 
135     int multimodel;
136     int mixed_excitation;
137 
138     /* filters for Mixed Excitation */
139     int ME_num;
140     int ME_order;
141     const double * const *me_h;
142 
143     int spamf0;
144     float gain;
145 
146     int freeable;  /* doesn't get dumped, but 1 when this a freeable struct */
147 
148 } cst_cg_db;
149 
150 CST_VAL_USER_TYPE_DCLS(cg_db,cst_cg_db)
151 void delete_cg_db(cst_cg_db *db);
152 
153 cst_utterance *cg_synth(cst_utterance *utt);
154 cst_wave *mlsa_resynthesis(const cst_track *t,
155                            const cst_track *str,
156                            cst_cg_db *cg_db,
157                            cst_audio_streaming_info *asc,
158                            int mlsa_speech_param);
159 cst_track *mlpg(const cst_track *param_track, cst_cg_db *cg_db);
160 
161 cst_voice *cst_cg_load_voice(const char *voxdir,
162                              const cst_lang lang_table[]);
163 int cst_cg_dump_voice(const cst_voice *v,const cst_string *filename);
164 
165 #endif
166