1 /*********************************************************************/
2 /* */
3 /* Language Technologies Institute */
4 /* Carnegie Mellon University */
5 /* Copyright (c) 2013 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and */
9 /* distribute this software and its documentation without */
10 /* restriction, including without limitation the rights to use, */
11 /* copy, modify, merge, publish, distribute, sublicense, and/or */
12 /* sell copies of this work, and to permit persons to whom this */
13 /* work is furnished to do so, subject to the following conditions: */
14
15 /* 1. The code must retain the above copyright notice, this list */
16 /* of conditions and the following disclaimer. */
17 /* 2. Any modifications must be clearly marked as such. */
18 /* 3. Original authors' names are not deleted. */
19 /* 4. The authors' names are not used to endorse or promote */
20 /* products derived from this software without specific */
21 /* prior written permission. */
22 /* */
23 /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
24 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
25 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN */
26 /* NO EVENT SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS */
27 /* BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES */
28 /* OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA */
29 /* OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR */
30 /* OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH */
31 /* THE USE OR PERFORMANCE OF THIS SOFTWARE. */
32 /* */
33 /*********************************************************************/
34 /* Author: Alok Parlikar (aup@cs.cmu.edu) */
35 /* Date: January 2013 */
36 /*********************************************************************/
37 /*
38 Interface to the MLSA code within HTS Engine API
39 (in the src/modules/hts_engine directory)
40
41 Written for HTS Engine API 1.07 (Jan 2013)
42 */
43
44 #include <sys/types.h>
45
46 #include <EST_walloc.h>
47 #include <festival.h>
48 #include <HTS_hidden.h>
49 #include "./vc.h"
50 #include "./HTS_vocoder_me.h"
51
52
53
54 DVECTOR synthesis_body(DMATRIX mcep, // input mel-cep sequence
55 DVECTOR f0v, // input F0 sequence
56 EST_Track *str, // str for mixed excitation
57 EST_Track *filtertrack,
58 double framerate, // sampling frequency (Hz)
59 int framem, // FFT length
60 double alpha,
61 double beta);
62
mlsa_resynthesis(LISP ltrack,LISP strtrack,LISP filtrack)63 LISP mlsa_resynthesis(LISP ltrack, LISP strtrack, LISP filtrack) {
64 EST_Track *t;
65 EST_Track *str = 0;
66 EST_Track *filter_track = 0;
67 EST_Wave *wave = 0;
68
69 DVECTOR w;
70 DMATRIX mcep;
71 DVECTOR f0v;
72
73 int framerate = 16000;
74 int i, j;
75 int shift;
76 double ALPHA = 0.42;
77 double BETA = 0.0;
78
79 LISP lispframerate = siod_get_lval("framerate",
80 NULL);
81 if (lispframerate == NIL) {
82 framerate = 16000;
83 } else {
84 framerate = FLONM(lispframerate);
85 }
86
87 ALPHA = FLONM(siod_get_lval("mlsa_alpha_param",
88 "mlsa: mlsa_alpha_param not set"));
89 BETA = FLONM(siod_get_lval("mlsa_beta_param",
90 "mlsa: mlsa_beta_param not set"));
91
92 if ((ltrack == NULL) ||
93 (TYPEP(ltrack, tc_string) &&
94 (streq(get_c_string(ltrack), "nil"))))
95 return siod(new EST_Wave(0, 1, framerate));
96
97 t = track(ltrack);
98
99 if (strtrack != NULL) {
100 /* We have to do mixed-excitation */
101 str = track(strtrack);
102 }
103
104 if (filtrack != NULL) {
105 filter_track = track(filtrack);
106 }
107
108 f0v = xdvalloc(t->num_frames());
109 mcep = xdmalloc(t->num_frames(), t->num_channels()-1);
110
111 for (i = 0; i < t->num_frames(); i++) {
112 f0v->data[i] = t->a(i, 0);
113 for (j = 1; j < t->num_channels(); j++)
114 mcep->data[i][j-1] = t->a(i, j);
115 }
116
117 if (t->num_frames() > 1) {
118 // Hacky way to get floats doing the right thing
119 // Seems to work for 16K and 48KHz -- aup
120
121 shift = (100000.0 * t->t(1)) - (100000.0 * t->t(0));
122 shift = shift/10;
123 if (shift % 10 > 5)
124 shift = ceil(shift/10.0);
125 else
126 shift = floor(shift/10.0);
127 } else {
128 shift = 5.0;
129 }
130
131 if (shift == 0)
132 shift = 5.0;
133
134 w = synthesis_body(mcep, f0v, str, filter_track,
135 framerate, shift, ALPHA, BETA);
136
137 wave = new EST_Wave(w->length, 1, framerate);
138
139 for (i = 0; i < w->length; i++)
140 wave->a(i) = (int16_t)w->data[i]; //NOLINT
141
142 xdmfree(mcep);
143 xdvfree(f0v);
144 xdvfree(w);
145
146 return siod(wave);
147 }
148
synthesis_body(DMATRIX mcep,DVECTOR f0v,EST_Track * str,EST_Track * filter_track,double framerate,int framem,double alpha,double beta)149 DVECTOR synthesis_body(DMATRIX mcep, // input mel-cep sequence
150 DVECTOR f0v, // input F0 sequence
151 EST_Track *str, // str for mixed excitation
152 EST_Track *filter_track, // Track for LPF or
153 // Mixed Excitation
154 // Filters
155 double framerate, // sampling frequency (Hz)
156 int framem, // FFT length
157 double alpha,
158 double beta) {
159 int64_t t, pos;
160 int framel;
161 double f0;
162 HTS_Vocoder v;
163 HTS_Vocoder_ME v_me;
164 DVECTOR xd = NODATA;
165
166 HTS_Boolean use_log_gain = FALSE;
167
168 size_t stage = 0; // MGC or LSP
169
170 size_t nlpf = 0;
171 double *lpf = NULL;
172
173 double volume = 1.0;
174
175 // Mixed Excitation Stuff
176 LISP filters;
177 LISP f;
178 int fl;
179 int i, j;
180 int me_num_filters = 0;
181 int me_filter_order = 0;
182 double **me_filter = NULL;
183 double *xp_sig = NULL;
184 double *xn_sig = NULL;
185 double *hp = NULL;
186 double *hn = NULL;
187 double *strengths;
188
189 if (str == NULL) {
190 // Not using mixed excitation, so use the lpf that HTS uses
191 // in the SLT voice as of Jan 2013 -- aup
192
193 // Check if LPF filter is to be applied to signal.
194 if (filter_track != NULL) {
195 if (filter_track->num_frames() != 1) {
196 printf("Warning: Wrong filter passed. Ignoring LPF\n");
197 printf("Expected single row for pulse-noise-excitation voice");
198 } else {
199 nlpf = (filter_track->num_channels() - 1)/2;
200 lpf = (double*) calloc(filter_track->num_channels(), sizeof(double));
201 for (i = 0; i < filter_track->num_channels(); i++) {
202 lpf[i] = filter_track->a(0,i);
203 }
204 // printf("aup_debug: Using LPF filter of nlpf %d\n", nlpf);
205 }
206 }
207 }
208
209 // floats may not do the right thing, but this seems to work
210 // for 16KHz and 48KHz -- aup
211 framel = framem * framerate/1000.0;
212
213 if (str == NULL) {
214 // Not Mixed Excitation
215 HTS_Vocoder_initialize(&v, mcep->col - 1,
216 stage, use_log_gain,
217 framerate, framel);
218 } else {
219 // Mixed Excitation
220 if (filter_track != NULL) {
221 me_num_filters = filter_track->num_frames();
222 me_filter_order = filter_track->num_channels();
223 me_filter = walloc(double*, me_num_filters);
224
225 for (i = 0; i < me_num_filters; i++) {
226 me_filter[i] = walloc(double, me_filter_order);
227 for (j = 0; j < me_filter_order; j++) {
228 me_filter[i][j] = filter_track->a(i, j);
229 }
230 }
231 } else {
232 printf("Warning: Attempting to use Mixed Excitation without Filters");
233 }
234
235 xp_sig = (double*) calloc(me_filter_order, sizeof(double));
236 xn_sig = (double*) calloc(me_filter_order, sizeof(double));
237 hp = (double*) calloc(me_filter_order, sizeof(double));
238 hn = (double*) calloc(me_filter_order, sizeof(double));
239
240 v_me.v = &v;
241 HTS_Vocoder_initialize_me(&v_me,
242 mcep->col -1,
243 stage, use_log_gain,
244 framerate, framel,
245 me_num_filters,
246 me_filter_order,
247 me_filter,
248 xp_sig, xn_sig, hp, hn);
249 }
250
251 // synthesize waveforms by MLSA filter
252 xd = xdvalloc(mcep->row * (framel + 2));
253 for (t = 0, pos = 0; t < mcep->row; t++) {
254 if (t >= f0v->length)
255 f0 = LZERO;
256 else
257 f0 = f0v->data[t];
258
259 if (f0 == 0)
260 f0 = LZERO;
261 else
262 f0 = log(f0);
263
264 if (str == NULL) {
265 // Not Mixed Excitation
266 // printf("aup_debug %d %d %d\n", t, pos, framel);
267 HTS_Vocoder_synthesize(&v, mcep->col - 1,
268 f0, mcep->data[t],
269 nlpf, lpf,
270 alpha, beta, volume,
271 &xd->data[pos], NULL);
272 } else {
273 strengths = (double*) calloc(me_filter_order, sizeof(double));
274 for (i = 0; i < me_num_filters; i++) {
275 strengths[i] = str->a((int)t, i);
276 }
277 HTS_Vocoder_synthesize_me(&v_me, mcep->col - 1,
278 f0, mcep->data[t],
279 strengths,
280 nlpf, lpf,
281 alpha, beta, volume,
282 &xd->data[pos], NULL);
283 free(strengths);
284 }
285
286 pos += framel;
287 }
288
289 if (lpf != NULL)
290 free(lpf);
291
292 if (str != NULL) {
293 // Mixed Excitation
294 free(xp_sig);
295 free(xn_sig);
296 free(hp);
297 free(hn);
298 }
299
300 HTS_Vocoder_clear(&v);
301
302 return xd;
303 }
304
305