1 /*********************************************************************/
2 /*                                                                   */
3 /*                  Language Technologies Institute                  */
4 /*                     Carnegie Mellon University                    */
5 /*                         Copyright (c) 2013                        */
6 /*                        All Rights Reserved.                       */
7 /*                                                                   */
8 /*  Permission is hereby granted, free of charge, to use and         */
9 /*  distribute this software and its documentation without           */
10 /*  restriction, including without limitation the rights to use,     */
11 /*  copy, modify, merge, publish, distribute, sublicense, and/or     */
12 /*  sell copies of this work, and to permit persons to whom this     */
13 /*  work is furnished to do so, subject to the following conditions: */
14 
15 /*   1. The code must retain the above copyright notice, this list   */
16 /*      of conditions and the following disclaimer.                  */
17 /*   2. Any modifications must be clearly marked as such.            */
18 /*   3. Original authors' names are not deleted.                     */
19 /*   4. The authors' names are not used to endorse or promote        */
20 /*      products derived from this software without specific         */
21 /*      prior written permission.                                    */
22 /*                                                                   */
23 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK     */
24 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING  */
25 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN        */
26 /*  NO EVENT SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS   */
27 /*  BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES     */
28 /*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA       */
29 /*  OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR      */
30 /*  OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH      */
31 /*  THE USE OR PERFORMANCE OF THIS SOFTWARE.                         */
32 /*                                                                   */
33 /*********************************************************************/
34 /*             Author:  Alok Parlikar (aup@cs.cmu.edu)               */
35 /*               Date:  January 2013                                 */
36 /*********************************************************************/
37 /*
38   Interface to the MLSA code within HTS Engine API
39   (in the src/modules/hts_engine directory)
40 
41   Written for HTS Engine API 1.07 (Jan 2013)
42 */
43 
44 #include <sys/types.h>
45 
46 #include <EST_walloc.h>
47 #include <festival.h>
48 #include <HTS_hidden.h>
49 #include "./vc.h"
50 #include "./HTS_vocoder_me.h"
51 
52 
53 
54 DVECTOR synthesis_body(DMATRIX mcep,      // input mel-cep sequence
55                        DVECTOR f0v,       // input F0 sequence
56                        EST_Track *str,    // str for mixed excitation
57                        EST_Track *filtertrack,
58                        double framerate,  // sampling frequency (Hz)
59                        int framem,        // FFT length
60                        double alpha,
61                        double beta);
62 
mlsa_resynthesis(LISP ltrack,LISP strtrack,LISP filtrack)63 LISP mlsa_resynthesis(LISP ltrack, LISP strtrack, LISP filtrack) {
64   EST_Track *t;
65   EST_Track *str = 0;
66   EST_Track *filter_track = 0;
67   EST_Wave *wave = 0;
68 
69   DVECTOR w;
70   DMATRIX mcep;
71   DVECTOR f0v;
72 
73   int framerate = 16000;
74   int i, j;
75   int shift;
76   double ALPHA = 0.42;
77   double BETA = 0.0;
78 
79   LISP lispframerate = siod_get_lval("framerate",
80                                      NULL);
81   if (lispframerate == NIL) {
82     framerate = 16000;
83   } else {
84     framerate = FLONM(lispframerate);
85   }
86 
87   ALPHA = FLONM(siod_get_lval("mlsa_alpha_param",
88                               "mlsa: mlsa_alpha_param not set"));
89   BETA = FLONM(siod_get_lval("mlsa_beta_param",
90                              "mlsa: mlsa_beta_param not set"));
91 
92   if ((ltrack == NULL) ||
93       (TYPEP(ltrack, tc_string) &&
94        (streq(get_c_string(ltrack), "nil"))))
95     return siod(new EST_Wave(0, 1, framerate));
96 
97   t = track(ltrack);
98 
99   if (strtrack != NULL) {
100     /* We have to do mixed-excitation */
101     str = track(strtrack);
102   }
103 
104   if (filtrack != NULL) {
105     filter_track = track(filtrack);
106   }
107 
108   f0v = xdvalloc(t->num_frames());
109   mcep = xdmalloc(t->num_frames(), t->num_channels()-1);
110 
111   for (i = 0; i < t->num_frames(); i++) {
112     f0v->data[i] = t->a(i, 0);
113     for (j = 1; j < t->num_channels(); j++)
114       mcep->data[i][j-1] = t->a(i, j);
115   }
116 
117   if (t->num_frames() > 1) {
118     // Hacky way to get floats doing the right thing
119     // Seems to work for 16K and 48KHz -- aup
120 
121     shift = (100000.0 * t->t(1)) - (100000.0 * t->t(0));
122     shift = shift/10;
123     if (shift % 10 > 5)
124       shift = ceil(shift/10.0);
125     else
126       shift = floor(shift/10.0);
127   } else {
128     shift = 5.0;
129   }
130 
131   if (shift == 0)
132     shift = 5.0;
133 
134   w = synthesis_body(mcep, f0v, str, filter_track,
135                      framerate, shift, ALPHA, BETA);
136 
137   wave = new EST_Wave(w->length, 1, framerate);
138 
139   for (i = 0; i < w->length; i++)
140     wave->a(i) = (int16_t)w->data[i];  //NOLINT
141 
142   xdmfree(mcep);
143   xdvfree(f0v);
144   xdvfree(w);
145 
146   return siod(wave);
147 }
148 
synthesis_body(DMATRIX mcep,DVECTOR f0v,EST_Track * str,EST_Track * filter_track,double framerate,int framem,double alpha,double beta)149 DVECTOR synthesis_body(DMATRIX mcep,      // input mel-cep sequence
150                        DVECTOR f0v,       // input F0 sequence
151                        EST_Track *str,    // str for mixed excitation
152                        EST_Track *filter_track,  // Track for LPF or
153                                                  // Mixed Excitation
154                                                  // Filters
155                        double framerate,  // sampling frequency (Hz)
156                        int framem,     // FFT length
157                        double alpha,
158                        double beta) {
159   int64_t t, pos;
160   int framel;
161   double f0;
162   HTS_Vocoder v;
163   HTS_Vocoder_ME v_me;
164   DVECTOR xd = NODATA;
165 
166   HTS_Boolean use_log_gain = FALSE;
167 
168   size_t stage = 0;  // MGC or LSP
169 
170   size_t nlpf = 0;
171   double *lpf = NULL;
172 
173   double volume = 1.0;
174 
175   // Mixed Excitation Stuff
176   LISP filters;
177   LISP f;
178   int fl;
179   int i, j;
180   int me_num_filters = 0;
181   int me_filter_order = 0;
182   double **me_filter = NULL;
183   double *xp_sig = NULL;
184   double *xn_sig = NULL;
185   double *hp = NULL;
186   double *hn = NULL;
187   double *strengths;
188 
189   if (str == NULL) {
190     // Not using mixed excitation, so use the lpf that HTS uses
191     // in the SLT voice as of Jan 2013 -- aup
192 
193     // Check if LPF filter is to be applied to signal.
194     if (filter_track != NULL) {
195       if (filter_track->num_frames() != 1) {
196         printf("Warning: Wrong filter passed. Ignoring LPF\n");
197         printf("Expected single row for pulse-noise-excitation voice");
198       } else {
199         nlpf = (filter_track->num_channels() - 1)/2;
200         lpf = (double*) calloc(filter_track->num_channels(), sizeof(double));
201         for (i = 0; i < filter_track->num_channels(); i++) {
202           lpf[i] = filter_track->a(0,i);
203         }
204         //        printf("aup_debug: Using LPF filter of nlpf %d\n", nlpf);
205       }
206     }
207   }
208 
209   // floats may not do the right thing, but this seems to work
210   // for 16KHz and 48KHz -- aup
211   framel = framem * framerate/1000.0;
212 
213   if (str == NULL) {
214     // Not Mixed Excitation
215     HTS_Vocoder_initialize(&v, mcep->col - 1,
216                            stage, use_log_gain,
217                            framerate, framel);
218   } else {
219     // Mixed Excitation
220     if (filter_track != NULL) {
221       me_num_filters = filter_track->num_frames();
222       me_filter_order = filter_track->num_channels();
223       me_filter = walloc(double*, me_num_filters);
224 
225       for (i = 0; i < me_num_filters; i++) {
226         me_filter[i] = walloc(double, me_filter_order);
227         for (j = 0; j < me_filter_order; j++) {
228           me_filter[i][j] = filter_track->a(i, j);
229         }
230       }
231     } else {
232       printf("Warning: Attempting to use Mixed Excitation without Filters");
233     }
234 
235     xp_sig =  (double*) calloc(me_filter_order, sizeof(double));
236     xn_sig = (double*) calloc(me_filter_order, sizeof(double));
237     hp = (double*) calloc(me_filter_order, sizeof(double));
238     hn = (double*) calloc(me_filter_order, sizeof(double));
239 
240     v_me.v = &v;
241     HTS_Vocoder_initialize_me(&v_me,
242                               mcep->col -1,
243                               stage, use_log_gain,
244                               framerate, framel,
245                               me_num_filters,
246                               me_filter_order,
247                               me_filter,
248                               xp_sig, xn_sig, hp, hn);
249   }
250 
251   // synthesize waveforms by MLSA filter
252   xd = xdvalloc(mcep->row * (framel + 2));
253   for (t = 0, pos = 0; t < mcep->row; t++) {
254     if (t >= f0v->length)
255       f0 = LZERO;
256     else
257       f0 = f0v->data[t];
258 
259     if (f0 == 0)
260       f0 = LZERO;
261     else
262       f0 = log(f0);
263 
264     if (str == NULL) {
265       // Not Mixed Excitation
266       //      printf("aup_debug %d %d %d\n", t, pos, framel);
267       HTS_Vocoder_synthesize(&v, mcep->col - 1,
268                              f0, mcep->data[t],
269                              nlpf, lpf,
270                              alpha, beta, volume,
271                              &xd->data[pos], NULL);
272     } else {
273       strengths =  (double*) calloc(me_filter_order, sizeof(double));
274       for (i = 0; i < me_num_filters; i++) {
275         strengths[i] = str->a((int)t, i);
276       }
277       HTS_Vocoder_synthesize_me(&v_me, mcep->col - 1,
278                                 f0, mcep->data[t],
279                                 strengths,
280                                 nlpf, lpf,
281                                 alpha, beta, volume,
282                                 &xd->data[pos], NULL);
283       free(strengths);
284     }
285 
286     pos += framel;
287   }
288 
289   if (lpf != NULL)
290     free(lpf);
291 
292   if (str != NULL) {
293     // Mixed Excitation
294     free(xp_sig);
295     free(xn_sig);
296     free(hp);
297     free(hn);
298   }
299 
300   HTS_Vocoder_clear(&v);
301 
302   return xd;
303 }
304 
305