1 /*
2 * epos/src/tdpsyn.cc
3 * (c) 2000-2002 Petr Horak, horak@petr.cz
4 * (c) 2001-2002 Jirka Hanika, geo@cuni.cz
5 *
6 * tdpsyn version 2.5.0 (20.9.2002)
7 *
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License in doc/COPYING for more details.
17 *
18 *
19 */
20
21 #include "epos.h"
22 #include "tdpsyn.h"
23 #include <math.h>
24 #include "endian_utils.h"
25
26 #define MAX_STRETCH 30 /* stretching beyond 30 samples per OLA frame must be done through repeating frames */
27 #define MAX_OLA_FRAME 4096 /* sanity check only */
28 #define HAMMING_PRECISION 15 /* hamming coefficient precision in bits */
29 #define LP_F0_STEP 8 /* step of F0 analysis for linear prediction */
30 #define LP_DECIM 10 /* F0 analysis decimation coeff. for linear prediction */
31 #define LP_F0_ORD 4 /* order of F0 contour LP analysis */
32 #define F0_FILT_ORD 9 /* F0 contour filter order */
33 #define LP_EXC_MUL 1.0 /* LP excitation multipicator */
34
35 #ifdef WIN32
36 #define snprintf _snprintf
37 #endif
38
39 void tdioven(char *p, int l);
40
41 /* F0 contour filter coefficients */
42 const double a[9] = {1,-6.46921563821389,18.43727805607084,-30.21344177474595,31.11962012720199,
43 -20.62061607537661,8.58111044795433,-2.04983423923570,0.21516477151414};
44
45 const double b[9] = {0.01477848982115,-0.08930749676388,0.25181616063735,-0.43840842338467,0.52230821454923,
46 -0.43840842338467,0.25181616063735,-0.08930749676388,0.01477848982115};
47
48 /* lp f0 contour filter coefficients (mean of 144 sentences from speaker Machac) */
49 // const double lp[LP_F0_ORD] = {-1.23761, 0.60009, -0.32046, 0.10699};
50
51 // these coeffs are from the version where the f0 is constant inside a syllable
52 const double lp[LP_F0_ORD] = {-0.900693, 0.043125, -0.003700, 0.069916};
53 // FIXME! lp coefficients must be configurable
54
55 /* Hamming coefficients for TD-PSOLA algorithm */
hamkoe(int winlen,uint16_t * data,int e,int e_base)56 int hamkoe(int winlen, uint16_t *data, int e, int e_base)
57 {
58 int i;
59 double fn;
60 fn = 2 * pii / (winlen - 1);
61 for (i=0; i < winlen; i++) {
62 data[i] = (uint16_t)((0.53999 - 0.46 * cos(fn * i)) * e / e_base * (1 << HAMMING_PRECISION));
63 }
64 return 0;
65 }
66
67
68 #if 0
69
70 int hamkoe(int winlen, double *data)
71 {
72 D_PRINT(0, "%d\n", winlen);
73 int i;
74 double fn;
75 fn = 2 * pii / (winlen - 1);
76 for (i=0; i < winlen; i++)
77 data[i] = 0.54 - 0.46 * cos(fn * i);
78 return 0;
79 }
80
81 int median(int prev, int curr, int next, int ibonus)
82 {
83 int lm;
84 if ((prev <= curr) == (curr <= next)) lm = curr;
85 else if ((prev <= curr) == (next <= prev)) lm = prev;
86 else lm = next;
87 return (abs(curr - lm) > ibonus) ? lm : curr;
88 }
89
90 #endif
91
92
93 /* Inventory file header structure */
94 struct tdi_hdr
95 {
96 int32_t magic;
97 int32_t samp_rate;
98 int32_t samp_size;
99 int32_t bufpos;
100 int32_t n_segs;
101 int32_t diph_offs;
102 int32_t diph_len;
103 int32_t res1;
104 int32_t res2;
105 int32_t ppulses;
106 int32_t res3;
107 int32_t res4;
108 };
109
tdpsyn(voice * v)110 tdpsyn::tdpsyn(voice *v)
111 {
112 tdi_hdr *hdr;
113
114 difpos = 0;
115
116 tdi = claim(v->models, v->location, scfg->inv_base_dir, "rb", "inventory", tdioven);
117 hdr = (tdi_hdr *)tdi->data;
118 D_PRINT(0, "Got %d and config says %d\n", hdr->n_segs, v->n_segs);
119 if (v->n_segs != hdr->n_segs) shriek(463, "inconsistent n_segs");
120 if (sizeof(SAMPLE) != hdr->samp_size) shriek(463, "inconsistent samp_size");
121 tdp_buff = (SAMPLE *)(hdr + 1);
122 diph_offs = (int *)((char *)tdp_buff + sizeof(SAMPLE) * hdr->bufpos);
123 diph_len = diph_offs + v->n_segs;
124 ppulses = diph_len + v->n_segs;
125
126 // this is debugging only!
127 D_PRINT(0, "Samples are %d bytes long.\n", sizeof(SAMPLE) * hdr->bufpos);
128
129 /* allocate the maximum necessary space for Hamming windows: */
130 // max_frame = 0;
131 // for (int k = 0; k < v->n_segs; k++) {
132 // int avpitch = average_pitch(diph_offs[k], diph_len[k]);
133 // int maxwin = avpitch + MAX_STRETCH; //(int)(w->inv_samping_rate / 500);
134 // if (max_frame < maxwin) max_frame = maxwin;
135 // }
136 // max_frame++;
137 // if (max_frame >= MAX_OLA_FRAME || max_frame == 0) shriek(463, "Inconsistent OLA frame buffer size");
138 //FIXME! max_frame=maxwin * (max_L - min_ori_L)
139
140 max_frame = MAX_OLA_FRAME;
141
142 wwin = (uint16_t *)xmalloc(sizeof(uint16_t) * (max_frame * 2));
143 memset(wwin, 0, (max_frame * 2) * sizeof(*wwin));
144
145 out_buff = (SAMPLE *)xmalloc(sizeof(SAMPLE) * max_frame * 2);
146 memset(out_buff, 0, max_frame * 2 * sizeof(*out_buff));
147
148 /* initialisation of lp prosody engine */
149 if (v->lpcprosody) {
150 int i;
151 for (i = 0; i < LPC_PROS_ORDER; lpfilt[i++] = 0);
152 for (i = 0; i < MAX_OFILT_ORDER; ofilt[i++] = 0);
153 sigpos = 0;
154 lppstep = LP_F0_STEP * v->inv_sampling_rate / 1000;
155 lpestep = LP_DECIM * lppstep;
156 basef0 = v->init_f;
157 lppitch = v->inv_sampling_rate / basef0;;
158 }
159
160 // init the smoothing filter
161 if (v->f0_smoothing) {
162 int i;
163 for (i = 0; i < MAX_OFILT_ORDER; smoothfilt[i++] = 0);
164 // CHECKME: the following two lines may be superfluous
165 basef0 = v->init_f;
166 lppitch = v->inv_sampling_rate / basef0;
167 }
168 }
169
~tdpsyn(void)170 tdpsyn::~tdpsyn(void)
171 {
172 free(out_buff);
173 free(wwin);
174 unclaim(tdi);
175 }
176
average_pitch(int offs,int len)177 inline int tdpsyn::average_pitch(int offs, int len)
178 {
179 // const int npitch = 145;
180 const int npitch = 580;
181 int tmp;
182
183 int total = 0;
184 int i = 0;
185 for (int j = 0; j <= len + 1; j++) {
186 tmp = ppulses[offs + j] - ppulses[offs + j - 1];
187 if (tmp < 2 * npitch) {
188 total += tmp;
189 i++;
190 }
191 }
192 if (i <= 0) {
193 if (cfg->paranoid) shriek(463,"pitch marks not found");
194 return 160;
195 }
196 return total / i;
197 }
198
synseg(voice * v,segment d,wavefm * w)199 void tdpsyn::synseg(voice *v, segment d, wavefm *w)
200 {
201 int i, j, k, l, m, slen, nlen, pitch, avpitch, origlen, newlen, maxwin, skip, reply, diflen;
202 double outf0, synf0, exc;
203 static double old_exc = 0;
204 SAMPLE poms;
205
206 const int max_frame = this->max_frame;
207 int pitch_saved = 0;
208 int segment_pitch;
209
210 if (diph_len[d.code] == 0) {
211 D_PRINT(2, "missing speech unit No: %d\n", d.code);
212 if (!cfg->paranoid) return;
213 shriek(463, "missing speech unit No: %d\n",d.code);
214 }
215
216 /* lp prosody reconstruction filter excitation signal computing */
217 if (v->lpcprosody) { // in d.f is excitation signal value
218
219 // impuls at start of each sentence clears the lpc filter
220 if (d.e >= v->init_i * 9) {
221 for (i = 0; i < LPC_PROS_ORDER; lpfilt[i++] = 0);
222 d.e = (d.e * 100 / v->init_i - 1000) * v->init_i / 100;
223 }
224
225 if (v->bang_nnet) {
226 // multiply with 13.4 if f0 filter is used
227 exc = ((float) d.f) / 1000;
228 // exc = (int) (((float) d.f) * 13.4);
229
230 // delete excitation if it was the same (do not allow more than one exc per syllable)
231 if (exc == old_exc) {
232 exc = 0;
233 }
234 else {
235 old_exc = exc;
236 }
237 D_PRINT(0, "Version lpc with nnet, exc. is %f\n", exc);
238 }
239 else {
240 exc = LP_EXC_MUL * (d.f - v->init_f);
241 D_PRINT(0, "Version lpc, exc is %f\n", exc);
242 }
243
244 pitch = lppitch;
245
246 }
247 else { // in d.f is f0 contour value
248 pitch = v->inv_sampling_rate / d.f;
249 D_PRINT(0, "Version without lpc, pitch is %d\n", pitch);
250 }
251
252 if (v->f0_smoothing) {
253 segment_pitch = pitch; // remember pitch for whole segment
254 if (lppitch > 0) {
255 pitch = lppitch; // use pitch from last segment
256 }
257 }
258
259 slen = diph_len[d.code];
260 avpitch = average_pitch(diph_offs[d.code], slen);
261 maxwin = avpitch + MAX_STRETCH;
262 maxwin = (pitch > maxwin) ? maxwin : pitch;
263 if (maxwin >= max_frame) shriek(461, "pitch too large");
264
265 if (d.t > 0) origlen = avpitch * slen * d.t / 100; else origlen = avpitch * slen;
266 newlen = pitch * slen;
267 //diflen = (newlen - origlen) / slen;
268 D_PRINT(0, "\navp=%d L=%d oril=%d newl=%d | %d (%d)\n",avpitch,pitch,origlen,newlen,diph_len[d.code],d.code);
269 D_PRINT(0, "unit:%4d f=%3d i=%3d t=%3d - pitch=%d\n",d.code,d.f,d.e,d.t,pitch);
270
271 hamkoe(2 * maxwin + 1, wwin, d.e, 100);
272 skip = 1; reply = 1;
273 if (newlen > origlen) skip = newlen / origlen;
274 if (origlen > newlen) reply = origlen / newlen;
275 D_PRINT(0, "dlen=%d p:%d avp=%d oril=%d newl=%d difl=%d",diph_len[d.code],pitch,avpitch,origlen,newlen,diflen);
276 nlen = slen - (skip - 1) * slen / skip + (reply - 1) * slen;
277 if (nlen == 0) {
278 D_PRINT(0, "Error: Pitch modelling exceeds range!\n");
279 nlen = 1;
280 }
281 diflen = (newlen - origlen - (skip - 1) * slen * pitch / skip + (reply - 1) * slen * pitch) / nlen;
282 D_PRINT(0, " -> diflen:%d sk:%d rp:%d\n",diflen,skip,reply);
283 for (j = 1; j <= diph_len[d.code]; j += skip) for (k = 0; k < reply; k++) {
284 memcpy(out_buff + max_frame - pitch, out_buff + max_frame, pitch * sizeof(*out_buff));
285 memset(out_buff + max_frame, 0, max_frame * sizeof(*out_buff));
286 for (i = -maxwin;i <= maxwin; i++) {
287 int ttemp;
288 poms = tdp_buff[i + ppulses[diph_offs[d.code] + j - 1]];
289 ttemp = wwin[i + pitch];
290 ttemp *= poms;
291 ttemp = ttemp >> HAMMING_PRECISION;
292 poms = (SAMPLE) ttemp;
293 // poms = (SAMPLE)(wwin[i + pitch] * poms >> HAMMING_PRECISION);
294 // poms = poms * d.e / 100;
295 out_buff[max_frame + i] += poms;
296 }
297
298 /* lpc synthesis of F0 contour */
299 if (v->lpcprosody) {
300 synf0 = 0; outf0 = 0;
301 for (l = 0; l < 2 * pitch; l++) {
302 sigpos++;
303 if (!(sigpos % lppstep)) { // new pitch value into f0 output filter
304 D_PRINT(0, "LPP position point %d, exc=%.2f synf0=%d otf0=%.2f L=%d\n",sigpos,exc,synf0,outf0,lppitch);
305 synf0 = 0;
306 if (!(sigpos % lpestep)) { // new excitation value into recontruction filter
307 D_PRINT(0, " >> LPE position point %d (exc=%.4f) << \n",sigpos,exc);
308 D_PRINT(0, "lp=[%.3f %.3f %.3f %.3f] lpfilt=[%.3f %.3f %.3f %.3f]\n",lp[0],lp[1],lp[2],lp[3],lpfilt[0],lpfilt[1],lpfilt[2],lpfilt[3]);
309 synf0 = exc - lpfilt[0]*lp[0];
310 exc = 0;
311 for (m = LP_F0_ORD - 1; m > 0; m--) {
312 synf0 -= lp[m] * lpfilt[m];
313 lpfilt[m] = lpfilt[m-1];
314 }
315 lpfilt[0] = synf0;
316 pitch_saved = 1;
317 D_PRINT(0, "Unsmoothed synf0 is: %f\n", synf0);
318 }
319
320 // skip f0 filter here
321 if (! (v->bang_nnet)) {
322 // if (1) {
323 ofilt[0] = synf0;
324 synf0 = 0;
325 for (m = 1; m < F0_FILT_ORD; m++) ofilt[0] -= a[m] * ofilt[m];
326 outf0 = 0;
327 for (m = 0; m < F0_FILT_ORD; m++) outf0 += b[m] * ofilt[m];
328 D_PRINT(0, "of=[%.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f]\n",ofilt[0],ofilt[1],
329 ofilt[2],ofilt[3],ofilt[4],ofilt[5],ofilt[6],ofilt[7],ofilt[8]);
330
331 // amplify the signal
332 // outf0 = outf0 * 13.4;
333 }
334 else {
335 outf0 = lpfilt[0];
336 }
337
338 // apply the prosody factor
339 // 1000 = prosody unchanged
340 // > 1000 = prosody enhanced
341 // < 1000 = prosody lowered
342 outf0 = outf0 * ((double) cfg->pros_factor) / 1000.0;
343
344 // do remember the first output value from the lpc filter
345 if (pitch_saved == 1) {
346 pitch_saved = 2;
347 lppitch = (int)(v->inv_sampling_rate / (basef0 + outf0));
348 }
349 D_PRINT(0, "New values: outf0 : %f, lppitch : %d\n", outf0, lppitch);
350 outf0 = 0;
351 for (m = F0_FILT_ORD - 1; m > 0; m--) ofilt[m] = ofilt[m - 1];
352 }
353 }
354 }
355
356 D_PRINT(0, "Pitch: %d\n", pitch);
357
358 w->sample((SAMPLE *)out_buff + max_frame - pitch, pitch);
359 D_PRINT(0, " j:%d difpos:%d diflen:%d",j,difpos,diflen);
360 difpos += diflen;
361 if (difpos < -pitch) {
362 if (reply == 1) j--; else k--;
363 difpos += pitch;
364 }
365 if (difpos > pitch) {
366 if (reply == 1) j++;
367 else if (k == reply - 1) { j++; k = 1; }
368 else k++;
369 difpos -= pitch;
370 }
371 D_PRINT(0, " -> j:%d difpos:%d\n",j,difpos);
372
373 // inserted to label f0 in wav file
374 if (cfg->label_f0) {
375
376 D_PRINT(0, "Labelling file!\n");
377
378 char tmp[4];
379 snprintf (tmp, 4, "%d", pitch);
380 tmp[3] = 0;
381 w->label(0, tmp, "pitch");
382 }
383
384 // f0 smoothing, when chosen
385 if (v->f0_smoothing) {
386
387 smoothfilt[0] = segment_pitch;
388
389 for (m = 1; m < F0_FILT_ORD; m++) {
390 smoothfilt[0] -= a[m] * smoothfilt[m];
391 }
392
393 lppitch = 0;
394 for (m = 0; m < F0_FILT_ORD; m++) lppitch += b[m] * smoothfilt[m];
395
396 for (m = F0_FILT_ORD - 1; m > 0; m--) smoothfilt[m] = smoothfilt[m - 1];
397
398 if (lppitch > 0)
399 pitch = lppitch;
400
401 slen = diph_len[d.code] - j + 1;
402 avpitch = average_pitch(diph_offs[d.code] + j - 1, slen);
403 maxwin = avpitch + MAX_STRETCH;
404 maxwin = (pitch > maxwin) ? maxwin : pitch;
405 if (maxwin >= max_frame) shriek(461, "pitch too large");
406
407 if (d.t > 0) origlen = avpitch * slen * d.t / 100; else origlen = avpitch * slen;
408 newlen = pitch * slen;
409
410 hamkoe(2 * maxwin + 1, wwin, d.e, 100);
411 skip = 1; reply = 1;
412 if (newlen > origlen) skip = newlen / origlen;
413 if (origlen > newlen) reply = origlen / newlen;
414
415 nlen = slen - (skip - 1) * slen / skip + (reply - 1) * slen;
416 if (nlen == 0) {
417 D_PRINT(0, "Error: Pitch modelling exceeds range!\n");
418 nlen = 1;
419 }
420 diflen = (newlen - origlen - (skip - 1) * slen * pitch / skip + (reply - 1) * slen * pitch) / nlen;
421 }
422 }
423 }
424
425
tdioven(char * p,int l)426 void tdioven(char *p, int l){
427 if (!scfg->_big_endian)
428 return;
429 /* Convert the header */
430 tdi_hdr* hdr = (tdi_hdr*)p;
431 hdr->samp_rate = from_le32s(hdr->samp_rate);
432 hdr->samp_size = from_le32s(hdr->samp_size);
433 hdr->bufpos = from_le32s(hdr->bufpos);
434 hdr->n_segs = from_le32s(hdr->n_segs);
435 hdr->diph_offs = from_le32s(hdr->diph_offs);
436 hdr->diph_len = from_le32s(hdr->diph_len);
437 hdr->res1 = from_le32s(hdr->res1);
438 hdr->res2 = from_le32s(hdr->res2);
439 hdr->ppulses = from_le32s(hdr->ppulses);
440 hdr->res3 = from_le32s(hdr->res3);
441 hdr->res4 = from_le32s(hdr->res4);
442
443 /* Convert the buffer */
444 SAMPLE *bufS = (SAMPLE *)(hdr + 1);
445 uint32_t *bufL = (uint32_t *)(bufS + hdr->bufpos);
446 char *stop = p + l;
447
448 for ( ; bufS < (SAMPLE *)bufL; bufS++)
449 *bufS = from_le16s(*bufS);
450
451 for ( ; bufL < (uint32_t*)stop; bufL++)
452 *bufL = from_le32u(*bufL);
453 }
454
455