1 /*
2  * 	epos/src/tdpsyn.cc
3  * 	(c) 2000-2002 Petr Horak, horak@petr.cz
4  * 	(c) 2001-2002 Jirka Hanika, geo@cuni.cz
5  *
6  *	tdpsyn version 2.5.0 (20.9.2002)
7  *
8     This program is free software; you can redistribute it and/or modify
9     it under the terms of the GNU General Public License as published by
10     the Free Software Foundation; either version 2 of the License, or
11     (at your option) any later version.
12 
13     This program is distributed in the hope that it will be useful,
14     but WITHOUT ANY WARRANTY; without even the implied warranty of
15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16     GNU General Public License in doc/COPYING for more details.
17  *
18  *
19  */
20 
21 #include "epos.h"
22 #include "tdpsyn.h"
23 #include <math.h>
24 #include "endian_utils.h"
25 
26 #define MAX_STRETCH	30	/* stretching beyond 30 samples per OLA frame must be done through repeating frames */
27 #define MAX_OLA_FRAME	4096	/* sanity check only */
28 #define HAMMING_PRECISION 15	/* hamming coefficient precision in bits */
29 #define LP_F0_STEP 8			/* step of F0 analysis for linear prediction */
30 #define LP_DECIM 10				/* F0 analysis decimation coeff. for linear prediction */
31 #define LP_F0_ORD 4				/* order of F0 contour LP analysis */
32 #define F0_FILT_ORD 9			/* F0 contour filter order */
33 #define LP_EXC_MUL 1.0			/* LP excitation multipicator */
34 
35 #ifdef WIN32
36 #define snprintf _snprintf
37 #endif
38 
39 void tdioven(char *p, int l);
40 
41 /* F0 contour filter coefficients */
42 const double a[9] = {1,-6.46921563821389,18.43727805607084,-30.21344177474595,31.11962012720199,
43 					-20.62061607537661,8.58111044795433,-2.04983423923570,0.21516477151414};
44 
45 const double b[9] = {0.01477848982115,-0.08930749676388,0.25181616063735,-0.43840842338467,0.52230821454923,
46 					-0.43840842338467,0.25181616063735,-0.08930749676388,0.01477848982115};
47 
48 /* lp f0 contour filter coefficients (mean of 144 sentences from speaker Machac) */
49 // const double lp[LP_F0_ORD] = {-1.23761, 0.60009, -0.32046, 0.10699};
50 
51 // these coeffs are from the version where the f0 is constant inside a syllable
52 const double lp[LP_F0_ORD] = {-0.900693, 0.043125, -0.003700, 0.069916};
53 // FIXME! lp coefficients must be configurable
54 
55 /* Hamming coefficients for TD-PSOLA algorithm */
hamkoe(int winlen,uint16_t * data,int e,int e_base)56 int hamkoe(int winlen, uint16_t *data, int e, int e_base)
57 {
58 	int i;
59 	double fn;
60 	fn = 2 * pii / (winlen - 1);
61 	for (i=0; i < winlen; i++) {
62 		data[i] = (uint16_t)((0.53999 - 0.46 * cos(fn * i)) * e / e_base * (1 << HAMMING_PRECISION));
63 	}
64 	return 0;
65 }
66 
67 
68 #if 0
69 
70 int hamkoe(int winlen, double *data)
71 {
72 	D_PRINT(0, "%d\n", winlen);
73 	int i;
74 	double fn;
75 	fn = 2 * pii / (winlen - 1);
76 	for (i=0; i < winlen; i++)
77 		data[i] = 0.54 - 0.46 * cos(fn * i);
78 	return 0;
79 }
80 
81 int median(int prev, int curr, int next, int ibonus)
82 {
83 	int lm;
84 	if ((prev <= curr) == (curr <= next)) lm = curr;
85 	else if ((prev <= curr) == (next <= prev)) lm = prev;
86 	else lm = next;
87 	return (abs(curr - lm) > ibonus) ? lm : curr;
88 }
89 
90 #endif
91 
92 
93 /* Inventory file header structure */
94 struct tdi_hdr
95 {
96 	int32_t  magic;
97 	int32_t  samp_rate;
98 	int32_t  samp_size;
99 	int32_t  bufpos;
100 	int32_t  n_segs;
101 	int32_t  diph_offs;
102 	int32_t  diph_len;
103 	int32_t  res1;
104 	int32_t  res2;
105 	int32_t  ppulses;
106 	int32_t  res3;
107 	int32_t  res4;
108 };
109 
tdpsyn(voice * v)110 tdpsyn::tdpsyn(voice *v)
111 {
112 	tdi_hdr *hdr;
113 
114 	difpos = 0;
115 
116 	tdi = claim(v->models, v->location, scfg->inv_base_dir, "rb", "inventory", tdioven);
117 	hdr = (tdi_hdr *)tdi->data;
118 	D_PRINT(0, "Got %d and config says %d\n", hdr->n_segs, v->n_segs);
119 	if (v->n_segs != hdr->n_segs) shriek(463, "inconsistent n_segs");
120 	if (sizeof(SAMPLE) != hdr->samp_size) shriek(463, "inconsistent samp_size");
121 	tdp_buff = (SAMPLE *)(hdr + 1);
122 	diph_offs = (int *)((char *)tdp_buff + sizeof(SAMPLE) * hdr->bufpos);
123 	diph_len = diph_offs + v->n_segs;
124 	ppulses = diph_len + v->n_segs;
125 
126 	// this is debugging only!
127 	D_PRINT(0, "Samples are %d bytes long.\n", sizeof(SAMPLE) * hdr->bufpos);
128 
129 	/* allocate the maximum necessary space for Hamming windows: */
130 //	max_frame = 0;
131 //	for (int k = 0; k < v->n_segs; k++) {
132 //		int avpitch = average_pitch(diph_offs[k], diph_len[k]);
133 //		int maxwin = avpitch + MAX_STRETCH; //(int)(w->inv_samping_rate / 500);
134 //		if (max_frame < maxwin) max_frame = maxwin;
135 //	}
136 //	max_frame++;
137 //	if (max_frame >= MAX_OLA_FRAME || max_frame == 0) shriek(463, "Inconsistent OLA frame buffer size");
138 	//FIXME! max_frame=maxwin * (max_L - min_ori_L)
139 
140 	max_frame = MAX_OLA_FRAME;
141 
142 	wwin = (uint16_t *)xmalloc(sizeof(uint16_t) * (max_frame * 2));
143 	memset(wwin, 0, (max_frame * 2) * sizeof(*wwin));
144 
145 	out_buff = (SAMPLE *)xmalloc(sizeof(SAMPLE) * max_frame * 2);
146 	memset(out_buff, 0, max_frame * 2 * sizeof(*out_buff));
147 
148 	/* initialisation of lp prosody engine */
149 	if (v->lpcprosody) {
150 		int i;
151 		for (i = 0; i < LPC_PROS_ORDER; lpfilt[i++] = 0);
152 		for (i = 0; i < MAX_OFILT_ORDER; ofilt[i++] = 0);
153 		sigpos = 0;
154 		lppstep = LP_F0_STEP * v->inv_sampling_rate / 1000;
155 		lpestep = LP_DECIM * lppstep;
156 		basef0 = v->init_f;
157 		lppitch = v->inv_sampling_rate / basef0;;
158 	}
159 
160 	// init the smoothing filter
161 	if (v->f0_smoothing) {
162 		int i;
163 		for (i = 0; i < MAX_OFILT_ORDER; smoothfilt[i++] = 0);
164 		// CHECKME: the following two lines may be superfluous
165 		basef0 = v->init_f;
166 		lppitch = v->inv_sampling_rate / basef0;
167 	}
168 }
169 
~tdpsyn(void)170 tdpsyn::~tdpsyn(void)
171 {
172 	free(out_buff);
173 	free(wwin);
174 	unclaim(tdi);
175 }
176 
average_pitch(int offs,int len)177 inline int tdpsyn::average_pitch(int offs, int len)
178 {
179 	// const int npitch = 145;
180 	const int npitch = 580;
181 	int tmp;
182 
183 	int total = 0;
184 	int i = 0;
185 	for (int j = 0; j <= len + 1; j++) {
186 		tmp = ppulses[offs + j] - ppulses[offs + j - 1];
187 		if (tmp < 2 * npitch) {
188 			total += tmp;
189 			i++;
190 		}
191 	}
192 	if (i <= 0) {
193 		if (cfg->paranoid) shriek(463,"pitch marks not found");
194 		return 160;
195 	}
196 	return total / i;
197 }
198 
synseg(voice * v,segment d,wavefm * w)199 void tdpsyn::synseg(voice *v, segment d, wavefm *w)
200 {
201 	int i, j, k, l, m, slen, nlen, pitch, avpitch, origlen, newlen, maxwin, skip, reply, diflen;
202 	double outf0, synf0, exc;
203 	static double old_exc = 0;
204 	SAMPLE poms;
205 
206 	const int max_frame = this->max_frame;
207 	int pitch_saved = 0;
208 	int segment_pitch;
209 
210 	if (diph_len[d.code] == 0) {
211 		D_PRINT(2, "missing speech unit No: %d\n", d.code);
212 		if (!cfg->paranoid) return;
213 		shriek(463, "missing speech unit No: %d\n",d.code);
214 	}
215 
216 	/* lp prosody reconstruction filter excitation signal computing */
217 	if (v->lpcprosody) { 	// in d.f is excitation signal value
218 
219 	  // impuls at start of each sentence clears the lpc filter
220 		if (d.e >= v->init_i * 9) {
221 			for (i = 0; i < LPC_PROS_ORDER; lpfilt[i++] = 0);
222 			d.e = (d.e * 100 / v->init_i - 1000) * v->init_i / 100;
223 		}
224 
225 	  if (v->bang_nnet) {
226 	    // multiply with 13.4 if f0 filter is used
227 	    exc = ((float) d.f) / 1000;
228 	    // exc = (int) (((float) d.f) * 13.4);
229 
230 	    // delete excitation if it was the same (do not allow more than one exc per syllable)
231 	    if (exc == old_exc) {
232 	      exc = 0;
233 	    }
234 	    else {
235 	      old_exc = exc;
236 	    }
237 	    D_PRINT(0, "Version lpc with nnet, exc. is %f\n", exc);
238 	  }
239 	  else {
240 		exc = LP_EXC_MUL * (d.f - v->init_f);
241 	    D_PRINT(0, "Version lpc, exc is %f\n", exc);
242 	  }
243 
244 		pitch = lppitch;
245 
246 	}
247 	else {				// in d.f is f0 contour value
248 	pitch = v->inv_sampling_rate / d.f;
249 	  D_PRINT(0, "Version without lpc, pitch is %d\n", pitch);
250 	}
251 
252 	if (v->f0_smoothing) {
253 		segment_pitch = pitch; // remember pitch for whole segment
254 		if (lppitch > 0) {
255 			pitch = lppitch; // use pitch from last segment
256 		}
257 	}
258 
259 	slen = diph_len[d.code];
260 	avpitch = average_pitch(diph_offs[d.code], slen);
261 	maxwin = avpitch + MAX_STRETCH;
262 	maxwin = (pitch > maxwin) ? maxwin : pitch;
263 	if (maxwin >= max_frame) shriek(461, "pitch too large");
264 
265 	if (d.t > 0) origlen = avpitch * slen * d.t / 100; else origlen = avpitch * slen;
266 	newlen = pitch * slen;
267 	//diflen = (newlen - origlen) / slen;
268 	D_PRINT(0, "\navp=%d L=%d oril=%d newl=%d | %d (%d)\n",avpitch,pitch,origlen,newlen,diph_len[d.code],d.code);
269 	D_PRINT(0, "unit:%4d f=%3d i=%3d t=%3d - pitch=%d\n",d.code,d.f,d.e,d.t,pitch);
270 
271 	hamkoe(2 * maxwin + 1, wwin, d.e, 100);
272 	skip = 1; reply = 1;
273 	if (newlen > origlen) skip = newlen / origlen;
274 	if (origlen > newlen) reply = origlen / newlen;
275 	D_PRINT(0, "dlen=%d p:%d avp=%d oril=%d newl=%d difl=%d",diph_len[d.code],pitch,avpitch,origlen,newlen,diflen);
276 	nlen = slen - (skip - 1) * slen / skip + (reply - 1) * slen;
277 	if (nlen == 0) {
278 	  D_PRINT(0, "Error: Pitch modelling exceeds range!\n");
279 	  nlen = 1;
280 	}
281 	diflen = (newlen - origlen - (skip - 1) * slen * pitch / skip + (reply - 1) * slen * pitch) / nlen;
282 	D_PRINT(0, " -> diflen:%d sk:%d rp:%d\n",diflen,skip,reply);
283 	for (j = 1; j <= diph_len[d.code]; j += skip) for (k = 0; k < reply; k++) {
284 		memcpy(out_buff + max_frame - pitch, out_buff + max_frame, pitch * sizeof(*out_buff));
285 		memset(out_buff + max_frame, 0, max_frame * sizeof(*out_buff));
286 		for (i = -maxwin;i <= maxwin; i++) {
287 			int ttemp;
288 			poms = tdp_buff[i + ppulses[diph_offs[d.code] + j - 1]];
289 			ttemp = wwin[i + pitch];
290 			ttemp *= poms;
291 			ttemp = ttemp >> HAMMING_PRECISION;
292 			poms = (SAMPLE) ttemp;
293 			//			poms = (SAMPLE)(wwin[i + pitch] * poms >> HAMMING_PRECISION);
294 //			poms = poms * d.e / 100;
295 			out_buff[max_frame + i] += poms;
296 		}
297 
298 		/* lpc synthesis of F0 contour */
299 		if (v->lpcprosody) {
300 			synf0 = 0; outf0 = 0;
301 			for (l = 0; l < 2 * pitch; l++) {
302 				sigpos++;
303 				if (!(sigpos % lppstep)) {	// new pitch value into f0 output filter
304 					D_PRINT(0, "LPP position point %d, exc=%.2f synf0=%d otf0=%.2f L=%d\n",sigpos,exc,synf0,outf0,lppitch);
305 					synf0 = 0;
306 					if (!(sigpos % lpestep)) {	// new excitation value into recontruction filter
307 						D_PRINT(0, "   >> LPE position point %d (exc=%.4f) <<   \n",sigpos,exc);
308 						D_PRINT(0, "lp=[%.3f %.3f %.3f %.3f] lpfilt=[%.3f %.3f %.3f %.3f]\n",lp[0],lp[1],lp[2],lp[3],lpfilt[0],lpfilt[1],lpfilt[2],lpfilt[3]);
309 						synf0 = exc - lpfilt[0]*lp[0];
310 						exc = 0;
311 						for (m = LP_F0_ORD - 1; m > 0; m--) {
312 							synf0 -= lp[m] * lpfilt[m];
313 							lpfilt[m] = lpfilt[m-1];
314 						}
315 						lpfilt[0] = synf0;
316 						pitch_saved = 1;
317 						D_PRINT(0, "Unsmoothed synf0 is: %f\n", synf0);
318 					}
319 
320 					// skip f0 filter here
321 					if (! (v->bang_nnet)) {
322 					// if (1) {
323 					ofilt[0] = synf0;
324 					synf0 = 0;
325 					for (m = 1; m < F0_FILT_ORD; m++) ofilt[0] -= a[m] * ofilt[m];
326 					outf0 = 0;
327 					for (m = 0; m < F0_FILT_ORD; m++) outf0 += b[m] * ofilt[m];
328 					D_PRINT(0, "of=[%.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f]\n",ofilt[0],ofilt[1],
329 						ofilt[2],ofilt[3],ofilt[4],ofilt[5],ofilt[6],ofilt[7],ofilt[8]);
330 
331 					  // amplify the signal
332 					  // outf0 = outf0 * 13.4;
333 					}
334 					else {
335 					  outf0 = lpfilt[0];
336 					}
337 
338 					// apply the prosody factor
339 					// 1000 = prosody unchanged
340 					// > 1000 = prosody enhanced
341 					// < 1000 = prosody lowered
342 					outf0 = outf0 * ((double) cfg->pros_factor) / 1000.0;
343 
344 					// do remember the first output value from the lpc filter
345 					if (pitch_saved == 1) {
346 					  pitch_saved = 2;
347 					lppitch = (int)(v->inv_sampling_rate / (basef0 + outf0));
348 					}
349 					D_PRINT(0, "New values: outf0 : %f, lppitch : %d\n", outf0, lppitch);
350 					outf0 = 0;
351 					for (m = F0_FILT_ORD - 1; m > 0; m--) ofilt[m] = ofilt[m - 1];
352 				}
353 			}
354 		}
355 
356 		D_PRINT(0, "Pitch: %d\n", pitch);
357 
358 		w->sample((SAMPLE *)out_buff + max_frame - pitch, pitch);
359 		D_PRINT(0, "  j:%d difpos:%d diflen:%d",j,difpos,diflen);
360 		difpos += diflen;
361 		if (difpos < -pitch) {
362 			if (reply == 1) j--; else k--;
363 			difpos += pitch;
364 		}
365 		if (difpos > pitch) {
366 			if (reply == 1) j++;
367 			else if (k == reply - 1) { j++; k = 1; }
368 			else k++;
369 			difpos -= pitch;
370 		}
371 		D_PRINT(0, " -> j:%d difpos:%d\n",j,difpos);
372 
373 		// inserted to label f0 in wav file
374 		if (cfg->label_f0) {
375 
376 			D_PRINT(0, "Labelling file!\n");
377 
378 			char tmp[4];
379 			snprintf (tmp, 4, "%d", pitch);
380 			tmp[3] = 0;
381 			w->label(0, tmp, "pitch");
382 		}
383 
384 		// f0 smoothing, when chosen
385 		if (v->f0_smoothing) {
386 
387 			smoothfilt[0] = segment_pitch;
388 
389 			for (m = 1; m < F0_FILT_ORD; m++) {
390 				smoothfilt[0] -= a[m] * smoothfilt[m];
391 			}
392 
393 			lppitch = 0;
394 			for (m = 0; m < F0_FILT_ORD; m++) lppitch += b[m] * smoothfilt[m];
395 
396 			for (m = F0_FILT_ORD - 1; m > 0; m--) smoothfilt[m] = smoothfilt[m - 1];
397 
398 			if (lppitch > 0)
399 				pitch = lppitch;
400 
401 			slen = diph_len[d.code] - j + 1;
402 			avpitch = average_pitch(diph_offs[d.code] + j - 1, slen);
403 			maxwin = avpitch + MAX_STRETCH;
404 			maxwin = (pitch > maxwin) ? maxwin : pitch;
405 			if (maxwin >= max_frame) shriek(461, "pitch too large");
406 
407 			if (d.t > 0) origlen = avpitch * slen * d.t / 100; else origlen = avpitch * slen;
408 			newlen = pitch * slen;
409 
410 			hamkoe(2 * maxwin + 1, wwin, d.e, 100);
411 			skip = 1; reply = 1;
412 			if (newlen > origlen) skip = newlen / origlen;
413 			if (origlen > newlen) reply = origlen / newlen;
414 
415 			nlen = slen - (skip - 1) * slen / skip + (reply - 1) * slen;
416 			if (nlen == 0) {
417 				D_PRINT(0, "Error: Pitch modelling exceeds range!\n");
418 				nlen = 1;
419 			}
420 			diflen = (newlen - origlen - (skip - 1) * slen * pitch / skip + (reply - 1) * slen * pitch) / nlen;
421 		}
422 	}
423 }
424 
425 
tdioven(char * p,int l)426 void tdioven(char *p, int l){
427 	if (!scfg->_big_endian)
428 		return;
429 	/* Convert the header */
430 	tdi_hdr* hdr = (tdi_hdr*)p;
431 	hdr->samp_rate = from_le32s(hdr->samp_rate);
432 	hdr->samp_size = from_le32s(hdr->samp_size);
433 	hdr->bufpos = from_le32s(hdr->bufpos);
434 	hdr->n_segs = from_le32s(hdr->n_segs);
435 	hdr->diph_offs = from_le32s(hdr->diph_offs);
436 	hdr->diph_len = from_le32s(hdr->diph_len);
437 	hdr->res1 = from_le32s(hdr->res1);
438 	hdr->res2 = from_le32s(hdr->res2);
439 	hdr->ppulses = from_le32s(hdr->ppulses);
440 	hdr->res3 = from_le32s(hdr->res3);
441 	hdr->res4 = from_le32s(hdr->res4);
442 
443 	/* Convert the buffer */
444 	SAMPLE *bufS = (SAMPLE *)(hdr + 1);
445 	uint32_t *bufL = (uint32_t *)(bufS + hdr->bufpos);
446 	char *stop = p + l;
447 
448 	for ( ; bufS < (SAMPLE *)bufL; bufS++)
449 		*bufS = from_le16s(*bufS);
450 
451 	for ( ; bufL < (uint32_t*)stop; bufL++)
452 		*bufL = from_le32u(*bufL);
453 }
454 
455