1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                     University of Edinburgh, UK                       */
5 /*                       Copyright (c) 1996,1997                         */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*                      Author :  Alan W Black                           */
34 /*                      Date   :  December 1996                          */
35 /*-----------------------------------------------------------------------*/
36 /*   Optimal Coupling for two pieces of speech                           */
37 /*                                                                       */
38 /*   Given two Tracks find the best point (minimised weighted euclidean    */
39 /*   distance between two vectors).                                      */
40 /*                                                                       */
41 /*=======================================================================*/
42 #include <iostream>
43 #include "festival.h"
44 
45 #define LEFT_PHONE(x) (car(x))
46 #define RIGHT_PHONE(x) (car(cdr(x)))
47 #define FILEID(x) (car(cdr(cdr(x))))
48 #define START(x) (car(cdr(cdr(cdr(x)))))
49 #define MID(x) (car(cdr(cdr(cdr(cdr(x))))))
50 #define END(x) (car(cdr(cdr(cdr(cdr(cdr(x)))))))
51 
52 static int get_track_for_phone(EST_Track &a,const EST_String &fileid,float st,float end);
53 static LISP before_ds(LISP d1, LISP ds);
54 static LISP after_ds(LISP d1, LISP ds);
55 static float find_best_left(LISP d,LISP ds,LISP weights);
56 static float find_best_right(LISP d,LISP ds,LISP weights);
57 static float frametoms(int frame,float frame_shift);
58 static int mstoframe(float ms,float frame_shift);
59 static float frame_distance(EST_Track &a, int fa,
60 			    EST_Track &b, int fb,
61 			    int size, double *weights);
62 
63 static EST_String coeffs_dir = "coeffs/";
64 static EST_String coeffs_ext = ".cep";
65 
find_optimal_coupling(LISP table,LISP weights)66 LISP find_optimal_coupling(LISP table, LISP weights)
67 {
68     // For each diphone description in table find the best overall
69     // join point between it and all other diphone sthat can join with it
70     LISP d,newtab,newent;
71     float best_left,best_right;
72 
73     newtab = NIL;
74     gc_protect(&newtab);
75     coeffs_dir = get_c_string(siod_get_lval("oc_coeffs_dir","no coeffs dir"));
76     coeffs_ext = get_c_string(siod_get_lval("oc_coeffs_ext","no coeffs ext"));
77 
78     for (d=table; d != NIL; d=cdr(d))
79     {
80 	pprint(car(d));
81 	if (ph_is_silence(get_c_string(LEFT_PHONE(car(d)))))
82 	    best_left = get_c_float(START(car(d)));
83 	else
84 	    best_left=find_best_left(car(d),before_ds(car(d),table),weights);
85 	if (ph_is_silence(get_c_string(RIGHT_PHONE(car(d)))))
86 	    best_right = get_c_float(END(car(d)));
87 	else
88 	    best_right=find_best_right(car(d),after_ds(car(d),table),weights);
89 	newent = cons(LEFT_PHONE(car(d)),              // left phone
90 		 cons(RIGHT_PHONE(car(d)),             // right phone
91 		 cons(FILEID(car(d)),                  // file_id
92    	         cons(flocons(best_left),              // left cut point
93 		 cons(MID(car(d)),                     // mid point
94 		 cons(flocons(best_right),NIL))))));   // right cut point
95 	newtab = cons(newent,newtab);
96     }
97     newtab = reverse(newtab);
98     gc_unprotect(&newtab);
99     return newtab;
100 }
101 
get_track_for_phone(EST_Track & a,const EST_String & fileid,float st,float end)102 static int get_track_for_phone(EST_Track &a, const EST_String &fileid, float st, float end)
103 {
104     // Load EST_Track from fileid from st to end (in ms)
105     EST_Track whole;
106     int start_frame, end_frame, i;
107 
108     if (whole.load(coeffs_dir+fileid+coeffs_ext) != 0)
109 	return -1;
110     start_frame = mstoframe(st-12.8,whole.shift());
111     end_frame = mstoframe(end-12.8,whole.shift())+1;
112     a.resize(end_frame-start_frame,whole.num_channels());
113     for (i=start_frame; i < end_frame; i++)
114 	a.copy_frame_in(i-start_frame,
115 			whole, i, 0,
116 			0, whole.num_channels());
117     a.fill_time(whole.shift());
118 
119     return 0;
120 }
121 
before_ds(LISP d1,LISP ds)122 static LISP before_ds(LISP d1, LISP ds)
123 {
124     // Return all entries in ds whose cadr equals d1's car
125     // i.e. all diphones which match d1's left phone
126     LISP m=NIL,l;
127 
128     for (l=ds; l != NIL; l=cdr(l))
129 	if (streq(get_c_string(car(d1)),get_c_string(car(cdr(car(l))))))
130 	    m=cons(car(l),m);
131     return m;
132 }
133 
after_ds(LISP d1,LISP ds)134 static LISP after_ds(LISP d1, LISP ds)
135 {
136     // Return all entries in ds whose car equals d1's cadr
137     // i..e all diphones which match d1's right phone
138     LISP m=NIL,l;
139 
140     for (l=ds; l != NIL; l=cdr(l))
141 	if (streq(get_c_string(car(cdr(d1))),get_c_string(car(car(l)))))
142 	    m=cons(car(l),m);
143     return m;
144 }
145 
find_best_left(LISP d,LISP ds,LISP weights)146 static float find_best_left(LISP d,LISP ds,LISP weights)
147 {
148     // Find the best join point with each of phones described
149     // in d
150     EST_Track a,b;
151     LISP l;
152     int i,j,best,bestj;;
153     double b_dist,dist;
154     float best_pos;
155 
156     get_track_for_phone(a,get_c_string(FILEID(d)),
157 			get_c_float(START(d)),get_c_float(MID(d)));
158 
159     // Cummulate the costs for each possible cut point
160     double *counts = new double[a.num_frames()];
161     for (i=0; i < a.num_frames(); i++)
162 	counts[i] = 0;
163     double *w = new double[siod_llength(weights)];
164     for (l=weights,i=0; i < siod_llength(weights); i++,l=cdr(l))
165 	w[i] = get_c_float(car(l));
166 
167     for (l=ds; l != NIL; l=cdr(l))
168     {   // for each matching phone
169 	get_track_for_phone(b,get_c_string(FILEID(car(l))),
170 			  get_c_float(MID(car(l))),get_c_float(END(car(l))));
171 	best=1;
172 
173 	b_dist = frame_distance(a, 1, b, 0, a.num_channels(),w);
174 
175 	for (i=1; i < a.num_frames()-1; i++)
176 	{
177 	    for (j=0; j < b.num_frames(); j++)
178 	    {
179 		dist = frame_distance(a, i, b, j, a.num_channels(),w);
180 		if (dist < b_dist)
181 		{
182 		    b_dist = dist;
183 		    best = i;
184 		    bestj = j;
185 		}
186 	    }
187 	}
188 	// You should probably find minimise the std
189 //	printf("best pos %d %s-%s %s-%s\n",best,
190 //	       get_c_string(LEFT_PHONE(d)),get_c_string(RIGHT_PHONE(d)),
191 //	       get_c_string(LEFT_PHONE(car(l))),get_c_string(RIGHT_PHONE(car(l))));
192 	counts[best] += 1;  // sum the best possible
193 //	counts[best] += b_dist;  // sum the best possible
194     }
195 
196     // Now find out the best position
197     best = 0;
198     for (i=0; i < a.num_frames(); i++)
199     {
200 	if (counts[i] > counts[best])
201 	    best = i;
202     }
203 
204     // Change frame number back to ms offset
205     best_pos = frametoms(mstoframe(get_c_float(START(d)),a.shift())
206 			 + best,a.shift());
207     delete counts;
208     delete w;
209     return best_pos;
210 }
211 
find_best_right(LISP d,LISP ds,LISP weights)212 static float find_best_right(LISP d,LISP ds,LISP weights)
213 {
214     // Find the best join point with each of phones described
215     // in d
216     EST_Track a,b;
217     LISP l;
218     int i,j,best,bestj;;
219     double b_dist,dist;
220     float best_pos;
221 
222     get_track_for_phone(a,get_c_string(FILEID(d)),
223 			get_c_float(MID(d)),get_c_float(END(d)));
224 
225     // Cummulate the costs for each possible cut point
226     double *counts = new double[a.num_frames()];
227     for (i=0; i < a.num_frames(); i++)
228 	counts[i] = 0;
229     double *w = new double[siod_llength(weights)];
230     for (l=weights,i=0; i < siod_llength(weights); i++,l=cdr(l))
231 	w[i] = get_c_float(car(l));
232 
233     for (l=ds; l != NIL; l=cdr(l))
234     {   // for each matching phone
235 	get_track_for_phone(b,get_c_string(FILEID(car(l))),
236 			    get_c_float(START(car(l))),
237 			    get_c_float(MID(car(l))));
238 	best=1;
239 	b_dist = frame_distance( a, 1,  b, 0, a.num_channels(),w);
240 	for (i=1; i < a.num_frames()-1; i++)
241 	{
242 	    for (j=0; j < b.num_frames(); j++)
243 	    {
244 		dist = frame_distance( a, i,  b, j, a.num_channels(),w);
245 		if (dist < b_dist)
246 		{
247 		    b_dist = dist;
248 		    best = i;
249 		    bestj = j;
250 		}
251 	    }
252 	}
253 	// You should probably find minimise the std
254 	counts[best] += 1;  // sum the best possible
255 //	counts[best] += b_dist;  // sum the best possible
256     }
257 
258     // Now find out the best position
259     best = 0;
260     for (i=0; i < a.num_frames(); i++)
261     {
262 	if (counts[i] > counts[best])
263 	    best = i;
264     }
265 
266     // Change frame number back to ms offset
267     best_pos = frametoms(mstoframe(get_c_float(MID(d)),a.shift())
268 			 + best,a.shift());
269     delete counts;
270     delete w;
271     return best_pos;
272 }
273 
frametoms(int frame,float frame_shift)274 static float frametoms(int frame,float frame_shift)
275 {
276     return (frame*frame_shift)*1000.0;
277 }
278 
mstoframe(float ms,float frame_shift)279 static int mstoframe(float ms,float frame_shift)
280 {
281     return (int)((ms/1000.0)/frame_shift);
282 }
283 
284 // RJC - change for Track reorg.
285 
frame_distance(EST_Track & a,int fa,EST_Track & b,int fb,int size,double * weights)286 static float frame_distance(EST_Track &a, int fa,
287 			    EST_Track &b, int fb,
288 			    int size, double *weights)
289 {
290     float cost = 0.0,diff;
291     int i;
292 
293     for (i=0; i < size; i++)
294     {
295 	if (weights[i] != 0.0)
296 	{
297 	    diff = (a(fa,i)-b(fb,i));
298 	    cost += diff*diff*weights[i];
299 	}
300     }
301 
302     return cost;
303 }
304 
305