1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : December 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* Optimal Coupling for two pieces of speech */
37 /* */
38 /* Given two Tracks find the best point (minimised weighted euclidean */
39 /* distance between two vectors). */
40 /* */
41 /*=======================================================================*/
42 #include <iostream>
43 #include "festival.h"
44
45 #define LEFT_PHONE(x) (car(x))
46 #define RIGHT_PHONE(x) (car(cdr(x)))
47 #define FILEID(x) (car(cdr(cdr(x))))
48 #define START(x) (car(cdr(cdr(cdr(x)))))
49 #define MID(x) (car(cdr(cdr(cdr(cdr(x))))))
50 #define END(x) (car(cdr(cdr(cdr(cdr(cdr(x)))))))
51
52 static int get_track_for_phone(EST_Track &a,const EST_String &fileid,float st,float end);
53 static LISP before_ds(LISP d1, LISP ds);
54 static LISP after_ds(LISP d1, LISP ds);
55 static float find_best_left(LISP d,LISP ds,LISP weights);
56 static float find_best_right(LISP d,LISP ds,LISP weights);
57 static float frametoms(int frame,float frame_shift);
58 static int mstoframe(float ms,float frame_shift);
59 static float frame_distance(EST_Track &a, int fa,
60 EST_Track &b, int fb,
61 int size, double *weights);
62
63 static EST_String coeffs_dir = "coeffs/";
64 static EST_String coeffs_ext = ".cep";
65
find_optimal_coupling(LISP table,LISP weights)66 LISP find_optimal_coupling(LISP table, LISP weights)
67 {
68 // For each diphone description in table find the best overall
69 // join point between it and all other diphone sthat can join with it
70 LISP d,newtab,newent;
71 float best_left,best_right;
72
73 newtab = NIL;
74 gc_protect(&newtab);
75 coeffs_dir = get_c_string(siod_get_lval("oc_coeffs_dir","no coeffs dir"));
76 coeffs_ext = get_c_string(siod_get_lval("oc_coeffs_ext","no coeffs ext"));
77
78 for (d=table; d != NIL; d=cdr(d))
79 {
80 pprint(car(d));
81 if (ph_is_silence(get_c_string(LEFT_PHONE(car(d)))))
82 best_left = get_c_float(START(car(d)));
83 else
84 best_left=find_best_left(car(d),before_ds(car(d),table),weights);
85 if (ph_is_silence(get_c_string(RIGHT_PHONE(car(d)))))
86 best_right = get_c_float(END(car(d)));
87 else
88 best_right=find_best_right(car(d),after_ds(car(d),table),weights);
89 newent = cons(LEFT_PHONE(car(d)), // left phone
90 cons(RIGHT_PHONE(car(d)), // right phone
91 cons(FILEID(car(d)), // file_id
92 cons(flocons(best_left), // left cut point
93 cons(MID(car(d)), // mid point
94 cons(flocons(best_right),NIL)))))); // right cut point
95 newtab = cons(newent,newtab);
96 }
97 newtab = reverse(newtab);
98 gc_unprotect(&newtab);
99 return newtab;
100 }
101
get_track_for_phone(EST_Track & a,const EST_String & fileid,float st,float end)102 static int get_track_for_phone(EST_Track &a, const EST_String &fileid, float st, float end)
103 {
104 // Load EST_Track from fileid from st to end (in ms)
105 EST_Track whole;
106 int start_frame, end_frame, i;
107
108 if (whole.load(coeffs_dir+fileid+coeffs_ext) != 0)
109 return -1;
110 start_frame = mstoframe(st-12.8,whole.shift());
111 end_frame = mstoframe(end-12.8,whole.shift())+1;
112 a.resize(end_frame-start_frame,whole.num_channels());
113 for (i=start_frame; i < end_frame; i++)
114 a.copy_frame_in(i-start_frame,
115 whole, i, 0,
116 0, whole.num_channels());
117 a.fill_time(whole.shift());
118
119 return 0;
120 }
121
before_ds(LISP d1,LISP ds)122 static LISP before_ds(LISP d1, LISP ds)
123 {
124 // Return all entries in ds whose cadr equals d1's car
125 // i.e. all diphones which match d1's left phone
126 LISP m=NIL,l;
127
128 for (l=ds; l != NIL; l=cdr(l))
129 if (streq(get_c_string(car(d1)),get_c_string(car(cdr(car(l))))))
130 m=cons(car(l),m);
131 return m;
132 }
133
after_ds(LISP d1,LISP ds)134 static LISP after_ds(LISP d1, LISP ds)
135 {
136 // Return all entries in ds whose car equals d1's cadr
137 // i..e all diphones which match d1's right phone
138 LISP m=NIL,l;
139
140 for (l=ds; l != NIL; l=cdr(l))
141 if (streq(get_c_string(car(cdr(d1))),get_c_string(car(car(l)))))
142 m=cons(car(l),m);
143 return m;
144 }
145
find_best_left(LISP d,LISP ds,LISP weights)146 static float find_best_left(LISP d,LISP ds,LISP weights)
147 {
148 // Find the best join point with each of phones described
149 // in d
150 EST_Track a,b;
151 LISP l;
152 int i,j,best,bestj;;
153 double b_dist,dist;
154 float best_pos;
155
156 get_track_for_phone(a,get_c_string(FILEID(d)),
157 get_c_float(START(d)),get_c_float(MID(d)));
158
159 // Cummulate the costs for each possible cut point
160 double *counts = new double[a.num_frames()];
161 for (i=0; i < a.num_frames(); i++)
162 counts[i] = 0;
163 double *w = new double[siod_llength(weights)];
164 for (l=weights,i=0; i < siod_llength(weights); i++,l=cdr(l))
165 w[i] = get_c_float(car(l));
166
167 for (l=ds; l != NIL; l=cdr(l))
168 { // for each matching phone
169 get_track_for_phone(b,get_c_string(FILEID(car(l))),
170 get_c_float(MID(car(l))),get_c_float(END(car(l))));
171 best=1;
172
173 b_dist = frame_distance(a, 1, b, 0, a.num_channels(),w);
174
175 for (i=1; i < a.num_frames()-1; i++)
176 {
177 for (j=0; j < b.num_frames(); j++)
178 {
179 dist = frame_distance(a, i, b, j, a.num_channels(),w);
180 if (dist < b_dist)
181 {
182 b_dist = dist;
183 best = i;
184 bestj = j;
185 }
186 }
187 }
188 // You should probably find minimise the std
189 // printf("best pos %d %s-%s %s-%s\n",best,
190 // get_c_string(LEFT_PHONE(d)),get_c_string(RIGHT_PHONE(d)),
191 // get_c_string(LEFT_PHONE(car(l))),get_c_string(RIGHT_PHONE(car(l))));
192 counts[best] += 1; // sum the best possible
193 // counts[best] += b_dist; // sum the best possible
194 }
195
196 // Now find out the best position
197 best = 0;
198 for (i=0; i < a.num_frames(); i++)
199 {
200 if (counts[i] > counts[best])
201 best = i;
202 }
203
204 // Change frame number back to ms offset
205 best_pos = frametoms(mstoframe(get_c_float(START(d)),a.shift())
206 + best,a.shift());
207 delete counts;
208 delete w;
209 return best_pos;
210 }
211
find_best_right(LISP d,LISP ds,LISP weights)212 static float find_best_right(LISP d,LISP ds,LISP weights)
213 {
214 // Find the best join point with each of phones described
215 // in d
216 EST_Track a,b;
217 LISP l;
218 int i,j,best,bestj;;
219 double b_dist,dist;
220 float best_pos;
221
222 get_track_for_phone(a,get_c_string(FILEID(d)),
223 get_c_float(MID(d)),get_c_float(END(d)));
224
225 // Cummulate the costs for each possible cut point
226 double *counts = new double[a.num_frames()];
227 for (i=0; i < a.num_frames(); i++)
228 counts[i] = 0;
229 double *w = new double[siod_llength(weights)];
230 for (l=weights,i=0; i < siod_llength(weights); i++,l=cdr(l))
231 w[i] = get_c_float(car(l));
232
233 for (l=ds; l != NIL; l=cdr(l))
234 { // for each matching phone
235 get_track_for_phone(b,get_c_string(FILEID(car(l))),
236 get_c_float(START(car(l))),
237 get_c_float(MID(car(l))));
238 best=1;
239 b_dist = frame_distance( a, 1, b, 0, a.num_channels(),w);
240 for (i=1; i < a.num_frames()-1; i++)
241 {
242 for (j=0; j < b.num_frames(); j++)
243 {
244 dist = frame_distance( a, i, b, j, a.num_channels(),w);
245 if (dist < b_dist)
246 {
247 b_dist = dist;
248 best = i;
249 bestj = j;
250 }
251 }
252 }
253 // You should probably find minimise the std
254 counts[best] += 1; // sum the best possible
255 // counts[best] += b_dist; // sum the best possible
256 }
257
258 // Now find out the best position
259 best = 0;
260 for (i=0; i < a.num_frames(); i++)
261 {
262 if (counts[i] > counts[best])
263 best = i;
264 }
265
266 // Change frame number back to ms offset
267 best_pos = frametoms(mstoframe(get_c_float(MID(d)),a.shift())
268 + best,a.shift());
269 delete counts;
270 delete w;
271 return best_pos;
272 }
273
frametoms(int frame,float frame_shift)274 static float frametoms(int frame,float frame_shift)
275 {
276 return (frame*frame_shift)*1000.0;
277 }
278
mstoframe(float ms,float frame_shift)279 static int mstoframe(float ms,float frame_shift)
280 {
281 return (int)((ms/1000.0)/frame_shift);
282 }
283
284 // RJC - change for Track reorg.
285
frame_distance(EST_Track & a,int fa,EST_Track & b,int fb,int size,double * weights)286 static float frame_distance(EST_Track &a, int fa,
287 EST_Track &b, int fb,
288 int size, double *weights)
289 {
290 float cost = 0.0,diff;
291 int i;
292
293 for (i=0; i < size; i++)
294 {
295 if (weights[i] != 0.0)
296 {
297 diff = (a(fa,i)-b(fb,i));
298 cost += diff*diff*weights[i];
299 }
300 }
301
302 return cost;
303 }
304
305