1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2;;; ;; 3;;; Carnegie Mellon University ;; 4;;; Copyright (c) 2005-2006 ;; 5;;; All Rights Reserved. ;; 6;;; ;; 7;;; Permission is hereby granted, free of charge, to use and distribute ;; 8;;; this software and its documentation without restriction, including ;; 9;;; without limitation the rights to use, copy, modify, merge, publish, ;; 10;;; distribute, sublicense, and/or sell copies of this work, and to ;; 11;;; permit persons to whom this work is furnished to do so, subject to ;; 12;;; the following conditions: ;; 13;;; 1. The code must retain the above copyright notice, this list of ;; 14;;; conditions and the following disclaimer. ;; 15;;; 2. Any modifications must be clearly marked as such. ;; 16;;; 3. Original authors' names are not deleted. ;; 17;;; 4. The authors' names are not used to endorse or promote products ;; 18;;; derived from this software without specific prior written ;; 19;;; permission. ;; 20;;; ;; 21;;; CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK ;; 22;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;; 23;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO ;; 24;;; EVENT SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE ;; 25;;; LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY ;; 26;;; DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, ;; 27;;; WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ;; 28;;; ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR ;; 29;;; PERFORMANCE OF THIS SOFTWARE. ;; 30;;; ;; 31;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 32;;; ;; 33;;; Author: Alan W Black (awb@cs.cmu.edu) Nov 2005 ;; 34;;; ;; 35;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 36;;; ;; 37;;; Run Time Synthesis support for clustergen (HMM-generation) voices ;; 38;;; ;; 39;;; This is voice-independant, and should be in festival/lib but is ;; 40;;; currently copied into each voice ;; 41;;; ;; 42;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 43 44(defvar cluster_synth_pre_hooks nil) 45(defvar cluster_synth_post_hooks nil) 46(defvar clustergen_mcep_trees nil) 47(defvar cg:frame_shift 0.005) 48 49(defSynthType ClusterGen 50 51 (apply_hooks cluster_synth_pre_hooks utt) 52 53 (set! clustergen_utt utt) ;; for debugging 54 55 ;; Build the state relation 56 (ClusterGen_make_HMMstate utt) 57 ;; Predict number of frames, then predict the frame values 58 (ClusterGen_make_mcep utt) ;; durations for # of vectors 59 (if (assoc 'cg::trajectory clustergen_mcep_trees) 60 (ClusterGen_predict_trajectory utt) ;; predict trajectory 61 (ClusterGen_predict_mcep utt) ;; predict vector types 62 ) 63 64 ;; Convert predicted mcep track into a waveform 65 (cluster_synth_method utt) 66 67 (apply_hooks cluster_synth_post_hooks utt) 68 utt 69) 70 71(define (cg_wave_synth_external utt) 72 ;; before we had it built-in to Festival 73 (let ((trackname (make_tmp_filename)) 74 (wavename (make_tmp_filename)) 75 ) 76 (track.save (utt.feat utt "param_track") trackname "est") 77 (system 78 (format nil "$FESTVOXDIR/src/clustergen/cg_resynth %s %s" 79 trackname wavename)) 80 (utt.import.wave utt wavename) 81;; (delete-file trackname) 82 (delete-file wavename) 83 utt) 84) 85 86(define (cg_duration i) 87 (if (item.prev i) 88 (- (item.feat i "end") (item.feat i "p.end")) 89 (item.feat i "end"))) 90 91(define (cg_state_pos i) 92 (let ((n (item.name i))) 93 (cond 94 ((not (string-equal n (item.feat i "p.name"))) 95 "b") 96 ((string-equal n (item.feat i "n.name")) 97 "m") 98 (t 99 "e")))) 100 101(define (cg_wave_synth utt) 102 (utt.relation.create utt 'Wave) 103 (item.set_feat 104 (utt.relation.append utt 'Wave) 105 "wave" 106 (mlsa_resynthesis (utt.feat utt "param_track"))) 107 utt) 108 109(define (ClusterGen_predict_states seg) 110 ;; The names may change 111 (cdr (assoc_string (item.name seg) phone_to_states))) 112 113(define (ClusterGen_make_HMMstate utt) 114 (let ((states) 115 (segstate) 116 (statepos)) 117 ;; Make HMMstate relation and items (three per phone) 118 (utt.relation.create utt "HMMstate") 119 (utt.relation.create utt "segstate") 120 121 (mapcar 122 (lambda (seg) 123 (set! statepos 1) 124 (set! states (ClusterGen_predict_states seg)) 125 (set! segstate (utt.relation.append utt 'segstate seg)) 126 (while states 127 (set! state (utt.relation.append utt 'HMMstate)) 128 (item.append_daughter segstate state) 129 (item.set_feat state "name" (car states)) 130 (item.set_feat state "statepos" statepos) 131 (set! statepos (+ 1 statepos)) 132 (set! states (cdr states))) 133 ) 134 (utt.relation.items utt 'Segment)) 135 ) 136) 137 138(define (ClusterGen_state_duration state) 139 (let ((zdur (wagon_predict state duration_cart_tree_cg)) 140 (ph_info (assoc_string (item.name state) duration_ph_info_cg)) 141 (seg_stretch (item.feat state "R:segstate.parent.dur_stretch")) 142 (syl_stretch (item.feat state "R:segstate.parent.R:SylStructure.parent.dur_stretch")) 143 (tok_stretch (item.feat state "R:segstate.parent.R:SylStructure.parent.parent.R:Token.parent.dur_stretch")) 144 (global_stretch (Parameter.get 'Duration_Stretch)) 145 (stretch 1.0)) 146 (if (string-matches (item.name state) "#_.*") 147 ;; Its a pau so explicitly set the duration 148 ;; Note we want sentence internal pauses to be about 100ms 149 ;; and sentence final pauses to be 150ms, but there will also 150 ;; sentence initial pauses of 150ms so we can treat all pauses as 151 ;; 100ms, there are three states so we use 50ms 152 (set! zdur 153 (/ (- 0.05 (car (cdr ph_info))) 154 (car (cdr (cdr ph_info)))))) 155 (if (not (string-equal seg_stretch "0")) 156 (setq stretch (* stretch seg_stretch))) 157 (if (not (string-equal syl_stretch "0")) 158 (setq stretch (* stretch syl_stretch))) 159 (if (not (string-equal tok_stretch "0")) 160 (setq stretch (* stretch tok_stretch))) 161 (if (not (string-equal global_stretch "0")) 162 (setq stretch (* stretch global_stretch))) 163 (if ph_info 164 (* stretch 165 (+ (car (cdr ph_info)) ;; mean 166 (* (car (cdr (cdr ph_info))) ;; stddev 167 zdur))) 168 (begin 169 (format t "ClusterGen_state_duration: no dur phone info for %s\n" 170 (item.name state)) 171 0.1)))) 172 173(define (ClusterGen_make_mcep utt) 174 (let ((num_frames 0) 175 (frame_advance cg:frame_shift) 176 (end 0.0)) 177 178 ;; Make HMMstate relation and items (three per phone) 179 (utt.relation.create utt "mcep") 180 (utt.relation.create utt "mcep_link") 181 (mapcar 182 (lambda (state) 183 ;; Predict Duration 184 (set! start end) 185 (set! end (+ start (ClusterGen_state_duration state))) 186 (item.set_feat state "end" end) 187 ;; create that number of mcep frames up to state end 188 (set! mcep_parent (utt.relation.append utt 'mcep_link state)) 189 (while (<= (* num_frames frame_advance) end) 190 (set! mcep_frame (utt.relation.append utt 'mcep)) 191 (item.append_daughter mcep_parent mcep_frame) 192 (item.set_feat mcep_frame "frame_number" num_frames) 193 (item.set_feat mcep_frame "name" (item.name mcep_parent)) 194 (set! num_frames (+ 1 num_frames)) 195 ) 196 ) 197 (utt.relation.items utt 'HMMstate)) 198 199 ;; Copy the final state end back up on to the segment for consistency 200 (mapcar 201 (lambda (seg) 202 (item.set_feat seg "end" (item.feat seg "R:segstate.daughtern.end"))) 203 (utt.relation.items utt 'Segment)) 204 205 (utt.set_feat utt "param_track_num_frames" num_frames) 206 utt) 207) 208 209(define (cg_F0_smooth track j) 210 (let ((p 0.0) 211 (i 0) 212 (num_frames (- (track.num_frames track) 1))) 213 214 (set! i 1) 215 (while (< i num_frames) 216 (set! this (track.get track i j)) 217 (set! next (track.get track (+ i 1) j)) 218 (if (> this 0.0) 219 (track.set 220 track i j 221 (/ (+ (if (> p 0.0) p this) 222 this 223 (if (> next 0.0) next this)) 224 3.0))) 225 (set! p this) 226 (set! i (+ 1 i))) 227 ) 228) 229 230(define (cg_mcep_smooth track j) 231 (let ((p 0.0) 232 (i 0) 233 (num_frames (- (track.num_frames track) 1))) 234 235 (set! i 1) 236 (while (< i num_frames) 237 (set! this (track.get track i j)) 238 (set! next (track.get track (+ i 1) j)) 239 (track.set 240 track i j 241 (/ (+ p this next) 3.0)) 242 (set! p this) 243 (set! i (+ 1 i))) 244 ) 245) 246 247;; For normal synthesis make unvoiced states unvoiced, but we don't 248;; do this during testing 249(defvar cg_predict_unvoiced t) 250 251(define (ClusterGen_predict_F0 mcep npow_val f0_val param_track) 252 "(ClusterGen_predict_F0 mcep npow_val f0_val param_track) 253Predict the F0 (or not)." 254 (if (and cg_predict_unvoiced 255 (string-equal "-" 256 (item.feat 257 mcep "R:mcep_link.parent.R:segstate.parent.ph_vc")) 258 (or (string-equal "-" 259 (item.feat 260 mcep "R:mcep_link.parent.R:segstate.parent.ph_cvox")) 261 (and (string-equal "+" 262 (item.feat 263 mcep "R:mcep_link.parent.R:segstate.parent.ph_cvox")) 264 (< npow_val 0.0)))) 265 (track.set param_track i 0 0.0) ;; make it unvoiced 266 (track.set param_track i 0 f0_val)) ;; make it voiced 267 (format stderr "%d\n" i) 268 (track.set param_track i 0 0.0) ;; make it unvoiced 269 ) 270 271(define (ClusterGen_predict_mcep utt) 272 (let ((param_track nil) 273 (frame_advance cg:frame_shift) 274 (frame nil) (f nil) (f0_val) 275 (num_channels (/ (track.num_channels clustergen_param_vectors) 2)) 276 ) 277 278 ;; Predict mcep values 279 (set! i 0) 280 (set! param_track 281 (track.resize nil 282 (utt.feat utt "param_track_num_frames") 283 num_channels)) 284 (utt.set_feat utt "param_track" param_track) 285 (mapcar 286 (lambda (mcep) 287 ;; Predict mcep frame 288 (let ((mcep_tree (assoc_string (item.name mcep) clustergen_mcep_trees)) 289 (f0_tree (assoc_string (item.name mcep) clustergen_f0_trees)) 290 (npow_tree (assoc_string (item.name mcep) clustergen_npow_trees))) 291 (if (null mcep_tree) 292 (format t "ClusterGen: can't find cluster tree for %s\n" 293 (item.name mcep)) 294 (begin 295 ;; F0 prediction 296 297 (set! f0_val (wagon mcep (cadr f0_tree))) 298 (set! npow_val (wagon mcep (cadr npow_tree))) 299 300 (ClusterGen_predict_F0 mcep (cadr npow_val) (cadr f0_val) param_track) 301 302 ;; MCEP prediction 303 (set! frame (wagon mcep (cadr mcep_tree))) 304 (set! j 1) 305 (set! f (car frame)) 306 (while (< j num_channels) 307 (track.set param_track i j 308 (track.get clustergen_param_vectors f (* 2 j))) 309 (set! j (+ 1 j))))) 310 311 (track.set_time param_track i (* i frame_advance)) 312 (set! i (+ 1 i)))) 313 (utt.relation.items utt 'mcep)) 314 (cg_F0_smooth param_track 0) 315 (mapcar 316 (lambda (x) 317 (cg_mcep_smooth param_track x)) 318 '( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25)) 319 utt 320 ) 321) 322 323(define (cg_voiced state) 324 "(cg_voiced state) 325t if this state is voices, nil otherwise." 326 (if (and cg_predict_unvoiced 327 (string-equal "-" (item.feat state "R:segstate.parent.ph_vc")) 328 (string-equal "-" (item.feat state "R:segstate.parent.ph_cvox"))) 329 nil 330 t)) 331 332(define (ClusterGen_predict_trajectory utt) 333 (let ((param_track nil) 334 (frame_advance cg:frame_shift) 335 (frame nil) (f nil) (f0_val) 336 (num_channels (track.num_channels clustergen_param_vectors)) 337; (num_channels (track.num_channels traj::clustergen_param_vectors)) 338 ) 339 340 ;; Predict mcep values 341 (set! i 0) 342 (set! param_track 343 (track.resize nil 344 (utt.feat utt "param_track_num_frames") 345 num_channels)) 346 (utt.set_feat utt "param_track" param_track) 347; (set! param_track (utt.feat utt "param_track")) 348 (mapcar 349 (lambda (state) 350 ;; Predict mcep frame 351 (let ((mcep_tree (assoc_string (item.name state) clustergen_mcep_trees)) 352 (f0_tree (assoc_string (item.name state) clustergen_f0_trees)) 353 (npow_tree (assoc_string (item.name state) clustergen_npow_trees)) 354 ) 355 (set! f0_val (wagon state (cadr f0_tree))) 356 (set! npow_val (wagon state (cadr npow_tree))) 357; (set! npow_val '(0 0)) 358; (set! f0_val '(0 0)) 359 (if (null mcep_tree) 360 (format t "ClusterGen: can't find cluster tree for %s\n" 361 (item.name state)) 362 (begin 363 ;; feature prediction (F0 and mcep) 364 (set! trajectory (wagon state (cadr mcep_tree))) 365 (if (item.relation.daughters state 'mcep_link) 366 (begin 367 (if (assoc 'cg::trajectory_ola clustergen_mcep_trees) 368 (cg:add_trajectory_ola 369 (caar trajectory) 370 (cadr (car trajectory)) 371 state 372 num_channels 373 param_track 374 frame_advance) 375 (cg:add_trajectory 376 (caar trajectory) 377 (cadr (car trajectory)) 378 state 379 num_channels 380 param_track 381 frame_advance 382 (cadr f0_val) 383 (cadr npow_val)))) 384 385 ))) 386 387 )) 388 (utt.relation.items utt 'HMMstate)) 389 390; (track.save param_track "trajectory.track") 391 392 (cg_F0_smooth param_track 0) 393 (mapcar 394 (lambda (x) 395 (cg_mcep_smooth param_track x)) 396; '( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25) 397 '( 1 2 3 ) 398 ) 399 utt 400 ) 401) 402 403(define (cg:add_trajectory s_start s_frames state num_channels 404 param_track frame_advance f0_val npow_val) 405"(cg:add_trajectory start n state num_channels) 406Add trajectory to daughters of state, interpolating as necessary." 407 (let ((j 0) (i 0) 408 (mceps (item.relation.daughters state 'mcep_link))) 409 410 (set! t_start (item.feat (car mceps) "frame_number")) 411 (set! t_frames (length mceps)) 412 (set! m (/ s_frames t_frames)) 413 (set! f 0) 414 415 (while (< i t_frames) 416 ;; find f 417 (set! s_pos (+ s_start f)) 418 419 (if (and cg_predict_unvoiced 420 (string-equal "-" 421 (item.feat 422 state "R:segstate.parent.ph_vc")) 423 (or (string-equal "-" 424 (item.feat 425 state "R:segstate.parent.ph_cvox")) 426 (and (string-equal "+" 427 (item.feat 428 state "R:segstate.parent.ph_cvox")) 429 (< npow_val -7.0)))) 430 (track.set param_track (+ i t_start) 0 0.0) ;; make it unvoiced 431 (track.set param_track (+ i t_start) 0 f0_val)) ;; make it voiced 432 433;; (if (cg_voiced state) 434;; (track.set param_track (+ i t_start) 0 435;; (track.get clustergen_param_vectors s_pos 0))) 436 437 (set! j 1) 438 (while (< j num_channels) 439 (track.set param_track (+ i t_start) j 440 (track.get clustergen_param_vectors 441 s_pos j)) 442 (set! j (+ 1 j))) 443 (set! f (+ m f)) 444 (track.set_time param_track 445 (+ i t_start) (* (+ i t_start) frame_advance)) 446 (set! i (+ i 1)) 447 ) 448 ) 449 ) 450 451(define (cg:add_trajectory_ola s_start s_frames state num_channels 452 param_track frame_advance) 453"(cg:add_trajectory start n state num_channels) 454Add trajectory to daughters of state, interpolating as necessary." 455 (let ((j 0) (i 0) (s1l 0) (s2l 0) (m 0.0) (w 0.0) 456 (t_start 0) (t_frames 0) (s_offset 0) 457 (mceps1 nil) (mceps2 nil)) 458 459 (set! i 0) 460 (while (< i s_frames) 461 (if (equal? -1.0 (track.get clustergen_param_vectors (+ s_start i) 0)) 462 (set! s1l i)) 463 (set! i (+ i 1))) 464 465 (if (and (item.prev state) 466 (item.relation.daughters (item.prev state) 'mcep_link) 467 (> s1l 0)) 468 (begin ;; do overlap on previous 469 (set! mceps1 (item.relation.daughters (item.prev state) 'mcep_link)) 470 (set! first_half_delta (/ 1.0 (length mceps1))) 471 (set! t_start (item.feat (car mceps1) "frame_number")) 472 (set! t_frames (length mceps1)) 473 (set! m (/ s1l t_frames)) 474 (set! i 0) 475 (set! w 0.0) 476 (while (< i t_frames) 477 (set! s_offset (nint (* i m))) 478 (if (not (< s_offset s1l)) 479 (begin 480; (format t "boing pre\n") 481 (set! s_offset (- s1l 1)))) 482 (set! s_pos (+ s_start s_offset)) 483 (if (< (track.get clustergen_param_vectors s_pos 0) 0) 484 (format t "assigning pre -1/-2 %d %d %f\n" s_pos i m)) 485 ;; F0 Prediction 486 (if (cg_voiced state) 487 (track.set param_track (+ i t_start) 0 488 (+ (* (- 1.0 w) (track.get param_track (+ i t_start) 0)) 489 (* w (track.get clustergen_param_vectors s_pos 0))))) 490 491 ;; MCEP Prediction 492 (set! j 1) 493 (while (< j num_channels) 494 (track.set param_track (+ i t_start) j 495 (+ (* (- 1.0 w) (track.get param_track (+ i t_start) j)) 496 (* w 497 (track.get clustergen_param_vectors s_pos (* 2 j)) 498 ) 499 ) 500 ) 501 (set! j (+ 1 j))) 502 (set! i (+ 1 i)) 503 (set! w (+ w first_half_delta)) 504 (if (> w 1.0) (set! w 1.0)) 505 ) 506 )) 507 508 ;; do assignment on current unit 509 (set! mceps2 (item.relation.daughters state 'mcep_link)) 510 (set! t_start (item.feat (car mceps2) "frame_number")) 511 (set! t_frames (length mceps2)) 512 (set! s2l (- s_frames (+ s1l 2))) 513 (set! s2_start (+ s_start s1l 1)) 514 (set! m (/ s2l t_frames)) 515 (set! i 0) 516 (while (< i t_frames) 517 (set! s_offset (nint (* i m))) 518 (if (not (< s_offset s2l)) 519 (set! s_offset (- s2l 1))) 520 (set! s_pos (+ s2_start s_offset)) 521 (if (< (track.get clustergen_param_vectors s_pos 0) 0) 522 (format t "assigning -1/-2 %d %d %f %f\n" s_pos i m 523 (track.get clustergen_param_vectors s_pos 0))) 524 ;; F0 Prediction 525 (if (cg_voiced state) 526 (track.set param_track (+ i t_start) 0 527 (track.get clustergen_param_vectors s_pos 0))) 528 ;; MCEP Prediction 529 (set! j 1) 530 (while (< j num_channels) 531 (track.set param_track (+ i t_start) j 532 (track.get clustergen_param_vectors s_pos (* 2 j))) 533 (set! j (+ 1 j))) 534 (track.set_time param_track 535 (+ i t_start) (* (+ i t_start) frame_advance)) 536 (set! i (+ 1 i)) 537 ) 538 ) 539) 540 541;; Sort of historical it should be set in INST_LANG_VOX_cg.scm 542;; but maybe not in old instantiations 543(defvar cluster_synth_method 544 (if (boundp 'mlsa_resynthesis) 545 cg_wave_synth 546 cg_wave_synth_external )) 547 548(provide 'clustergen) 549