1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2;;; ;;; 3;;; Carnegie Mellon University ;;; 4;;; and Alan W Black and Kevin Lenzo ;;; 5;;; Copyright (c) 1998-2000 ;;; 6;;; All Rights Reserved. ;;; 7;;; ;;; 8;;; Permission is hereby granted, free of charge, to use and distribute ;;; 9;;; this software and its documentation without restriction, including ;;; 10;;; without limitation the rights to use, copy, modify, merge, publish, ;;; 11;;; distribute, sublicense, and/or sell copies of this work, and to ;;; 12;;; permit persons to whom this work is furnished to do so, subject to ;;; 13;;; the following conditions: ;;; 14;;; 1. The code must retain the above copyright notice, this list of ;;; 15;;; conditions and the following disclaimer. ;;; 16;;; 2. Any modifications must be clearly marked as such. ;;; 17;;; 3. Original authors' names are not deleted. ;;; 18;;; 4. The authors' names are not used to endorse or promote products ;;; 19;;; derived from this software without specific prior written ;;; 20;;; permission. ;;; 21;;; ;;; 22;;; CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK ;;; 23;;; DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ;;; 24;;; ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT ;;; 25;;; SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE ;;; 26;;; FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;;; 27;;; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN ;;; 28;;; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ;;; 29;;; ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ;;; 30;;; THIS SOFTWARE. ;;; 31;;; ;;; 32;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 33;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 34;;; ;;; 35;;; Code for building data for prompts, aligning and unit selection ;;; 36;;; synthesizer ;;; 37;;; ;;; 38;;; This file is only used at database build time ;;; 39;;; ;;; 40;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 41 42(defvar cmu_us_rms_arctic::clunits_dir ".") 43 44(require 'clunits_build) 45 46;;; Basic voice definition file with voice defines and clunit 47;;; parameter definition for run time. 48(load "festvox/cmu_us_rms_arctic_clunits.scm") 49 50;;; Add Build time parameters 51(set! cmu_us_rms_arctic::dt_params 52 (cons 53 ;; in case cmu_us_rms_arctic_clunits defines this too, put this at start 54 (list 'db_dir (string-append cmu_us_rms_arctic::clunits_dir "/")) 55 (append 56 cmu_us_rms_arctic::dt_params 57 (list 58 ;;; In cmu_us_rms_arctic_clunits.scm 59 ;;'(coeffs_dir "lpc/") 60 ;;'(coeffs_ext ".lpc") 61 '(disttabs_dir "festival/disttabs/") 62 '(utts_dir "festival/utts/") 63 '(utts_ext ".utt") 64 '(dur_pen_weight 0.0) 65 '(f0_pen_weight 0.0) 66 '(get_stds_per_unit t) 67 '(ac_left_context 0.8) 68 '(ac_weights 69 (0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5)) 70 ;; Join weights in cmu_us_rms_arctic_clunits.scm 71 ;; Features for extraction 72 '(feats_dir "festival/feats/") 73 '(feats 74 (occurid 75 p.name p.ph_vc p.ph_ctype 76 p.ph_vheight p.ph_vlng 77 p.ph_vfront p.ph_vrnd 78 p.ph_cplace p.ph_cvox 79 n.name n.ph_vc n.ph_ctype 80 n.ph_vheight n.ph_vlng 81 n.ph_vfront n.ph_vrnd 82 n.ph_cplace n.ph_cvox 83 segment_duration 84 seg_pitch p.seg_pitch n.seg_pitch 85 R:SylStructure.parent.stress 86 seg_onsetcoda n.seg_onsetcoda p.seg_onsetcoda 87 R:SylStructure.parent.accented 88 pos_in_syl 89 syl_initial 90 syl_final 91 R:SylStructure.parent.syl_break 92 R:SylStructure.parent.R:Syllable.p.syl_break 93 R:SylStructure.parent.position_type 94 pp.name pp.ph_vc pp.ph_ctype 95 pp.ph_vheight pp.ph_vlng 96 pp.ph_vfront pp.ph_vrnd 97 pp.ph_cplace pp.ph_cvox 98 n.lisp_is_pau 99 p.lisp_is_pau 100 R:SylStructure.parent.parent.gpos 101 R:SylStructure.parent.parent.R:Word.p.gpos 102 R:SylStructure.parent.parent.R:Word.n.gpos 103 )) 104 ;; Wagon tree building params 105; (trees_dir "festvox/") ;; in cmu_us_rms_arctic_clunits.scm 106 '(wagon_field_desc "festival/clunits/all.desc") 107 '(wagon_progname "$ESTDIR/bin/wagon") 108 '(wagon_cluster_size 20) 109 '(prune_reduce 0) 110 '(cluster_prune_limit 40) 111 ;; The dictionary of units used at run time 112; (catalogue_dir "festvox/") ;; in cmu_us_rms_arctic_clunits.scm 113 ;; Run time parameters 114 ;; all in cmu_us_rms_arctic_clunits.scm 115 ;; Files in db, filled in at build_clunits time 116 ;; (files ("time0001" "time0002" ....)) 117)))) 118 119(define (build_clunits file) 120 "(build_clunits file) 121Build cluster synthesizer for the given recorded data and domain." 122 (build_clunits_init file) 123 (do_all) ;; someday I'll change the name of this function 124) 125 126(define (build_clunits_init file) 127 "(build_clunits_init file) 128Get setup ready for (do_all) (or (do_init))." 129 (eval (list cmu_us_rms_arctic::closest_voice)) 130 131 ;; Add specific fileids to the list for this run 132 (set! cmu_us_rms_arctic::dt_params 133 (append 134 cmu_us_rms_arctic::dt_params 135 (list 136 (list 137 'files 138 (mapcar car (load file t)))))) 139 140 (set! dt_params cmu_us_rms_arctic::dt_params) 141 (set! clunits_params cmu_us_rms_arctic::dt_params) 142) 143 144(define (do_prompt name text) 145 "(do_prompt name text) 146Synthesize given text and save waveform and labels for prompts." 147 (let ((utt1 (utt.synth (eval (list 'Utterance 'Text text))))) 148 (utt.save utt1 (format nil "prompt-utt/%s.utt" name)) 149 (utt.save.segs utt1 (format nil "prompt-lab/%s.lab" name)) 150 (utt.save.wave utt1 (format nil "prompt-wav/%s.wav" name)) 151 t)) 152 153(define (build_prompts file) 154 "(build_prompt file) 155For each utterances in prompt file, synth and save waveform and 156labels for prompts and aligning." 157 (set! cmu_us_rms_arctic::clunits_prompting_stage t) 158 (voice_cmu_us_rms_arctic_clunits) 159 (let ((p (load file t))) 160 (mapcar 161 (lambda (l) 162 (format t "%s PROMPTS\n" (car l)) 163 (do_prompt (car l) (cadr l)) 164 t) 165 p) 166 t)) 167 168(define (build_utts file) 169 "(build_utts file) 170For each utterances in prompt file, synthesize and merge aligned labels 171to predicted labels building a new utetrances and saving it." 172 (set! cmu_us_rms_arctic::clunits_prompting_stage t) 173 (voice_cmu_us_rms_arctic_clunits) 174 (let ((p (load file t))) 175 (mapcar 176 (lambda (l) 177 (format t "%s UTTS\n" (car l)) 178 (align_utt (car l) (cadr l)) 179 t) 180 p) 181 t)) 182 183(define (align_utt name text) 184 "(align_utts file) 185Synth an utterance and load in the actualed aligned segments and merge 186them into the synthesizer utterance." 187 (let (; (utt1 (utt.load nil (format nil "prompt-utt/%s.utt" name))) 188 (utt1 (utt.synth (eval (list 'Utterance 'Text text)))) 189 (silence (car (cadr (car (PhoneSet.description '(silences)))))) 190 segments actual-segments) 191 192 (utt.relation.load utt1 'actual-segment 193 (format nil "lab/%s.lab" name)) 194 (set! segments (utt.relation.items utt1 'Segment)) 195 (set! actual-segments (utt.relation.items utt1 'actual-segment)) 196 197 ;; These should align, but if the labels had to be hand edited 198 ;; then they may not, we cater here for insertions and deletions 199 ;; of silences int he corrected hand labelled files (actual-segments) 200 ;; If you need to something more elaborate you'll have to change the 201 ;; code below. 202 (while (and segments actual-segments) 203 (cond 204 ((string-equal (string-append "#" (item.name (car segments))) 205 (item.name (car actual-segments))) 206 ;; junk unit that is to be ignored 207 (item.set_feat (car segments) "end" 208 (item.feat (car actual-segments) "end")) 209 (item.set_feat (car segments) "ignore" "1") 210 (set! segments (cdr segments)) 211 (set! actual-segments (cdr actual-segments))) 212 ((and (not (string-equal (item.name (car segments)) 213 (item.name (car actual-segments)))) 214 (or (string-equal (item.name (car actual-segments)) silence) 215 (string-equal (item.name (car actual-segments)) "H#") 216 (string-equal (item.name (car actual-segments)) "h#"))) 217 (item.insert 218 (car segments) 219 (list silence (list (list "end" (item.feat 220 (car actual-segments) "end")))) 221 'before) 222 (set! actual-segments (cdr actual-segments))) 223 ((and (not (string-equal (item.name (car segments)) 224 (item.name (car actual-segments)))) 225 (string-equal (item.name (car segments)) silence)) 226 (item.delete (car segments)) 227 (set! segments (cdr segments))) 228 ((string-equal (item.name (car segments)) 229 (item.name (car actual-segments))) 230 (item.set_feat (car segments) "end" 231 (item.feat (car actual-segments) "end")) 232 (set! segments (cdr segments)) 233 (set! actual-segments (cdr actual-segments))) 234 (t 235 (format stderr 236 "align missmatch at %s (%f) %s (%f)\n" 237 (item.name (car segments)) 238 (item.feat (car segments) "end") 239 (item.name (car actual-segments)) 240 (item.feat (car actual-segments) "end")) 241 (error))) 242 ) 243 244 (mapcar 245 (lambda (a) 246 ;; shorten and split sliences 247 (while (and (string-equal (item.name a) silence) 248 (> (item.feat a "segment_duration") 0.300)) 249; (format t "splitting %s silence of %f at %f\n" 250; (item.name a) 251; (item.feat a "segment_duration") 252; (item.feat a "end")) 253 (cond 254 ((string-equal "h#" (item.feat a "p.name")) 255 (item.set_feat (item.prev a) "end" 256 (+ 0.150 (item.feat a "p.end")))) 257 ((and (string-equal silence (item.feat a "p.name")) 258 (string-equal silence (item.feat a "p.p.name"))) 259 (item.set_feat (item.prev a) "end" 260 (+ 0.150 (item.feat a "p.end"))) 261 (item.set_feat (item.prev a) "name" silence)) 262 (t 263 (item.insert a 264 (list silence 265 (list 266 (list "end" 267 (+ 0.150 268 (item.feat a "p.end"))))) 269 'before))))) 270 (utt.relation.items utt1 'Segment)) 271 272 (utt.relation.delete utt1 'actual-segment) 273 (utt.set_feat utt1 "fileid" name) 274 ;; If we have an F0 add in targets too 275 (if (probe_file (format nil "f0/%s.f0" name)) 276 (build::add_targets utt1)) 277 (utt.save utt1 (format nil "festival/utts/%s.utt" name)) 278 t)) 279 280;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 281;;; Some prosody modelling code 282;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 283 284(define (build::add_targets utt) 285 "(build::add_targets utt) 286Adds targets based on the F0 in f0/*.f0. Adds a point to each vowel." 287 (let ((fileid (utt.feat utt "fileid")) 288 (f0_points)) 289 (set! f0_points (build::load_f0_points fileid)) 290 (set! awb_f0_points f0_points) 291 ;; Get rid of the old one 292 (utt.relation.delete utt 'Target) 293 ;; Create a new one 294 (utt.relation.create utt 'Target) 295 (build::add_target 296 utt 297 f0_points) 298 utt)) 299 300(define (build::add_target utt f0_points) 301 "(build::add_target utt f0_points) 302Add F0 points at start or syllable, mid point of each vowel, and 303last segment before silence. The F0 continued over non-voiced 304periods is such a naive and hopless way its embarrassing." 305 (let ((s (utt.relation.first utt 'Segment)) 306 (f0s f0_points) 307 targ) 308 (while s 309 (if (and (not (member_string 310 (item.name s) 311 (cadr (car (PhoneSet.description '(silences)))))) 312 (or (string-equal "1" (item.feat s "syl_initial")) 313 (string-equal "+" (item.feat s "ph_vc")) 314 (member_string 315 (item.feat s "n.name") 316 (cadr (car (PhoneSet.description '(silences))))))) 317 (begin 318 (set! targ (utt.relation.append utt 'Target s)) 319 (if (string-equal "1" (item.feat s "syl_initial")) 320 (item.relation.append_daughter 321 targ 322 'Target 323 (list 324 "0" 325 (list 326 (list 'f0 (build::get_f0_at f0s (item.feat s "segment_start"))) 327 (list 'pos (item.feat s "segment_start")))))) 328 (if (string-equal "+" (item.feat s "ph_vc")) 329 (item.relation.append_daughter 330 targ 331 'Target 332 (list 333 "0" 334 (list 335 (list 'f0 (build::get_f0_at f0s (item.feat s "segment_mid"))) 336 (list 'pos (item.feat s "segment_mid")))))) 337 (if (member_string 338 (item.feat s "n.name") 339 (cadr (car (PhoneSet.description '(silences))))) 340 (item.relation.append_daughter 341 targ 342 'Target 343 (list 344 "0" 345 (list 346 (list 'f0 (build::get_f0_at f0s (item.feat s "segment_end"))) 347 (list 'pos (item.feat s "segment_end")))))))) 348 (set! s (item.next s)) 349 )) 350) 351 352(define (build::get_f0_at f0s position) 353 "(build::get_f0_at f0s position) 354Returns the non-zero F0 nearest to position." 355 (build::get_f0_at_2 356 -1 357 f0s 358 position)) 359 360(define (build::get_f0_at_2 f0 f0s position) 361 "(build::get_f0_at f0 f0s position) 362Returns the non-zero F0 nearest to position." 363 (cond 364 ((null f0s) 365 (if (> f0 0) 366 f0 367 110 ;; aint nothing there at all at all 368 )) 369 (t 370 (if (> 0 (cadr (car f0s))) 371 (set! f0 (cadr (car f0s)))) 372 (cond 373 ((and (>= position (car (car f0s))) 374 (<= position (car (cadr f0s)))) 375 (if (< f0 1) 376 (build::find_first_f0 f0s) 377 f0)) 378 (t 379 (build::get_f0_at_2 f0 (cdr f0s) position)))))) 380 381(define (build::find_first_f0 f0s) 382 (cond 383 ((null f0s) 384 110 ;; last resort 385 ) 386 ((> (cadr (car f0s)) 0) 387 (cadr (car f0s))) 388 (t 389 (build::find_first_f0 (cdr f0s))))) 390 391(define (build::load_f0_points fileid) 392 "(build::load_f0_points fileid) 393Extract F0 as ascii times and values from the F0 file and load 394it as a simple assoc list." 395 (let ((f0asciifile (make_tmp_filename)) 396 f0fd point points 397 (time 0)) 398 (system 399 (format nil "$EST%s/bin/ch_track -otype ascii -o %s f0/%s.f0" 400 "DIR" ;; to stop that var name being mapped. 401 f0asciifile 402 fileid)) 403 (set! f0fd (fopen f0asciifile "r")) 404 (while (not (equal? (set! point (readfp f0fd)) (eof-val))) 405 (set! points 406 (cons 407 (list time point) points)) 408 (set! time (+ 0.005 time)) 409 ;; skip the second field. 410 (readfp f0fd)) 411 (fclose f0fd) 412 (delete-file f0asciifile) 413 (reverse points))) 414 415;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 416;;; Code to try to find bad labelling by looking at duration distribution 417;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 418 419;;; A simple sufficient statistics class 420(define (suffstats.new) 421 (list 422 0 ;; n 423 0 ;; sum 424 0 ;; sumx 425 )) 426 427(define (suffstats.set_n x n) 428 (set-car! x n)) 429(define (suffstats.set_sum x sum) 430 (set-car! (cdr x) sum)) 431(define (suffstats.set_sumx x sumx) 432 (set-car! (cdr (cdr x)) sumx)) 433(define (suffstats.n x) 434 (car x)) 435(define (suffstats.sum x) 436 (car (cdr x))) 437(define (suffstats.sumx x) 438 (car (cdr (cdr x)))) 439(define (suffstats.reset x) 440 (suffstats.set_n x 0) 441 (suffstats.set_sum x 0) 442 (suffstats.set_sumx x 0)) 443(define (suffstats.add x d) 444 (suffstats.set_n x (+ (suffstats.n x) 1)) 445 (suffstats.set_sum x (+ (suffstats.sum x) d)) 446 (suffstats.set_sumx x (+ (suffstats.sumx x) (* d d))) 447) 448 449(define (suffstats.mean x) 450 (/ (suffstats.sum x) (suffstats.n x))) 451(define (suffstats.variance x) 452 (/ (- (* (suffstats.n x) (suffstats.sumx x)) 453 (* (suffstats.sum x) (suffstats.sum x))) 454 (* (suffstats.n x) (- (suffstats.n x) 1)))) 455(define (suffstats.stddev x) 456 (sqrt (suffstats.variance x))) 457 458(define (cummulate_stats stats phone duration) 459 (let ((pstat (car (cdr (assoc_string phone stats)))) 460 (newstats stats)) 461 (if (null pstat) 462 (begin 463 (set! pstat (suffstats.new)) 464 (set! newstats (cons (list phone pstat) stats)))) 465 (suffstats.add pstat duration) 466 newstats)) 467 468(define (collect_dur_stats utts) 469 (let ((stats nil)) 470 (mapcar 471 (lambda (u) 472 (mapcar 473 (lambda (s) 474 (set! stats (cummulate_stats 475 stats 476 (item.name s) 477 (item.feat s "segment_duration")))) 478 (utt.relation.items u 'Segment))) 479 utts) 480 stats)) 481 482(define (score_utts utts durstats ofile) 483 (let ((ofd (fopen ofile "w"))) 484 (mapcar 485 (lambda (u) 486 (let ((score 0) (tot 0)) 487 (format ofd "%s " (utt.feat u "fileid")) 488 (mapcar 489 (lambda (s) 490 (let ((stats (car (cdr (assoc_string (item.name s) durstats)))) 491 (dur (item.feat s "segment_duration")) 492 (zscore)) 493 (set! tot (+ 1 tot)) 494 (set! zscore (/ (- dur (suffstats.mean stats)) 495 (suffstats.stddev stats))) 496 (if (< zscore 0) 497 (set! zscore (* -1 zscore))) 498 (if (or (< dur 0.011) 499 (> zscore 3)) 500 (set! score (+ 1 score))))) 501 (utt.relation.items u 'Segment)) 502 (format ofd "%0.4f %d %d\n" 503 (/ score tot) 504 score 505 tot))) 506 utts))) 507 508(define (make_simple_utt fileid) 509 (let ((utt (Utterance Text ""))) 510 (utt.relation.load utt 'Segment 511 (format nil "lab/%s.lab" fileid)) 512 (utt.set_feat utt "fileid" fileid) 513 utt)) 514 515(define (find_outlier_utts file ofile) 516 (voice_kal_diphone) 517 (let ((p (load file t)) 518 utts dur_states) 519 (set! utts (mapcar (lambda (l) (make_simple_utt (car l))) p)) 520 (set! dur_stats (collect_dur_stats utts)) 521 (score_utts utts dur_stats ofile) 522 t)) 523 524(provide 'build_clunits) 525 526