festlex-czech/festival-czech-0.2.1/czech.scm.in

26 (define (czech-min x y)
29 (define (czech-max x y)
32 (define (czech-item.has-feat item feat)
35 (define (czech-item.feat? item feat value)
38 (define (czech-item.feat*? item feat value)
41 (define (czech-all-same lst)
44            (czech-all-same (cdr lst)))))
46 (define (czech-suffix string i)
49 (defvar czech-randomize t)
51 (defvar czech-rand-range nil)
53 (defvar czech-moravian t)
55 (defvar czech-insert-filling-vowels t)
57 (defvar czech-group-digits 3)
59 (define (czech-rand)
60   (if czech-randomize
62         (if (not czech-rand-range)
70               (set! czech-rand-range 1)
71               (while (> max czech-rand-range)
72                 (set! czech-rand-range (* 2 czech-rand-range)))))
73         (/ (rand) czech-rand-range))
76 (define (czech-random-choice lst)
78     (let ((n (* (czech-rand) max)))
81 (define (czech-next-token-punc word)
86 (define (czech-next-punc word)
87   (let ((token (item.next (item.parent (item.relation word 'Token)))))
88     (while (and token (not (string-matches (item.feat token 'punc) "[^0]+")))
89       (set! token (item.next token)))
90     (if token
91         (item.feat token 'punc)
94 (define (czech-prev-punc word)
95   (let ((token (item.prev (item.parent (item.relation word 'Token)))))
96     (while (and token (not (string-matches (item.feat token 'punc) "[^0]+")))
97       (set! token (item.prev token)))
98     (if token
99         (item.feat token 'punc)
102 (define (czech-word-stress-unit word)
107 (define (czech-stress-unit-punc unit)
113 (defPhoneSet czech  unknown
174 (defvar czech-phoneset-translation '())
175 (defvar czech-phoneset-translation* nil)
180  czech-normalize
287  czech-orthography
444 (defvar czech-unknown-symbol-word "nezn�m�")
446 (defvar czech-lts-extra-rules '())
448 (define (czech-basic-lts word)
449   (let ((word (if (lts.in.alphabet word 'czech-normalize)
451                   czech-unknown-symbol-word)))
455                               (lts.apply word 'czech-normalize)
456                               'czech-orthography))
460 (define (czech-syllabify-phstress phones)
465 (define (czech-lts word features)
469                                 (czech-basic-lts word))))
471               (czech-syllabify-phstress
472                (let ((rules czech-lts-extra-rules*))
479 (define (czech-downcase word)
480   (if (lts.in.alphabet word 'czech-normalize)
481       (apply string-append (lts.apply word 'czech-normalize))
486 (defvar czech-token.unknown-word-name "nezn�m�")
487 (defvar czech-token.separator-word-name "odd�lova�") ; our own variable
488 (defvar czech-token.garbage-word-name "smet�")       ; our own variable
489 (defvar czech-token.whitespace " �\t\n\r")
490 (defvar czech-token.punctuation "\"'`.,:;!?-(){}[]<>")
491 (defvar czech-token.prepunctuation "\"'`({[<")
495 (defvar czech-chars "a-zA-Z����������������������������ة����ݮ")
496 (defvar czech-char-regexp (string-append "[" czech-chars "]"))
498 (defvar czech-multiword-abbrevs
524 (define (czech-remove element list)
527    ((equal? element (car list)) (czech-remove element (cdr list)))
528    (t (cons (car list) (czech-remove element (cdr list))))))
530 (define (czech-number name)
534           (czech-number (czech-suffix name 1))))
537       (append (czech-number (string-before name comma))
539               (czech-number (string-after name comma)))))
543     (cons "nula" (czech-number (czech-suffix name 1))))
545     (czech-number-from-digits (czech-remove (car (symbolexplode " "))
548 (define (czech-digits-1 digits)
549   (if czech-group-digits
551         (while (> (- n czech-group-digits) 0)
552           (set! n (- n czech-group-digits)))
553         (append (czech-number (substring digits 0 n))
554                 (if (> (length digits) czech-group-digits)
555                     (czech-digits (czech-suffix digits n))
557       (czech-number digits)))
559 (define (czech-digits digits)
564     (append (czech-number "0") (czech-digits (czech-suffix digits 1))))
566     (czech-digits-1 digits))))
568 (define (czech-prepend-numprefix token name)
569   (if (czech-item.has-feat token 'numprefix)
570       (string-append (item.feat token 'numprefix) name)
573 (define (czech-number* token name)
574   (czech-number (czech-prepend-numprefix token name)))
576 (define (czech-number@ name)
583     (cons "nula" (czech-number (string-after name "0"))))
585     (czech-number name))))
587 (define (czech-number-from-digits digits)
629 	   (czech-number-from-digits (cdr digits)))))
644        (czech-number-from-digits (cdr digits))))
684 	      (czech-number-from-digits head-digits)
686 	   (czech-number-from-digits tail-digits)))))
688       (if czech-group-digits
689           (czech-digits (apply string-append digits))
690           (apply append (mapcar czech-number digits)))))))
692 (define (czech-tokenize-on-nonalphas string)
696    ((string-matches string (string-append "^" czech-char-regexp "*$"))
702       (while (string-matches (substring string i 1) czech-char-regexp)
714               (czech-tokenize-on-nonalphas
715                (czech-suffix string (+ i 1))))))))
717 (define (czech-token-to-words token name)
720    ((assoc_string (czech-downcase name) czech-multiword-abbrevs)
721     (apply append (mapcar (lambda (w) (czech-token-to-words token w))
722                           (cadr (assoc_string (czech-downcase name)
723                                               czech-multiword-abbrevs)))))
725          (item.prev token)
726          (czech-item.feat*? token "p.name" "[-+]?[0-9]+[.,]?[0-9]*"))
731              (czech-item.has-feat token 'numprefix))
732          (not (czech-item.has-feat token 'punc))
733          (item.feat token "n.whitespace" " ")
734          (string-matches (item.feat token "n.name") "^[0-9][0-9][0-9]$"))
735     (item.set_feat (item.next token) 'numprefix
736                    (czech-prepend-numprefix token name))
740          (czech-item.feat? token 'punc ".")
741          (item.next token)
742          (not (string-matches (item.feat token "n.whitespace") "  +")))
743     (if (not (czech-item.has-feat token 'punctype))
744         (item.set_feat token 'punctype 'num))
745     (append (czech-number* token name)
749          (not (czech-item.has-feat token 'numprefix)))
750     (czech-digits name))
752    ((let ((nname (czech-prepend-numprefix token name)))
756     (if (not (czech-item.has-feat token 'punctype))
757         (item.set_feat token 'punctype 'num))
758     (let ((nname (czech-prepend-numprefix token name)))
759       (if (and (czech-item.feat? token "n.name" "K�")
762            (czech-number (string-before nname ","))
767                   (czech-number hellers)
769           (czech-number nname))))
772          (string-matches (item.feat token "p.name") "^[-+]?[0-9]+,[-0-9]+$"))
778            (not (string-matches (item.feat token "p.name") capitals))
779            (not (string-matches (item.feat token "p.next") capitals))
786          (let ((expansion (cadr (assoc_string (czech-downcase phoneme)
787                                               czech-multiword-abbrevs))))
795        (lts.apply name 'czech-normalize))
803             (lts.apply name 'czech-normalize)))
805    ((and (string-matches name (string-append "^[^" czech-chars "0-9]+$"))
807          (czech-all-same (symbolexplode name)))
808     (list czech-token.separator-word-name))
809    ((and (string-matches name (string-append "^[^" czech-chars "0-9]$"))
810          (eqv? (length (item.daughters token)) 0)
811          (let ((punc (item.feat token 'punc)))
814                 (czech-all-same (symbolexplode punc)))))
815     (item.set_feat token 'punc 0)
816     (list czech-token.separator-word-name))
820          (not (string-matches (item.feat token "p.name")
822          (not (string-matches (item.feat token "p.name")
824          (not (string-matches (item.feat token "p.name")
826     (append (czech-number@ (string-before name ":"))
827             (czech-number@ (string-after name ":"))))
829     (append (czech-number@ (string-before name ":"))
830             (czech-number@ (string-before (string-after name ":") ":"))
831             (czech-number@ (string-after (string-after name ":") ":"))))
834     (append (czech-number (string-before name ":"))
836             (czech-number (string-after name ":"))))
842      (czech-token-to-words token (string-append
846      (czech-token-to-words token (string-after (substring name 1 1000) "-"))))
848    ((string-matches name (string-append "^" czech-char-regexp "+$"))
849     (if (string-equal (czech-downcase name) "�") ; Festival bug workaround
852    ((string-matches name (string-append "^[^" czech-chars "0-9]+$"))
855       (list czech-token.garbage-word-name))
857            (string-equal (item.name token) name)
858            (or (not (string-matches (item.feat token 'prepunctuation) "0?"))
859                (not (string-matches (item.feat token 'punctuation) "0?"))))
860       ;; This handles the case when the whole token consists of two or more
865      ((assoc_string name czech-multiword-abbrevs)
866       (cadr (assoc_string name czech-multiword-abbrevs)))
870    ((string-matches name (string-append "^" czech-char-regexp "+-$"))
871     (czech-token-to-words token (string-before name "-")))
873       (string-append "^[" czech-chars "0-9]+-[-" czech-chars "0-9]+$"))
875      (czech-token-to-words token (string-before name "-"))
877      (czech-token-to-words token (string-after name "-"))))
884       (append (czech-digits (substring name 0 i))
885               (czech-token-to-words token (czech-suffix name i)))))
896       (append (czech-token-to-words token (substring name 0 i))
897               (czech-digits (substring name i (- j i)))
898               (czech-token-to-words token (czech-suffix name j)))))
906     (if (not (string-matches name (string-append "^[-" czech-chars "]+$")))
907         (item.set_feat token 'punctype nil))
910      (mapcar (lambda (name) (czech-token-to-words token name))
911              (czech-tokenize-on-nonalphas name))))))
915 (defvar czech-lexicon-file "czech-lexicon.out")
917 (lex.create "czech")
918 (lex.set.phoneset "czech")
919 (lex.select "czech")
923     (let ((file (path-append (car dirs) czech-lexicon-file)))
932 (lex.set.lts.method 'czech-lts)
937 (defvar czech-guess-pos
967 (define (czech-word-pos? word pos)
969           (apply append (mapcar (lambda (p) (cdr (assoc p czech-guess-pos)))
972 (define (czech-pos-in-phrase-from word)
976                 (or (not (czech-item.feat*? w "R:Token.p.name" "0?"))
977                     (and (czech-item.feat*? w "p.R:Token.parent.punc" "0?")
978                          (czech-item.feat*? w "R:Token.parent.prepunctuation"
980                          (not (czech-item.feat*?
982                                (string-append "^[^" czech-chars "0-9]+$"))))))
987 (define (czech-pos-in-phrase-to word)
991                 (or (czech-item.feat*? w "R:Token.n.name" "0?")
992                     (and (czech-item.feat*? w "R:Token.parent.punc" "0?")
993                          (czech-item.feat*?
995                          (not (czech-item.feat*?
997                                (string-append "^[^" czech-chars "0-9]+$"))))))
1002 (define (czech-pos-last-in-phrase? word)
1003   (<= (czech-pos-in-phrase-to word) 1))
1005 (define (czech-pos utt)
1008      (let ((name (czech-downcase (item.name w)))
1009            (token (item.parent (item.relation w 'Token))))
1012         ((czech-item.has-feat w 'pos)
1015         ((and (czech-item.has-feat token 'punctype)
1016               (string-matches name (string-append "^[^" czech-chars "0-9]+$")))
1017          (item.set_feat w 'pos (item.feat token 'punctype)))
1028               (czech-pos-last-in-phrase? w))
1033               (or (czech-pos-last-in-phrase? w) ; final word
1034                   (czech-word-pos? (item.next w) '(prep0 prep))
1041                       (cdr (assoc 'question czech-guess-pos))))
1043         ;; Nothing special: check the czech-guess-pos tree
1045          (let ((pos-sets czech-guess-pos))
1055   (mapcar (lambda (token)
1056             (if (and (czech-item.feat*? token 'punc "0?")
1057                      (czech-item.feat? token "daughtern.R:Word.n.gpos" 'conj))
1058                 (item.set_feat token 'punc ",")))
1064 (define (czech-next-simple-punc word)
1065   (let ((unit (item.next (czech-word-stress-unit word))))
1069      ((string-matches (czech-stress-unit-punc unit) ".*[.?!;:,-]")
1070       (czech-stress-unit-punc unit))
1071      ((czech-item.feat? unit 'preelement 1)
1072       (czech-next-punc word))
1076 (define (czech-prev-simple-punc word)
1077   (let ((unit (item.prev (czech-word-stress-unit word))))
1081      ((string-matches (czech-stress-unit-punc unit) ".*[.?!;:,-]")
1082       (czech-stress-unit-punc unit))
1084       (let ((token (item.prev (item.parent (item.relation word 'Token)))))
1085         (while (and token (not (string-matches (item.feat token 'punc) ".+")))
1086           (set! token (item.prev token)))
1087         (let ((pword (and token
1088                           (item.next token)
1089                           (item.daughter1 (item.next token)))))
1091                    (czech-item.feat? (czech-word-stress-unit pword)
1093               (item.feat token 'punc)
1096 (defvar czech-phrase-cart-tree
1097   ;; Note: Additional corrections are applied in czech-adjust-phrase-breaks
1119          ((lisp_czech-next-token-punc matches "\".*")
1136 (define (czech-adjust-phrase-breaks utt)
1140              ((czech-item.feat? w 'pbreak 'XB1) ; "big" punctuations
1145                 ((czech-item.feat? w "R:SylStructure.name" 0)
1148                 ((or (czech-item.feat*? (czech-word-stress-unit w)
1151                      (czech-item.feat*? (czech-word-stress-unit w)
1157              ((czech-item.feat? w 'pbreak 'XB2) ; "comma" punctuations
1164                 ((czech-item.feat? w "R:SylStructure.name" 0)
1167                 ((czech-item.feat*? w "lisp_czech-next-simple-punc" ".*,")
1169                 ((czech-item.feat*? w "lisp_czech-prev-simple-punc" ".*,")
1171                 ((czech-item.feat*? w "lisp_czech-prev-simple-punc"
1174                 ((czech-item.feat*? (czech-word-stress-unit w)
1184 (define (czech-adjust-segments segments)
1200                    (czech-item.feat? item2 "ph_postnas" '+)
1207               (if czech-moravian
1212                    (czech-item.feat? item1 "ph_cvox" '-)
1216           (if (and (czech-item.feat? item1 "ph_cvox" '+)
1217                    (not (czech-item.feat? item1 "ph_partner" 0))
1221                        (czech-item.feat? item2 "ph_cvox" '-)
1222                        (and (czech-item.feat? item2 "ph_cvox" 'u)
1228                                    czech-proper-single-syl-prepositions))))))
1231           (if (and (czech-item.feat? item1 "ph_cvox" '-)
1232                    (not (czech-item.feat? item1 "ph_partner" 0))
1234                    (czech-item.feat? item2 "ph_cvox" '+)
1238         (czech-adjust-segments (cdr segments)))))
1240 (define (czech-adjust-phonetic-form utt)
1245         (czech-adjust-segments items)
1250 (define (czech-intonation-units utt)
1252   (let ((token (utt.relation utt 'Token)))
1253     (while token
1254       (if (or (czech-item.feat*? token "daughtern.pbreak" "[SBX]?B[12]?")
1255               (czech-item.feat*? token "daughtern.p.pbreak" "[SBX]?B[12]?"))
1256           (let ((w (item.daughtern token)))
1263       (set! token (item.next token))))
1275         (if (or (czech-item.feat*? w "sentence_break" 1)
1277                 ;; last token generates no words for whatever reason)
1291 (define (czech-yes-no-question int-unit)
1296        (not (czech-item.feat? int-unit
1299        (not (czech-item.feat? int-unit
1303 (defvar czech-proper-single-syl-prepositions
1306 (defvar czech-special-final-words
1309 (define (czech-syllable-kernels phonemes)
1313       (if (and (czech-item.feat? (car phonemes) 'ph_vc '-)
1314                (czech-item.feat? (car phonemes) 'ph_syl '+))
1317       (while (and phonemes (czech-item.feat? (car phonemes) 'ph_syl '-))
1323             (if (czech-item.feat? (car phonemes) 'ph_vc '-)
1325                             (czech-item.feat? (car phonemes) 'ph_vc '-)
1326                             (czech-item.feat? (car phonemes) 'ph_syl '+))
1330                         (czech-item.feat? (car phonemes) 'ph_vc '+)
1331                         (czech-item.feat? (car phonemes) 'ph_syl '+))
1336                 (while (and seg (or (czech-item.feat? seg 'ph_cvox '+)
1337                                     (czech-item.feat? seg 'ph_cvox 'u)))
1343 (define (czech-syllable-count phonemes)
1344   (length (czech-syllable-kernels phonemes)))
1346 (define (czech-stress-unit-phonemes unit)
1355 (define (czech-unit-syllable-count unit)
1356   (czech-syllable-count (czech-stress-unit-phonemes unit)))
1358 (define (czech-identify-stress-units sylwords)
1380                      (czech-item.feat? w 'pos 'prep0)
1382                      (and (member (czech-downcase (item.name w))
1383                                   czech-proper-single-syl-prepositions)
1384                           (not (czech-item.feat? w "pos" "se"))))
1392                 (if (and (<= (czech-unit-syllable-count last-unit) 1)
1394                                       czech-special-final-words)))
1401                               (<= (czech-unit-syllable-count (car units*)) 1))
1434                       (let ((len (czech-unit-syllable-count u))
1439                                     (<= (czech-unit-syllable-count
1459                                      (czech-random-choice '(t nil)))
1471                                    (czech-random-choice '(t nil)))
1492                              ((czech-random-choice '(t nil))
1513 (define (czech-stress-units utt)
1519       (let ((stress-units (czech-identify-stress-units
1553            (if (czech-yes-no-question int-unit) "FF-IT" "FF-KKL")
1565 (define (czech-word utt)
1567   (czech-intonation-units utt)
1568   (czech-stress-units utt)
1569   (czech-adjust-phrase-breaks utt)
1574 (define (czech-add-strokes utt)
1579       (if (and (czech-item.feat? i "daughter1.daughter1.ph_vc" '+)
1581                (not (czech-item.feat? i "daughter1.daughter1.R:Segment.p.name"
1588 (define (czech-pause-breaks utt)
1594        (if (czech-item.feat? w "pbreak" 'SB)
1598 (define (czech-pause utt)
1599   (czech-pause-breaks utt)
1600   (czech-add-strokes utt)
1601   (czech-adjust-phonetic-form utt)
1606 (defvar czech-accent-cart-tree '(NONE))
1610 (defvar czech-int-contours
1661 (defvar czech-int-contour-tree
1679 (define (czech-int-select-contours utt)
1685         (let ((contourtype (wagon_predict unit czech-int-contour-tree)))
1688           (let ((nsyls (czech-unit-syllable-count unit)))
1689             (let ((contour (czech-random-choice
1692                                         czech-int-contours)))))
1709               (if (czech-item.feat? unit 'preelement 1)
1733             (kernels (czech-syllable-kernels
1734                       (czech-stress-unit-phonemes unit))))
1750                       (while (czech-item.feat? (cadr k*) 'ph_vc '+)
1763                   (while (or (czech-item.feat? next-k 'ph_cvox '+)
1764                              (czech-item.feat? next-k 'ph_cvox 'u))
1777 (defvar czech-int-simple-params '((f0_mean 100) (f0_std 10)))
1779 (define (czech-int-targets utt syl)
1790           (let ((contourval (and (czech-item.has-feat s 'contourval)
1815           (set! times-values (cons (list (czech-max (- (item.feat last-seg 'end) 0.01)
1823 (defvar czech-phoneme-durations
1867 (defvar czech-silence-durations
1870 (defvar czech-stress-duration-factors
1884 (defvar czech-duration-random-factor 0.2)
1886 (define (czech-duration-pauses utt)
1891                                    czech-silence-durations)))
1900                    (* 10 (+ min (* (- max min) (czech-rand)))))))))
1903 (define (czech-duration-factors utt)
1906       (let ((nphones (length (czech-stress-unit-phonemes sunit))))
1912         (let ((factor (cadr (assoc nphones czech-stress-duration-factors))))
1933       (if (eqv? (czech-syllable-count phonemes) 1)
1934           (let ((durfact (cadr (assoc (czech-min (length phonemes) 12)
1935                                       czech-stress-duration-factors))))
1939 (define (czech-duration-compute utt)
1948                                                 czech-phoneme-durations*)))))))
1951 (define (czech-duration utt)
1952   (czech-duration-pauses utt)
1953   (czech-duration-factors utt)
1954   (czech-duration-compute utt)
1959 (defvar czech-volume-scale 1.8)
1960 (defvar czech-volume-scale* nil)
1962 (define (czech-adjust-volume utt)
1963   (utt.wave.rescale utt czech-volume-scale*))
1967 (define (czech-translate-add-vowels utt)
1968   (if (and (string-equal (Param.get 'Language) 'czech)
1969            czech-insert-filling-vowels)
1988             (vowel? (lambda (ph) (czech-item.feat? ph 'ph_vc '+)))
2000 (define (czech-translate-phonemes utt)
2001   (if (and (string-equal (Param.get 'Language) 'czech)
2002            czech-phoneset-translation*)
2005          (let ((tr (assoc (item.name item) czech-phoneset-translation*)))
2010 (defvar czech-after-analysis-hooks
2011   (list czech-translate-add-vowels czech-translate-phonemes))
2015 (define (czech-reset-parameters)
2016   (set! czech-lts-extra-rules* czech-lts-extra-rules)
2017   (set! czech-int-simple-params* czech-int-simple-params)
2018   (set! czech-phoneme-durations* czech-phoneme-durations)
2019   (set! czech-volume-scale* czech-volume-scale)
2020   (set! czech-phoneset-translation* czech-phoneset-translation)
2021   (set! czech-after-analysis-hooks* czech-after-analysis-hooks)
2024 (define (voice-czech-common)
2026   (Param.set 'Language 'czech)
2028   (Param.set 'PhoneSet 'czech)
2029   (PhoneSet.select 'czech)
2032   (set! token.unknown_word_name czech-token.unknown-word-name)
2033   (set! token.whitespace czech-token.whitespace)
2034   (set! token.punctuation czech-token.punctuation)
2035   (set! token.prepunctuation czech-token.prepunctuation)
2036   (set! token_to_words czech-token-to-words)
2039   (lex.select "czech")
2041   (Param.set 'Word_Method 'czech-word)
2043   (set! guess_pos czech-guess-pos)      ; not actually used
2044   (Param.set 'POS_Method czech-pos)
2047   (set! phrase_cart_tree czech-phrase-cart-tree)
2051   (Param.set 'Pause_Method czech-pause)
2053   (set! int_accent_cart_tree czech-accent-cart-tree)
2054   (Param.set 'Int_Method czech-int-select-contours)
2055   (set! int_general_params (cons (list 'targ_func czech-int-targets)
2056                                  czech-int-simple-params*))
2059   (Param.set 'Duration_Method czech-duration)
2062   (set! after_analysis_hooks czech-after-analysis-hooks*)
2064   (set! after_synth_hooks (list czech-adjust-volume))
2067   (set! current-voice 'czech))
2069 (defmac (czech-proclaim-voice form)
2079     (let ((parameters `((language czech)
2086            (czech-reset-parameters)
2088            (voice-czech-common)
2094 (provide 'czech)