1 // Copyright 2016 Emilie Gillet.
2 //
3 // Author: Emilie Gillet (emilie.o.gillet@gmail.com)
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
11 //
12 // The above copyright notice and this permission notice shall be included in
13 // all copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 // THE SOFTWARE.
22 //
23 // See http://creativecommons.org/licenses/MIT/ for more information.
24 //
25 // -----------------------------------------------------------------------------
26 //
27 // SAM-inspired speech synth (as used in Shruthi/Ambika/Braids).
28 
29 #include "plaits/dsp/speech/sam_speech_synth.h"
30 
31 #include <algorithm>
32 
33 #include "stmlib/dsp/dsp.h"
34 #include "stmlib/dsp/parameter_interpolator.h"
35 
36 #include "plaits/dsp/oscillator/oscillator.h"
37 #include "plaits/resources.h"
38 
39 namespace plaits {
40 
41 using namespace std;
42 using namespace stmlib;
43 
Init()44 void SAMSpeechSynth::Init() {
45   phase_ = 0.0f;
46   frequency_ = 0.0f;
47   pulse_next_sample_ = 0.0f;
48   pulse_lp_ = 0.0f;
49 
50   fill(&formant_phase_[0], &formant_phase_[3], 0);
51   consonant_samples_ = 0;
52   consonant_index_ = 0.0f;
53 }
54 
55 // Phoneme data
56 
57 /* static */
58 SAMSpeechSynth::Phoneme SAMSpeechSynth::phonemes_[] = {
59   { { { 60, 15 }, { 90, 13 }, { 200, 1 } } },
60   { { { 40, 13 }, { 114, 12 }, { 139, 6 } } },
61   { { { 33, 14 }, { 155, 12 }, { 209, 7 } } },
62   { { { 22, 13 }, { 189, 10 }, { 247, 8 } } },
63   { { { 51, 15 }, { 99, 12 }, { 195, 1 } } },
64   { { { 29, 13 }, { 65, 8 }, { 180, 0 } } },
65   { { { 13, 12 }, { 103, 3 }, { 182, 0 } } },
66   { { { 20, 15 }, { 114, 3 }, { 213, 0 } } },
67   { { { 13, 7 }, { 164, 3 }, { 222, 14 } } },
68   { { { 13, 9 }, { 121, 9 }, { 254, 0 } } },
69   { { { 40, 12 }, { 112, 10 }, { 114, 5 } } },
70   { { { 24, 13 }, { 54, 8 }, { 157, 0 } } },
71   { { { 33, 14 }, { 155, 12 }, { 166, 7 } } },
72   { { { 36, 14 }, { 83, 8 }, { 249, 1 } } },
73   { { { 40, 14 }, { 114, 12 }, { 139, 6 } } },
74   { { { 13, 5 }, { 58, 5 }, { 182, 5 } } },
75   { { { 13, 7 }, { 164, 10 }, { 222, 14 } } }
76 };
77 
78 /* static */
79 float SAMSpeechSynth::formant_amplitude_lut[] = {
80   0.03125000f,  0.03756299f,  0.04515131f,  0.05427259f,  0.06523652f,
81   0.07841532f,  0.09425646f,  0.11329776f,  0.13618570f,  0.16369736f,
82   0.19676682f,  0.23651683f,  0.28429697f,  0.34172946f,  0.41076422f,
83   0.49374509f
84 };
85 
InterpolatePhonemeData(float phoneme,float formant_shift,uint32_t * formant_frequency,float * formant_amplitude)86 void SAMSpeechSynth::InterpolatePhonemeData(
87   float phoneme,
88   float formant_shift,
89   uint32_t* formant_frequency,
90   float* formant_amplitude) {
91   MAKE_INTEGRAL_FRACTIONAL(phoneme);
92 
93   const Phoneme& p_1 = phonemes_[phoneme_integral];
94   const Phoneme& p_2 = phonemes_[phoneme_integral + 1];
95 
96   formant_shift = 1.0f + formant_shift * 2.5f;
97   for (int i = 0; i < kSAMNumFormants; ++i) {
98     float f_1 = p_1.formant[i].frequency;
99     float f_2 = p_2.formant[i].frequency;
100     float f = f_1 + (f_2 - f_1) * phoneme_fractional;
101     f *= 8.0f * formant_shift * 4294967296.0f / kSampleRate;
102     formant_frequency[i] = static_cast<uint32_t>(f);
103 
104     float a_1 = formant_amplitude_lut[p_1.formant[i].amplitude];
105     float a_2 = formant_amplitude_lut[p_2.formant[i].amplitude];
106     formant_amplitude[i] = a_1 + (a_2 - a_1) * phoneme_fractional;
107   }
108 }
109 
Render(bool consonant,float frequency,float vowel,float formant_shift,float * excitation,float * output,size_t size)110 void SAMSpeechSynth::Render(
111     bool consonant,
112     float frequency,
113     float vowel,
114     float formant_shift,
115     float* excitation,
116     float* output,
117     size_t size) {
118   if (frequency >= 0.0625f) {
119     frequency = 0.0625f;
120   }
121 
122   if (consonant) {
123     consonant_samples_ = kSampleRate * 0.05f;
124     int r = (vowel + 3.0f * frequency + 7.0f * formant_shift) * 8.0f;
125     consonant_index_ = (r % kSAMNumConsonants);
126   }
127   consonant_samples_ -= min(consonant_samples_, size);
128 
129   float phoneme = consonant_samples_
130       ? (consonant_index_ + kSAMNumVowels)
131       : vowel * (kSAMNumVowels - 1.0001f);
132 
133   uint32_t formant_frequency[kSAMNumFormants];
134   float formant_amplitude[kSAMNumFormants];
135 
136   InterpolatePhonemeData(
137       phoneme,
138       formant_shift,
139       formant_frequency,
140       formant_amplitude);
141 
142   ParameterInterpolator fm(&frequency_, frequency, size);
143   float pulse_next_sample = pulse_next_sample_;
144 
145   while (size--) {
146     float pulse_this_sample = pulse_next_sample;
147     pulse_next_sample = 0.0f;
148     const float frequency = fm.Next();
149     phase_ += frequency;
150 
151     if (phase_ >= 1.0f) {
152       phase_ -= 1.0f;
153       float t = phase_ / frequency;
154       formant_phase_[0] = static_cast<uint32_t>(
155           t * static_cast<float>(formant_frequency[0]));
156       formant_phase_[1] = static_cast<uint32_t>(
157           t * static_cast<float>(formant_frequency[1]));
158       formant_phase_[2] = static_cast<uint32_t>(
159           t * static_cast<float>(formant_frequency[2]));
160       pulse_this_sample -= ThisBlepSample(t);
161       pulse_next_sample -= NextBlepSample(t);
162     } else {
163       formant_phase_[0] += formant_frequency[0];
164       formant_phase_[1] += formant_frequency[1];
165       formant_phase_[2] += formant_frequency[2];
166     }
167     pulse_next_sample += phase_;
168 
169     float d = pulse_this_sample - 0.5f - pulse_lp_;
170     pulse_lp_ += min(16.0f * frequency, 1.0f) * d;
171     *excitation++ = d;
172 
173     float s = 0;
174     s += lut_sine[formant_phase_[0] >> 22] * formant_amplitude[0];
175     s += lut_sine[formant_phase_[1] >> 22] * formant_amplitude[1];
176     s += lut_sine[formant_phase_[2] >> 22] * formant_amplitude[2];
177     s *= (1.0f - phase_);
178     *output++ = s;
179   }
180   pulse_next_sample_ = pulse_next_sample;
181 }
182 
183 }  // namespace plaits
184