1 /*
2  * Reverb.cpp
3  * ----------
4  * Purpose: Mixing code for reverb.
5  * Notes  : Ugh... This should really be removed at some point.
6  * Authors: Olivier Lapicque
7  *          OpenMPT Devs
8  * The OpenMPT source code is released under the BSD license. Read LICENSE for more details.
9  */
10 
11 
12 #include "stdafx.h"
13 
14 #ifndef NO_REVERB
15 #include "Reverb.h"
16 #include "../soundlib/MixerLoops.h"
17 #include "mpt/base/numbers.hpp"
18 
19 #if defined(MPT_ENABLE_ARCH_INTRINSICS_SSE2)
20 #include <emmintrin.h>
21 #endif
22 
23 #endif // NO_REVERB
24 
25 
26 OPENMPT_NAMESPACE_BEGIN
27 
28 
29 #ifndef NO_REVERB
30 
31 
32 #if defined(MPT_ENABLE_ARCH_INTRINSICS_SSE2)
33 // Load two 32-bit values
Load64SSE(const int32 * x)34 static MPT_FORCEINLINE __m128i Load64SSE(const int32 *x) { return _mm_loadl_epi64(reinterpret_cast<const __m128i *>(x)); }
35 // Load four 16-bit values
Load64SSE(const LR16 (& x)[2])36 static MPT_FORCEINLINE __m128i Load64SSE(const LR16 (&x)[2]) { return _mm_loadl_epi64(&reinterpret_cast<const __m128i &>(x)); }
37 // Store two 32-bit or four 16-bit values from register
Store64SSE(int32 * dst,__m128i src)38 static MPT_FORCEINLINE void Store64SSE(int32 *dst, __m128i src) { return _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), src); }
Store64SSE(LR16 (& dst)[2],__m128i src)39 static MPT_FORCEINLINE void Store64SSE(LR16 (&dst)[2], __m128i src) { return _mm_storel_epi64(&reinterpret_cast<__m128i &>(dst), src); }
40 #endif
41 
42 
CReverb()43 CReverb::CReverb()
44 {
45 	// Reverb mix buffers
46 	MemsetZero(g_RefDelay);
47 	MemsetZero(g_LateReverb);
48 }
49 
50 
OnePoleLowPassCoef(int32 scale,float g,float F_c,float F_s)51 static int32 OnePoleLowPassCoef(int32 scale, float g, float F_c, float F_s)
52 {
53 	if(g > 0.999999f) return 0;
54 
55 	g *= g;
56 	double scale_over_1mg = scale / (1.0 - g);
57 	double cosw = std::cos((2.0 * mpt::numbers::pi) * F_c / F_s);
58 	return mpt::saturate_round<int32>((1.0 - (std::sqrt((g + g) * (1.0 - cosw) - g * g * (1.0 - cosw * cosw)) + g * cosw)) * scale_over_1mg);
59 }
60 
mBToLinear(int32 value_mB)61 static float mBToLinear(int32 value_mB)
62 {
63 	if(!value_mB) return 1;
64 	if(value_mB <= -100000) return 0;
65 
66 	const double val = value_mB * 3.321928094887362304 / (100.0 * 20.0);	// log2(10)/(100*20)
67 	return static_cast<float>(std::pow(2.0, val - static_cast<int32>(0.5 + val)));
68 }
69 
mBToLinear(int32 scale,int32 value_mB)70 static int32 mBToLinear(int32 scale, int32 value_mB)
71 {
72 	return mpt::saturate_round<int32>(mBToLinear(value_mB) * scale);
73 }
74 
75 static constexpr std::pair<SNDMIX_REVERB_PROPERTIES, const char *> ReverbPresets[NUM_REVERBTYPES] =
76 {
77 	// Examples simulating General MIDI 2'musical' reverb presets
78 	// Name  (Decay time)  Description
79 	// Plate       (1.3s)  A plate reverb simulation.
80 	{{ -1000, -200, 1.30f,0.90f,     0,0.002f,     0,0.010f,100.0f, 75.0f }, "GM Plate"},
81 	// Small Room  (1.1s)  A small size room with a length of 5m or so.
82 	{{ -1000, -600, 1.10f,0.83f,  -400,0.005f,   500,0.010f,100.0f,100.0f }, "GM Small Room"},
83 	// Medium Room (1.3s)  A medium size room with a length of 10m or so.
84 	{{ -1000, -600, 1.30f,0.83f, -1000,0.010f,  -200,0.020f,100.0f,100.0f }, "GM Medium Room"},
85 	// Large Room  (1.5s)  A large size room suitable for live performances.
86 	{{ -1000, -600, 1.50f,0.83f, -1600,0.020f, -1000,0.040f,100.0f,100.0f }, "GM Large Room"},
87 	// Medium Hall (1.8s)  A medium size concert hall.
88 	{{ -1000, -600, 1.80f,0.70f, -1300,0.015f,  -800,0.030f,100.0f,100.0f }, "GM Medium Hall"},
89 	// Large Hall  (1.8s)  A large size concert hall suitable for a full orchestra.
90 	{{ -1000, -600, 1.80f,0.70f, -2000,0.030f, -1400,0.060f,100.0f,100.0f }, "GM Large Hall"},
91 
92 	{{ -1000, -100, 1.49f,0.83f, -2602,0.007f,   200,0.011f,100.0f,100.0f }, "Generic"},
93 	{{ -1000,-6000, 0.17f,0.10f, -1204,0.001f,   207,0.002f,100.0f,100.0f }, "Padded Cell"},
94 	{{ -1000, -454, 0.40f,0.83f, -1646,0.002f,    53,0.003f,100.0f,100.0f }, "Room"},
95 	{{ -1000,-1200, 1.49f,0.54f,  -370,0.007f,  1030,0.011f,100.0f, 60.0f }, "Bathroom"},
96 	{{ -1000,-6000, 0.50f,0.10f, -1376,0.003f, -1104,0.004f,100.0f,100.0f }, "Living Room"},
97 	{{ -1000, -300, 2.31f,0.64f,  -711,0.012f,    83,0.017f,100.0f,100.0f }, "Stone Room"},
98 	{{ -1000, -476, 4.32f,0.59f,  -789,0.020f,  -289,0.030f,100.0f,100.0f }, "Auditorium"},
99 	{{ -1000, -500, 3.92f,0.70f, -1230,0.020f,    -2,0.029f,100.0f,100.0f }, "Concert Hall"},
100 	{{ -1000,    0, 2.91f,1.30f,  -602,0.015f,  -302,0.022f,100.0f,100.0f }, "Cave"},
101 	{{ -1000, -698, 7.24f,0.33f, -1166,0.020f,    16,0.030f,100.0f,100.0f }, "Arena"},
102 	{{ -1000,-1000,10.05f,0.23f,  -602,0.020f,   198,0.030f,100.0f,100.0f }, "Hangar"},
103 	{{ -1000,-4000, 0.30f,0.10f, -1831,0.002f, -1630,0.030f,100.0f,100.0f }, "Carpeted Hallway"},
104 	{{ -1000, -300, 1.49f,0.59f, -1219,0.007f,   441,0.011f,100.0f,100.0f }, "Hallway"},
105 	{{ -1000, -237, 2.70f,0.79f, -1214,0.013f,   395,0.020f,100.0f,100.0f }, "Stone Corridor"},
106 	{{ -1000, -270, 1.49f,0.86f, -1204,0.007f,    -4,0.011f,100.0f,100.0f }, "Alley"},
107 	{{ -1000,-3300, 1.49f,0.54f, -2560,0.162f,  -613,0.088f, 79.0f,100.0f }, "Forest"},
108 	{{ -1000, -800, 1.49f,0.67f, -2273,0.007f, -2217,0.011f, 50.0f,100.0f }, "City"},
109 	{{ -1000,-2500, 1.49f,0.21f, -2780,0.300f, -2014,0.100f, 27.0f,100.0f }, "Mountains"},
110 	{{ -1000,-1000, 1.49f,0.83f,-10000,0.061f,   500,0.025f,100.0f,100.0f }, "Quarry"},
111 	{{ -1000,-2000, 1.49f,0.50f, -2466,0.179f, -2514,0.100f, 21.0f,100.0f }, "Plain"},
112 	{{ -1000,    0, 1.65f,1.50f, -1363,0.008f, -1153,0.012f,100.0f,100.0f }, "Parking Lot"},
113 	{{ -1000,-1000, 2.81f,0.14f,   429,0.014f,   648,0.021f, 80.0f, 60.0f }, "Sewer Pipe"},
114 	{{ -1000,-4000, 1.49f,0.10f,  -449,0.007f,  1700,0.011f,100.0f,100.0f }, "Underwater"},
115 };
116 
GetReverbPresetName(uint32 preset)117 mpt::ustring GetReverbPresetName(uint32 preset)
118 {
119 	return (preset < NUM_REVERBTYPES) ? mpt::ToUnicode(mpt::Charset::ASCII, ReverbPresets[preset].second) : mpt::ustring{};
120 }
121 
GetReverbPreset(uint32 preset)122 const SNDMIX_REVERB_PROPERTIES *GetReverbPreset(uint32 preset)
123 {
124 	return (preset < NUM_REVERBTYPES) ? &ReverbPresets[preset].first : nullptr;
125 }
126 
127 //////////////////////////////////////////////////////////////////////////
128 //
129 // I3DL2 environmental reverb support
130 //
131 
132 struct REFLECTIONPRESET
133 {
134 	int32 lDelayFactor;
135 	int16 sGainLL, sGainRR, sGainLR, sGainRL;
136 };
137 
138 const REFLECTIONPRESET gReflectionsPreset[ENVIRONMENT_NUMREFLECTIONS] =
139 {
140 	// %Delay, ll,    rr,   lr,    rl
141 	{0,    9830,   6554,	  0,     0},
142 	{10,   6554,  13107,	  0,     0},
143 	{24,  -9830,  13107,	  0,     0},
144 	{36,  13107,  -6554,      0,     0},
145 	{54,  16384,  16384,  -1638, -1638},
146 	{61, -13107,   8192,   -328,  -328},
147 	{73, -11468, -11468,  -3277,  3277},
148 	{87,  13107,  -9830,   4916, -4916}
149 };
150 
151 ////////////////////////////////////////////////////////////////////////////////////
152 //
153 // Implementation
154 //
155 
ftol(float f)156 MPT_FORCEINLINE int32 ftol(float f) { return static_cast<int32>(f); }
157 
I3dl2_to_Generic(const SNDMIX_REVERB_PROPERTIES * pReverb,EnvironmentReverb * pRvb,float flOutputFreq,int32 lMinRefDelay,int32 lMaxRefDelay,int32 lMinRvbDelay,int32 lMaxRvbDelay,int32 lTankLength)158 static void I3dl2_to_Generic(
159 				const SNDMIX_REVERB_PROPERTIES *pReverb,
160 				EnvironmentReverb *pRvb,
161 				float flOutputFreq,
162 				int32 lMinRefDelay,
163 				int32 lMaxRefDelay,
164 				int32 lMinRvbDelay,
165 				int32 lMaxRvbDelay,
166 				int32 lTankLength)
167 {
168 	float flDelayFactor, flDelayFactorHF, flDecayTimeHF;
169 	int32 lDensity, lTailDiffusion;
170 
171 	// Common parameters
172 	pRvb->ReverbLevel = pReverb->lReverb;
173 	pRvb->ReflectionsLevel = pReverb->lReflections;
174 	pRvb->RoomHF = pReverb->lRoomHF;
175 
176 	// HACK: Somewhat normalize the reverb output level
177 	int32 lMaxLevel = (pRvb->ReverbLevel > pRvb->ReflectionsLevel) ? pRvb->ReverbLevel : pRvb->ReflectionsLevel;
178 	if (lMaxLevel < -600)
179 	{
180 		lMaxLevel += 600;
181 		pRvb->ReverbLevel -= lMaxLevel;
182 		pRvb->ReflectionsLevel -= lMaxLevel;
183 	}
184 
185 	// Pre-Diffusion factor (for both reflections and late reverb)
186 	lDensity = 8192 + ftol(79.31f * pReverb->flDensity);
187 	pRvb->PreDiffusion = lDensity;
188 
189 	// Late reverb diffusion
190 	lTailDiffusion = ftol((0.15f + pReverb->flDiffusion * (0.36f*0.01f)) * 32767.0f);
191 	if (lTailDiffusion > 0x7f00) lTailDiffusion = 0x7f00;
192 	pRvb->TankDiffusion = lTailDiffusion;
193 
194 	// Verify reflections and reverb delay parameters
195 	float flRefDelay = pReverb->flReflectionsDelay;
196 	if (flRefDelay > 0.100f) flRefDelay = 0.100f;
197 	int32 lReverbDelay = ftol(pReverb->flReverbDelay * flOutputFreq);
198 	int32 lReflectionsDelay = ftol(flRefDelay * flOutputFreq);
199 	int32 lReverbDecayTime = ftol(pReverb->flDecayTime * flOutputFreq);
200 	if (lReflectionsDelay < lMinRefDelay)
201 	{
202 		lReverbDelay -= (lMinRefDelay - lReflectionsDelay);
203 		lReflectionsDelay = lMinRefDelay;
204 	}
205 	if (lReflectionsDelay > lMaxRefDelay)
206 	{
207 		lReverbDelay += (lReflectionsDelay - lMaxRefDelay);
208 		lReflectionsDelay = lMaxRefDelay;
209 	}
210 	// Adjust decay time when adjusting reverb delay
211 	if (lReverbDelay < lMinRvbDelay)
212 	{
213 		lReverbDecayTime -= (lMinRvbDelay - lReverbDelay);
214 		lReverbDelay = lMinRvbDelay;
215 	}
216 	if (lReverbDelay > lMaxRvbDelay)
217 	{
218 		lReverbDecayTime += (lReverbDelay - lMaxRvbDelay);
219 		lReverbDelay = lMaxRvbDelay;
220 	}
221 	pRvb->ReverbDelay = lReverbDelay;
222 	pRvb->ReverbDecaySamples = lReverbDecayTime;
223 	// Setup individual reflections delay and gains
224 	for (uint32 iRef=0; iRef<ENVIRONMENT_NUMREFLECTIONS; iRef++)
225 	{
226 		EnvironmentReflection &ref = pRvb->Reflections[iRef];
227 		ref.Delay = lReflectionsDelay + (gReflectionsPreset[iRef].lDelayFactor * lReverbDelay + 50)/100;
228 		ref.GainLL = gReflectionsPreset[iRef].sGainLL;
229 		ref.GainRL = gReflectionsPreset[iRef].sGainRL;
230 		ref.GainLR = gReflectionsPreset[iRef].sGainLR;
231 		ref.GainRR = gReflectionsPreset[iRef].sGainRR;
232 	}
233 
234 	// Late reverb decay time
235 	if (lTankLength < 10) lTankLength = 10;
236 	flDelayFactor = (lReverbDecayTime <= lTankLength) ? 1.0f : ((float)lTankLength / (float)lReverbDecayTime);
237 	pRvb->ReverbDecay = ftol(std::pow(0.001f, flDelayFactor) * 32768.0f);
238 
239 	// Late Reverb Decay HF
240 	flDecayTimeHF = (float)lReverbDecayTime * pReverb->flDecayHFRatio;
241 	flDelayFactorHF = (flDecayTimeHF <= (float)lTankLength) ? 1.0f : ((float)lTankLength / flDecayTimeHF);
242 	pRvb->flReverbDamping = std::pow(0.001f, flDelayFactorHF);
243 }
244 
245 
Shutdown(MixSampleInt & gnRvbROfsVol,MixSampleInt & gnRvbLOfsVol)246 void CReverb::Shutdown(MixSampleInt &gnRvbROfsVol, MixSampleInt &gnRvbLOfsVol)
247 {
248 	gnReverbSend = false;
249 
250 	gnRvbLOfsVol = 0;
251 	gnRvbROfsVol = 0;
252 
253 	// Clear out all reverb state
254 	g_bLastInPresent = false;
255 	g_bLastOutPresent = false;
256 	g_nLastRvbIn_xl = g_nLastRvbIn_xr = 0;
257 	g_nLastRvbIn_yl = g_nLastRvbIn_yr = 0;
258 	g_nLastRvbOut_xl = g_nLastRvbOut_xr = 0;
259 	MemsetZero(gnDCRRvb_X1);
260 	MemsetZero(gnDCRRvb_Y1);
261 
262 	// Zero internal buffers
263 	MemsetZero(g_LateReverb.Diffusion1);
264 	MemsetZero(g_LateReverb.Diffusion2);
265 	MemsetZero(g_LateReverb.Delay1);
266 	MemsetZero(g_LateReverb.Delay2);
267 	MemsetZero(g_RefDelay.RefDelayBuffer);
268 	MemsetZero(g_RefDelay.PreDifBuffer);
269 	MemsetZero(g_RefDelay.RefOut);
270 }
271 
272 
Initialize(bool bReset,MixSampleInt & gnRvbROfsVol,MixSampleInt & gnRvbLOfsVol,uint32 MixingFreq)273 void CReverb::Initialize(bool bReset, MixSampleInt &gnRvbROfsVol, MixSampleInt &gnRvbLOfsVol, uint32 MixingFreq)
274 {
275 	if (m_Settings.m_nReverbType >= NUM_REVERBTYPES) m_Settings.m_nReverbType = 0;
276 	const SNDMIX_REVERB_PROPERTIES *rvbPreset = &ReverbPresets[m_Settings.m_nReverbType].first;
277 
278 	if ((rvbPreset != m_currentPreset) || (bReset))
279 	{
280 		// Reverb output frequency is half of the dry output rate
281 		float flOutputFrequency = (float)MixingFreq;
282 		EnvironmentReverb rvb;
283 
284 		// Reset reverb parameters
285 		m_currentPreset = rvbPreset;
286 		I3dl2_to_Generic(rvbPreset, &rvb, flOutputFrequency,
287 							RVBMINREFDELAY, RVBMAXREFDELAY,
288 							RVBMINRVBDELAY, RVBMAXRVBDELAY,
289 							( RVBDIF1L_LEN + RVBDIF1R_LEN
290 							+ RVBDIF2L_LEN + RVBDIF2R_LEN
291 							+ RVBDLY1L_LEN + RVBDLY1R_LEN
292 							+ RVBDLY2L_LEN + RVBDLY2R_LEN) / 2);
293 
294 		// Store reverb decay time (in samples) for reverb auto-shutdown
295 		gnReverbDecaySamples = rvb.ReverbDecaySamples;
296 
297 		// Room attenuation at high frequencies
298 		int32 nRoomLP;
299 		nRoomLP = OnePoleLowPassCoef(32768, mBToLinear(rvb.RoomHF), 5000, flOutputFrequency);
300 		g_RefDelay.nCoeffs.c.l = (int16)nRoomLP;
301 		g_RefDelay.nCoeffs.c.r = (int16)nRoomLP;
302 
303 		// Pre-Diffusion factor (for both reflections and late reverb)
304 		g_RefDelay.nPreDifCoeffs.c.l = (int16)(rvb.PreDiffusion*2);
305 		g_RefDelay.nPreDifCoeffs.c.r = (int16)(rvb.PreDiffusion*2);
306 
307 		// Setup individual reflections delay and gains
308 		for (uint32 iRef=0; iRef<8; iRef++)
309 		{
310 			SWRvbReflection &ref = g_RefDelay.Reflections[iRef];
311 			ref.DelayDest = rvb.Reflections[iRef].Delay;
312 			ref.Delay = ref.DelayDest;
313 			ref.Gains[0].c.l = rvb.Reflections[iRef].GainLL;
314 			ref.Gains[0].c.r = rvb.Reflections[iRef].GainRL;
315 			ref.Gains[1].c.l = rvb.Reflections[iRef].GainLR;
316 			ref.Gains[1].c.r = rvb.Reflections[iRef].GainRR;
317 		}
318 		g_LateReverb.nReverbDelay = rvb.ReverbDelay;
319 
320 		// Reflections Master Gain
321 		uint32 lReflectionsGain = 0;
322 		if (rvb.ReflectionsLevel > -9000)
323 		{
324 			lReflectionsGain = mBToLinear(32768, rvb.ReflectionsLevel);
325 		}
326 		g_RefDelay.lMasterGain = lReflectionsGain;
327 
328 		// Late reverb master gain
329 		uint32 lReverbGain = 0;
330 		if (rvb.ReverbLevel > -9000)
331 		{
332 			lReverbGain = mBToLinear(32768, rvb.ReverbLevel);
333 		}
334 		g_LateReverb.lMasterGain = lReverbGain;
335 
336 		// Late reverb diffusion
337 		uint32 nTailDiffusion = rvb.TankDiffusion;
338 		if (nTailDiffusion > 0x7f00) nTailDiffusion = 0x7f00;
339 		g_LateReverb.nDifCoeffs[0].c.l = (int16)nTailDiffusion;
340 		g_LateReverb.nDifCoeffs[0].c.r = (int16)nTailDiffusion;
341 		g_LateReverb.nDifCoeffs[1].c.l = (int16)nTailDiffusion;
342 		g_LateReverb.nDifCoeffs[1].c.r = (int16)nTailDiffusion;
343 		g_LateReverb.Dif2InGains[0].c.l = 0x7000;
344 		g_LateReverb.Dif2InGains[0].c.r = 0x1000;
345 		g_LateReverb.Dif2InGains[1].c.l = 0x1000;
346 		g_LateReverb.Dif2InGains[1].c.r = 0x7000;
347 
348 		// Late reverb decay time
349 		int32 nReverbDecay = rvb.ReverbDecay;
350 		Limit(nReverbDecay, 0, 0x7ff0);
351 		g_LateReverb.nDecayDC[0].c.l = (int16)nReverbDecay;
352 		g_LateReverb.nDecayDC[0].c.r = 0;
353 		g_LateReverb.nDecayDC[1].c.l = 0;
354 		g_LateReverb.nDecayDC[1].c.r = (int16)nReverbDecay;
355 
356 		// Late Reverb Decay HF
357 		float fReverbDamping = rvb.flReverbDamping * rvb.flReverbDamping;
358 		int32 nDampingLowPass;
359 
360 		nDampingLowPass = OnePoleLowPassCoef(32768, fReverbDamping, 5000, flOutputFrequency);
361 		Limit(nDampingLowPass, 0x100, 0x7f00);
362 
363 		g_LateReverb.nDecayLP[0].c.l = (int16)nDampingLowPass;
364 		g_LateReverb.nDecayLP[0].c.r = 0;
365 		g_LateReverb.nDecayLP[1].c.l = 0;
366 		g_LateReverb.nDecayLP[1].c.r = (int16)nDampingLowPass;
367 	}
368 	if (bReset)
369 	{
370 		gnReverbSamples = 0;
371 		Shutdown(gnRvbROfsVol, gnRvbLOfsVol);
372 	}
373 	// Wait at least 5 seconds before shutting down the reverb
374 	if (gnReverbDecaySamples < MixingFreq*5)
375 	{
376 		gnReverbDecaySamples = MixingFreq*5;
377 	}
378 }
379 
380 
TouchReverbSendBuffer(MixSampleInt * MixReverbBuffer,MixSampleInt & gnRvbROfsVol,MixSampleInt & gnRvbLOfsVol,uint32 nSamples)381 void CReverb::TouchReverbSendBuffer(MixSampleInt *MixReverbBuffer, MixSampleInt &gnRvbROfsVol, MixSampleInt &gnRvbLOfsVol, uint32 nSamples)
382 {
383 	if(!gnReverbSend)
384 	{ // and we did not clear the buffer yet, do it now because we will get new data
385 		StereoFill(MixReverbBuffer, nSamples, gnRvbROfsVol, gnRvbLOfsVol);
386 	}
387 	gnReverbSend = true; // we will have to process reverb
388 }
389 
390 
391 // Reverb
Process(MixSampleInt * MixSoundBuffer,MixSampleInt * MixReverbBuffer,MixSampleInt & gnRvbROfsVol,MixSampleInt & gnRvbLOfsVol,uint32 nSamples)392 void CReverb::Process(MixSampleInt *MixSoundBuffer, MixSampleInt *MixReverbBuffer, MixSampleInt &gnRvbROfsVol, MixSampleInt &gnRvbLOfsVol, uint32 nSamples)
393 {
394 	if((!gnReverbSend) && (!gnReverbSamples))
395 	{ // no data is sent to reverb and reverb decayed completely
396 		return;
397 	}
398 	if(!gnReverbSend)
399 	{ // no input data in MixReverbBuffer, so the buffer got not cleared in TouchReverbSendBuffer(), do it now for decay
400 		StereoFill(MixReverbBuffer, nSamples, gnRvbROfsVol, gnRvbLOfsVol);
401 	}
402 
403 	uint32 nIn, nOut;
404 	// Dynamically adjust reverb master gains
405 	int32 lMasterGain;
406 	lMasterGain = ((g_RefDelay.lMasterGain * m_Settings.m_nReverbDepth) >> 4);
407 	if (lMasterGain > 0x7fff) lMasterGain = 0x7fff;
408 	g_RefDelay.ReflectionsGain.c.l = (int16)lMasterGain;
409 	g_RefDelay.ReflectionsGain.c.r = (int16)lMasterGain;
410 	lMasterGain = ((g_LateReverb.lMasterGain * m_Settings.m_nReverbDepth) >> 4);
411 	if (lMasterGain > 0x10000) lMasterGain = 0x10000;
412 	g_LateReverb.RvbOutGains[0].c.l = (int16)((lMasterGain+0x7f) >> 3);	// l->l
413 	g_LateReverb.RvbOutGains[0].c.r = (int16)((lMasterGain+0xff) >> 4);	// r->l
414 	g_LateReverb.RvbOutGains[1].c.l = (int16)((lMasterGain+0xff) >> 4);	// l->r
415 	g_LateReverb.RvbOutGains[1].c.r = (int16)((lMasterGain+0x7f) >> 3);	// r->r
416 	// Process Dry/Wet Mix
417 	int32 lMaxRvbGain = (g_RefDelay.lMasterGain > g_LateReverb.lMasterGain) ? g_RefDelay.lMasterGain : g_LateReverb.lMasterGain;
418 	if (lMaxRvbGain > 32768) lMaxRvbGain = 32768;
419 	int32 lDryVol = (36 - m_Settings.m_nReverbDepth)>>1;
420 	if (lDryVol < 8) lDryVol = 8;
421 	if (lDryVol > 16) lDryVol = 16;
422 	lDryVol = 16 - (((16-lDryVol) * lMaxRvbGain) >> 15);
423 	ReverbDryMix(MixSoundBuffer, MixReverbBuffer, lDryVol, nSamples);
424 	// Downsample 2x + 1st stage of lowpass filter
425 	nIn = ReverbProcessPreFiltering1x(MixReverbBuffer, nSamples);
426 	nOut = nIn;
427 	// Main reverb processing: split into small chunks (needed for short reverb delays)
428 	// Reverb Input + Low-Pass stage #2 + Pre-diffusion
429 	if (nIn > 0) ProcessPreDelay(&g_RefDelay, MixReverbBuffer, nIn);
430 	// Process Reverb Reflections and Late Reverberation
431 	int32 *pRvbOut = MixReverbBuffer;
432 	uint32 nRvbSamples = nOut;
433 	while (nRvbSamples > 0)
434 	{
435 		uint32 nPosRef = g_RefDelay.nRefOutPos & SNDMIX_REVERB_DELAY_MASK;
436 		uint32 nPosRvb = (nPosRef - g_LateReverb.nReverbDelay) & SNDMIX_REVERB_DELAY_MASK;
437 		uint32 nmax1 = (SNDMIX_REVERB_DELAY_MASK+1) - nPosRef;
438 		uint32 nmax2 = (SNDMIX_REVERB_DELAY_MASK+1) - nPosRvb;
439 		nmax1 = (nmax1 < nmax2) ? nmax1 : nmax2;
440 		uint32 n = nRvbSamples;
441 		if (n > nmax1) n = nmax1;
442 		if (n > 64) n = 64;
443 		// Reflections output + late reverb delay
444 		ProcessReflections(&g_RefDelay, &g_RefDelay.RefOut[nPosRef], pRvbOut, n);
445 		// Late Reverberation
446 		ProcessLateReverb(&g_LateReverb, &g_RefDelay.RefOut[nPosRvb], pRvbOut, n);
447 		// Update delay positions
448 		g_RefDelay.nRefOutPos = (g_RefDelay.nRefOutPos + n) & SNDMIX_REVERB_DELAY_MASK;
449 		g_RefDelay.nDelayPos = (g_RefDelay.nDelayPos + n) & SNDMIX_REFLECTIONS_DELAY_MASK;
450 		pRvbOut += n*2;
451 		nRvbSamples -= n;
452 	}
453 	// Adjust nDelayPos, in case nIn != nOut
454 	g_RefDelay.nDelayPos = (g_RefDelay.nDelayPos - nOut + nIn) & SNDMIX_REFLECTIONS_DELAY_MASK;
455 	// Upsample 2x
456 	ReverbProcessPostFiltering1x(MixReverbBuffer, MixSoundBuffer, nSamples);
457 	// Automatically shut down if needed
458 	if(gnReverbSend) gnReverbSamples = gnReverbDecaySamples; // reset decay counter
459 	else if(gnReverbSamples > nSamples) gnReverbSamples -= nSamples; // decay
460 	else // decayed
461 	{
462 		Shutdown(gnRvbROfsVol, gnRvbLOfsVol);
463 		gnReverbSamples = 0;
464 	}
465 	gnReverbSend = false; // no input data in MixReverbBuffer
466 }
467 
468 
ReverbDryMix(int32 * MPT_RESTRICT pDry,int32 * MPT_RESTRICT pWet,int lDryVol,uint32 nSamples)469 void CReverb::ReverbDryMix(int32 * MPT_RESTRICT pDry, int32 * MPT_RESTRICT pWet, int lDryVol, uint32 nSamples)
470 {
471 	for (uint32 i=0; i<nSamples; i++)
472 	{
473 		pDry[i*2] += (pWet[i*2]>>4) * lDryVol;
474 		pDry[i*2+1] += (pWet[i*2+1]>>4) * lDryVol;
475 	}
476 }
477 
478 
ReverbProcessPreFiltering2x(int32 * MPT_RESTRICT pWet,uint32 nSamples)479 uint32 CReverb::ReverbProcessPreFiltering2x(int32 * MPT_RESTRICT pWet, uint32 nSamples)
480 {
481 	uint32 nOutSamples = 0;
482 	int lowpass = g_RefDelay.nCoeffs.c.l;
483 	int y1_l = g_nLastRvbIn_yl, y1_r = g_nLastRvbIn_yr;
484 	uint32 n = nSamples;
485 
486 	if (g_bLastInPresent)
487 	{
488 		int x1_l = g_nLastRvbIn_xl, x1_r = g_nLastRvbIn_xr;
489 		int x2_l = pWet[0], x2_r = pWet[1];
490 		x1_l = (x1_l+x2_l)>>13;
491 		x1_r = (x1_r+x2_r)>>13;
492 		y1_l = x1_l + (((x1_l - y1_l)*lowpass)>>15);
493 		y1_r = x1_r + (((x1_r - y1_r)*lowpass)>>15);
494 		pWet[0] = y1_l;
495 		pWet[1] = y1_r;
496 		pWet+=2;
497 		n--;
498 		nOutSamples = 1;
499 		g_bLastInPresent = false;
500 	}
501 	if (n & 1)
502 	{
503 		n--;
504 		g_nLastRvbIn_xl = pWet[n*2];
505 		g_nLastRvbIn_xr = pWet[n*2+1];
506 		g_bLastInPresent = true;
507 	}
508 	n >>= 1;
509 	for (uint32 i=0; i<n; i++)
510 	{
511 		int x1_l = pWet[i*4];
512 		int x2_l = pWet[i*4+2];
513 		x1_l = (x1_l+x2_l)>>13;
514 		int x1_r = pWet[i*4+1];
515 		int x2_r = pWet[i*4+3];
516 		x1_r = (x1_r+x2_r)>>13;
517 		y1_l = x1_l + (((x1_l - y1_l)*lowpass)>>15);
518 		y1_r = x1_r + (((x1_r - y1_r)*lowpass)>>15);
519 		pWet[i*2] = y1_l;
520 		pWet[i*2+1] = y1_r;
521 	}
522 	g_nLastRvbIn_yl = y1_l;
523 	g_nLastRvbIn_yr = y1_r;
524 	return nOutSamples + n;
525 }
526 
527 
ReverbProcessPreFiltering1x(int32 * MPT_RESTRICT pWet,uint32 nSamples)528 uint32 CReverb::ReverbProcessPreFiltering1x(int32 * MPT_RESTRICT pWet, uint32 nSamples)
529 {
530 	int lowpass = g_RefDelay.nCoeffs.c.l;
531 	int y1_l = g_nLastRvbIn_yl, y1_r = g_nLastRvbIn_yr;
532 
533 	for (uint32 i=0; i<nSamples; i++)
534 	{
535 		int x_l = pWet[i*2] >> 12;
536 		int x_r = pWet[i*2+1] >> 12;
537 		y1_l = x_l + (((x_l - y1_l)*lowpass)>>15);
538 		y1_r = x_r + (((x_r - y1_r)*lowpass)>>15);
539 		pWet[i*2] = y1_l;
540 		pWet[i*2+1] = y1_r;
541 	}
542 	g_nLastRvbIn_yl = y1_l;
543 	g_nLastRvbIn_yr = y1_r;
544 	return nSamples;
545 }
546 
547 
ReverbProcessPostFiltering2x(const int32 * MPT_RESTRICT pRvb,int32 * MPT_RESTRICT pDry,uint32 nSamples)548 void CReverb::ReverbProcessPostFiltering2x(const int32 * MPT_RESTRICT pRvb, int32 * MPT_RESTRICT pDry, uint32 nSamples)
549 {
550 	uint32 n0 = nSamples, n;
551 	int x1_l = g_nLastRvbOut_xl, x1_r = g_nLastRvbOut_xr;
552 
553 	if (g_bLastOutPresent)
554 	{
555 		pDry[0] += x1_l;
556 		pDry[1] += x1_r;
557 		pDry += 2;
558 		n0--;
559 		g_bLastOutPresent = false;
560 	}
561 	n  = n0 >> 1;
562 	for (uint32 i=0; i<n; i++)
563 	{
564 		int x_l = pRvb[i*2], x_r = pRvb[i*2+1];
565 		pDry[i*4] += (x_l + x1_l)>>1;
566 		pDry[i*4+1] += (x_r + x1_r)>>1;
567 		pDry[i*4+2] += x_l;
568 		pDry[i*4+3] += x_r;
569 		x1_l = x_l;
570 		x1_r = x_r;
571 	}
572 	if (n0 & 1)
573 	{
574 		int x_l = pRvb[n*2], x_r = pRvb[n*2+1];
575 		pDry[n*4] += (x_l + x1_l)>>1;
576 		pDry[n*4+1] += (x_r + x1_r)>>1;
577 		x1_l = x_l;
578 		x1_r = x_r;
579 		g_bLastOutPresent = true;
580 	}
581 	g_nLastRvbOut_xl = x1_l;
582 	g_nLastRvbOut_xr = x1_r;
583 }
584 
585 
586 #define DCR_AMOUNT		9
587 
588 // Stereo Add + DC removal
ReverbProcessPostFiltering1x(const int32 * MPT_RESTRICT pRvb,int32 * MPT_RESTRICT pDry,uint32 nSamples)589 void CReverb::ReverbProcessPostFiltering1x(const int32 * MPT_RESTRICT pRvb, int32 * MPT_RESTRICT pDry, uint32 nSamples)
590 {
591 #if defined(MPT_ENABLE_ARCH_INTRINSICS_SSE2)
592 	if(CPU::HasFeatureSet(CPU::feature::sse2))
593 	{
594 		__m128i nDCRRvb_Y1 = Load64SSE(gnDCRRvb_Y1);
595 		__m128i nDCRRvb_X1 = Load64SSE(gnDCRRvb_X1);
596 		__m128i in = _mm_set1_epi32(0);
597 		while(nSamples--)
598 		{
599 			in = Load64SSE(pRvb);
600 			pRvb += 2;
601 			// x(n-1) - x(n)
602 			__m128i diff = _mm_sub_epi32(nDCRRvb_X1, in);
603 			nDCRRvb_X1 = _mm_add_epi32(nDCRRvb_Y1, _mm_sub_epi32(_mm_srai_epi32(diff, DCR_AMOUNT + 1), diff));
604 			__m128i out = _mm_add_epi32(Load64SSE(pDry), nDCRRvb_X1);
605 			nDCRRvb_Y1 = _mm_sub_epi32(nDCRRvb_X1, _mm_srai_epi32(nDCRRvb_X1, DCR_AMOUNT));
606 			nDCRRvb_X1 = in;
607 			Store64SSE(pDry, out);
608 			pDry += 2;
609 		}
610 		Store64SSE(gnDCRRvb_X1, in);
611 		Store64SSE(gnDCRRvb_Y1, nDCRRvb_Y1);
612 		return;
613 	}
614 #endif
615 	int32 X1L = gnDCRRvb_X1[0], X1R = gnDCRRvb_X1[1];
616 	int32 Y1L = gnDCRRvb_Y1[0], Y1R = gnDCRRvb_Y1[1];
617 	int32 inL = 0, inR = 0;
618 	while(nSamples--)
619 	{
620 		inL = pRvb[0];
621 		inR = pRvb[1];
622 		pRvb += 2;
623 		int32 outL = pDry[0], outR = pDry[1];
624 
625 		// x(n-1) - x(n)
626 		X1L -= inL;
627 		X1R -= inR;
628 		X1L = X1L / (1 << (DCR_AMOUNT + 1)) - X1L;
629 		X1R = X1R / (1 << (DCR_AMOUNT + 1)) - X1R;
630 		Y1L += X1L;
631 		Y1R += X1R;
632 		// add to dry mix
633 		outL += Y1L;
634 		outR += Y1R;
635 		Y1L -= Y1L / (1 << DCR_AMOUNT);
636 		Y1R -= Y1R / (1 << DCR_AMOUNT);
637 		X1L = inL;
638 		X1R = inR;
639 
640 		pDry[0] = outL;
641 		pDry[1] = outR;
642 		pDry += 2;
643 	}
644 	gnDCRRvb_Y1[0] = Y1L;
645 	gnDCRRvb_Y1[1] = Y1R;
646 	gnDCRRvb_X1[0] = inL;
647 	gnDCRRvb_X1[1] = inR;
648 }
649 
650 
ReverbDCRemoval(int32 * MPT_RESTRICT pBuffer,uint32 nSamples)651 void CReverb::ReverbDCRemoval(int32 * MPT_RESTRICT pBuffer, uint32 nSamples)
652 {
653 #if defined(MPT_ENABLE_ARCH_INTRINSICS_SSE2)
654 	if(CPU::HasFeatureSet(CPU::feature::sse2))
655 	{
656 		__m128i nDCRRvb_Y1 = Load64SSE(gnDCRRvb_Y1);
657 		__m128i nDCRRvb_X1 = Load64SSE(gnDCRRvb_X1);
658 		while(nSamples--)
659 		{
660 			__m128i in = Load64SSE(pBuffer);
661 			__m128i diff = _mm_sub_epi32(nDCRRvb_X1, in);
662 			__m128i out = _mm_add_epi32(nDCRRvb_Y1, _mm_sub_epi32(_mm_srai_epi32(diff, DCR_AMOUNT + 1), diff));
663 			Store64SSE(pBuffer, out);
664 			pBuffer += 2;
665 			nDCRRvb_Y1 = _mm_sub_epi32(out, _mm_srai_epi32(out, DCR_AMOUNT));
666 			nDCRRvb_X1 = in;
667 		}
668 		Store64SSE(gnDCRRvb_X1, nDCRRvb_X1);
669 		Store64SSE(gnDCRRvb_Y1, nDCRRvb_Y1);
670 		return;
671 	}
672 #endif
673 	int32 X1L = gnDCRRvb_X1[0], X1R = gnDCRRvb_X1[1];
674 	int32 Y1L = gnDCRRvb_Y1[0], Y1R = gnDCRRvb_Y1[1];
675 	int32 inL = 0, inR = 0;
676 	while(nSamples--)
677 	{
678 		inL = pBuffer[0];
679 		inR = pBuffer[1];
680 		// x(n-1) - x(n)
681 		X1L -= inL;
682 		X1R -= inR;
683 		X1L = X1L / (1 << (DCR_AMOUNT + 1)) - X1L;
684 		X1R = X1R / (1 << (DCR_AMOUNT + 1)) - X1R;
685 		Y1L += X1L;
686 		Y1R += X1R;
687 		pBuffer[0] = Y1L;
688 		pBuffer[1] = Y1R;
689 		pBuffer += 2;
690 		Y1L -= Y1L / (1 << DCR_AMOUNT);
691 		Y1R -= Y1R / (1 << DCR_AMOUNT);
692 		X1L = inL;
693 		X1R = inR;
694 	}
695 	gnDCRRvb_Y1[0] = Y1L;
696 	gnDCRRvb_Y1[1] = Y1R;
697 	gnDCRRvb_X1[0] = inL;
698 	gnDCRRvb_X1[1] = inR;
699 }
700 
701 
702 //////////////////////////////////////////////////////////////////////////
703 //
704 // Pre-Delay:
705 //
706 // 1. Saturate and low-pass the reverb input (stage 2 of roomHF)
707 // 2. Process pre-diffusion
708 // 3. Insert the result in the reflections delay buffer
709 //
710 
711 // Save some typing
Clamp16(int32 x)712 static MPT_FORCEINLINE int32 Clamp16(int32 x) { return Clamp(x, std::numeric_limits<int16>::min(), std::numeric_limits<int16>::max()); }
713 
ProcessPreDelay(SWRvbRefDelay * MPT_RESTRICT pPreDelay,const int32 * MPT_RESTRICT pIn,uint32 nSamples)714 void CReverb::ProcessPreDelay(SWRvbRefDelay * MPT_RESTRICT pPreDelay, const int32 * MPT_RESTRICT pIn, uint32 nSamples)
715 {
716 	uint32 preDifPos = pPreDelay->nPreDifPos;
717 	uint32 delayPos = pPreDelay->nDelayPos - 1;
718 #if defined(MPT_ENABLE_ARCH_INTRINSICS_SSE2)
719 	if(CPU::HasFeatureSet(CPU::feature::sse2))
720 	{
721 		__m128i coeffs = _mm_cvtsi32_si128(pPreDelay->nCoeffs.lr);
722 		__m128i history = _mm_cvtsi32_si128(pPreDelay->History.lr);
723 		__m128i preDifCoeffs = _mm_cvtsi32_si128(pPreDelay->nPreDifCoeffs.lr);
724 		while(nSamples--)
725 		{
726 			__m128i in32 = Load64SSE(pIn);					// 16-bit unsaturated reverb input [  r  |  l  ]
727 			__m128i inSat = _mm_packs_epi32(in32, in32);	// [ r | l | r | l ] (16-bit saturated)
728 			pIn += 2;
729 			// Low-pass
730 			__m128i lp = _mm_mulhi_epi16(_mm_subs_epi16(history, inSat), coeffs);
731 			__m128i preDif = _mm_cvtsi32_si128(pPreDelay->PreDifBuffer[preDifPos].lr);
732 			history = _mm_adds_epi16(_mm_adds_epi16(lp, lp), inSat);
733 			// Pre-Diffusion
734 			preDifPos = (preDifPos + 1) & SNDMIX_PREDIFFUSION_DELAY_MASK;
735 			delayPos = (delayPos + 1) & SNDMIX_REFLECTIONS_DELAY_MASK;
736 			__m128i preDif2 = _mm_subs_epi16(history, _mm_mulhi_epi16(preDif, preDifCoeffs));
737 			pPreDelay->PreDifBuffer[preDifPos].lr = _mm_cvtsi128_si32(preDif2);
738 			pPreDelay->RefDelayBuffer[delayPos].lr = _mm_cvtsi128_si32(_mm_adds_epi16(_mm_mulhi_epi16(preDifCoeffs, preDif2), preDif));
739 		}
740 		pPreDelay->nPreDifPos = preDifPos;
741 		pPreDelay->History.lr = _mm_cvtsi128_si32(history);
742 		return;
743 	}
744 #endif
745 	const int32 coeffsL = pPreDelay->nCoeffs.c.l, coeffsR = pPreDelay->nCoeffs.c.r;
746 	const int32 preDifCoeffsL = pPreDelay->nPreDifCoeffs.c.l, preDifCoeffsR = pPreDelay->nPreDifCoeffs.c.r;
747 	int16 historyL = pPreDelay->History.c.l, historyR = pPreDelay->History.c.r;
748 	while(nSamples--)
749 	{
750 		int32 inL = Clamp16(pIn[0]);
751 		int32 inR = Clamp16(pIn[1]);
752 		pIn += 2;
753 		// Low-pass
754 		int32 lpL = (Clamp16(historyL - inL) * coeffsL) / 65536;
755 		int32 lpR = (Clamp16(historyR - inR) * coeffsR) / 65536;
756 		historyL = mpt::saturate_cast<int16>(Clamp16(lpL + lpL) + inL);
757 		historyR = mpt::saturate_cast<int16>(Clamp16(lpR + lpR) + inR);
758 		// Pre-Diffusion
759 		int32 preDifL = pPreDelay->PreDifBuffer[preDifPos].c.l;
760 		int32 preDifR = pPreDelay->PreDifBuffer[preDifPos].c.r;
761 		preDifPos = (preDifPos + 1) & SNDMIX_PREDIFFUSION_DELAY_MASK;
762 		delayPos = (delayPos + 1) & SNDMIX_REFLECTIONS_DELAY_MASK;
763 		int16 preDif2L = mpt::saturate_cast<int16>(historyL - preDifL * preDifCoeffsL / 65536);
764 		int16 preDif2R = mpt::saturate_cast<int16>(historyR - preDifR * preDifCoeffsR / 65536);
765 		pPreDelay->PreDifBuffer[preDifPos].c.l = preDif2L;
766 		pPreDelay->PreDifBuffer[preDifPos].c.r = preDif2R;
767 		pPreDelay->RefDelayBuffer[delayPos].c.l = mpt::saturate_cast<int16>(preDifCoeffsL * preDif2L / 65536 + preDifL);
768 		pPreDelay->RefDelayBuffer[delayPos].c.r = mpt::saturate_cast<int16>(preDifCoeffsR * preDif2R / 65536 + preDifR);
769 	}
770 	pPreDelay->nPreDifPos = preDifPos;
771 	pPreDelay->History.c.l = historyL;
772 	pPreDelay->History.c.r = historyR;
773 }
774 
775 
776 ////////////////////////////////////////////////////////////////////
777 //
778 // ProcessReflections:
779 // First stage:
780 //	- process 4 reflections, output to pRefOut
781 //	- output results to pRefOut
782 // Second stage:
783 //	- process another 3 reflections
784 //	- sum with pRefOut
785 //	- apply reflections master gain and accumulate in the given output
786 //
787 
ProcessReflections(SWRvbRefDelay * MPT_RESTRICT pPreDelay,LR16 * MPT_RESTRICT pRefOut,int32 * MPT_RESTRICT pOut,uint32 nSamples)788 void CReverb::ProcessReflections(SWRvbRefDelay * MPT_RESTRICT pPreDelay, LR16 * MPT_RESTRICT pRefOut, int32 * MPT_RESTRICT pOut, uint32 nSamples)
789 {
790 #if defined(MPT_ENABLE_ARCH_INTRINSICS_SSE2)
791 	if(CPU::HasFeatureSet(CPU::feature::sse2))
792 	{
793 		union
794 		{
795 			__m128i xmm;
796 			int16 i[8];
797 		} pos;
798 		const LR16 *refDelayBuffer = pPreDelay->RefDelayBuffer;
799 #define GETDELAY(x) static_cast<int16>(pPreDelay->Reflections[x].Delay)
800 		__m128i delayPos = _mm_set_epi16(GETDELAY(7), GETDELAY(6), GETDELAY(5), GETDELAY(4), GETDELAY(3), GETDELAY(2), GETDELAY(1), GETDELAY(0));
801 #undef GETDELAY
802 		delayPos = _mm_sub_epi16(_mm_set1_epi16(static_cast<int16>(pPreDelay->nDelayPos - 1)), delayPos);
803 		__m128i gain12 = _mm_unpacklo_epi64(Load64SSE(pPreDelay->Reflections[0].Gains), Load64SSE(pPreDelay->Reflections[1].Gains));
804 		__m128i gain34 = _mm_unpacklo_epi64(Load64SSE(pPreDelay->Reflections[2].Gains), Load64SSE(pPreDelay->Reflections[3].Gains));
805 		__m128i gain56 = _mm_unpacklo_epi64(Load64SSE(pPreDelay->Reflections[4].Gains), Load64SSE(pPreDelay->Reflections[5].Gains));
806 		__m128i gain78 = _mm_unpacklo_epi64(Load64SSE(pPreDelay->Reflections[6].Gains), Load64SSE(pPreDelay->Reflections[7].Gains));
807 		// For 28-bit final output: 16+15-3 = 28
808 		__m128i refGain = _mm_srai_epi32(_mm_set_epi32(0, 0, pPreDelay->ReflectionsGain.c.r, pPreDelay->ReflectionsGain.c.l), 3);
809 		__m128i delayInc = _mm_set1_epi16(1), delayMask = _mm_set1_epi16(SNDMIX_REFLECTIONS_DELAY_MASK);
810 		while(nSamples--)
811 		{
812 			delayPos = _mm_and_si128(_mm_add_epi16(delayInc, delayPos), delayMask);
813 			_mm_storeu_si128(&pos.xmm, delayPos);
814 			__m128i ref12 = _mm_set_epi32(refDelayBuffer[pos.i[1]].lr, refDelayBuffer[pos.i[1]].lr, refDelayBuffer[pos.i[0]].lr, refDelayBuffer[pos.i[0]].lr);
815 			__m128i ref34 = _mm_set_epi32(refDelayBuffer[pos.i[3]].lr, refDelayBuffer[pos.i[3]].lr, refDelayBuffer[pos.i[2]].lr, refDelayBuffer[pos.i[2]].lr);
816 			__m128i ref56 = _mm_set_epi32(refDelayBuffer[pos.i[5]].lr, refDelayBuffer[pos.i[5]].lr, refDelayBuffer[pos.i[4]].lr, refDelayBuffer[pos.i[4]].lr);
817 			__m128i ref78 = _mm_set_epi32(0,                           0,                           refDelayBuffer[pos.i[6]].lr, refDelayBuffer[pos.i[6]].lr);
818 			// First stage
819 			__m128i refOut1 = _mm_add_epi32(_mm_madd_epi16(ref12, gain12), _mm_madd_epi16(ref34, gain34));
820 			refOut1 = _mm_srai_epi32(_mm_add_epi32(refOut1, _mm_shuffle_epi32(refOut1, _MM_SHUFFLE(1, 0, 3, 2))), 15);
821 
822 			// Second stage
823 			__m128i refOut2 = _mm_add_epi32(_mm_madd_epi16(ref56, gain56), _mm_madd_epi16(ref78, gain78));
824 			refOut2 = _mm_srai_epi32(_mm_add_epi32(refOut2, _mm_shuffle_epi32(refOut2, _MM_SHUFFLE(1, 0, 3, 2))), 15);
825 
826 			// Saturate to 16-bit and sum stages
827 			__m128i refOut = _mm_adds_epi16(_mm_packs_epi32(refOut1, refOut1), _mm_packs_epi32(refOut2, refOut2));
828 			pRefOut->lr = _mm_cvtsi128_si32(refOut);
829 			pRefOut++;
830 
831 			__m128i out = _mm_madd_epi16(_mm_unpacklo_epi16(refOut, refOut), refGain);	// Apply reflections gain
832 			// At this, point, this is the only output of the reverb
833 			Store64SSE(pOut, out);
834 			pOut += 2;
835 		}
836 		return;
837 	}
838 #endif
839 	int pos[7];
840 	for(int i = 0; i < 7; i++)
841 		pos[i] = pPreDelay->nDelayPos - pPreDelay->Reflections[i].Delay - 1;
842 	// For 28-bit final output: 16+15-3 = 28
843 	int16 refGain = pPreDelay->ReflectionsGain.c.l / (1 << 3);
844 	while(nSamples--)
845 	{
846 		// First stage
847 		int32 refOutL = 0, refOutR = 0;
848 		for(int i = 0; i < 4; i++)
849 		{
850 			pos[i] = (pos[i] + 1) & SNDMIX_REFLECTIONS_DELAY_MASK;
851 			int16 refL = pPreDelay->RefDelayBuffer[pos[i]].c.l, refR = pPreDelay->RefDelayBuffer[pos[i]].c.r;
852 			refOutL += refL * pPreDelay->Reflections[i].Gains[0].c.l + refR * pPreDelay->Reflections[i].Gains[0].c.r;
853 			refOutR += refL * pPreDelay->Reflections[i].Gains[1].c.l + refR * pPreDelay->Reflections[i].Gains[1].c.r;
854 		}
855 		int16 stage1l = mpt::saturate_cast<int16>(refOutL / (1 << 15));
856 		int16 stage1r = mpt::saturate_cast<int16>(refOutR / (1 << 15));
857 		// Second stage
858 		refOutL = 0;
859 		refOutR = 0;
860 		for(int i = 4; i < 7; i++)
861 		{
862 			pos[i] = (pos[i] + 1) & SNDMIX_REFLECTIONS_DELAY_MASK;
863 			int16 refL = pPreDelay->RefDelayBuffer[pos[i]].c.l, refR = pPreDelay->RefDelayBuffer[pos[i]].c.r;
864 			refOutL += refL * pPreDelay->Reflections[i].Gains[0].c.l + refR * pPreDelay->Reflections[i].Gains[0].c.r;
865 			refOutR += refL * pPreDelay->Reflections[i].Gains[1].c.l + refR * pPreDelay->Reflections[i].Gains[1].c.r;
866 		}
867 		pOut[0] = (pRefOut->c.l = mpt::saturate_cast<int16>(stage1l + refOutL / (1 << 15))) * refGain;
868 		pOut[1] = (pRefOut->c.r = mpt::saturate_cast<int16>(stage1r + refOutR / (1 << 15))) * refGain;
869 		pRefOut++;
870 		pOut += 2;
871 	}
872 }
873 
874 
875 //////////////////////////////////////////////////////////////////////////
876 //
877 // Late reverberation (with SW reflections)
878 //
879 
ProcessLateReverb(SWLateReverb * MPT_RESTRICT pReverb,LR16 * MPT_RESTRICT pRefOut,int32 * MPT_RESTRICT pMixOut,uint32 nSamples)880 void CReverb::ProcessLateReverb(SWLateReverb * MPT_RESTRICT pReverb, LR16 * MPT_RESTRICT pRefOut, int32 * MPT_RESTRICT pMixOut, uint32 nSamples)
881 {
882 	// Calculate delay line offset from current delay position
883 	#define DELAY_OFFSET(x) ((delayPos - (x)) & RVBDLY_MASK)
884 
885 #if defined(MPT_ENABLE_ARCH_INTRINSICS_SSE2)
886 	if(CPU::HasFeatureSet(CPU::feature::sse2))
887 	{
888 		int delayPos = pReverb->nDelayPos & RVBDLY_MASK;
889 		__m128i rvbOutGains = Load64SSE(pReverb->RvbOutGains);
890 		__m128i difCoeffs = Load64SSE(pReverb->nDifCoeffs);
891 		__m128i decayLP = Load64SSE(pReverb->nDecayLP);
892 		__m128i lpHistory = Load64SSE(pReverb->LPHistory);
893 		while(nSamples--)
894 		{
895 			__m128i refIn = _mm_cvtsi32_si128(pRefOut->lr);	// 16-bit stereo input
896 			pRefOut++;
897 
898 			__m128i delay2 = _mm_unpacklo_epi32(
899 				_mm_cvtsi32_si128(pReverb->Delay2[DELAY_OFFSET(RVBDLY2L_LEN)].lr),
900 				_mm_cvtsi32_si128(pReverb->Delay2[DELAY_OFFSET(RVBDLY2R_LEN)].lr));
901 
902 			// Unsigned to avoid sign extension
903 			uint16 diff1L = pReverb->Diffusion1[DELAY_OFFSET(RVBDIF1L_LEN)].c.l;
904 			uint16 diff1R = pReverb->Diffusion1[DELAY_OFFSET(RVBDIF1R_LEN)].c.r;
905 			int32 diffusion1 = diff1L | (diff1R << 16);	// diffusion1 history
906 
907 			uint16 diff2L = pReverb->Diffusion2[DELAY_OFFSET(RVBDIF2L_LEN)].c.l;
908 			uint16 diff2R = pReverb->Diffusion2[DELAY_OFFSET(RVBDIF2R_LEN)].c.r;
909 			int32 diffusion2 = diff2L | (diff2R << 16);	// diffusion2 history
910 
911 			__m128i lpDecay = _mm_mulhi_epi16(_mm_subs_epi16(lpHistory, delay2), decayLP);
912 			lpHistory = _mm_adds_epi16(_mm_adds_epi16(lpDecay, lpDecay), delay2);	// Low-passed decay
913 
914 			// Apply decay gain
915 			__m128i histDecay = _mm_srai_epi32(_mm_madd_epi16(Load64SSE(pReverb->nDecayDC), lpHistory), 15);
916 			__m128i histDecayPacked = _mm_shuffle_epi32(_mm_packs_epi32(histDecay, histDecay), _MM_SHUFFLE(2, 0, 2, 0));
917 			__m128i histDecayIn = _mm_adds_epi16(_mm_shuffle_epi32(_mm_packs_epi32(histDecay, histDecay), _MM_SHUFFLE(2, 0, 2, 0)), _mm_srai_epi16(_mm_unpacklo_epi32(refIn, refIn), 2));
918 			__m128i histDecayInDiff = _mm_subs_epi16(histDecayIn, _mm_mulhi_epi16(_mm_cvtsi32_si128(diffusion1), difCoeffs));
919 			pReverb->Diffusion1[delayPos].lr = _mm_cvtsi128_si32(histDecayInDiff);
920 
921 			__m128i delay1Out = _mm_adds_epi16(_mm_mulhi_epi16(difCoeffs, histDecayInDiff), _mm_cvtsi32_si128(diffusion1));
922 			// Insert the diffusion output in the reverb delay line
923 			pReverb->Delay1[delayPos].lr = _mm_cvtsi128_si32(delay1Out);
924 			__m128i histDecayInDelay = _mm_adds_epi16(histDecayIn, _mm_unpacklo_epi32(delay1Out, delay1Out));
925 
926 			// Input to second diffuser
927 			__m128i delay1 = _mm_unpacklo_epi32(
928 				_mm_cvtsi32_si128(pReverb->Delay1[DELAY_OFFSET(RVBDLY1L_LEN)].lr),
929 				_mm_cvtsi32_si128(pReverb->Delay1[DELAY_OFFSET(RVBDLY1R_LEN)].lr));
930 
931 			__m128i delay1Gains = _mm_srai_epi32(_mm_madd_epi16(delay1, Load64SSE(pReverb->Dif2InGains)), 15);
932 			__m128i delay1GainsSat = _mm_shuffle_epi32(_mm_packs_epi32(delay1Gains, delay1Gains), _MM_SHUFFLE(2, 0, 2, 0));
933 			__m128i histDelay1 = _mm_subs_epi16(_mm_adds_epi16(histDecayInDelay, delay1), delay1GainsSat);	// accumulate with reverb output
934 			__m128i diff2out = _mm_subs_epi16(delay1GainsSat, _mm_mulhi_epi16(_mm_cvtsi32_si128(diffusion2), difCoeffs));
935 			__m128i diff2outCoeffs = _mm_mulhi_epi16(difCoeffs, diff2out);
936 			pReverb->Diffusion2[delayPos].lr = _mm_cvtsi128_si32(diff2out);
937 
938 			__m128i mixOut = Load64SSE(pMixOut);
939 			__m128i delay2out = _mm_adds_epi16(diff2outCoeffs, _mm_cvtsi32_si128(diffusion2));
940 			pReverb->Delay2[delayPos].lr = _mm_cvtsi128_si32(delay2out);
941 			delayPos = (delayPos + 1) & RVBDLY_MASK;
942 			// Accumulate with reverb output
943 			__m128i out = _mm_add_epi32(_mm_madd_epi16(_mm_adds_epi16(histDelay1, delay2out), rvbOutGains), mixOut);
944 			Store64SSE(pMixOut, out);
945 			pMixOut += 2;
946 		}
947 		Store64SSE(pReverb->LPHistory, lpHistory);
948 		pReverb->nDelayPos = delayPos;
949 		return;
950 	}
951 #endif
952 	int delayPos = pReverb->nDelayPos & RVBDLY_MASK;
953 	while(nSamples--)
954 	{
955 		int16 refInL = pRefOut->c.l, refInR = pRefOut->c.r;
956 		pRefOut++;
957 
958 		int32 delay2LL = pReverb->Delay2[DELAY_OFFSET(RVBDLY2L_LEN)].c.l, delay2LR = pReverb->Delay2[DELAY_OFFSET(RVBDLY2L_LEN)].c.r;
959 		int32 delay2RL = pReverb->Delay2[DELAY_OFFSET(RVBDLY2R_LEN)].c.l, delay2RR = pReverb->Delay2[DELAY_OFFSET(RVBDLY2R_LEN)].c.r;
960 
961 		int32 diff1L = pReverb->Diffusion1[DELAY_OFFSET(RVBDIF1L_LEN)].c.l;
962 		int32 diff1R = pReverb->Diffusion1[DELAY_OFFSET(RVBDIF1R_LEN)].c.r;
963 
964 		int32 diff2L = pReverb->Diffusion2[DELAY_OFFSET(RVBDIF2L_LEN)].c.l;
965 		int32 diff2R = pReverb->Diffusion2[DELAY_OFFSET(RVBDIF2R_LEN)].c.r;
966 
967 		int32 lpDecayLL = Clamp16(pReverb->LPHistory[0].c.l - delay2LL) * pReverb->nDecayLP[0].c.l / 65536;
968 		int32 lpDecayLR = Clamp16(pReverb->LPHistory[0].c.r - delay2LR) * pReverb->nDecayLP[0].c.r / 65536;
969 		int32 lpDecayRL = Clamp16(pReverb->LPHistory[1].c.l - delay2RL) * pReverb->nDecayLP[1].c.l / 65536;
970 		int32 lpDecayRR = Clamp16(pReverb->LPHistory[1].c.r - delay2RR) * pReverb->nDecayLP[1].c.r / 65536;
971 		// Low-passed decay
972 		pReverb->LPHistory[0].c.l = mpt::saturate_cast<int16>(Clamp16(lpDecayLL + lpDecayLL) + delay2LL);
973 		pReverb->LPHistory[0].c.r = mpt::saturate_cast<int16>(Clamp16(lpDecayLR + lpDecayLR) + delay2LR);
974 		pReverb->LPHistory[1].c.l = mpt::saturate_cast<int16>(Clamp16(lpDecayRL + lpDecayRL) + delay2RL);
975 		pReverb->LPHistory[1].c.r = mpt::saturate_cast<int16>(Clamp16(lpDecayRR + lpDecayRR) + delay2RR);
976 
977 		// Apply decay gain
978 		int32 histDecayL = Clamp16((int32)pReverb->nDecayDC[0].c.l * pReverb->LPHistory[0].c.l / (1 << 15));
979 		int32 histDecayR = Clamp16((int32)pReverb->nDecayDC[1].c.r * pReverb->LPHistory[1].c.r / (1 << 15));
980 		int32 histDecayInL = Clamp16(histDecayL + refInL / 4);
981 		int32 histDecayInR = Clamp16(histDecayR + refInR / 4);
982 		int32 histDecayInDiffL = Clamp16(histDecayInL - diff1L * pReverb->nDifCoeffs[0].c.l / 65536);
983 		int32 histDecayInDiffR = Clamp16(histDecayInR - diff1R * pReverb->nDifCoeffs[0].c.r / 65536);
984 		pReverb->Diffusion1[delayPos].c.l = static_cast<int16>(histDecayInDiffL);
985 		pReverb->Diffusion1[delayPos].c.r = static_cast<int16>(histDecayInDiffR);
986 
987 		int32 delay1L = Clamp16(pReverb->nDifCoeffs[0].c.l * histDecayInDiffL / 65536 + diff1L);
988 		int32 delay1R = Clamp16(pReverb->nDifCoeffs[0].c.r * histDecayInDiffR / 65536 + diff1R);
989 		// Insert the diffusion output in the reverb delay line
990 		pReverb->Delay1[delayPos].c.l = static_cast<int16>(delay1L);
991 		pReverb->Delay1[delayPos].c.r = static_cast<int16>(delay1R);
992 		int32 histDecayInDelayL = Clamp16(histDecayInL + delay1L);
993 		int32 histDecayInDelayR = Clamp16(histDecayInR + delay1R);
994 
995 		// Input to second diffuser
996 		int32 delay1LL = pReverb->Delay1[DELAY_OFFSET(RVBDLY1L_LEN)].c.l, delay1LR = pReverb->Delay1[DELAY_OFFSET(RVBDLY1L_LEN)].c.r;
997 		int32 delay1RL = pReverb->Delay1[DELAY_OFFSET(RVBDLY1R_LEN)].c.l, delay1RR = pReverb->Delay1[DELAY_OFFSET(RVBDLY1R_LEN)].c.r;
998 
999 		int32 delay1GainsL = Clamp16((delay1LL * pReverb->Dif2InGains[0].c.l + delay1LR * pReverb->Dif2InGains[0].c.r) / (1 << 15));
1000 		int32 delay1GainsR = Clamp16((delay1RL * pReverb->Dif2InGains[1].c.l + delay1RR * pReverb->Dif2InGains[1].c.r) / (1 << 15));
1001 
1002 		// accumulate with reverb output
1003 		int32 histDelay1LL = Clamp16(Clamp16(histDecayInDelayL + delay1LL) - delay1GainsL);
1004 		int32 histDelay1LR = Clamp16(Clamp16(histDecayInDelayR + delay1LR) - delay1GainsR);
1005 		int32 histDelay1RL = Clamp16(Clamp16(histDecayInDelayL + delay1RL) - delay1GainsL);
1006 		int32 histDelay1RR = Clamp16(Clamp16(histDecayInDelayR + delay1RR) - delay1GainsR);
1007 		int32 diff2outL = Clamp16(delay1GainsL - diff2L * pReverb->nDifCoeffs[0].c.l / 65536);
1008 		int32 diff2outR = Clamp16(delay1GainsR - diff2R * pReverb->nDifCoeffs[0].c.r / 65536);
1009 		int32 diff2outCoeffsL = pReverb->nDifCoeffs[0].c.l * diff2outL / 65536;
1010 		int32 diff2outCoeffsR = pReverb->nDifCoeffs[0].c.r * diff2outR / 65536;
1011 		pReverb->Diffusion2[delayPos].c.l = static_cast<int16>(diff2outL);
1012 		pReverb->Diffusion2[delayPos].c.r = static_cast<int16>(diff2outR);
1013 
1014 		int32 delay2outL = Clamp16(diff2outCoeffsL + diff2L);
1015 		int32 delay2outR = Clamp16(diff2outCoeffsR + diff2R);
1016 		pReverb->Delay2[delayPos].c.l = static_cast<int16>(delay2outL);
1017 		pReverb->Delay2[delayPos].c.r = static_cast<int16>(delay2outR);
1018 		delayPos = (delayPos + 1) & RVBDLY_MASK;
1019 		// Accumulate with reverb output
1020 		pMixOut[0] += Clamp16(histDelay1LL + delay2outL) * pReverb->RvbOutGains[0].c.l + Clamp16(histDelay1LR + delay2outR) * pReverb->RvbOutGains[0].c.r;
1021 		pMixOut[1] += Clamp16(histDelay1RL + Clamp16(diff2outCoeffsL)) * pReverb->RvbOutGains[1].c.l + Clamp16(histDelay1RR + Clamp16(diff2outCoeffsR)) * pReverb->RvbOutGains[1].c.r;
1022 		pMixOut += 2;
1023 	}
1024 	pReverb->nDelayPos = delayPos;
1025 
1026 	#undef DELAY_OFFSET
1027 }
1028 
1029 
1030 #else
1031 
1032 
1033 MPT_MSVC_WORKAROUND_LNK4221(Reverb)
1034 
1035 
1036 #endif // NO_REVERB
1037 
1038 
1039 OPENMPT_NAMESPACE_END
1040 
1041