1 /*
2  * Copyright (C) 2021 Robin Gareus <robin@gareus.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef _GNU_SOURCE
19 #define _GNU_SOURCE // needed for M_PI
20 #endif
21 
22 #include <cmath>
23 #include <cstdio>
24 #include <cstdlib>
25 #include <cstring>
26 #include <getopt.h>
27 #include <inttypes.h>
28 #include <limits>
29 #include <map>
30 #include <thread>
31 #include <vector>
32 
33 #include <fftw3.h>
34 #include <sndfile.h>
35 
36 #include "dsp_peak_calc.h"
37 
38 #define SUBSAMPLE 2
39 #define MAXSAMPLE (180 * SUBSAMPLE)
40 
41 class SinCosLut
42 {
43 public:
SinCosLut()44 	SinCosLut ()
45 	{
46 		float mp = 2.f * M_PI / SUBSAMPLE / -360.0;
47 		for (int i = 0; i < MAXSAMPLE; ++i) {
48 #ifdef __APPLE__
49 			_sin[i] = sinf (mp * i);
50 			_cos[i] = cosf (mp * i);
51 #else
52 			sincosf (mp * i, &_sin[i], &_cos[i]);
53 #endif
54 		}
55 	}
56 
57 	void
sincos(int a,float * s,float * c) const58 	sincos (int a, float* s, float* c) const
59 	{
60 #if 0
61 		static float mp = 2.f * M_PI / SUBSAMPLE / -360.0;
62 		sincosf (a * mp, s, c);
63 #else
64 		*s = _sin[a];
65 		*c = _cos[a];
66 #endif
67 	}
68 
69 private:
70 	float _sin[MAXSAMPLE];
71 	float _cos[MAXSAMPLE];
72 };
73 
74 SinCosLut scl;
75 
76 static float
coeff_to_dB(float coeff)77 coeff_to_dB (float coeff)
78 {
79 	if (coeff < 1e-15) {
80 		return -std::numeric_limits<float>::infinity ();
81 	}
82 	return 20.0f * log10f (coeff);
83 }
84 
85 static inline float
calc_peak(float * buf,uint32_t n_samples,float pk)86 calc_peak (float* buf, uint32_t n_samples, float pk)
87 {
88 #ifdef _HAVE_DSP_COMPUTE_PEAK
89 	return dsp_compute_peak (buf, n_samples, pk);
90 #else
91 	for (uint32_t i = 0; i < n_samples; ++i) {
92 		pk = std::max (pk, fabsf (buf[i]));
93 	}
94 #endif
95 	return pk;
96 }
97 
98 static inline float
calc_rotated_peak(float const * b0,float const * b1,uint32_t n_samples,float pk,int angle)99 calc_rotated_peak (float const* b0, float const* b1, uint32_t n_samples, float pk, int angle)
100 {
101 	float sa, ca;
102 	scl.sincos (angle, &sa, &ca);
103 
104 #ifdef _HAVE_DSP_COMPUTE_PEAK
105 	float x[n_samples];
106 
107 #pragma GCC ivdep
108 	for (uint32_t i = 0; i < n_samples; ++i) {
109 		x[i] = ca * b0[i] + sa * b1[i];
110 	}
111 	return calc_peak (x, n_samples, pk);
112 
113 #else
114 
115 	for (uint32_t i = 0; i < n_samples; ++i) {
116 		const float x = ca * b0[i] + sa * b1[i];
117 		pk = std::max (pk, fabsf (x));
118 	}
119 	return pk;
120 #endif
121 }
122 
123 /* ****************************************************************************/
124 
125 class PhaseRotateProc
126 {
127 public:
PhaseRotateProc(uint32_t blksiz)128 	PhaseRotateProc (uint32_t blksiz)
129 		: _fftlen (blksiz * 2)
130 		, _parsiz (blksiz)
131 		, _firlen (blksiz / 2)
132 		, _time_data (0)
133 		, _freq_data (0)
134 		, _plan_r2c (0)
135 		, _plan_c2r (0)
136 	{
137 		_time_data = (float*)fftwf_malloc (_fftlen * sizeof (float));
138 		_freq_data = (fftwf_complex*)fftwf_malloc ((_parsiz + 1) * sizeof (fftwf_complex));
139 		_ffir_data = (fftwf_complex*)fftwf_malloc ((_parsiz + 1) * sizeof (fftwf_complex));
140 		_plan_r2c  = fftwf_plan_dft_r2c_1d (_fftlen, _time_data, _freq_data, FFTW_ESTIMATE);
141 		_plan_c2r  = fftwf_plan_dft_c2r_1d (_fftlen, _freq_data, _time_data, FFTW_ESTIMATE);
142 		_norm      = 0.5 / _parsiz;
143 
144 		/* generate hilbert FIR */
145 		float         fir[_fftlen];
146 		fftwf_complex freq_fir[_firlen + 1];
147 
148 		for (uint32_t i = 0; i <= _firlen; ++i) {
149 			const float re = i & 1 ? -1 : 1;
150 			freq_fir[i][0] = 0;
151 			freq_fir[i][1] = re;
152 		}
153 
154 		fftwf_plan plan_fir_c2r = fftwf_plan_dft_c2r_1d (_parsiz, freq_fir, fir, FFTW_ESTIMATE);
155 		fftwf_execute_dft_c2r (plan_fir_c2r, freq_fir, fir);
156 		fftwf_destroy_plan (plan_fir_c2r);
157 
158 		const double flen = 1.0 / _parsiz;
159 		for (uint32_t i = 0; i < _parsiz; ++i) {
160 			fir[i] *= _norm * (1 - cos (2.0 * M_PI * i * flen));
161 		}
162 
163 		memset (fir + _parsiz, 0, _parsiz * sizeof (float));
164 		fftwf_execute_dft_r2c (_plan_r2c, fir, _ffir_data);
165 	}
166 
~PhaseRotateProc()167 	~PhaseRotateProc ()
168 	{
169 		fftwf_free (_time_data);
170 		fftwf_free (_freq_data);
171 		fftwf_destroy_plan (_plan_r2c);
172 		fftwf_destroy_plan (_plan_c2r);
173 	}
174 
175 	uint32_t
parsiz() const176 	parsiz () const
177 	{
178 		return _parsiz;
179 	}
180 
181 	void
hilbert(float const * in,float * out,float * o_out)182 	hilbert (float const* in, float* out, float* o_out)
183 	{
184 		const uint32_t parsiz = _parsiz;
185 
186 		/* copy end/overlap of prev. iFFT */
187 		memcpy (out, o_out, sizeof (float) * parsiz);
188 
189 		/* fill fft buffer & zero pad */
190 		memcpy (_time_data, in + parsiz, sizeof (float) * parsiz);
191 		memset (_time_data + parsiz, 0, sizeof (float) * parsiz);
192 
193 		fftwf_execute_dft_r2c (_plan_r2c, _time_data, _freq_data);
194 
195 		/* FIR convolution, 90deg phase shift */
196 		const float          norm     = _norm;
197 		const fftwf_complex* freq_fir = _ffir_data;
198 #pragma GCC ivdep /* do not check for aliasing, buffers do not overlap */
199 		for (uint32_t i = 0; i <= parsiz; ++i) {
200 			const float re   = _freq_data[i][0];
201 			_freq_data[i][0] = norm * (re * freq_fir[i][0] - _freq_data[i][1] * freq_fir[i][1]);
202 			_freq_data[i][1] = norm * (re * freq_fir[i][1] + _freq_data[i][1] * freq_fir[i][0]);
203 		}
204 
205 		fftwf_execute_dft_c2r (_plan_c2r, _freq_data, _time_data);
206 
207 #pragma GCC ivdep /* do not check for aliasing, buffers do not overlap */
208 		for (uint32_t i = 0; i < parsiz; ++i) {
209 			out[i] += _time_data[i];
210 		}
211 		memcpy (o_out, _time_data + parsiz, sizeof (float) * parsiz);
212 	}
213 
214 	void
rotate(float const * in,float * out,int angle)215 	rotate (float const* in, float* out, int angle)
216 	{
217 		float          sa, ca;
218 		const uint32_t parsiz = _parsiz;
219 		scl.sincos (angle, &sa, &ca);
220 		const float* fin = &in[_firlen];
221 #pragma GCC ivdep
222 		for (uint32_t i = 0; i < parsiz; ++i) {
223 			out[i] = ca * fin[i] + sa * out[i];
224 		}
225 	}
226 
227 	void
process(float const * in,float * out,float * o_out,int angle)228 	process (float const* in, float* out, float* o_out, int angle)
229 	{
230 		hilbert (in, out, o_out);
231 		rotate (in, out, angle);
232 	}
233 
234 private:
235 	PhaseRotateProc (PhaseRotateProc const&) = delete;
236 	uint32_t       _fftlen;
237 	uint32_t       _parsiz;
238 	uint32_t       _firlen;
239 	float          _norm;
240 	float*         _time_data;
241 	fftwf_complex* _freq_data;
242 	fftwf_complex* _ffir_data;
243 	fftwf_plan     _plan_r2c;
244 	fftwf_plan     _plan_c2r;
245 };
246 
247 /* ****************************************************************************/
248 
249 typedef std::vector<std::unique_ptr<PhaseRotateProc>> PRPVec;
250 
251 class PhaseRotate
252 {
253 public:
254 	PhaseRotate (PRPVec&, uint32_t);
255 	~PhaseRotate ();
256 
257 	void reset ();
258 	void ensure_thread_buffers ();
259 
260 	void analyze (float const* in, int ang_start = 0, int ang_end = 180, int ang_stride = 1, int chn = -1, bool start = false);
261 	void apply (float* buf, std::vector<int> const& angles);
262 
263 	int
n_channels() const264 	n_channels () const
265 	{
266 		return _n_chn;
267 	}
268 
269 	int
blksiz() const270 	blksiz () const
271 	{
272 		return _proc.front ()->parsiz ();
273 	}
274 
275 	float
peak(int c,int a) const276 	peak (int c, int a) const
277 	{
278 		if (c < 0 || (uint32_t)c > _n_chn) {
279 			return peak_all (a);
280 		}
281 		if (a < 0) {
282 			a += MAXSAMPLE;
283 		}
284 		return _peak[c][a % MAXSAMPLE];
285 	}
286 
287 	float
peak_all(int a) const288 	peak_all (int a) const
289 	{
290 		float p = 0;
291 		if (a < 0) {
292 			a += MAXSAMPLE;
293 		}
294 		a = a % MAXSAMPLE;
295 		for (uint32_t c = 0; c < _n_chn; ++c) {
296 			p = std::max (p, _peak[c][a]);
297 		}
298 		return p;
299 	}
300 
301 private:
302 	void thr_process (float const* in, int ang_start = 0, int ang_end = 180, int ang_stride = 1, int chn = -1, bool start = false);
303 	void thr_apply (float const* buf, int c, int a);
304 
305 	PRPVec&  _proc;
306 	uint32_t _n_chn;
307 	size_t   _n_threads;
308 	float**  _buf_out;
309 	float**  _buf_old;
310 	float**  _buf_olp;
311 	float**  _peak;
312 };
313 
PhaseRotate(PRPVec & p,uint32_t n_chn)314 PhaseRotate::PhaseRotate (PRPVec& p, uint32_t n_chn)
315 	: _proc (p)
316 	, _n_chn (n_chn)
317 	, _n_threads (_proc.size ())
318 {
319 	uint32_t parsiz = _proc.front ()->parsiz ();
320 
321 	_buf_old = (float**)calloc (n_chn, sizeof (float*));
322 	_buf_olp = (float**)calloc (n_chn, sizeof (float*));
323 	_peak    = (float**)calloc (n_chn, sizeof (float*));
324 
325 	_buf_out = (float**)calloc (_n_threads, sizeof (float*));
326 	for (size_t t = 0; t < _n_threads; ++t) {
327 		_buf_out[t] = (float*)calloc (parsiz, sizeof (float));
328 	}
329 
330 	for (uint32_t c = 0; c < n_chn; ++c) {
331 		_buf_old[c] = (float*)calloc (parsiz, sizeof (float));    // input per channel
332 		_buf_olp[c] = (float*)calloc (parsiz, sizeof (float));    // overlap per channel
333 		_peak[c]    = (float*)calloc (MAXSAMPLE, sizeof (float)); // peak per channel per angle
334 	}
335 }
336 
~PhaseRotate()337 PhaseRotate::~PhaseRotate ()
338 {
339 	for (size_t t = 0; t < _n_threads; ++t) {
340 		free (_buf_out[t]);
341 	}
342 
343 	for (uint32_t c = 0; c < _n_chn; ++c) {
344 		free (_buf_old[c]);
345 		free (_buf_olp[c]);
346 		free (_peak[c]);
347 	}
348 
349 	free (_buf_out);
350 	free (_buf_old);
351 	free (_buf_olp);
352 	free (_peak);
353 }
354 
355 void
reset()356 PhaseRotate::reset ()
357 {
358 	uint32_t parsiz = _proc.front ()->parsiz ();
359 	for (uint32_t c = 0; c < _n_chn; ++c) {
360 		memset (_buf_old[c], 0, parsiz * sizeof (float));
361 		memset (_buf_olp[c], 0, parsiz * sizeof (float));
362 		for (uint32_t a = 0; a < MAXSAMPLE; ++a) {
363 			_peak[c][a] = 0;
364 		}
365 	}
366 }
367 
368 void
ensure_thread_buffers()369 PhaseRotate::ensure_thread_buffers ()
370 {
371 	if (_n_threads >= _proc.size ()) {
372 		return;
373 	}
374 	for (size_t t = 0; t < _n_threads; ++t) {
375 		free (_buf_out[t]);
376 	}
377 	free (_buf_out);
378 
379 	uint32_t parsiz = _proc.front ()->parsiz ();
380 
381 	_n_threads = _proc.size ();
382 	_buf_out   = (float**)calloc (_n_threads, sizeof (float*));
383 	for (size_t t = 0; t < _n_threads; ++t) {
384 		_buf_out[t] = (float*)calloc (parsiz, sizeof (float));
385 	}
386 }
387 
388 void
thr_process(float const * p_in,int ang_start,int ang_end,int ang_stride,int c,bool start)389 PhaseRotate::thr_process (float const* p_in, int ang_start, int ang_end, int ang_stride, int c, bool start)
390 {
391 	uint32_t parsiz = _proc[c]->parsiz ();
392 	uint32_t fftlen = 2 * parsiz;
393 	uint32_t firlen = parsiz / 2;
394 
395 	float tdc[fftlen];
396 	float hil[parsiz];
397 
398 	memcpy (tdc, _buf_old[c], parsiz * sizeof (float));
399 	for (uint32_t i = 0; i < parsiz; ++i) {
400 		tdc[i + parsiz] = p_in[c + i * _n_chn];
401 	}
402 
403 	/* remember overlap */
404 	memcpy (_buf_old[c], &tdc[parsiz], parsiz * sizeof (float));
405 
406 	/* apply hilbert FIR */
407 	_proc[c]->hilbert (tdc, hil, _buf_olp[c]);
408 
409 	int angle = ang_start;
410 	while (angle <= ang_end) {
411 		int a = (angle + MAXSAMPLE) % MAXSAMPLE;
412 		/* calc peak / angle */
413 		if (angle == 0) {
414 			_peak[c][a] = calc_peak (_buf_old[c], parsiz, _peak[c][a]);
415 		} else {
416 			memcpy (_buf_out[c], hil, parsiz * sizeof (float));
417 
418 			if (start) {
419 				_peak[c][a] = calc_rotated_peak (&tdc[firlen], &_buf_out[c][firlen], firlen, _peak[c][a], a);
420 			} else {
421 				_peak[c][a] = calc_rotated_peak (&tdc[firlen], _buf_out[c], parsiz, _peak[c][a], a);
422 			}
423 		}
424 		angle += ang_stride;
425 		if (angle >= ang_end) {
426 			break;
427 		}
428 	}
429 }
430 
431 void
analyze(float const * p_in,int ang_start,int ang_end,int ang_stride,int chn,bool start)432 PhaseRotate::analyze (float const* p_in, int ang_start, int ang_end, int ang_stride, int chn, bool start)
433 {
434 	uint32_t c0 = (chn < 0) ? 0 : chn;
435 	uint32_t c1 = (chn < 0) ? _n_chn : chn + 1;
436 
437 	std::vector<std::thread> threads;
438 	for (uint32_t c = c0; c < c1; ++c) {
439 		threads.push_back (std::thread (&PhaseRotate::thr_process, this, p_in, ang_start, ang_end, ang_stride, c, start));
440 	}
441 	for (auto& th : threads) {
442 		th.join ();
443 	}
444 }
445 
446 void
thr_apply(float const * buf,int c,int a)447 PhaseRotate::thr_apply (float const* buf, int c, int a)
448 {
449 	uint32_t parsiz = _proc[c]->parsiz ();
450 	uint32_t fftlen = 2 * parsiz;
451 
452 	float tdc[fftlen];
453 
454 	/* copy overlap */
455 	memcpy (tdc, _buf_old[c], parsiz * sizeof (float));
456 	/* de-interleave channel */
457 	for (uint32_t i = 0; i < parsiz; ++i) {
458 		tdc[i + parsiz] = buf[c + i * _n_chn];
459 	}
460 	/* remember overlap */
461 	memcpy (_buf_old[c], &tdc[parsiz], parsiz * sizeof (float));
462 
463 	a = (a + MAXSAMPLE) % MAXSAMPLE;
464 	_proc[c]->process (tdc, _buf_out[c], _buf_olp[c], a);
465 }
466 
467 void
apply(float * buf,std::vector<int> const & angles)468 PhaseRotate::apply (float* buf, std::vector<int> const& angles)
469 {
470 	std::vector<std::thread> threads;
471 	for (uint32_t c = 0; c < _n_chn; ++c) {
472 		threads.push_back (std::thread (&PhaseRotate::thr_apply, this, buf, c, angles[c]));
473 	}
474 	for (auto& th : threads) {
475 		th.join ();
476 	}
477 
478 	/* interleave */
479 	const uint32_t parsiz = _proc.front ()->parsiz ();
480 	for (uint32_t c = 0; c < _n_chn; ++c) {
481 		for (uint32_t i = 0; i < parsiz; ++i) {
482 			buf[c + i * _n_chn] = _buf_out[c][i];
483 		}
484 	}
485 }
486 
487 /* ****************************************************************************/
488 
489 static void
usage()490 usage ()
491 {
492 	// help2man compatible format (standard GNU help-text)
493 	printf ("phase-rotate - Audio File Phase Rotation Util.\n\n");
494 	printf ("Usage: phase-rotate [ OPTIONS ] <file> [out-file]\n\n");
495 
496 	/* **** "---------|---------|---------|---------|---------|---------|---------|---------|" */
497 	printf ("Options:\n"
498 	        "  -a, --angle <n>[,<n>]*     specify phase angle to apply\n"
499 	        "  -f, --fftlen <num>         process-block size, freq. resolution\n"
500 	        "  -h, --help                 display this help and exit\n"
501 	        "  -l, --link-channels        use downmixed mono peak for analysis\n"
502 	        "  -s, --stride <num>         analysis step-size\n"
503 	        "  -v, --verbose              show processing information\n"
504 	        "  -V, --version              print version information and exit\n"
505 	        "\n");
506 
507 	printf ("\n"
508 	        "This utility analyzes the given audio file to find a phase-rotation\n"
509 	        "angle that results in minimal digital-peak, while retaining overall\n"
510 	        "sound and loudness.\n"
511 	        "\n"
512 	        "If both input and output file are given, the analysis results applied, and\n"
513 	        "a new file with optimized phase is written. Otherwise the analysis results\n"
514 	        "are only printed to standard output.\n"
515 	        "\n"
516 	        "Analysis is performed in two steps, first a coarse analysis is performed,\n"
517 	        "calculating peak for angles distanced `stride' degrees apart. Then local\n"
518 	        "minimums are explored in a second step.\n"
519 	        "\n"
520 	        "Verbose analysis allows to plot the digital peak vs phase-rotation.\n"
521 	        "The output is in gnuplot(1) data file format.\n"
522 	        "\n"
523 	        "If the -a option is specified, no analysis is performed but the given,\n"
524 	        "phase-angle(s) are directly applied. This requires both input and output\n"
525 	        "files to be given. If a single angle is given it is applied to all channels\n"
526 	        "of the file. Otherwise one has to specify the same number of phase-angles as\n"
527 	        "there are channels in the file.\n"
528 	        "\n");
529 
530 	printf ("\n"
531 	        "Examples:\n"
532 	        "phase-rotate -l my-music.wav out-file.wav\n\n"
533 	        "phase-rotate -vv -s 3 my-music.wav\n\n"
534 	        "phase-rotate -a 10,20 in.wav out.wav\n\n");
535 
536 	printf ("Report bugs to <https://github.com/x42/phaserotate.lv2/issues>\n"
537 	        "Website: <https://github.com/x42/phaserotate.lv2/>\n");
538 	::exit (EXIT_SUCCESS);
539 }
540 
541 static void
copy_metadata(SNDFILE * infile,SNDFILE * outfile)542 copy_metadata (SNDFILE* infile, SNDFILE* outfile)
543 {
544 	SF_CUES           cues;
545 	SF_BROADCAST_INFO binfo;
546 
547 	memset (&cues, 0, sizeof (cues));
548 	memset (&binfo, 0, sizeof (binfo));
549 
550 	for (int k = SF_STR_FIRST; k <= SF_STR_LAST; ++k) {
551 		const char* str = sf_get_string (infile, k);
552 		if (str != NULL) {
553 			sf_set_string (outfile, k, str);
554 		}
555 	}
556 
557 	if (sf_command (infile, SFC_GET_CUE, &cues, sizeof (cues)) == SF_TRUE)
558 		sf_command (outfile, SFC_SET_CUE, &cues, sizeof (cues));
559 
560 	if (sf_command (infile, SFC_GET_BROADCAST_INFO, &binfo, sizeof (binfo)) == SF_TRUE) {
561 		sf_command (outfile, SFC_SET_BROADCAST_INFO, &binfo, sizeof (binfo));
562 	}
563 }
564 
565 static void
analyze_file(PhaseRotate & pr,SNDFILE * infile,float * buf,int ang_start,int ang_end,int ang_stride,int chn=-1)566 analyze_file (PhaseRotate& pr, SNDFILE* infile, float* buf, int ang_start, int ang_end, int ang_stride, int chn = -1)
567 {
568 	int n_channels = pr.n_channels ();
569 	int blksiz     = pr.blksiz ();
570 
571 	bool start = true;
572 	do {
573 		sf_count_t n = sf_readf_float (infile, buf, blksiz);
574 		if (n <= 0) {
575 			break;
576 		}
577 		if (n < blksiz) {
578 			/* silence pad */
579 			memset (&buf[n_channels * n], 0, sizeof (float) * n_channels * (blksiz - n));
580 		}
581 		pr.analyze (buf, ang_start, ang_end, ang_stride, chn, start);
582 		start = false;
583 	} while (1);
584 
585 	memset (buf, 0, blksiz * n_channels * sizeof (float));
586 	pr.analyze (buf, ang_start, ang_end, ang_stride, chn, false);
587 }
588 
589 int
main(int argc,char ** argv)590 main (int argc, char** argv)
591 {
592 	SF_INFO      nfo;
593 	SNDFILE*     infile     = NULL;
594 	SNDFILE*     outfile    = NULL;
595 	float*       buf        = NULL;
596 	char*        angles_opt = NULL;
597 	int          stride     = 12 * SUBSAMPLE;
598 	int          verbose    = 0;
599 	FILE*        verbose_fd = stdout;
600 	bool         find_min   = true;
601 	bool         link_chn   = false;
602 	unsigned int blksiz     = 0;
603 
604 	std::vector<int> angles;
605 
606 	// TODO pick stride to optimize processor usage
607 
608 	const char* optstring = "a:f:hls:Vv";
609 
610 	/* clang-format off */
611 	const struct option longopts[] = {
612 		{ "angle",         required_argument, 0, 'a' },
613 		{ "fftlen",        required_argument, 0, 'f' },
614 		{ "stride",        required_argument, 0, 's' },
615 		{ "help",          no_argument,       0, 'h' },
616 		{ "link-channels", no_argument,       0, 'l' },
617 		{ "version",       no_argument,       0, 'V' },
618 		{ "verbose",       no_argument,       0, 'v' },
619 	};
620 	/* clang-format on */
621 
622 	int c = 0;
623 	while (EOF != (c = getopt_long (argc, argv,
624 	                                optstring, longopts, (int*)0))) {
625 		switch (c) {
626 			case 'a':
627 				angles_opt = optarg;
628 				break;
629 
630 			case 'f':
631 				blksiz = atoi (optarg);
632 				break;
633 
634 			case 'h':
635 				usage ();
636 				break;
637 
638 			case 'l':
639 				link_chn = true;
640 				break;
641 
642 			case 's':
643 				stride = atoi (optarg);
644 				break;
645 
646 			case 'V':
647 				printf ("phase-rotate version %s\n\n", VERSION);
648 				printf ("Copyright (C) GPL 2021 Robin Gareus <robin@gareus.org>\n");
649 				exit (EXIT_SUCCESS);
650 				break;
651 
652 			case 'v':
653 				++verbose;
654 				break;
655 
656 			default:
657 				fprintf (stderr, "Error: unrecognized option. See --help for usage information.\n");
658 				::exit (EXIT_FAILURE);
659 				break;
660 		}
661 	}
662 
663 	if (optind + 1 > argc) {
664 		fprintf (stderr, "Error: Missing parameter. See --help for usage information.\n");
665 		::exit (EXIT_FAILURE);
666 	}
667 
668 	if (stride < 1 || stride > 45 * SUBSAMPLE || (MAXSAMPLE % stride) != 0) {
669 		fprintf (stderr, "Error: 180 deg is not evenly dividable by given stride.\n");
670 		::exit (EXIT_FAILURE);
671 	}
672 
673 	if (blksiz != 0 && (blksiz < 1024 || blksiz > 32768)) {
674 		fprintf (stderr, "Error: fft-len is out of bounds; valid range 1024..32768\n");
675 		::exit (EXIT_FAILURE);
676 	}
677 
678 	if (angles_opt && optind + 2 > argc) {
679 		fprintf (stderr, "Error: -a, --angle option requires an output file to be given.\n");
680 		::exit (EXIT_FAILURE);
681 	}
682 
683 	memset (&nfo, 0, sizeof (SF_INFO));
684 
685 	if ((infile = sf_open (argv[optind], SFM_READ, &nfo)) == 0) {
686 		fprintf (stderr, "Cannot open '%s' for reading: ", argv[optind]);
687 		fputs (sf_strerror (NULL), stderr);
688 		::exit (EXIT_FAILURE);
689 	}
690 
691 	if (find_min && !nfo.seekable) {
692 		fprintf (stderr, "File '%s' is not seekable. ", argv[optind]);
693 		::exit (EXIT_FAILURE);
694 	}
695 
696 	if (optind + 1 < argc) {
697 		if ((outfile = sf_open (argv[optind + 1], SFM_WRITE, &nfo)) == 0) {
698 			fprintf (stderr, "Cannot open '%s' for writing: ", argv[optind + 1]);
699 			fputs (sf_strerror (NULL), stderr);
700 			::exit (EXIT_FAILURE);
701 		}
702 	}
703 
704 	if (verbose > 1) {
705 		verbose_fd = stderr;
706 	}
707 
708 	if (verbose > 2) {
709 		char strbuffer[65536];
710 		sf_command (infile, SFC_GET_LOG_INFO, strbuffer, 65536);
711 		fputs (strbuffer, verbose_fd);
712 	} else if (verbose) {
713 		fprintf (verbose_fd, "Input File      : %s\n", argv[optind]);
714 		fprintf (verbose_fd, "Sample Rate     : %d Hz\n", nfo.samplerate);
715 		fprintf (verbose_fd, "Channels        : %d\n", nfo.channels);
716 	}
717 
718 	if (angles_opt) {
719 		find_min  = false;
720 		char* ang = angles_opt;
721 		char* saveptr;
722 		while ((ang = strtok_r (angles_opt, ",", &saveptr)) != NULL) {
723 			char*  ep;
724 			double a = strtod (ang, &ep);
725 			if (*ep != '\0' || a < -180 || a > 180) {
726 				fprintf (stderr, "Error: Invalid angle speficied, value needs to be -180 .. +180.\n");
727 				::exit (EXIT_FAILURE);
728 			}
729 			angles_opt = NULL;
730 			angles.push_back (round (a * (float)SUBSAMPLE));
731 		}
732 		if (angles.size () == 1) {
733 			while (angles.size () < (size_t)nfo.channels) {
734 				angles.push_back (angles.back ());
735 			}
736 		}
737 		if (angles.size () < (size_t)nfo.channels) {
738 			fprintf (stderr, "Error: file has more channels than angles were specified.\n");
739 			::exit (EXIT_FAILURE);
740 		}
741 		if (verbose) {
742 			fprintf (verbose_fd, "# Apply phase-shift\n");
743 			for (int c = 0; c < nfo.channels; ++c) {
744 				fprintf (verbose_fd, "Channel: %2d Phase: %5.2f deg\n", c + 1, angles[c] / (float)SUBSAMPLE);
745 			}
746 		}
747 	}
748 
749 	if (blksiz == 0 || blksiz > 32768) {
750 		blksiz = nfo.samplerate / 8;
751 	}
752 
753 	unsigned power_of_two;
754 	for (power_of_two = 1; 1U << power_of_two < blksiz; ++power_of_two);
755 	blksiz = std::min (32768, std::max (1024, 1 << power_of_two));
756 
757 	if (verbose > 1) {
758 		fprintf (verbose_fd, "Process block-size %d\n", blksiz);
759 	}
760 
761 	buf = (float*)malloc (blksiz * nfo.channels * sizeof (float));
762 
763 	if (!buf) {
764 		fprintf (stderr, "Out of memory\n");
765 		::exit (EXIT_FAILURE);
766 	}
767 
768 	{
769 		PRPVec prp;
770 		int    n_threads = nfo.channels;
771 
772 		for (int i = 0; i < n_threads; ++i) {
773 			auto p = std::unique_ptr<PhaseRotateProc> (new PhaseRotateProc (blksiz));
774 			prp.push_back (std::move (p));
775 		}
776 
777 		PhaseRotate pr (prp, nfo.channels);
778 
779 		if (find_min) {
780 			if (verbose > 1) {
781 				fprintf (verbose_fd, "Analyzing using %d process threads, stride = %d\n", n_threads, stride);
782 			}
783 
784 			analyze_file (pr, infile, buf, 0, MAXSAMPLE, stride);
785 
786 			/*
787 			 * Consider writing gnuplot file
788 			 * ```
789 			 * #!/usr/bin/gnuplot -persist
790 			 * set terminal qt
791 			 * set xlabel "Phase Rotation [deg]"
792 			 * set ylabel "Peak [dBFS]"
793 			 * set datafile missing "-inf"
794 			 * plot '-' u 1:3 t "chn-1", '' u 1:4 t "chn-2"
795 			 *  ... # [data, terminate with 'e\n']
796 			 * e
797 			 * ```
798 			 */
799 
800 			if (verbose > 1) {
801 				printf ("# Angle mono-peak");
802 				for (int c = 0; c < nfo.channels; ++c) {
803 					printf (" chn-%d", c + 1);
804 				}
805 				printf ("\n");
806 				for (uint32_t a = 0; a < MAXSAMPLE; a += stride) {
807 					printf ("%.2f %.4f", a / (float)SUBSAMPLE, coeff_to_dB (pr.peak_all (a)));
808 					for (int c = 0; c < nfo.channels; ++c) {
809 						printf (" %.4f", coeff_to_dB (pr.peak (c, a)));
810 					}
811 					printf ("\n");
812 				}
813 			}
814 
815 			std::map<int, std::vector<int>> mins;
816 			angles.clear ();
817 
818 			int   min_angle[(int)nfo.channels];
819 			float p_min[nfo.channels];
820 			float r_zro[nfo.channels];
821 			float r_min[nfo.channels];
822 
823 			for (int c = 0; c < nfo.channels; ++c) {
824 				float c_min = std::numeric_limits<float>::infinity ();
825 				float c_max = 0;
826 				float range;
827 
828 				r_zro[c] = pr.peak (c, 0);
829 
830 				for (uint32_t a = 0; a < MAXSAMPLE; a += stride) {
831 					c_min = std::min (c_min, pr.peak (link_chn ? -1 : c, a));
832 					c_max = std::max (c_max, pr.peak (link_chn ? -1 : c, a));
833 				}
834 
835 				range = c_max - c_min;
836 				if (range == 0) {
837 					mins[0].push_back (c);
838 					continue;
839 				}
840 				if (stride > 1) {
841 					range *= .07;
842 					p_min[c] = std::numeric_limits<float>::infinity ();
843 				} else {
844 					range    = 0;
845 					p_min[c] = c_min;
846 				}
847 
848 				for (uint32_t a = 0; a < MAXSAMPLE; a += stride) {
849 					float p = pr.peak (link_chn ? -1 : c, a);
850 					if (p <= c_min + range) {
851 						mins[a].push_back (c);
852 						if (verbose > 1) {
853 							fprintf (verbose_fd, "Consider min: %f (< %f) chn: %d @ %.2f deg\n", p, c_min + range, c, a / (float)SUBSAMPLE);
854 						}
855 					}
856 				}
857 			}
858 
859 			if (stride == 1) {
860 				for (auto& mp : mins) {
861 					for (auto& cn : mp.second) {
862 						min_angle[cn] = mp.first;
863 						r_min[cn]     = pr.peak (cn, mp.first);
864 					}
865 				}
866 			} else {
867 				/* search local min */
868 				int stride_2 = (stride + 1) / 2;
869 
870 				// TODO: binary search, but use all CPU cores..
871 				for (auto& mp : mins) {
872 					if (0 != sf_seek (infile, 0, SEEK_SET)) {
873 						fprintf (stderr, "Failed to rewind input file\n");
874 						::exit (EXIT_FAILURE);
875 					}
876 					pr.reset ();
877 
878 					int ma = mp.first;
879 
880 					analyze_file (pr, infile, buf, ma - stride_2, ma + stride_2 + 1, 1, mp.second.size () > 1 ? -1 : mp.second.front ());
881 
882 					for (auto& cn : mp.second) {
883 						for (int a = ma - stride_2; a < ma + stride_2 + 1; ++a) {
884 							float p = pr.peak (link_chn ? -1 : cn, a);
885 							if (p <= p_min[cn]) {
886 								p_min[cn]     = p;
887 								r_min[cn]     = pr.peak (cn, a);
888 								min_angle[cn] = (a + MAXSAMPLE) % MAXSAMPLE;
889 							}
890 
891 							if (verbose > 1) {
892 								printf ("%.2f %.4f",
893 								        ((a + MAXSAMPLE) % MAXSAMPLE) / (float)SUBSAMPLE,
894 								        coeff_to_dB (pr.peak_all (a)));
895 								for (int c = 0; c < nfo.channels; ++c) {
896 									printf (" %.4f", coeff_to_dB (pr.peak (c, a)));
897 								}
898 								printf ("\n");
899 							}
900 						}
901 					}
902 				}
903 			}
904 
905 			/* collect results */
906 			float avg_rotate = 0;
907 			int   avg_count  = 0;
908 			for (int c = 0; c < nfo.channels; ++c) {
909 				if (p_min[c] != std::numeric_limits<float>::infinity ()) {
910 					avg_rotate += min_angle[c];
911 					++avg_count;
912 				}
913 			}
914 			avg_rotate /= avg_count;
915 			float avg_dist = MAXSAMPLE / (float)avg_count;
916 
917 			/* minimize channel phase distance */
918 			for (int c = 0; c < nfo.channels; ++c) {
919 				if (p_min[c] == std::numeric_limits<float>::infinity ()) {
920 					angles.push_back (0);
921 				} else {
922 					if (min_angle[c] > 90 * SUBSAMPLE && fabsf (min_angle[c] - avg_rotate) > avg_dist) {
923 						min_angle[c] -= MAXSAMPLE;
924 					} else if (avg_rotate > 90 * SUBSAMPLE) {
925 						min_angle[c] -= MAXSAMPLE;
926 					}
927 					angles.push_back (min_angle[c]);
928 				}
929 			}
930 
931 			/* print results */
932 			if (!outfile || verbose) {
933 				fprintf (verbose_fd, "# Result -- Minimize digital peak\n");
934 				for (int c = 0; c < nfo.channels; ++c) {
935 					if (p_min[c] == std::numeric_limits<float>::infinity ()) {
936 						fprintf (verbose_fd, "Channel: %2d Phase:   0 deg # cannot find min.\n", c + 1);
937 					} else {
938 						fprintf (verbose_fd, "Channel: %2d Phase: %5.2f deg", c + 1, min_angle[c] / (float)SUBSAMPLE);
939 						if (min_angle[c] != 0) {
940 							fprintf (verbose_fd, ", gain: %5.2f dB (att. %4.2f to %4.2f dBFS)",
941 							         coeff_to_dB (r_zro[c]) - coeff_to_dB (r_min[c]),
942 							         coeff_to_dB (r_zro[c]), coeff_to_dB (r_min[c]));
943 						}
944 						fprintf (verbose_fd, "\n");
945 					}
946 				}
947 			}
948 		} // find min
949 
950 		if (outfile) {
951 			copy_metadata (infile, outfile);
952 			int n_channels = nfo.channels;
953 
954 			if (find_min) {
955 				if (0 != sf_seek (infile, 0, SEEK_SET)) {
956 					fprintf (stderr, "Failed to rewind input file\n");
957 					::exit (EXIT_FAILURE);
958 				}
959 
960 				pr.reset ();
961 			}
962 
963 			const uint32_t latency = blksiz / 2;
964 
965 			uint32_t pad = 0;
966 			uint32_t off = latency;
967 			do {
968 				sf_count_t n = sf_readf_float (infile, buf, blksiz);
969 				if (n <= 0) {
970 					break;
971 				}
972 
973 				if (n < latency) {
974 					pad = blksiz - n;
975 					/* silence remaining buffer */
976 					memset (&buf[n_channels * n], 0, sizeof (float) * n_channels * pad);
977 					pad = latency - n;
978 					n += pad;
979 				}
980 
981 				pr.apply (buf, angles);
982 
983 				n -= off;
984 
985 				if (n != sf_writef_float (outfile, &buf[off], n)) {
986 					fprintf (stderr, "Error writing to output file.\n");
987 					pad = latency;
988 					break;
989 				}
990 				off = 0;
991 			} while (1);
992 
993 			sf_count_t n = (sf_count_t)latency - pad;
994 			if (n > 0) {
995 				memset (buf, 0, blksiz * n_channels * sizeof (float));
996 				pr.apply (buf, angles);
997 
998 				if (n != sf_writef_float (outfile, buf, n)) {
999 					fprintf (stderr, "Error writing to output file.\n");
1000 				}
1001 			}
1002 			sf_close (outfile);
1003 		} // write file
1004 
1005 	} // scope
1006 
1007 	sf_close (infile);
1008 	free (buf);
1009 	fftwf_cleanup ();
1010 	return 0;
1011 }
1012