Beast - Music Synthesizer and Composer  0.11.1+10.g2da35
bseresamplerimpl.hh
Go to the documentation of this file.
1  // Licensed GNU LGPL v2.1 or later: http://www.gnu.org/licenses/lgpl.html
2 #ifndef __BSE_RESAMPLER_TCC__
3 #define __BSE_RESAMPLER_TCC__
4 
5 #include <vector>
6 #include <bse/bseresampler.hh>
7 #include <sfi/sfi.hh>
8 #include <math.h>
9 #include <string.h>
10 #include <stdlib.h>
11 #include <stdio.h>
12 #ifdef __SSE__
13 #include <xmmintrin.h>
14 #endif
15 
16 namespace Bse {
17 namespace Resampler {
18 using std::vector;
19 using std::min;
20 using std::max;
21 using std::copy;
22 
23 /* see: http://ds9a.nl/gcc-simd/ */
24 union F4Vector
25 {
26  float f[4];
27 #ifdef __SSE__
28  __m128 v; // vector of four single floats
29 #endif
30 };
31 
47 template<class Accumulator> static inline Accumulator
48 fir_process_one_sample (const float *input,
49  const float *taps, /* [0..order-1] */
50  const guint order)
51 {
52  Accumulator out = 0;
53  for (guint i = 0; i < order; i++)
54  out += input[i] * taps[i];
55  return out;
56 }
57 
68 static inline void
69 fir_process_4samples_sse (const float *input,
70  const float *sse_taps,
71  const guint order,
72  float *out0,
73  float *out1,
74  float *out2,
75  float *out3)
76 {
77 #ifdef __SSE__
78  /* input and taps must be 16-byte aligned */
79  const F4Vector *input_v = reinterpret_cast<const F4Vector *> (input);
80  const F4Vector *sse_taps_v = reinterpret_cast<const F4Vector *> (sse_taps);
81  F4Vector out0_v, out1_v, out2_v, out3_v;
82 
83  out0_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[0].v);
84  out1_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[1].v);
85  out2_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[2].v);
86  out3_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[3].v);
87 
88  for (guint i = 1; i < (order + 6) / 4; i++)
89  {
90  out0_v.v = _mm_add_ps (out0_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 0].v));
91  out1_v.v = _mm_add_ps (out1_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 1].v));
92  out2_v.v = _mm_add_ps (out2_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 2].v));
93  out3_v.v = _mm_add_ps (out3_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 3].v));
94  }
95 
96  *out0 = out0_v.f[0] + out0_v.f[1] + out0_v.f[2] + out0_v.f[3];
97  *out1 = out1_v.f[0] + out1_v.f[1] + out1_v.f[2] + out1_v.f[3];
98  *out2 = out2_v.f[0] + out2_v.f[1] + out2_v.f[2] + out2_v.f[3];
99  *out3 = out3_v.f[0] + out3_v.f[1] + out3_v.f[2] + out3_v.f[3];
100 #else
102 #endif
103 }
104 
105 
124 static inline vector<float>
125 fir_compute_sse_taps (const vector<float>& taps)
126 {
127  const int order = taps.size();
128  vector<float> sse_taps ((order + 6) / 4 * 16);
129 
130  for (int j = 0; j < 4; j++)
131  for (int i = 0; i < order; i++)
132  {
133  int k = i + j;
134  sse_taps[(k / 4) * 16 + (k % 4) + j * 4] = taps[i];
135  }
136 
137  return sse_taps;
138 }
139 
149 static inline bool
150 fir_test_filter_sse (bool verbose,
151  const guint max_order = 64)
152 {
153  int errors = 0;
154  if (verbose)
155  printf ("testing SSE filter implementation:\n\n");
156 
157  for (guint order = 0; order < max_order; order++)
158  {
159  vector<float> taps (order);
160  for (guint i = 0; i < order; i++)
161  taps[i] = i + 1;
162 
163  AlignedArray<float,16> sse_taps (fir_compute_sse_taps (taps));
164  if (verbose)
165  {
166  for (uint i = 0; i < sse_taps.size(); i++)
167  {
168  printf ("%3d", (int) (sse_taps[i] + 0.5));
169  if (i % 4 == 3)
170  printf (" |");
171  if (i % 16 == 15)
172  printf (" ||| upper bound = %d\n", (order + 6) / 4);
173  }
174  printf ("\n\n");
175  }
176 
177  AlignedArray<float,16> random_mem (order + 4);
178  for (guint i = 0; i < order + 4; i++)
179  random_mem[i] = 1.0 - rand() / (0.5 * RAND_MAX);
180 
181  /* FIXME: the problem with this test is that we explicitely test SSE code
182  * here, but the test case is not compiled with -msse within the BEAST tree
183  */
184  float out[4];
185  fir_process_4samples_sse (&random_mem[0], &sse_taps[0], order,
186  &out[0], &out[1], &out[2], &out[3]);
187 
188  double avg_diff = 0.0;
189  for (int i = 0; i < 4; i++)
190  {
191  double diff = fir_process_one_sample<double> (&random_mem[i], &taps[0], order) - out[i];
192  avg_diff += fabs (diff);
193  }
194  avg_diff /= (order + 1);
195  bool is_error = (avg_diff > 0.00001);
196  if (is_error || verbose)
197  printf ("*** order = %d, avg_diff = %g\n", order, avg_diff);
198  if (is_error)
199  errors++;
200  }
201  if (errors)
202  printf ("*** %d errors detected\n", errors);
203  else
204  printf ("filter implementation ok.\n");
205 
206  return (errors == 0);
207 }
208 
216 template<guint ORDER, bool USE_SSE>
217 class Upsampler2 : public Resampler2 {
218  vector<float> taps;
219  AlignedArray<float,16> history;
220  AlignedArray<float,16> sse_taps;
221 protected:
222  /* fast SSE optimized convolution */
223  void
224  process_4samples_aligned (const float *input /* aligned */,
225  float *output)
226  {
227  const guint H = (ORDER / 2); /* half the filter length */
228 
229  output[1] = input[H];
230  output[3] = input[H + 1];
231  output[5] = input[H + 2];
232  output[7] = input[H + 3];
233 
234  fir_process_4samples_sse (input, &sse_taps[0], ORDER, &output[0], &output[2], &output[4], &output[6]);
235  }
236  /* slow convolution */
237  void
238  process_sample_unaligned (const float *input,
239  float *output)
240  {
241  const guint H = (ORDER / 2); /* half the filter length */
242  output[0] = fir_process_one_sample<float> (&input[0], &taps[0], ORDER);
243  output[1] = input[H];
244  }
245  void
246  process_block_aligned (const float *input,
247  guint n_input_samples,
248  float *output)
249  {
250  uint i = 0;
251  if (USE_SSE)
252  {
253  while (i + 3 < n_input_samples)
254  {
255  process_4samples_aligned (&input[i], &output[i*2]);
256  i += 4;
257  }
258  }
259  while (i < n_input_samples)
260  {
261  process_sample_unaligned (&input[i], &output[2*i]);
262  i++;
263  }
264  }
265  void
266  process_block_unaligned (const float *input,
267  guint n_input_samples,
268  float *output)
269  {
270  uint i = 0;
271  if (USE_SSE)
272  {
273  while ((reinterpret_cast<ptrdiff_t> (&input[i]) & 15) && i < n_input_samples)
274  {
275  process_sample_unaligned (&input[i], &output[2 * i]);
276  i++;
277  }
278  }
279  process_block_aligned (&input[i], n_input_samples - i, &output[2 * i]);
280  }
281 public:
287  Upsampler2 (float *init_taps) :
288  taps (init_taps, init_taps + ORDER),
289  history (2 * ORDER),
290  sse_taps (fir_compute_sse_taps (taps))
291  {
292  RAPICORN_ASSERT ((ORDER & 1) == 0); /* even order filter */
293  }
298  void
299  process_block (const float *input,
300  guint n_input_samples,
301  float *output)
302  {
303  const uint history_todo = min (n_input_samples, ORDER - 1);
304 
305  copy (input, input + history_todo, &history[ORDER - 1]);
306  process_block_aligned (&history[0], history_todo, output);
307  if (n_input_samples > history_todo)
308  {
309  process_block_unaligned (input, n_input_samples - history_todo, &output [2 * history_todo]);
310 
311  // build new history from new input
312  copy (input + n_input_samples - history_todo, input + n_input_samples, &history[0]);
313  }
314  else
315  {
316  // build new history from end of old history
317  // (very expensive if n_input_samples tends to be a lot smaller than ORDER often)
318  memmove (&history[0], &history[n_input_samples], sizeof (history[0]) * (ORDER - 1));
319  }
320  }
324  guint
325  order() const
326  {
327  return ORDER;
328  }
329  double
330  delay() const
331  {
332  return order() - 1;
333  }
334 };
335 
343 template<guint ORDER, bool USE_SSE>
344 class Downsampler2 : public Resampler2 {
345  vector<float> taps;
346  AlignedArray<float,16> history_even;
347  AlignedArray<float,16> history_odd;
348  AlignedArray<float,16> sse_taps;
349  /* fast SSE optimized convolution */
350  template<int ODD_STEPPING> void
351  process_4samples_aligned (const float *input_even /* aligned */,
352  const float *input_odd,
353  float *output)
354  {
355  const guint H = (ORDER / 2) - 1; /* half the filter length */
356 
357  fir_process_4samples_sse (input_even, &sse_taps[0], ORDER, &output[0], &output[1], &output[2], &output[3]);
358 
359  output[0] += 0.5 * input_odd[H * ODD_STEPPING];
360  output[1] += 0.5 * input_odd[(H + 1) * ODD_STEPPING];
361  output[2] += 0.5 * input_odd[(H + 2) * ODD_STEPPING];
362  output[3] += 0.5 * input_odd[(H + 3) * ODD_STEPPING];
363  }
364  /* slow convolution */
365  template<int ODD_STEPPING> float
366  process_sample_unaligned (const float *input_even,
367  const float *input_odd)
368  {
369  const guint H = (ORDER / 2) - 1; /* half the filter length */
370 
371  return fir_process_one_sample<float> (&input_even[0], &taps[0], ORDER) + 0.5 * input_odd[H * ODD_STEPPING];
372  }
373  template<int ODD_STEPPING> void
374  process_block_aligned (const float *input_even,
375  const float *input_odd,
376  float *output,
377  guint n_output_samples)
378  {
379  uint i = 0;
380  if (USE_SSE)
381  {
382  while (i + 3 < n_output_samples)
383  {
384  process_4samples_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i]);
385  i += 4;
386  }
387  }
388  while (i < n_output_samples)
389  {
390  output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]);
391  i++;
392  }
393  }
394  template<int ODD_STEPPING> void
395  process_block_unaligned (const float *input_even,
396  const float *input_odd,
397  float *output,
398  guint n_output_samples)
399  {
400  uint i = 0;
401  if (USE_SSE)
402  {
403  while ((reinterpret_cast<ptrdiff_t> (&input_even[i]) & 15) && i < n_output_samples)
404  {
405  output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]);
406  i++;
407  }
408  }
409  process_block_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i], n_output_samples);
410  }
411  void
412  deinterleave2 (const float *data,
413  guint n_data_values,
414  float *output)
415  {
416  for (uint i = 0; i < n_data_values; i += 2)
417  output[i / 2] = data[i];
418  }
419 public:
425  Downsampler2 (float *init_taps) :
426  taps (init_taps, init_taps + ORDER),
427  history_even (2 * ORDER),
428  history_odd (2 * ORDER),
429  sse_taps (fir_compute_sse_taps (taps))
430  {
431  RAPICORN_ASSERT ((ORDER & 1) == 0); /* even order filter */
432  }
437  void
438  process_block (const float *input,
439  guint n_input_samples,
440  float *output)
441  {
442  RAPICORN_ASSERT ((n_input_samples & 1) == 0);
443 
444  const uint BLOCKSIZE = 1024;
445 
446  F4Vector block[BLOCKSIZE / 4]; /* using F4Vector ensures 16-byte alignment */
447  float *input_even = &block[0].f[0];
448 
449  while (n_input_samples)
450  {
451  uint n_input_todo = min (n_input_samples, BLOCKSIZE * 2);
452 
453  /* since the halfband filter contains zeros every other sample
454  * and since we're using SSE instructions, which expect the
455  * data to be consecutively represented in memory, we prepare
456  * a block of samples containing only even-indexed samples
457  *
458  * we keep the deinterleaved data on the stack (instead of per-class
459  * allocated memory), to ensure that even running a lot of these
460  * downsampler streams will not result in cache trashing
461  *
462  * FIXME: this implementation is suboptimal for non-SSE, because it
463  * performs an extra deinterleaving step in any case, but deinterleaving
464  * is only required for SSE instructions
465  */
466  deinterleave2 (input, n_input_todo, input_even);
467 
468  const float *input_odd = input + 1; /* we process this one with a stepping of 2 */
469 
470  const uint n_output_todo = n_input_todo / 2;
471  const uint history_todo = min (n_output_todo, ORDER - 1);
472 
473  copy (input_even, input_even + history_todo, &history_even[ORDER - 1]);
474  deinterleave2 (input_odd, history_todo * 2, &history_odd[ORDER - 1]);
475 
476  process_block_aligned <1> (&history_even[0], &history_odd[0], output, history_todo);
477  if (n_output_todo > history_todo)
478  {
479  process_block_unaligned<2> (input_even, input_odd, &output[history_todo], n_output_todo - history_todo);
480 
481  // build new history from new input (here: history_todo == ORDER - 1)
482  copy (input_even + n_output_todo - history_todo, input_even + n_output_todo, &history_even[0]);
483  deinterleave2 (input_odd + n_input_todo - history_todo * 2, history_todo * 2, &history_odd[0]); /* FIXME: can be optimized */
484  }
485  else
486  {
487  // build new history from end of old history
488  // (very expensive if n_output_todo tends to be a lot smaller than ORDER often)
489  memmove (&history_even[0], &history_even[n_output_todo], sizeof (history_even[0]) * (ORDER - 1));
490  memmove (&history_odd[0], &history_odd[n_output_todo], sizeof (history_odd[0]) * (ORDER - 1));
491  }
492 
493  n_input_samples -= n_input_todo;
494  input += n_input_todo;
495  output += n_output_todo;
496  }
497  }
501  guint
502  order() const
503  {
504  return ORDER;
505  }
506  double
507  delay() const
508  {
509  return order() / 2 - 0.5;
510  }
511 };
512 
513 template<bool USE_SSE> Resampler2*
514 Resampler2::create_impl (BseResampler2Mode mode,
515  BseResampler2Precision precision)
516 {
517  if (mode == BSE_RESAMPLER2_MODE_UPSAMPLE)
518  {
519  switch (precision)
520  {
521  case BSE_RESAMPLER2_PREC_LINEAR: return create_impl_with_coeffs <Upsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 2.0);
522  case BSE_RESAMPLER2_PREC_48DB: return create_impl_with_coeffs <Upsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 2.0);
523  case BSE_RESAMPLER2_PREC_72DB: return create_impl_with_coeffs <Upsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 2.0);
524  case BSE_RESAMPLER2_PREC_96DB: return create_impl_with_coeffs <Upsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 2.0);
525  case BSE_RESAMPLER2_PREC_120DB: return create_impl_with_coeffs <Upsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 2.0);
526  case BSE_RESAMPLER2_PREC_144DB: return create_impl_with_coeffs <Upsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 2.0);
527  }
528  }
529  else if (mode == BSE_RESAMPLER2_MODE_DOWNSAMPLE)
530  {
531  switch (precision)
532  {
533  case BSE_RESAMPLER2_PREC_LINEAR: return create_impl_with_coeffs <Downsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 1.0);
534  case BSE_RESAMPLER2_PREC_48DB: return create_impl_with_coeffs <Downsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 1.0);
535  case BSE_RESAMPLER2_PREC_72DB: return create_impl_with_coeffs <Downsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 1.0);
536  case BSE_RESAMPLER2_PREC_96DB: return create_impl_with_coeffs <Downsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 1.0);
537  case BSE_RESAMPLER2_PREC_120DB: return create_impl_with_coeffs <Downsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 1.0);
538  case BSE_RESAMPLER2_PREC_144DB: return create_impl_with_coeffs <Downsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 1.0);
539  }
540  }
541  return 0;
542 }
543 
544 } // Resampler
545 } // Bse
546 
547 #endif /* __BSE_RESAMPLER_TCC__ */
T copy(T...args)
The Bse namespace contains all functions of the synthesis engine.
Definition: bstbseutils.cc:91
rand
Definition: bseresamplerimpl.hh:24
printf
guint order() const
Returns the filter order.
Definition: bseresamplerimpl.hh:502
Downsampler2(float *init_taps)
Constructs a Downsampler2 class using a given set of filter coefficients.
Definition: bseresamplerimpl.hh:425
bool verbose()
void process_block(const float *input, guint n_input_samples, float *output)
The function process_block() takes a block of input samples and produces a block with half the length...
Definition: bseresamplerimpl.hh:438
T min(T...args)
#define RAPICORN_ASSERT_UNREACHED()
void process_block(const float *input, guint n_input_samples, float *output)
The function process_block() takes a block of input samples and produces a block with twice the lengt...
Definition: bseresamplerimpl.hh:299
fabs
T max(T...args)
T size(T...args)
STL class.
memmove
guint order() const
Returns the FIR filter order.
Definition: bseresamplerimpl.hh:325
Factor 2 upsampling of a data stream.
Definition: bseresamplerimpl.hh:217
Factor 2 downsampling of a data stream.
Definition: bseresamplerimpl.hh:344
#define RAPICORN_ASSERT(cond)
Upsampler2(float *init_taps)
Constructs an Upsampler2 object with a given set of filter coefficients.
Definition: bseresamplerimpl.hh:287