2 #ifndef __BSE_RESAMPLER_TCC__
3 #define __BSE_RESAMPLER_TCC__
13 #include <xmmintrin.h>
47 template<
class Accumulator>
static inline Accumulator
48 fir_process_one_sample (
const float *input,
53 for (guint i = 0; i < order; i++)
54 out += input[i] * taps[i];
69 fir_process_4samples_sse (
const float *input,
70 const float *sse_taps,
79 const F4Vector *input_v =
reinterpret_cast<const F4Vector *
> (input);
80 const F4Vector *sse_taps_v =
reinterpret_cast<const F4Vector *
> (sse_taps);
81 F4Vector out0_v, out1_v, out2_v, out3_v;
83 out0_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[0].v);
84 out1_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[1].v);
85 out2_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[2].v);
86 out3_v.v = _mm_mul_ps (input_v[0].v, sse_taps_v[3].v);
88 for (guint i = 1; i < (order + 6) / 4; i++)
90 out0_v.v = _mm_add_ps (out0_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 0].v));
91 out1_v.v = _mm_add_ps (out1_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 1].v));
92 out2_v.v = _mm_add_ps (out2_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 2].v));
93 out3_v.v = _mm_add_ps (out3_v.v, _mm_mul_ps (input_v[i].v, sse_taps_v[i * 4 + 3].v));
96 *out0 = out0_v.f[0] + out0_v.f[1] + out0_v.f[2] + out0_v.f[3];
97 *out1 = out1_v.f[0] + out1_v.f[1] + out1_v.f[2] + out1_v.f[3];
98 *out2 = out2_v.f[0] + out2_v.f[1] + out2_v.f[2] + out2_v.f[3];
99 *out3 = out3_v.f[0] + out3_v.f[1] + out3_v.f[2] + out3_v.f[3];
101 g_assert_not_reached();
124 static inline vector<float>
125 fir_compute_sse_taps (
const vector<float>& taps)
127 const int order = taps.
size();
128 vector<float> sse_taps ((order + 6) / 4 * 16);
130 for (
int j = 0; j < 4; j++)
131 for (
int i = 0; i < order; i++)
134 sse_taps[(k / 4) * 16 + (k % 4) + j * 4] = taps[i];
150 fir_test_filter_sse (
bool verbose,
151 const guint max_order = 64)
155 printf (
"testing SSE filter implementation:\n\n");
157 for (guint order = 0; order < max_order; order++)
159 vector<float> taps (order);
160 for (guint i = 0; i < order; i++)
163 AlignedArray<float,16> sse_taps (fir_compute_sse_taps (taps));
166 for (uint i = 0; i < sse_taps.size(); i++)
168 printf (
"%3d", (
int) (sse_taps[i] + 0.5));
172 printf (
" ||| upper bound = %d\n", (order + 6) / 4);
177 AlignedArray<float,16> random_mem (order + 4);
178 for (guint i = 0; i < order + 4; i++)
179 random_mem[i] = 1.0 -
rand() / (0.5 * RAND_MAX);
185 fir_process_4samples_sse (&random_mem[0], &sse_taps[0], order,
186 &out[0], &out[1], &out[2], &out[3]);
188 double avg_diff = 0.0;
189 for (
int i = 0; i < 4; i++)
191 double diff = fir_process_one_sample<double> (&random_mem[i], &taps[0], order) - out[i];
192 avg_diff +=
fabs (diff);
194 avg_diff /= (order + 1);
195 bool is_error = (avg_diff > 0.00001);
196 if (is_error || verbose)
197 printf (
"*** order = %d, avg_diff = %g\n", order, avg_diff);
202 printf (
"*** %d errors detected\n", errors);
204 printf (
"filter implementation ok.\n");
206 return (errors == 0);
216 template<gu
int ORDER,
bool USE_SSE>
219 AlignedArray<float,16> history;
220 AlignedArray<float,16> sse_taps;
224 process_4samples_aligned (
const float *input ,
227 const guint H = (ORDER / 2);
229 output[1] = input[H];
230 output[3] = input[H + 1];
231 output[5] = input[H + 2];
232 output[7] = input[H + 3];
234 fir_process_4samples_sse (input, &sse_taps[0], ORDER, &output[0], &output[2], &output[4], &output[6]);
238 process_sample_unaligned (
const float *input,
241 const guint H = (ORDER / 2);
242 output[0] = fir_process_one_sample<float> (&input[0], &taps[0], ORDER);
243 output[1] = input[H];
246 process_block_aligned (
const float *input,
247 guint n_input_samples,
253 while (i + 3 < n_input_samples)
255 process_4samples_aligned (&input[i], &output[i*2]);
259 while (i < n_input_samples)
261 process_sample_unaligned (&input[i], &output[2*i]);
266 process_block_unaligned (
const float *input,
267 guint n_input_samples,
273 while ((reinterpret_cast<ptrdiff_t> (&input[i]) & 15) && i < n_input_samples)
275 process_sample_unaligned (&input[i], &output[2 * i]);
279 process_block_aligned (&input[i], n_input_samples - i, &output[2 * i]);
288 taps (init_taps, init_taps + ORDER),
290 sse_taps (fir_compute_sse_taps (taps))
292 g_assert ((ORDER & 1) == 0);
300 guint n_input_samples,
303 const uint history_todo = min (n_input_samples, ORDER - 1);
305 copy (input, input + history_todo, &history[ORDER - 1]);
306 process_block_aligned (&history[0], history_todo, output);
307 if (n_input_samples > history_todo)
309 process_block_unaligned (input, n_input_samples - history_todo, &output [2 * history_todo]);
312 copy (input + n_input_samples - history_todo, input + n_input_samples, &history[0]);
318 memmove (&history[0], &history[n_input_samples],
sizeof (history[0]) * (ORDER - 1));
343 template<gu
int ORDER,
bool USE_SSE>
346 AlignedArray<float,16> history_even;
347 AlignedArray<float,16> history_odd;
348 AlignedArray<float,16> sse_taps;
350 template<
int ODD_STEPPING>
void
351 process_4samples_aligned (
const float *input_even ,
352 const float *input_odd,
355 const guint H = (ORDER / 2) - 1;
357 fir_process_4samples_sse (input_even, &sse_taps[0], ORDER, &output[0], &output[1], &output[2], &output[3]);
359 output[0] += 0.5 * input_odd[H * ODD_STEPPING];
360 output[1] += 0.5 * input_odd[(H + 1) * ODD_STEPPING];
361 output[2] += 0.5 * input_odd[(H + 2) * ODD_STEPPING];
362 output[3] += 0.5 * input_odd[(H + 3) * ODD_STEPPING];
365 template<
int ODD_STEPPING>
float
366 process_sample_unaligned (
const float *input_even,
367 const float *input_odd)
369 const guint H = (ORDER / 2) - 1;
371 return fir_process_one_sample<float> (&input_even[0], &taps[0], ORDER) + 0.5 * input_odd[H * ODD_STEPPING];
373 template<
int ODD_STEPPING>
void
374 process_block_aligned (
const float *input_even,
375 const float *input_odd,
377 guint n_output_samples)
382 while (i + 3 < n_output_samples)
384 process_4samples_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i]);
388 while (i < n_output_samples)
390 output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]);
394 template<
int ODD_STEPPING>
void
395 process_block_unaligned (
const float *input_even,
396 const float *input_odd,
398 guint n_output_samples)
403 while ((reinterpret_cast<ptrdiff_t> (&input_even[i]) & 15) && i < n_output_samples)
405 output[i] = process_sample_unaligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING]);
409 process_block_aligned<ODD_STEPPING> (&input_even[i], &input_odd[i * ODD_STEPPING], &output[i], n_output_samples);
412 deinterleave2 (
const float *data,
416 for (uint i = 0; i < n_data_values; i += 2)
417 output[i / 2] = data[i];
426 taps (init_taps, init_taps + ORDER),
427 history_even (2 * ORDER),
428 history_odd (2 * ORDER),
429 sse_taps (fir_compute_sse_taps (taps))
431 g_assert ((ORDER & 1) == 0);
439 guint n_input_samples,
442 g_assert ((n_input_samples & 1) == 0);
444 const uint BLOCKSIZE = 1024;
447 float *input_even = &block[0].f[0];
449 while (n_input_samples)
451 uint n_input_todo = min (n_input_samples, BLOCKSIZE * 2);
466 deinterleave2 (input, n_input_todo, input_even);
468 const float *input_odd = input + 1;
470 const uint n_output_todo = n_input_todo / 2;
471 const uint history_todo = min (n_output_todo, ORDER - 1);
473 copy (input_even, input_even + history_todo, &history_even[ORDER - 1]);
474 deinterleave2 (input_odd, history_todo * 2, &history_odd[ORDER - 1]);
476 process_block_aligned <1> (&history_even[0], &history_odd[0], output, history_todo);
477 if (n_output_todo > history_todo)
479 process_block_unaligned<2> (input_even, input_odd, &output[history_todo], n_output_todo - history_todo);
482 copy (input_even + n_output_todo - history_todo, input_even + n_output_todo, &history_even[0]);
483 deinterleave2 (input_odd + n_input_todo - history_todo * 2, history_todo * 2, &history_odd[0]);
489 memmove (&history_even[0], &history_even[n_output_todo],
sizeof (history_even[0]) * (ORDER - 1));
490 memmove (&history_odd[0], &history_odd[n_output_todo],
sizeof (history_odd[0]) * (ORDER - 1));
493 n_input_samples -= n_input_todo;
494 input += n_input_todo;
495 output += n_output_todo;
509 return order() / 2 - 0.5;
513 template<
bool USE_SSE> Resampler2*
514 Resampler2::create_impl (BseResampler2Mode mode,
515 BseResampler2Precision precision)
517 if (mode == BSE_RESAMPLER2_MODE_UPSAMPLE)
521 case BSE_RESAMPLER2_PREC_LINEAR:
return create_impl_with_coeffs <Upsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 2.0);
522 case BSE_RESAMPLER2_PREC_48DB:
return create_impl_with_coeffs <Upsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 2.0);
523 case BSE_RESAMPLER2_PREC_72DB:
return create_impl_with_coeffs <Upsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 2.0);
524 case BSE_RESAMPLER2_PREC_96DB:
return create_impl_with_coeffs <Upsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 2.0);
525 case BSE_RESAMPLER2_PREC_120DB:
return create_impl_with_coeffs <Upsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 2.0);
526 case BSE_RESAMPLER2_PREC_144DB:
return create_impl_with_coeffs <Upsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 2.0);
529 else if (mode == BSE_RESAMPLER2_MODE_DOWNSAMPLE)
533 case BSE_RESAMPLER2_PREC_LINEAR:
return create_impl_with_coeffs <Downsampler2<2, USE_SSE> > (halfband_fir_linear_coeffs, 2, 1.0);
534 case BSE_RESAMPLER2_PREC_48DB:
return create_impl_with_coeffs <Downsampler2<16, USE_SSE> > (halfband_fir_48db_coeffs, 16, 1.0);
535 case BSE_RESAMPLER2_PREC_72DB:
return create_impl_with_coeffs <Downsampler2<24, USE_SSE> > (halfband_fir_72db_coeffs, 24, 1.0);
536 case BSE_RESAMPLER2_PREC_96DB:
return create_impl_with_coeffs <Downsampler2<32, USE_SSE> > (halfband_fir_96db_coeffs, 32, 1.0);
537 case BSE_RESAMPLER2_PREC_120DB:
return create_impl_with_coeffs <Downsampler2<42, USE_SSE> > (halfband_fir_120db_coeffs, 42, 1.0);
538 case BSE_RESAMPLER2_PREC_144DB:
return create_impl_with_coeffs <Downsampler2<52, USE_SSE> > (halfband_fir_144db_coeffs, 52, 1.0);
The Bse namespace contains all functions of the synthesis engine.
Definition: bstbseutils.cc:67
Definition: bseresamplerimpl.hh:24
guint order() const
Returns the filter order.
Definition: bseresamplerimpl.hh:502
Downsampler2(float *init_taps)
Constructs a Downsampler2 class using a given set of filter coefficients.
Definition: bseresamplerimpl.hh:425
void process_block(const float *input, guint n_input_samples, float *output)
The function process_block() takes a block of input samples and produces a block with half the length...
Definition: bseresamplerimpl.hh:438
void process_block(const float *input, guint n_input_samples, float *output)
The function process_block() takes a block of input samples and produces a block with twice the lengt...
Definition: bseresamplerimpl.hh:299
guint order() const
Returns the FIR filter order.
Definition: bseresamplerimpl.hh:325
Factor 2 upsampling of a data stream.
Definition: bseresamplerimpl.hh:217
Factor 2 downsampling of a data stream.
Definition: bseresamplerimpl.hh:344
Upsampler2(float *init_taps)
Constructs an Upsampler2 object with a given set of filter coefficients.
Definition: bseresamplerimpl.hh:287