/* Calf Box, an open source musical instrument. Copyright (C) 2010-2013 Krzysztof Foltman This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "config-api.h" #include "dspmath.h" #include "errors.h" #include "midi.h" #include "module.h" #include "rt.h" #include "sampler.h" #include "sfzloader.h" #include #include #include #include #include #define LOW_QUALITY_INTERPOLATION 0 struct resampler_state { float *leftright; int offset; float lgain, rgain, lgain_delta, rgain_delta; }; #if USE_NEON #include static inline void process_voice_mono_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos) { static const float32x2_t shift1a = {0.f, 1.f}, shift1b = {1.f, 1.f}; static const float32x2_t shift2a = {-1.f, -1.f}, shift2b = {0.f, 0.f}; static const float32x2_t shift3a = {-2.f, -2.f}, shift3b = {-2.f, -1.f}; static const float32x2_t scalinga = {-1 / 6.0, 3 / 6.0}, scalingb = {-3 / 6.0, 1 / 6.0}; uint64x1_t pos = v->bigpos, delta = v->bigdelta; float32x2_t gains = {rs->lgain, rs->rgain}; const float32x2_t gaindeltas = {rs->lgain_delta, rs->rgain_delta}; for (uint32_t i = rs->offset; i < endpos; i++) { float32x2_t posposf = vcvt_n_f32_u32(vreinterpret_u32_u64(pos), 32); int32x4_t smp = vmovl_s16(vld1_s16(&srcdata[pos >> 32])); pos = vadd_u64(pos, delta); float32x2_t t2 = vdup_n_f32(posposf[0]); float32x2_t samplesa = vcvt_f32_s32(vget_low_s32(smp)), samplesb = vcvt_f32_s32(vget_high_s32(smp)); float32x2_t mula = vmul_f32(vmul_f32(vadd_f32(t2, shift1a), vadd_f32(t2, shift2a)), vmul_f32(vadd_f32(t2, shift3a), scalinga)); float32x2_t mulb = vmul_f32(vmul_f32(vadd_f32(t2, shift1b), vadd_f32(t2, shift2b)), vmul_f32(vadd_f32(t2, shift3b), scalingb)); float32x2_t v = vmla_f32(vmul_f32(samplesa, mula), samplesb, mulb); float32x2_t result = vmul_f32(gains, vadd_f32(v, vrev64_f32(v))); gains = vadd_f32(gains, gaindeltas); rs->leftright[2 * i] = result[0]; rs->leftright[2 * i + 1] = result[1]; } rs->lgain = gains[0]; rs->rgain = gains[1]; v->bigpos = pos; rs->offset = endpos; } static inline void process_voice_stereo_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos) { static const float32x2_t shift1a = {0.f, 1.f}, shift1b = {1.f, 1.f}; static const float32x2_t shift2a = {-1.f, -1.f}, shift2b = {0.f, 0.f}; static const float32x2_t shift3a = {-2.f, -2.f}, shift3b = {-2.f, -1.f}; static const float32x2_t scalinga = {-1 / 6.0, 3 / 6.0}, scalingb = {-3 / 6.0, 1 / 6.0}; uint64x1_t pos = v->bigpos, delta = v->bigdelta; float32x2_t gains = {rs->lgain, rs->rgain}; const float32x2_t gaindeltas = {rs->lgain_delta, rs->rgain_delta}; for (uint32_t i = rs->offset; i < endpos; i++) { float32x2_t posposf = vcvt_n_f32_u32(vreinterpret_u32_u64(pos), 32); int16x4x2_t pp = vld2_s16(&srcdata[(pos >> 31) &~ 1]); pos = vadd_u64(pos, delta); int32x4_t smp_left = vmovl_s16(pp.val[0]), smp_right = vmovl_s16(pp.val[1]); float32x2_t t2 = vdup_n_f32(posposf[0]); float32x2_t samplesLa = vcvt_f32_s32(vget_low_s32(smp_left)), samplesLb = vcvt_f32_s32(vget_high_s32(smp_left)); float32x2_t samplesRa = vcvt_f32_s32(vget_low_s32(smp_right)), samplesRb = vcvt_f32_s32(vget_high_s32(smp_right)); float32x2_t mula = vmul_f32(vmul_f32(vadd_f32(t2, shift1a), vadd_f32(t2, shift2a)), vmul_f32(vadd_f32(t2, shift3a), scalinga)); float32x2_t mulb = vmul_f32(vmul_f32(vadd_f32(t2, shift1b), vadd_f32(t2, shift2b)), vmul_f32(vadd_f32(t2, shift3b), scalingb)); float32x2_t vL = vmla_f32(vmul_f32(samplesLa, mula), samplesLb, mulb); float32x2_t vR = vmla_f32(vmul_f32(samplesRa, mula), samplesRb, mulb); float32x2x2_t transposed = vtrn_f32(vL, vR); float32x2_t result = vmul_f32(gains, vadd_f32(transposed.val[0], transposed.val[1])); gains = vadd_f32(gains, gaindeltas); rs->leftright[2 * i] = result[0]; rs->leftright[2 * i + 1] = result[1]; } rs->lgain = gains[0]; rs->rgain = gains[1]; v->bigpos = pos; rs->offset = endpos; } #elif USE_SSE #include typedef __m128 V4SF; static const V4SF shift1 = {0, 1, 1, 1}; static const V4SF shift2 = {-1, -1, 0, 0}; static const V4SF shift3 = {-2, -2, -2, -1}; static const V4SF scaling = {-1, 3, -3, 1}; static const V4SF zero = {0, 0, 0, 0}; static inline void process_voice_mono_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos) { uint64_t pos = v->bigpos; const float ffrac = 1.0f / 6.0f; const float _scaler = 1.f / (128.f * 16777216.f); for (int i = rs->offset; i < endpos; i++) { //float t = ((pos >> 8) & 0x00FFFFFF) * scaler; const int16_t *p = &srcdata[pos >> 32]; V4SF t2 = __builtin_ia32_cvtsi2ss(zero, (pos & 0xFFFFFFFF) >> 1) * _scaler; pos += v->bigdelta; V4SF t4 = __builtin_ia32_shufps(t2, t2, 0); V4SF v4mul = (t4 + shift1) * (t4 + shift2) * (t4 + shift3) * scaling; V4SF samples = {p[0], p[1], p[2], p[3]}; v4mul = __builtin_ia32_mulps(samples, v4mul); float c = (v4mul[0] + v4mul[1] + v4mul[2] + v4mul[3]) * ffrac; rs->leftright[2 * i] = rs->lgain * c; rs->leftright[2 * i + 1] = rs->rgain * c; rs->lgain += rs->lgain_delta; rs->rgain += rs->rgain_delta; } v->bigpos = pos; rs->offset = endpos; } static inline void process_voice_stereo_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos) { uint64_t pos = v->bigpos; const float ffrac = 1.0f / 6.0f; const float _scaler = 1.f / (128.f * 16777216.f); for (int i = rs->offset; i < endpos; i++) { //float t = ((pos >> 8) & 0x00FFFFFF) * scaler; const int16_t *p = &srcdata[(pos >> 31) & ~1]; V4SF t2 = __builtin_ia32_cvtsi2ss(zero, (pos & 0xFFFFFFFF) >> 1) * _scaler; pos += v->bigdelta; V4SF t4 = __builtin_ia32_shufps(t2, t2, 0); V4SF v4mul = (t4 + shift1) * (t4 + shift2) * (t4 + shift3) * scaling; V4SF samples_left = {p[0], p[2], p[4], p[6]}; samples_left = __builtin_ia32_mulps(samples_left, v4mul); V4SF samples_right = {p[1], p[3], p[5], p[7]}; samples_right = __builtin_ia32_mulps(samples_right, v4mul); float cl = (samples_left[0] + samples_left[1] + samples_left[2] + samples_left[3]) * ffrac; float cr = (samples_right[0] + samples_right[1] + samples_right[2] + samples_right[3]) * ffrac; rs->leftright[2 * i] = rs->lgain * cl; rs->leftright[2 * i + 1] = rs->rgain * cr; rs->lgain += rs->lgain_delta; rs->rgain += rs->rgain_delta; } v->bigpos = pos; rs->offset = endpos; } #else static inline void process_voice_mono_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos) { const float ffrac = 1.0f / 6.0f; const float scaler = 1.f / 16777216.f; for (int i = rs->offset; i < endpos; i++) { float t = ((v->bigpos >> 8) & 0x00FFFFFF) * scaler; const int16_t *p = &srcdata[v->bigpos >> 32]; #if LOW_QUALITY_INTERPOLATION float c = (1.f - t) * p[1] + t * p[2]; #else float b0 = -t*(t-1.f)*(t-2.f); float b1 = 3.f*(t+1.f)*(t-1.f)*(t-2.f); float c = (b0 * p[0] + b1 * p[1] - 3.f*(t+1.f)*t*(t-2.f) * p[2] + (t+1.f)*t*(t-1.f) * p[3]) * ffrac; #endif rs->leftright[2 * i] = rs->lgain * c; rs->leftright[2 * i + 1] = rs->rgain * c; rs->lgain += rs->lgain_delta; rs->rgain += rs->rgain_delta; v->bigpos += v->bigdelta; } rs->offset = endpos; } static inline void process_voice_stereo_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos) { const float ffrac = 1.0f / 6.0f; const float scaler = 1.f / 16777216.f; for (int i = rs->offset; i < endpos; i++) { float t = ((v->bigpos >> 8) & 0x00FFFFFF) * scaler; const int16_t *p = &srcdata[(v->bigpos >> 31) & ~1]; #if LOW_QUALITY_INTERPOLATION float c0 = (1.f - t) * p[2] + t * p[4]; float c1 = (1.f - t) * p[3] + t * p[5]; #else float b0 = -t*(t-1.f)*(t-2.f); float b1 = 3.f*(t+1.f)*(t-1.f)*(t-2.f); float c0 = (b0 * p[0] + b1 * p[2] - 3.f*(t+1.f)*t*(t-2.f) * p[4] + (t+1.f)*t*(t-1.f) * p[6]) * ffrac; float c1 = (b0 * p[1] + b1 * p[3] - 3.f*(t+1.f)*t*(t-2.f) * p[5] + (t+1.f)*t*(t-1.f) * p[7]) * ffrac; #endif rs->leftright[2 * i] = rs->lgain * c0; rs->leftright[2 * i + 1] = rs->rgain * c1; rs->lgain += rs->lgain_delta; rs->rgain += rs->rgain_delta; v->bigpos += v->bigdelta; } rs->offset = endpos; } #endif static inline uint32_t process_voice_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, uint32_t pos_offset, uint32_t usable_sample_end) { uint32_t out_frames = CBOX_BLOCK_SIZE - rs->offset; uint64_t sample_end64 = ((uint64_t)usable_sample_end) << 32; // Check how many frames can be written to output buffer without going // past usable_sample_end. if (__builtin_expect(v->bigpos + (out_frames - 1) * v->bigdelta >= sample_end64, 0)) out_frames = (sample_end64 - v->bigpos) / v->bigdelta + 1; assert(out_frames > 0 && out_frames <= (uint32_t)(CBOX_BLOCK_SIZE - rs->offset)); uint32_t oldpos = v->bigpos >> 32; if (v->mode == spt_stereo16) process_voice_stereo_noloop(v, rs, srcdata - (pos_offset << 1), rs->offset + out_frames); else process_voice_mono_noloop(v, rs, srcdata - pos_offset, rs->offset + out_frames); return (v->bigpos >> 32) - oldpos; } static void process_voice_withloop(struct sampler_gen *v, struct resampler_state *rs) { // This is the first frame where interpolation will cross the loop boundary uint32_t loop_end = v->loop_end; uint32_t loop_edge = loop_end - MAX_INTERPOLATION_ORDER; while ( rs->offset < CBOX_BLOCK_SIZE ) { uint64_t startframe = v->bigpos >> 32; int16_t *source_data = v->sample_data; uint32_t source_offset = 0; uint32_t usable_sample_end = loop_edge; // if the first frame to play is already within 3 frames of loop end // (we need consecutive 4 frames for cubic interpolation) then // "straighten out" the area around the loop, and play that if (__builtin_expect(startframe >= loop_edge, 0)) { // if fully past the loop end, then it's normal wraparound // (or end of the sample if not looping) if (startframe >= loop_end) { if (v->loop_start == (uint32_t)-1) { v->mode = spt_inactive; return; } v->play_count++; if (v->loop_count && v->play_count >= v->loop_count) { v->mode = spt_inactive; return; } v->bigpos -= (uint64_t)(loop_end - v->loop_start) << 32; continue; } usable_sample_end = loop_end; source_data = v->scratch; source_offset = loop_edge; } process_voice_noloop(v, rs, source_data, source_offset, usable_sample_end); } } static void process_voice_streaming(struct sampler_gen *v, struct resampler_state *rs, uint32_t limit) { if (v->consumed_credit > 0) { if (v->consumed_credit >= limit) { v->consumed_credit -= limit; return; } limit -= v->consumed_credit; v->consumed_credit = 0; } // This is the first frame where interpolation will cross the loop boundary int16_t scratch[2 * MAX_INTERPOLATION_ORDER * 2]; while ( limit && rs->offset < CBOX_BLOCK_SIZE ) { uint64_t startframe = v->bigpos >> 32; int16_t *source_data = v->in_streaming_buffer ? v->streaming_buffer : v->sample_data; uint32_t loop_start = v->in_streaming_buffer ? 0 : v->loop_start; uint32_t loop_end = v->in_streaming_buffer ? v->streaming_buffer_frames : v->loop_end; uint32_t loop_edge = loop_end - MAX_INTERPOLATION_ORDER; uint32_t source_offset = 0; uint32_t usable_sample_end = loop_edge; // if the first frame to play is already within 3 frames of loop end // (we need consecutive 4 frames for cubic interpolation) then // "straighten out" the area around the loop, and play that if (startframe >= loop_edge) { // if fully past the loop end, then it's normal wraparound // (or end of the sample if not looping) if (startframe >= loop_end) { if (v->loop_start == (uint32_t)-1) { v->mode = spt_inactive; return; } v->bigpos -= (uint64_t)(loop_end - loop_start) << 32; if (v->prefetch_only_loop) v->consumed -= (loop_end - loop_start); else v->in_streaming_buffer = TRUE; continue; } int shift = (v->mode == spt_stereo16) ? 1 : 0; // 'linearize' the virtual circular buffer - write 3 (or N) frames before end of the loop // and 3 (N) frames at the start of the loop, and play it; in rare cases this will need to be // repeated twice if output write pointer is close to CBOX_BLOCK_SIZE or playback rate is very low, // but that's OK. uint32_t halfscratch = MAX_INTERPOLATION_ORDER << shift; memcpy(&scratch[0], &source_data[loop_edge << shift], halfscratch * sizeof(int16_t) ); if (v->loop_start == (uint32_t)-1) memset(scratch + halfscratch, 0, halfscratch * sizeof(int16_t)); else memcpy(scratch + halfscratch, &v->streaming_buffer[v->loop_start << shift], halfscratch * sizeof(int16_t)); usable_sample_end = loop_end; source_data = scratch; source_offset = loop_edge; } if (limit != (uint32_t)-1 && usable_sample_end - startframe > limit) usable_sample_end = startframe + limit; uint32_t consumed = process_voice_noloop(v, rs, source_data, source_offset, usable_sample_end); if (consumed > limit) { // The number of frames 'consumed' may be greater than the amount // available because of sample-skipping (at least that's the only // *legitimate* reason). This should be accounted for in the, // consumed sample counter (hence temporary storage of the // 'buffer overconsumption' in the consumed_credit field), but is not // actually causing any use of missing data, as the missing samples // have been skipped. assert(v->consumed_credit == 0); v->consumed_credit = consumed - limit; assert (v->consumed_credit <= 1 + (v->bigdelta >> 32)); consumed = limit; } v->consumed += consumed; if (consumed < limit) limit -= consumed; else break; } } void sampler_gen_reset(struct sampler_gen *v) { v->mode = spt_inactive; v->bigpos = 0; v->last_lgain = 0.f; v->last_rgain = 0.f; v->play_count = 0; v->consumed = 0; v->consumed_credit = 0; v->streaming_buffer = NULL; v->in_streaming_buffer = FALSE; v->prefetch_only_loop = FALSE; v->fadein_counter = -1.f; } uint32_t sampler_gen_sample_playback(struct sampler_gen *v, float *leftright, uint32_t limit) { struct resampler_state rs; rs.leftright = leftright; rs.offset = 0; rs.lgain = v->last_lgain; rs.rgain = v->last_rgain; rs.lgain_delta = (v->lgain - v->last_lgain) * (1.f / CBOX_BLOCK_SIZE); rs.rgain_delta = (v->rgain - v->last_rgain) * (1.f / CBOX_BLOCK_SIZE); if (v->streaming_buffer) process_voice_streaming(v, &rs, limit); else { process_voice_withloop(v, &rs); } uint32_t written = rs.offset; if (!v->streaming_buffer) { v->virtpos += written * v->virtdelta; if (v->virtpos != v->bigpos) { while ((v->virtpos >> 32) >= v->loop_end && v->loop_start != SAMPLER_NO_LOOP) v->virtpos -= ((uint64_t)(v->loop_end - v->loop_start)) << 32; } // XXXKF looping if (v->fadein_counter == -1 && fabs((v->bigpos - v->virtpos) / (65536.0 * 65536.0)) > v->stretching_jump) { int64_t jump = (int64_t)(v->stretching_jump * 65536.0 * 65536.0); int64_t newpos = v->bigpos > v->virtpos ? v->bigpos - jump : v->bigpos + jump; if (newpos < 0) newpos = 0; // XXXKF beware of extremely short loops while ((newpos >> 32) >= v->loop_end && v->loop_start != SAMPLER_NO_LOOP) newpos -= ((uint64_t)(v->loop_end - v->loop_start)) << 32; if ((newpos >> 32) >= v->cur_sample_end - 4) newpos = ((uint64_t)v->cur_sample_end - 4)<< 32; v->fadein_pos = newpos; v->fadein_counter = 0; } else if (v->fadein_counter != -1) { float leftright_fadein[2 * CBOX_BLOCK_SIZE]; rs.offset = 0; rs.leftright = leftright_fadein; rs.lgain = v->last_lgain; rs.rgain = v->last_rgain; uint64_t oldpos = v->bigpos; v->bigpos = v->fadein_pos; process_voice_withloop(v, &rs); v->fadein_pos = v->bigpos; v->bigpos = oldpos; uint32_t written2 = rs.offset; // XXXKF not the best set of special cases uint32_t i; if (written2 > written) { for (i = 2 * written; i < 2 * written2; i += 2) leftright[i] = leftright[i + 1] = 0.f; written = written2; } if (written2 < written) { for (i = 2 * written2; i < 2 * written; i += 2) leftright_fadein[i] = leftright_fadein[i + 1] = 0.f; written2 = written; } float cnt = v->fadein_counter; float scl = v->bigdelta / (v->stretching_crossfade * v->virtdelta); for (i = 0; i < 2 * written2; i += 2) { leftright[i] += (leftright_fadein[i] - leftright[i]) * cnt; leftright[i + 1] += (leftright_fadein[i + 1] - leftright[i + 1]) * cnt; cnt += scl; if (cnt > 1.f) cnt = 1.f; } if (cnt >= 1.f) { cnt = -1.f; v->bigpos = v->fadein_pos; } v->fadein_counter = cnt; } } v->last_lgain = v->lgain; v->last_rgain = v->rgain; return written; }