X-Git-Url: https://git.sesse.net/?p=c64tapwav;a=blobdiff_plain;f=decode.cpp;h=3662566737d4d4fc415a525d2071f9185ce0e8b0;hp=839c2619d91824b0fc49417942c20f6e038690a9;hb=22be96e6cc5a38d73af6fc65bc8802b05cac8325;hpb=a04e2b22877ea003dbb8495373497ce0a24a8595 diff --git a/decode.cpp b/decode.cpp index 839c261..3662566 100644 --- a/decode.cpp +++ b/decode.cpp @@ -7,6 +7,9 @@ #include #include #include +#ifdef __AVX__ +#include +#endif #include #include @@ -25,6 +28,7 @@ // SPSA options #define NUM_FILTER_COEFF 32 +#define NUM_SPSA_VALS (NUM_FILTER_COEFF + 2) #define NUM_ITER 5000 #define A NUM_ITER/10 // approx #define INITIAL_A 0.005 // A bit of trial and error... @@ -327,17 +331,30 @@ std::vector crop(const std::vector& pcm, float crop_start, float c return std::vector(pcm.begin() + start_sample, pcm.begin() + end_sample); } -// TODO: Support AVX here. std::vector do_fir_filter(const std::vector& pcm, const float* filter) { std::vector filtered_pcm; - filtered_pcm.reserve(pcm.size()); - for (unsigned i = NUM_FILTER_COEFF; i < pcm.size(); ++i) { + filtered_pcm.resize(pcm.size()); + unsigned i = NUM_FILTER_COEFF; +#ifdef __AVX__ + unsigned avx_end = i + ((pcm.size() - i) & ~7); + for ( ; i < avx_end; i += 8) { + __m256 s = _mm256_setzero_ps(); + for (int j = 0; j < NUM_FILTER_COEFF; ++j) { + __m256 f = _mm256_set1_ps(filter[j]); + s = _mm256_fmadd_ps(f, _mm256_load_ps(&pcm[i - j]), s); + } + _mm256_storeu_ps(&filtered_pcm[i], s); + } +#endif + // Do what we couldn't do with AVX (which is everything for non-AVX machines) + // as scalar code. + for (; i < pcm.size(); ++i) { float s = 0.0f; for (int j = 0; j < NUM_FILTER_COEFF; ++j) { s += filter[j] * pcm[i - j]; } - filtered_pcm.push_back(s); + filtered_pcm[i] = s; } if (output_filtered) { @@ -379,7 +396,7 @@ std::vector do_rc_filter(const std::vector& pcm, float freq, int s return filtered_pcm; } -std::vector detect_pulses(const std::vector &pcm, int sample_rate) +std::vector detect_pulses(const std::vector &pcm, float hysteresis_upper_limit, float hysteresis_lower_limit, int sample_rate) { std::vector pulses; @@ -491,7 +508,7 @@ void find_kmeans(const std::vector &pulses, double calibration_factor, co void spsa_train(const std::vector &pcm, int sample_rate) { - float filter[NUM_FILTER_COEFF] = { 1.0f }; // The rest is filled with 0. + float vals[NUM_SPSA_VALS] = { hysteresis_upper_limit, hysteresis_lower_limit, 1.0f }; // The rest is filled with 0. float start_c = INITIAL_C; double best_badness = HUGE_VAL; @@ -501,38 +518,38 @@ void spsa_train(const std::vector &pcm, int sample_rate) float c = start_c * pow(n, -GAMMA); // find a random perturbation - float p[NUM_FILTER_COEFF]; - float filter1[NUM_FILTER_COEFF], filter2[NUM_FILTER_COEFF]; - for (int i = 0; i < NUM_FILTER_COEFF; ++i) { + float p[NUM_SPSA_VALS]; + float vals1[NUM_SPSA_VALS], vals2[NUM_SPSA_VALS]; + for (int i = 0; i < NUM_SPSA_VALS; ++i) { p[i] = (rand() % 2) ? 1.0 : -1.0; - filter1[i] = std::max(std::min(filter[i] - c * p[i], 1.0f), -1.0f); - filter2[i] = std::max(std::min(filter[i] + c * p[i], 1.0f), -1.0f); + vals1[i] = std::max(std::min(vals[i] - c * p[i], 1.0f), -1.0f); + vals2[i] = std::max(std::min(vals[i] + c * p[i], 1.0f), -1.0f); } - std::vector pulses1 = detect_pulses(do_fir_filter(pcm, filter1), sample_rate); - std::vector pulses2 = detect_pulses(do_fir_filter(pcm, filter2), sample_rate); + std::vector pulses1 = detect_pulses(do_fir_filter(pcm, vals1 + 2), vals1[0], vals1[1], sample_rate); + std::vector pulses2 = detect_pulses(do_fir_filter(pcm, vals2 + 2), vals2[0], vals2[1], sample_rate); float badness1 = eval_badness(pulses1, 1.0); float badness2 = eval_badness(pulses2, 1.0); // Find the gradient estimator - float g[NUM_FILTER_COEFF]; - for (int i = 0; i < NUM_FILTER_COEFF; ++i) { + float g[NUM_SPSA_VALS]; + for (int i = 0; i < NUM_SPSA_VALS; ++i) { g[i] = (badness2 - badness1) / (2.0 * c * p[i]); - filter[i] -= a * g[i]; - filter[i] = std::max(std::min(filter[i], 1.0f), -1.0f); + vals[i] -= a * g[i]; + vals[i] = std::max(std::min(vals[i], 1.0f), -1.0f); } if (badness2 < badness1) { std::swap(badness1, badness2); - std::swap(filter1, filter2); + std::swap(vals1, vals2); std::swap(pulses1, pulses2); } if (badness1 < best_badness) { - printf("\nNew best filter (badness=%f):", badness1); + fprintf(stderr, "\nNew best filter (badness=%f):", badness1); for (int i = 0; i < NUM_FILTER_COEFF; ++i) { - printf(" %.5f", filter1[i]); + fprintf(stderr, " %.5f", vals1[i + 2]); } + fprintf(stderr, ", hysteresis limits = %f %f\n", vals1[0], vals1[1]); best_badness = badness1; - printf("\n"); find_kmeans(pulses1, 1.0, train_snap_points); @@ -540,8 +557,8 @@ void spsa_train(const std::vector &pcm, int sample_rate) output_cycle_plot(pulses1, 1.0); } } - printf("%d ", n); - fflush(stdout); + fprintf(stderr, "%d ", n); + fflush(stderr); } } @@ -594,7 +611,7 @@ int main(int argc, char **argv) exit(0); } - std::vector pulses = detect_pulses(pcm, sample_rate); + std::vector pulses = detect_pulses(pcm, hysteresis_upper_limit, hysteresis_lower_limit, sample_rate); double calibration_factor = 1.0; if (do_calibrate) {