+
+ }
+
+ // Now make use of the bilinear filtering in the GPU to reduce the number of samples
+ // we need to make. This is a bit more complex than BlurEffect since we cannot combine
+ // two neighboring samples if their weights have differing signs, so we first need to
+ // figure out the maximum number of samples. Then, we downconvert all the weights to
+ // that number -- we could have gone for a variable-length system, but this is simpler,
+ // and the gains would probably be offset by the extra cost of checking when to stop.
+ //
+ // The greedy strategy for combining samples is optimal.
+ src_bilinear_samples = 0;
+ for (unsigned y = 0; y < dst_samples; ++y) {
+ unsigned num_samples_saved = combine_samples(weights + (y * src_samples) * 2, NULL, src_samples, UINT_MAX);
+ src_bilinear_samples = max<int>(src_bilinear_samples, src_samples - num_samples_saved);
+ }
+
+ // Now that we know the right width, actually combine the samples.
+ float *bilinear_weights = new float[dst_samples * src_bilinear_samples * 2];
+ fp16_int_t *bilinear_weights_fp16 = new fp16_int_t[dst_samples * src_bilinear_samples * 2];
+ for (unsigned y = 0; y < dst_samples; ++y) {
+ float *bilinear_weights_ptr = bilinear_weights + (y * src_bilinear_samples) * 2;
+ fp16_int_t *bilinear_weights_fp16_ptr = bilinear_weights_fp16 + (y * src_bilinear_samples) * 2;
+ unsigned num_samples_saved = combine_samples(
+ weights + (y * src_samples) * 2,
+ bilinear_weights_ptr,
+ src_samples,
+ src_samples - src_bilinear_samples);
+ assert(int(src_samples) - int(num_samples_saved) == src_bilinear_samples);
+
+ // Convert to fp16.
+ for (int i = 0; i < src_bilinear_samples; ++i) {
+ bilinear_weights_fp16_ptr[i * 2 + 0] = fp64_to_fp16(bilinear_weights_ptr[i * 2 + 0]);
+ bilinear_weights_fp16_ptr[i * 2 + 1] = fp64_to_fp16(bilinear_weights_ptr[i * 2 + 1]);
+ }
+
+ // Normalize so that the sum becomes one. Note that we do it twice;
+ // this sometimes helps a tiny little bit when we have many samples.
+ for (int normalize_pass = 0; normalize_pass < 2; ++normalize_pass) {
+ double sum = 0.0;
+ for (int i = 0; i < src_bilinear_samples; ++i) {
+ sum += fp16_to_fp64(bilinear_weights_fp16_ptr[i * 2 + 0]);
+ }
+ for (int i = 0; i < src_bilinear_samples; ++i) {
+ bilinear_weights_fp16_ptr[i * 2 + 0] = fp64_to_fp16(
+ fp16_to_fp64(bilinear_weights_fp16_ptr[i * 2 + 0]) / sum);
+ }
+ }