+ float weight = lanczos_weight(radius_scaling_factor * (src_y - center_src_y - subpixel_offset), LANCZOS_RADIUS);
+ weights[y * src_samples + i].weight = weight * radius_scaling_factor;
+ weights[y * src_samples + i].pos = (src_y + 0.5) / float(src_size);
+ }
+ }
+
+ // Now make use of the bilinear filtering in the GPU to reduce the number of samples
+ // we need to make. This is a bit more complex than BlurEffect since we cannot combine
+ // two neighboring samples if their weights have differing signs, so we first need to
+ // figure out the maximum number of samples. Then, we downconvert all the weights to
+ // that number -- we could have gone for a variable-length system, but this is simpler,
+ // and the gains would probably be offset by the extra cost of checking when to stop.
+ //
+ // The greedy strategy for combining samples is optimal.
+ src_bilinear_samples = 0;
+ for (unsigned y = 0; y < dst_samples; ++y) {
+ unsigned num_samples_saved = combine_samples(weights + y * src_samples, NULL, src_size, src_samples, UINT_MAX);
+ src_bilinear_samples = max<int>(src_bilinear_samples, src_samples - num_samples_saved);
+ }
+
+ // Now that we know the right width, actually combine the samples.
+ Tap<float> *bilinear_weights = new Tap<float>[dst_samples * src_bilinear_samples];
+ Tap<fp16_int_t> *bilinear_weights_fp16 = new Tap<fp16_int_t>[dst_samples * src_bilinear_samples];
+ for (unsigned y = 0; y < dst_samples; ++y) {
+ Tap<float> *bilinear_weights_ptr = bilinear_weights + y * src_bilinear_samples;
+ Tap<fp16_int_t> *bilinear_weights_fp16_ptr = bilinear_weights_fp16 + y * src_bilinear_samples;
+ unsigned num_samples_saved = combine_samples(
+ weights + y * src_samples,
+ bilinear_weights_ptr,
+ src_size,
+ src_samples,
+ src_samples - src_bilinear_samples);
+ assert(int(src_samples) - int(num_samples_saved) == src_bilinear_samples);
+
+ // Convert to fp16.
+ for (int i = 0; i < src_bilinear_samples; ++i) {
+ bilinear_weights_fp16_ptr[i].weight = fp64_to_fp16(bilinear_weights_ptr[i].weight);
+ bilinear_weights_fp16_ptr[i].pos = fp64_to_fp16(bilinear_weights_ptr[i].pos);
+ }
+
+ // Normalize so that the sum becomes one. Note that we do it twice;
+ // this sometimes helps a tiny little bit when we have many samples.
+ for (int normalize_pass = 0; normalize_pass < 2; ++normalize_pass) {
+ double sum = 0.0;
+ for (int i = 0; i < src_bilinear_samples; ++i) {
+ sum += fp16_to_fp64(bilinear_weights_fp16_ptr[i].weight);
+ }
+ for (int i = 0; i < src_bilinear_samples; ++i) {
+ bilinear_weights_fp16_ptr[i].weight = fp64_to_fp16(
+ fp16_to_fp64(bilinear_weights_fp16_ptr[i].weight) / sum);
+ }