git.sesse.net Git - ffmpeg/blob - libavfilter/vf_deshake_opencl.c

   1 /*
   2  * This file is part of FFmpeg.
   3  *
   4  * FFmpeg is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * FFmpeg is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with FFmpeg; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17  *
  18  * Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  19  * Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  20  * Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  21  * Third party copyrights are property of their respective owners.
  22  *
  23  * Redistribution and use in source and binary forms, with or without modification,
  24  * are permitted provided that the following conditions are met:
  25  *
  26  *   * Redistribution's of source code must retain the above copyright notice,
  27  *     this list of conditions and the following disclaimer.
  28  *
  29  *   * Redistribution's in binary form must reproduce the above copyright notice,
  30  *     this list of conditions and the following disclaimer in the documentation
  31  *     and/or other materials provided with the distribution.
  32  *
  33  *   * The name of the copyright holders may not be used to endorse or promote products
  34  *     derived from this software without specific prior written permission.
  35  *
  36  * This software is provided by the copyright holders and contributors "as is" and
  37  * any express or implied warranties, including, but not limited to, the implied
  38  * warranties of merchantability and fitness for a particular purpose are disclaimed.
  39  * In no event shall the Intel Corporation or contributors be liable for any direct,
  40  * indirect, incidental, special, exemplary, or consequential damages
  41  * (including, but not limited to, procurement of substitute goods or services;
  42  * loss of use, data, or profits; or business interruption) however caused
  43  * and on any theory of liability, whether in contract, strict liability,
  44  * or tort (including negligence or otherwise) arising in any way out of
  45  * the use of this software, even if advised of the possibility of such damage.
  46  */
  47
  48 #include <stdbool.h>
  49 #include <float.h>
  50 #include <libavutil/lfg.h>
  51 #include "libavutil/opt.h"
  52 #include "libavutil/imgutils.h"
  53 #include "libavutil/mem.h"
  54 #include "libavutil/fifo.h"
  55 #include "libavutil/common.h"
  56 #include "libavutil/avassert.h"
  57 #include "libavutil/pixfmt.h"
  58 #include "avfilter.h"
  59 #include "framequeue.h"
  60 #include "filters.h"
  61 #include "transform.h"
  62 #include "formats.h"
  63 #include "internal.h"
  64 #include "opencl.h"
  65 #include "opencl_source.h"
  66 #include "video.h"
  67
  68 /*
  69 This filter matches feature points between frames (dealing with outliers) and then
  70 uses the matches to estimate an affine transform between frames. This transform is
  71 decomposed into various values (translation, scale, rotation) and the values are
  72 summed relative to the start of the video to obtain on absolute camera position
  73 for each frame. This "camera path" is then smoothed via a gaussian filter, resulting
  74 in a new path that is turned back into an affine transform and applied to each
  75 frame to render it.
  76
  77 High-level overview:
  78
  79 All of the work to extract motion data from frames occurs in queue_frame. Motion data
  80 is buffered in a smoothing window, so queue_frame simply computes the absolute camera
  81 positions and places them in ringbuffers.
  82
  83 filter_frame is responsible for looking at the absolute camera positions currently
  84 in the ringbuffers, applying the gaussian filter, and then transforming the frames.
  85 */
  86
  87 // Number of bits for BRIEF descriptors
  88 #define BREIFN 512
  89 // Size of the patch from which a BRIEF descriptor is extracted
  90 // This is the size used in OpenCV
  91 #define BRIEF_PATCH_SIZE 31
  92 #define BRIEF_PATCH_SIZE_HALF (BRIEF_PATCH_SIZE / 2)
  93
  94 #define MATCHES_CONTIG_SIZE 2000
  95
  96 #define ROUNDED_UP_DIV(a, b) ((a + (b - 1)) / b)
  97
  98 typedef struct PointPair {
  99     // Previous frame
 100     cl_float2 p1;
 101     // Current frame
 102     cl_float2 p2;
 103 } PointPair;
 104
 105 typedef struct MotionVector {
 106     PointPair p;
 107     // Used to mark vectors as potential outliers
 108     cl_int should_consider;
 109 } MotionVector;
 110
 111 // Denotes the indices for the different types of motion in the ringbuffers array
 112 enum RingbufferIndices {
 113     RingbufX,
 114     RingbufY,
 115     RingbufRot,
 116     RingbufScaleX,
 117     RingbufScaleY,
 118
 119     // Should always be last
 120     RingbufCount
 121 };
 122
 123 // Struct that holds data for drawing point match debug data
 124 typedef struct DebugMatches {
 125     MotionVector *matches;
 126     // The points used to calculate the affine transform for a frame
 127     MotionVector model_matches[3];
 128
 129     int num_matches;
 130     // For cases where we couldn't calculate a model
 131     int num_model_matches;
 132 } DebugMatches;
 133
 134 // Groups together the ringbuffers that store absolute distortion / position values
 135 // for each frame
 136 typedef struct AbsoluteFrameMotion {
 137     // Array with the various ringbuffers, indexed via the RingbufferIndices enum
 138     AVFifoBuffer *ringbuffers[RingbufCount];
 139
 140     // Offset to get to the current frame being processed
 141     // (not in bytes)
 142     int curr_frame_offset;
 143     // Keeps track of where the start and end of contiguous motion data is (to
 144     // deal with cases where no motion data is found between two frames)
 145     int data_start_offset;
 146     int data_end_offset;
 147
 148     AVFifoBuffer *debug_matches;
 149 } AbsoluteFrameMotion;
 150
 151 // Takes care of freeing the arrays within the DebugMatches inside of the
 152 // debug_matches ringbuffer and then freeing the buffer itself.
 153 static void free_debug_matches(AbsoluteFrameMotion *afm) {
 154     DebugMatches dm;
 155
 156     if (!afm->debug_matches) {
 157         return;
 158     }
 159
 160     while (av_fifo_size(afm->debug_matches) > 0) {
 161         av_fifo_generic_read(
 162             afm->debug_matches,
 163             &dm,
 164             sizeof(DebugMatches),
 165             NULL
 166         );
 167
 168         av_freep(&dm.matches);
 169     }
 170
 171     av_fifo_freep(&afm->debug_matches);
 172 }
 173
 174 // Stores the translation, scale, rotation, and skew deltas between two frames
 175 typedef struct FrameDelta {
 176     cl_float2 translation;
 177     float rotation;
 178     cl_float2 scale;
 179     cl_float2 skew;
 180 } FrameDelta;
 181
 182 typedef struct SimilarityMatrix {
 183     // The 2x3 similarity matrix
 184     double matrix[6];
 185 } SimilarityMatrix;
 186
 187 typedef struct CropInfo {
 188     // The top left corner of the bounding box for the crop
 189     cl_float2 top_left;
 190     // The bottom right corner of the bounding box for the crop
 191     cl_float2 bottom_right;
 192 } CropInfo;
 193
 194 // Returned from function that determines start and end values for iteration
 195 // around the current frame in a ringbuffer
 196 typedef struct IterIndices {
 197     int start;
 198     int end;
 199 } IterIndices;
 200
 201 typedef struct DeshakeOpenCLContext {
 202     OpenCLFilterContext ocf;
 203     // Whether or not the above `OpenCLFilterContext` has been initialized
 204     int initialized;
 205
 206     // These variables are used in the activate callback
 207     int64_t duration;
 208     bool eof;
 209
 210     // State for random number generation
 211     AVLFG alfg;
 212
 213     // FIFO frame queue used to buffer future frames for processing
 214     FFFrameQueue fq;
 215     // Ringbuffers for frame positions
 216     AbsoluteFrameMotion abs_motion;
 217
 218     // The number of frames' motion to consider before and after the frame we are
 219     // smoothing
 220     int smooth_window;
 221     // The number of the frame we are currently processing
 222     int curr_frame;
 223
 224     // Stores a 1d array of normalised gaussian kernel values for convolution
 225     float *gauss_kernel;
 226
 227     // Buffer for error values used in RANSAC code
 228     float *ransac_err;
 229
 230     // Information regarding how to crop the smoothed luminance (or RGB) planes
 231     CropInfo crop_y;
 232     // Information regarding how to crop the smoothed chroma planes
 233     CropInfo crop_uv;
 234
 235     // Whether or not we are processing YUV input (as oppposed to RGB)
 236     bool is_yuv;
 237     // The underlying format of the hardware surfaces
 238     int sw_format;
 239
 240     // Buffer to copy `matches` into for the CPU to work with
 241     MotionVector *matches_host;
 242     MotionVector *matches_contig_host;
 243
 244     MotionVector *inliers;
 245
 246     cl_command_queue command_queue;
 247     cl_kernel kernel_grayscale;
 248     cl_kernel kernel_harris_response;
 249     cl_kernel kernel_refine_features;
 250     cl_kernel kernel_brief_descriptors;
 251     cl_kernel kernel_match_descriptors;
 252     cl_kernel kernel_transform;
 253     cl_kernel kernel_crop_upscale;
 254
 255     // Stores a frame converted to grayscale
 256     cl_mem grayscale;
 257     // Stores the harris response for a frame (measure of "cornerness" for each pixel)
 258     cl_mem harris_buf;
 259
 260     // Detected features after non-maximum suppression and sub-pixel refinement
 261     cl_mem refined_features;
 262     // Saved from the previous frame
 263     cl_mem prev_refined_features;
 264
 265     // BRIEF sampling pattern that is randomly initialized
 266     cl_mem brief_pattern;
 267     // Feature point descriptors for the current frame
 268     cl_mem descriptors;
 269     // Feature point descriptors for the previous frame
 270     cl_mem prev_descriptors;
 271     // Vectors between points in current and previous frame
 272     cl_mem matches;
 273     cl_mem matches_contig;
 274     // Holds the matrix to transform luminance (or RGB) with
 275     cl_mem transform_y;
 276     // Holds the matrix to transform chroma with
 277     cl_mem transform_uv;
 278
 279     // Configurable options
 280
 281     int tripod_mode;
 282     int debug_on;
 283     int should_crop;
 284
 285     // Whether or not feature points should be refined at a sub-pixel level
 286     cl_int refine_features;
 287     // If the user sets a value other than the default, 0, this percentage is
 288     // translated into a sigma value ranging from 0.5 to 40.0
 289     float smooth_percent;
 290     // This number is multiplied by the video frame rate to determine the size
 291     // of the smooth window
 292     float smooth_window_multiplier;
 293
 294     // Debug stuff
 295
 296     cl_kernel kernel_draw_debug_info;
 297     cl_mem debug_matches;
 298     cl_mem debug_model_matches;
 299
 300     // These store the total time spent executing the different kernels in nanoseconds
 301     unsigned long long grayscale_time;
 302     unsigned long long harris_response_time;
 303     unsigned long long refine_features_time;
 304     unsigned long long brief_descriptors_time;
 305     unsigned long long match_descriptors_time;
 306     unsigned long long transform_time;
 307     unsigned long long crop_upscale_time;
 308
 309     // Time spent copying matched features from the device to the host
 310     unsigned long long read_buf_time;
 311 } DeshakeOpenCLContext;
 312
 313 // Returns a random uniformly-distributed number in [low, high]
 314 static int rand_in(int low, int high, AVLFG *alfg) {
 315     return (av_lfg_get(alfg) % (high - low)) + low;
 316 }
 317
 318 // Returns the average execution time for an event given the total time and the
 319 // number of frames processed.
 320 static double averaged_event_time_ms(unsigned long long total_time, int num_frames) {
 321     return (double)total_time / (double)num_frames / 1000000.0;
 322 }
 323
 324 // The following code is loosely ported from OpenCV
 325
 326 // Estimates affine transform from 3 point pairs
 327 // model is a 2x3 matrix:
 328 //      a b c
 329 //      d e f
 330 static void run_estimate_kernel(const MotionVector *point_pairs, double *model)
 331 {
 332     // src points
 333     double x1 = point_pairs[0].p.p1.s[0];
 334     double y1 = point_pairs[0].p.p1.s[1];
 335     double x2 = point_pairs[1].p.p1.s[0];
 336     double y2 = point_pairs[1].p.p1.s[1];
 337     double x3 = point_pairs[2].p.p1.s[0];
 338     double y3 = point_pairs[2].p.p1.s[1];
 339
 340     // dest points
 341     double X1 = point_pairs[0].p.p2.s[0];
 342     double Y1 = point_pairs[0].p.p2.s[1];
 343     double X2 = point_pairs[1].p.p2.s[0];
 344     double Y2 = point_pairs[1].p.p2.s[1];
 345     double X3 = point_pairs[2].p.p2.s[0];
 346     double Y3 = point_pairs[2].p.p2.s[1];
 347
 348     double d = 1.0 / ( x1*(y2-y3) + x2*(y3-y1) + x3*(y1-y2) );
 349
 350     model[0] = d * ( X1*(y2-y3) + X2*(y3-y1) + X3*(y1-y2) );
 351     model[1] = d * ( X1*(x3-x2) + X2*(x1-x3) + X3*(x2-x1) );
 352     model[2] = d * ( X1*(x2*y3 - x3*y2) + X2*(x3*y1 - x1*y3) + X3*(x1*y2 - x2*y1) );
 353
 354     model[3] = d * ( Y1*(y2-y3) + Y2*(y3-y1) + Y3*(y1-y2) );
 355     model[4] = d * ( Y1*(x3-x2) + Y2*(x1-x3) + Y3*(x2-x1) );
 356     model[5] = d * ( Y1*(x2*y3 - x3*y2) + Y2*(x3*y1 - x1*y3) + Y3*(x1*y2 - x2*y1) );
 357 }
 358
 359 // Checks that the 3 points in the given array are not collinear
 360 static bool points_not_collinear(const cl_float2 **points)
 361 {
 362     int j, k, i = 2;
 363
 364     for (j = 0; j < i; j++) {
 365         double dx1 = points[j]->s[0] - points[i]->s[0];
 366         double dy1 = points[j]->s[1] - points[i]->s[1];
 367
 368         for (k = 0; k < j; k++) {
 369             double dx2 = points[k]->s[0] - points[i]->s[0];
 370             double dy2 = points[k]->s[1] - points[i]->s[1];
 371
 372             // Assuming a 3840 x 2160 video with a point at (0, 0) and one at
 373             // (3839, 2159), this prevents a third point from being within roughly
 374             // 0.5 of a pixel of the line connecting the two on both axes
 375             if (fabs(dx2*dy1 - dy2*dx1) <= 1.0) {
 376                 return false;
 377             }
 378         }
 379     }
 380
 381     return true;
 382 }
 383
 384 // Checks a subset of 3 point pairs to make sure that the points are not collinear
 385 // and not too close to each other
 386 static bool check_subset(const MotionVector *pairs_subset)
 387 {
 388     const cl_float2 *prev_points[] = {
 389         &pairs_subset[0].p.p1,
 390         &pairs_subset[1].p.p1,
 391         &pairs_subset[2].p.p1
 392     };
 393
 394     const cl_float2 *curr_points[] = {
 395         &pairs_subset[0].p.p2,
 396         &pairs_subset[1].p.p2,
 397         &pairs_subset[2].p.p2
 398     };
 399
 400     return points_not_collinear(prev_points) && points_not_collinear(curr_points);
 401 }
 402
 403 // Selects a random subset of 3 points from point_pairs and places them in pairs_subset
 404 static bool get_subset(
 405     AVLFG *alfg,
 406     const MotionVector *point_pairs,
 407     const int num_point_pairs,
 408     MotionVector *pairs_subset,
 409     int max_attempts
 410 ) {
 411     int idx[3];
 412     int i = 0, j, iters = 0;
 413
 414     for (; iters < max_attempts; iters++) {
 415         for (i = 0; i < 3 && iters < max_attempts;) {
 416             int idx_i = 0;
 417
 418             for (;;) {
 419                 idx_i = idx[i] = rand_in(0, num_point_pairs, alfg);
 420
 421                 for (j = 0; j < i; j++) {
 422                     if (idx_i == idx[j]) {
 423                         break;
 424                     }
 425                 }
 426
 427                 if (j == i) {
 428                     break;
 429                 }
 430             }
 431
 432             pairs_subset[i] = point_pairs[idx[i]];
 433             i++;
 434         }
 435
 436         if (i == 3 && !check_subset(pairs_subset)) {
 437             continue;
 438         }
 439         break;
 440     }
 441
 442     return i == 3 && iters < max_attempts;
 443 }
 444
 445 // Computes the error for each of the given points based on the given model.
 446 static void compute_error(
 447     const MotionVector *point_pairs,
 448     const int num_point_pairs,
 449     const double *model,
 450     float *err
 451 ) {
 452     double F0 = model[0], F1 = model[1], F2 = model[2];
 453     double F3 = model[3], F4 = model[4], F5 = model[5];
 454
 455     for (int i = 0; i < num_point_pairs; i++) {
 456         const cl_float2 *f = &point_pairs[i].p.p1;
 457         const cl_float2 *t = &point_pairs[i].p.p2;
 458
 459         double a = F0*f->s[0] + F1*f->s[1] + F2 - t->s[0];
 460         double b = F3*f->s[0] + F4*f->s[1] + F5 - t->s[1];
 461
 462         err[i] = a*a + b*b;
 463     }
 464 }
 465
 466 // Determines which of the given point matches are inliers for the given model
 467 // based on the specified threshold.
 468 //
 469 // err must be an array of num_point_pairs length
 470 static int find_inliers(
 471     MotionVector *point_pairs,
 472     const int num_point_pairs,
 473     const double *model,
 474     float *err,
 475     double thresh
 476 ) {
 477     float t = (float)(thresh * thresh);
 478     int i, n = num_point_pairs, num_inliers = 0;
 479
 480     compute_error(point_pairs, num_point_pairs, model, err);
 481
 482     for (i = 0; i < n; i++) {
 483         if (err[i] <= t) {
 484             // This is an inlier
 485             point_pairs[i].should_consider = true;
 486             num_inliers += 1;
 487         } else {
 488             point_pairs[i].should_consider = false;
 489         }
 490     }
 491
 492     return num_inliers;
 493 }
 494
 495 // Determines the number of iterations required to achieve the desired confidence level.
 496 //
 497 // The equation used to determine the number of iterations to do is:
 498 // 1 - confidence = (1 - inlier_probability^num_points)^num_iters
 499 //
 500 // Solving for num_iters:
 501 //
 502 // num_iters = log(1 - confidence) / log(1 - inlier_probability^num_points)
 503 //
 504 // A more in-depth explanation can be found at https://en.wikipedia.org/wiki/Random_sample_consensus
 505 // under the 'Parameters' heading
 506 static int ransac_update_num_iters(double confidence, double num_outliers, int max_iters)
 507 {
 508     double num, denom;
 509
 510     confidence   = av_clipd(confidence, 0.0, 1.0);
 511     num_outliers = av_clipd(num_outliers, 0.0, 1.0);
 512
 513     // avoid inf's & nan's
 514     num = FFMAX(1.0 - confidence, DBL_MIN);
 515     denom = 1.0 - pow(1.0 - num_outliers, 3);
 516     if (denom < DBL_MIN) {
 517         return 0;
 518     }
 519
 520     num = log(num);
 521     denom = log(denom);
 522
 523     return denom >= 0 || -num >= max_iters * (-denom) ? max_iters : (int)round(num / denom);
 524 }
 525
 526 // Estimates an affine transform between the given pairs of points using RANdom
 527 // SAmple Consensus
 528 static bool estimate_affine_2d(
 529     DeshakeOpenCLContext *deshake_ctx,
 530     MotionVector *point_pairs,
 531     DebugMatches *debug_matches,
 532     const int num_point_pairs,
 533     double *model_out,
 534     const double threshold,
 535     const int max_iters,
 536     const double confidence
 537 ) {
 538     bool result = false;
 539     double best_model[6], model[6];
 540     MotionVector pairs_subset[3], best_pairs[3];
 541
 542     int iter, niters = FFMAX(max_iters, 1);
 543     int good_count, max_good_count = 0;
 544
 545     // We need at least 3 points to build a model from
 546     if (num_point_pairs < 3) {
 547         return false;
 548     } else if (num_point_pairs == 3) {
 549         // There are only 3 points, so RANSAC doesn't apply here
 550         run_estimate_kernel(point_pairs, model_out);
 551
 552         for (int i = 0; i < 3; ++i) {
 553             point_pairs[i].should_consider = true;
 554         }
 555
 556         return true;
 557     }
 558
 559     for (iter = 0; iter < niters; ++iter) {
 560         bool found = get_subset(&deshake_ctx->alfg, point_pairs, num_point_pairs, pairs_subset, 10000);
 561
 562         if (!found) {
 563             if (iter == 0) {
 564                 return false;
 565             }
 566
 567             break;
 568         }
 569
 570         run_estimate_kernel(pairs_subset, model);
 571         good_count = find_inliers(point_pairs, num_point_pairs, model, deshake_ctx->ransac_err, threshold);
 572
 573         if (good_count > FFMAX(max_good_count, 2)) {
 574             for (int mi = 0; mi < 6; ++mi) {
 575                 best_model[mi] = model[mi];
 576             }
 577
 578             for (int pi = 0; pi < 3; pi++) {
 579                 best_pairs[pi] = pairs_subset[pi];
 580             }
 581
 582             max_good_count = good_count;
 583             niters = ransac_update_num_iters(
 584                 confidence,
 585                 (double)(num_point_pairs - good_count) / num_point_pairs,
 586                 niters
 587             );
 588         }
 589     }
 590
 591     if (max_good_count > 0) {
 592         for (int mi = 0; mi < 6; ++mi) {
 593             model_out[mi] = best_model[mi];
 594         }
 595
 596         for (int pi = 0; pi < 3; ++pi) {
 597             debug_matches->model_matches[pi] = best_pairs[pi];
 598         }
 599         debug_matches->num_model_matches = 3;
 600
 601         // Find the inliers again for the best model for debugging
 602         find_inliers(point_pairs, num_point_pairs, best_model, deshake_ctx->ransac_err, threshold);
 603         result = true;
 604     }
 605
 606     return result;
 607 }
 608
 609 // "Wiggles" the first point in best_pairs around a tiny bit in order to decrease the
 610 // total error
 611 static void optimize_model(
 612     DeshakeOpenCLContext *deshake_ctx,
 613     MotionVector *best_pairs,
 614     MotionVector *inliers,
 615     const int num_inliers,
 616     float best_err,
 617     double *model_out
 618 ) {
 619     float move_x_val = 0.01;
 620     float move_y_val = 0.01;
 621     bool move_x = true;
 622     float old_move_x_val = 0;
 623     double model[6];
 624     int last_changed = 0;
 625
 626     for (int iters = 0; iters < 200; iters++) {
 627         float total_err = 0;
 628
 629         if (move_x) {
 630             best_pairs[0].p.p2.s[0] += move_x_val;
 631         } else {
 632             best_pairs[0].p.p2.s[0] += move_y_val;
 633         }
 634
 635         run_estimate_kernel(best_pairs, model);
 636         compute_error(inliers, num_inliers, model, deshake_ctx->ransac_err);
 637
 638         for (int j = 0; j < num_inliers; j++) {
 639             total_err += deshake_ctx->ransac_err[j];
 640         }
 641
 642         if (total_err < best_err) {
 643             for (int mi = 0; mi < 6; ++mi) {
 644                 model_out[mi] = model[mi];
 645             }
 646
 647             best_err = total_err;
 648             last_changed = iters;
 649         } else {
 650             // Undo the change
 651             if (move_x) {
 652                 best_pairs[0].p.p2.s[0] -= move_x_val;
 653             } else {
 654                 best_pairs[0].p.p2.s[0] -= move_y_val;
 655             }
 656
 657             if (iters - last_changed > 4) {
 658                 // We've already improved the model as much as we can
 659                 break;
 660             }
 661
 662             old_move_x_val = move_x_val;
 663
 664             if (move_x) {
 665                 move_x_val *= -1;
 666             } else {
 667                 move_y_val *= -1;
 668             }
 669
 670             if (old_move_x_val < 0) {
 671                 move_x = false;
 672             } else {
 673                 move_x = true;
 674             }
 675         }
 676     }
 677 }
 678
 679 // Uses a process similar to that of RANSAC to find a transform that minimizes
 680 // the total error for a set of point matches determined to be inliers
 681 //
 682 // (Pick random subsets, compute model, find total error, iterate until error
 683 // is minimized.)
 684 static bool minimize_error(
 685     DeshakeOpenCLContext *deshake_ctx,
 686     MotionVector *inliers,
 687     DebugMatches *debug_matches,
 688     const int num_inliers,
 689     double *model_out,
 690     const int max_iters
 691 ) {
 692     bool result = false;
 693     float best_err = FLT_MAX;
 694     double best_model[6], model[6];
 695     MotionVector pairs_subset[3], best_pairs[3];
 696
 697     for (int i = 0; i < max_iters; i++) {
 698         float total_err = 0;
 699         bool found = get_subset(&deshake_ctx->alfg, inliers, num_inliers, pairs_subset, 10000);
 700
 701         if (!found) {
 702             if (i == 0) {
 703                 return false;
 704             }
 705
 706             break;
 707         }
 708
 709         run_estimate_kernel(pairs_subset, model);
 710         compute_error(inliers, num_inliers, model, deshake_ctx->ransac_err);
 711
 712         for (int j = 0; j < num_inliers; j++) {
 713             total_err += deshake_ctx->ransac_err[j];
 714         }
 715
 716         if (total_err < best_err) {
 717             for (int mi = 0; mi < 6; ++mi) {
 718                 best_model[mi] = model[mi];
 719             }
 720
 721             for (int pi = 0; pi < 3; pi++) {
 722                 best_pairs[pi] = pairs_subset[pi];
 723             }
 724
 725             best_err = total_err;
 726         }
 727     }
 728
 729     for (int mi = 0; mi < 6; ++mi) {
 730         model_out[mi] = best_model[mi];
 731     }
 732
 733     for (int pi = 0; pi < 3; ++pi) {
 734         debug_matches->model_matches[pi] = best_pairs[pi];
 735     }
 736     debug_matches->num_model_matches = 3;
 737     result = true;
 738
 739     optimize_model(deshake_ctx, best_pairs, inliers, num_inliers, best_err, model_out);
 740     return result;
 741 }
 742
 743 // End code from OpenCV
 744
 745 // Decomposes a similarity matrix into translation, rotation, scale, and skew
 746 //
 747 // See http://frederic-wang.fr/decomposition-of-2d-transform-matrices.html
 748 static FrameDelta decompose_transform(double *model)
 749 {
 750     FrameDelta ret;
 751
 752     double a = model[0];
 753     double c = model[1];
 754     double e = model[2];
 755     double b = model[3];
 756     double d = model[4];
 757     double f = model[5];
 758     double delta = a * d - b * c;
 759
 760     memset(&ret, 0, sizeof(ret));
 761
 762     ret.translation.s[0] = e;
 763     ret.translation.s[1] = f;
 764
 765     // This is the QR method
 766     if (a != 0 || b != 0) {
 767         double r = hypot(a, b);
 768
 769         ret.rotation = FFSIGN(b) * acos(a / r);
 770         ret.scale.s[0] = r;
 771         ret.scale.s[1] = delta / r;
 772         ret.skew.s[0] = atan((a * c + b * d) / (r * r));
 773         ret.skew.s[1] = 0;
 774     } else if (c != 0 || d != 0) {
 775         double s = sqrt(c * c + d * d);
 776
 777         ret.rotation = M_PI / 2 - FFSIGN(d) * acos(-c / s);
 778         ret.scale.s[0] = delta / s;
 779         ret.scale.s[1] = s;
 780         ret.skew.s[0] = 0;
 781         ret.skew.s[1] = atan((a * c + b * d) / (s * s));
 782     } // otherwise there is only translation
 783
 784     return ret;
 785 }
 786
 787 // Move valid vectors from the 2d buffer into a 1d buffer where they are contiguous
 788 static int make_vectors_contig(
 789     DeshakeOpenCLContext *deshake_ctx,
 790     int size_y,
 791     int size_x
 792 ) {
 793     int num_vectors = 0;
 794
 795     for (int i = 0; i < size_y; ++i) {
 796         for (int j = 0; j < size_x; ++j) {
 797             MotionVector v = deshake_ctx->matches_host[j + i * size_x];
 798
 799             if (v.should_consider) {
 800                 deshake_ctx->matches_contig_host[num_vectors] = v;
 801                 ++num_vectors;
 802             }
 803
 804             // Make sure we do not exceed the amount of space we allocated for these vectors
 805             if (num_vectors == MATCHES_CONTIG_SIZE - 1) {
 806                 return num_vectors;
 807             }
 808         }
 809     }
 810     return num_vectors;
 811 }
 812
 813 // Returns the gaussian kernel value for the given x coordinate and sigma value
 814 static float gaussian_for(int x, float sigma) {
 815     return 1.0f / expf(((float)x * (float)x) / (2.0f * sigma * sigma));
 816 }
 817
 818 // Makes a normalized gaussian kernel of the given length for the given sigma
 819 // and places it in gauss_kernel
 820 static void make_gauss_kernel(float *gauss_kernel, float length, float sigma)
 821 {
 822     float gauss_sum = 0;
 823     int window_half = length / 2;
 824
 825     for (int i = 0; i < length; ++i) {
 826         float val = gaussian_for(i - window_half, sigma);
 827
 828         gauss_sum += val;
 829         gauss_kernel[i] = val;
 830     }
 831
 832     // Normalize the gaussian values
 833     for (int i = 0; i < length; ++i) {
 834         gauss_kernel[i] /= gauss_sum;
 835     }
 836 }
 837
 838 // Returns indices to start and end iteration at in order to iterate over a window
 839 // of length size centered at the current frame in a ringbuffer
 840 //
 841 // Always returns numbers that result in a window of length size, even if that
 842 // means specifying negative indices or indices past the end of the values in the
 843 // ringbuffers. Make sure you clip indices appropriately within your loop.
 844 static IterIndices start_end_for(DeshakeOpenCLContext *deshake_ctx, int length) {
 845     IterIndices indices;
 846
 847     indices.start = deshake_ctx->abs_motion.curr_frame_offset - (length / 2);
 848     indices.end = deshake_ctx->abs_motion.curr_frame_offset + (length / 2) + (length % 2);
 849
 850     return indices;
 851 }
 852
 853 // Sets val to the value in the given ringbuffer at the given offset, taking care of
 854 // clipping the offset into the appropriate range
 855 static void ringbuf_float_at(
 856     DeshakeOpenCLContext *deshake_ctx,
 857     AVFifoBuffer *values,
 858     float *val,
 859     int offset
 860 ) {
 861     int clip_start, clip_end, offset_clipped;
 862     if (deshake_ctx->abs_motion.data_end_offset != -1) {
 863         clip_end = deshake_ctx->abs_motion.data_end_offset;
 864     } else {
 865         // This expression represents the last valid index in the buffer,
 866         // which we use repeatedly at the end of the video.
 867         clip_end = deshake_ctx->smooth_window - (av_fifo_space(values) / sizeof(float)) - 1;
 868     }
 869
 870     if (deshake_ctx->abs_motion.data_start_offset != -1) {
 871         clip_start = deshake_ctx->abs_motion.data_start_offset;
 872     } else {
 873         // Negative indices will occur at the start of the video, and we want
 874         // them to be clipped to 0 in order to repeatedly use the position of
 875         // the first frame.
 876         clip_start = 0;
 877     }
 878
 879     offset_clipped = av_clip(
 880         offset,
 881         clip_start,
 882         clip_end
 883     );
 884
 885     av_fifo_generic_peek_at(
 886         values,
 887         val,
 888         offset_clipped * sizeof(float),
 889         sizeof(float),
 890         NULL
 891     );
 892 }
 893
 894 // Returns smoothed current frame value of the given buffer of floats based on the
 895 // given Gaussian kernel and its length (also the window length, centered around the
 896 // current frame) and the "maximum value" of the motion.
 897 //
 898 // This "maximum value" should be the width / height of the image in the case of
 899 // translation and an empirically chosen constant for rotation / scale.
 900 //
 901 // The sigma chosen to generate the final gaussian kernel with used to smooth the
 902 // camera path is either hardcoded (set by user, deshake_ctx->smooth_percent) or
 903 // adaptively chosen.
 904 static float smooth(
 905     DeshakeOpenCLContext *deshake_ctx,
 906     float *gauss_kernel,
 907     int length,
 908     float max_val,
 909     AVFifoBuffer *values
 910 ) {
 911     float new_large_s = 0, new_small_s = 0, new_best = 0, old, diff_between,
 912           percent_of_max, inverted_percent;
 913     IterIndices indices = start_end_for(deshake_ctx, length);
 914     float large_sigma = 40.0f;
 915     float small_sigma = 2.0f;
 916     float best_sigma;
 917
 918     if (deshake_ctx->smooth_percent) {
 919         best_sigma = (large_sigma - 0.5f) * deshake_ctx->smooth_percent + 0.5f;
 920     } else {
 921         // Strategy to adaptively smooth trajectory:
 922         //
 923         // 1. Smooth path with large and small sigma values
 924         // 2. Take the absolute value of the difference between them
 925         // 3. Get a percentage by putting the difference over the "max value"
 926         // 4, Invert the percentage
 927         // 5. Calculate a new sigma value weighted towards the larger sigma value
 928         // 6. Determine final smoothed trajectory value using that sigma
 929
 930         make_gauss_kernel(gauss_kernel, length, large_sigma);
 931         for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
 932             ringbuf_float_at(deshake_ctx, values, &old, i);
 933             new_large_s += old * gauss_kernel[j];
 934         }
 935
 936         make_gauss_kernel(gauss_kernel, length, small_sigma);
 937         for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
 938             ringbuf_float_at(deshake_ctx, values, &old, i);
 939             new_small_s += old * gauss_kernel[j];
 940         }
 941
 942         diff_between = fabsf(new_large_s - new_small_s);
 943         percent_of_max = diff_between / max_val;
 944         inverted_percent = 1 - percent_of_max;
 945         best_sigma = large_sigma * powf(inverted_percent, 40);
 946     }
 947
 948     make_gauss_kernel(gauss_kernel, length, best_sigma);
 949     for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
 950         ringbuf_float_at(deshake_ctx, values, &old, i);
 951         new_best += old * gauss_kernel[j];
 952     }
 953
 954     return new_best;
 955 }
 956
 957 // Returns the position of the given point after the transform is applied
 958 static cl_float2 transformed_point(float x, float y, float *transform) {
 959     cl_float2 ret;
 960
 961     ret.s[0] = x * transform[0] + y * transform[1] + transform[2];
 962     ret.s[1] = x * transform[3] + y * transform[4] + transform[5];
 963
 964     return ret;
 965 }
 966
 967 // Creates an affine transform that scales from the center of a frame
 968 static void transform_center_scale(
 969     float x_shift,
 970     float y_shift,
 971     float angle,
 972     float scale_x,
 973     float scale_y,
 974     float center_w,
 975     float center_h,
 976     float *matrix
 977 ) {
 978     cl_float2 center_s;
 979     float center_s_w, center_s_h;
 980
 981     ff_get_matrix(
 982         0,
 983         0,
 984         0,
 985         scale_x,
 986         scale_y,
 987         matrix
 988     );
 989
 990     center_s = transformed_point(center_w, center_h, matrix);
 991     center_s_w = center_w - center_s.s[0];
 992     center_s_h = center_h - center_s.s[1];
 993
 994     ff_get_matrix(
 995         x_shift + center_s_w,
 996         y_shift + center_s_h,
 997         angle,
 998         scale_x,
 999         scale_y,
1000         matrix
1001     );
1002 }
1003
1004 // Determines the crop necessary to eliminate black borders from a smoothed frame
1005 // and updates target crop accordingly
1006 static void update_needed_crop(
1007     CropInfo* crop,
1008     float *transform,
1009     float frame_width,
1010     float frame_height
1011 ) {
1012     float new_width, new_height, adjusted_width, adjusted_height, adjusted_x, adjusted_y;
1013
1014     cl_float2 top_left = transformed_point(0, 0, transform);
1015     cl_float2 top_right = transformed_point(frame_width, 0, transform);
1016     cl_float2 bottom_left = transformed_point(0, frame_height, transform);
1017     cl_float2 bottom_right = transformed_point(frame_width, frame_height, transform);
1018     float ar_h = frame_height / frame_width;
1019     float ar_w = frame_width / frame_height;
1020
1021     if (crop->bottom_right.s[0] == 0) {
1022         // The crop hasn't been set to the original size of the plane
1023         crop->bottom_right.s[0] = frame_width;
1024         crop->bottom_right.s[1] = frame_height;
1025     }
1026
1027     crop->top_left.s[0] = FFMAX3(
1028         crop->top_left.s[0],
1029         top_left.s[0],
1030         bottom_left.s[0]
1031     );
1032
1033     crop->top_left.s[1] = FFMAX3(
1034         crop->top_left.s[1],
1035         top_left.s[1],
1036         top_right.s[1]
1037     );
1038
1039     crop->bottom_right.s[0] = FFMIN3(
1040         crop->bottom_right.s[0],
1041         bottom_right.s[0],
1042         top_right.s[0]
1043     );
1044
1045     crop->bottom_right.s[1] = FFMIN3(
1046         crop->bottom_right.s[1],
1047         bottom_right.s[1],
1048         bottom_left.s[1]
1049     );
1050
1051     // Make sure our potentially new bounding box has the same aspect ratio
1052     new_height = crop->bottom_right.s[1] - crop->top_left.s[1];
1053     new_width = crop->bottom_right.s[0] - crop->top_left.s[0];
1054
1055     adjusted_width = new_height * ar_w;
1056     adjusted_x = crop->bottom_right.s[0] - adjusted_width;
1057
1058     if (adjusted_x >= crop->top_left.s[0]) {
1059         crop->top_left.s[0] = adjusted_x;
1060     } else {
1061         adjusted_height = new_width * ar_h;
1062         adjusted_y = crop->bottom_right.s[1] - adjusted_height;
1063         crop->top_left.s[1] = adjusted_y;
1064     }
1065 }
1066
1067 static av_cold void deshake_opencl_uninit(AVFilterContext *avctx)
1068 {
1069     DeshakeOpenCLContext *ctx = avctx->priv;
1070     cl_int cle;
1071
1072     for (int i = 0; i < RingbufCount; i++)
1073         av_fifo_freep(&ctx->abs_motion.ringbuffers[i]);
1074
1075     if (ctx->debug_on)
1076         free_debug_matches(&ctx->abs_motion);
1077
1078     if (ctx->gauss_kernel)
1079         av_freep(&ctx->gauss_kernel);
1080
1081     if (ctx->ransac_err)
1082         av_freep(&ctx->ransac_err);
1083
1084     if (ctx->matches_host)
1085         av_freep(&ctx->matches_host);
1086
1087     if (ctx->matches_contig_host)
1088         av_freep(&ctx->matches_contig_host);
1089
1090     if (ctx->inliers)
1091         av_freep(&ctx->inliers);
1092
1093     ff_framequeue_free(&ctx->fq);
1094
1095     CL_RELEASE_KERNEL(ctx->kernel_grayscale);
1096     CL_RELEASE_KERNEL(ctx->kernel_harris_response);
1097     CL_RELEASE_KERNEL(ctx->kernel_refine_features);
1098     CL_RELEASE_KERNEL(ctx->kernel_brief_descriptors);
1099     CL_RELEASE_KERNEL(ctx->kernel_match_descriptors);
1100     CL_RELEASE_KERNEL(ctx->kernel_crop_upscale);
1101     if (ctx->debug_on)
1102         CL_RELEASE_KERNEL(ctx->kernel_draw_debug_info);
1103
1104     CL_RELEASE_QUEUE(ctx->command_queue);
1105
1106     if (!ctx->is_yuv)
1107         CL_RELEASE_MEMORY(ctx->grayscale);
1108     CL_RELEASE_MEMORY(ctx->harris_buf);
1109     CL_RELEASE_MEMORY(ctx->refined_features);
1110     CL_RELEASE_MEMORY(ctx->prev_refined_features);
1111     CL_RELEASE_MEMORY(ctx->brief_pattern);
1112     CL_RELEASE_MEMORY(ctx->descriptors);
1113     CL_RELEASE_MEMORY(ctx->prev_descriptors);
1114     CL_RELEASE_MEMORY(ctx->matches);
1115     CL_RELEASE_MEMORY(ctx->matches_contig);
1116     CL_RELEASE_MEMORY(ctx->transform_y);
1117     CL_RELEASE_MEMORY(ctx->transform_uv);
1118     if (ctx->debug_on) {
1119         CL_RELEASE_MEMORY(ctx->debug_matches);
1120         CL_RELEASE_MEMORY(ctx->debug_model_matches);
1121     }
1122
1123     ff_opencl_filter_uninit(avctx);
1124 }
1125
1126 static int deshake_opencl_init(AVFilterContext *avctx)
1127 {
1128     DeshakeOpenCLContext *ctx = avctx->priv;
1129     AVFilterLink *outlink = avctx->outputs[0];
1130     AVFilterLink *inlink = avctx->inputs[0];
1131     // Pointer to the host-side pattern buffer to be initialized and then copied
1132     // to the GPU
1133     PointPair *pattern_host = NULL;
1134     cl_int cle;
1135     int err;
1136     cl_ulong8 zeroed_ulong8;
1137     FFFrameQueueGlobal fqg;
1138     cl_image_format grayscale_format;
1139     cl_image_desc grayscale_desc;
1140     cl_command_queue_properties queue_props;
1141
1142     const enum AVPixelFormat disallowed_formats[14] = {
1143         AV_PIX_FMT_GBRP,
1144         AV_PIX_FMT_GBRP9BE,
1145         AV_PIX_FMT_GBRP9LE,
1146         AV_PIX_FMT_GBRP10BE,
1147         AV_PIX_FMT_GBRP10LE,
1148         AV_PIX_FMT_GBRP16BE,
1149         AV_PIX_FMT_GBRP16LE,
1150         AV_PIX_FMT_GBRAP,
1151         AV_PIX_FMT_GBRAP16BE,
1152         AV_PIX_FMT_GBRAP16LE,
1153         AV_PIX_FMT_GBRAP12BE,
1154         AV_PIX_FMT_GBRAP12LE,
1155         AV_PIX_FMT_GBRAP10BE,
1156         AV_PIX_FMT_GBRAP10LE
1157     };
1158
1159     // Number of elements for an array
1160     const int image_grid_32 = ROUNDED_UP_DIV(outlink->h, 32) * ROUNDED_UP_DIV(outlink->w, 32);
1161
1162     const int descriptor_buf_size = image_grid_32 * (BREIFN / 8);
1163     const int features_buf_size = image_grid_32 * sizeof(cl_float2);
1164
1165     const AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
1166     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hw_frames_ctx->sw_format);
1167
1168     av_assert0(hw_frames_ctx);
1169     av_assert0(desc);
1170
1171     ff_framequeue_global_init(&fqg);
1172     ff_framequeue_init(&ctx->fq, &fqg);
1173     ctx->eof = false;
1174     ctx->smooth_window = (int)(av_q2d(avctx->inputs[0]->frame_rate) * ctx->smooth_window_multiplier);
1175     ctx->curr_frame = 0;
1176
1177     memset(&zeroed_ulong8, 0, sizeof(cl_ulong8));
1178
1179     ctx->gauss_kernel = av_malloc_array(ctx->smooth_window, sizeof(float));
1180     if (!ctx->gauss_kernel) {
1181         err = AVERROR(ENOMEM);
1182         goto fail;
1183     }
1184
1185     ctx->ransac_err = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(float));
1186     if (!ctx->ransac_err) {
1187         err = AVERROR(ENOMEM);
1188         goto fail;
1189     }
1190
1191     for (int i = 0; i < RingbufCount; i++) {
1192         ctx->abs_motion.ringbuffers[i] = av_fifo_alloc_array(
1193             ctx->smooth_window,
1194             sizeof(float)
1195         );
1196
1197         if (!ctx->abs_motion.ringbuffers[i]) {
1198             err = AVERROR(ENOMEM);
1199             goto fail;
1200         }
1201     }
1202
1203     if (ctx->debug_on) {
1204         ctx->abs_motion.debug_matches = av_fifo_alloc_array(
1205             ctx->smooth_window / 2,
1206             sizeof(DebugMatches)
1207         );
1208
1209         if (!ctx->abs_motion.debug_matches) {
1210             err = AVERROR(ENOMEM);
1211             goto fail;
1212         }
1213     }
1214
1215     ctx->abs_motion.curr_frame_offset = 0;
1216     ctx->abs_motion.data_start_offset = -1;
1217     ctx->abs_motion.data_end_offset = -1;
1218
1219     pattern_host = av_malloc_array(BREIFN, sizeof(PointPair));
1220     if (!pattern_host) {
1221         err = AVERROR(ENOMEM);
1222         goto fail;
1223     }
1224
1225     ctx->matches_host = av_malloc_array(image_grid_32, sizeof(MotionVector));
1226     if (!ctx->matches_host) {
1227         err = AVERROR(ENOMEM);
1228         goto fail;
1229     }
1230
1231     ctx->matches_contig_host = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(MotionVector));
1232     if (!ctx->matches_contig_host) {
1233         err = AVERROR(ENOMEM);
1234         goto fail;
1235     }
1236
1237     ctx->inliers = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(MotionVector));
1238     if (!ctx->inliers) {
1239         err = AVERROR(ENOMEM);
1240         goto fail;
1241     }
1242
1243     // Initializing the patch pattern for building BREIF descriptors with
1244     av_lfg_init(&ctx->alfg, 234342424);
1245     for (int i = 0; i < BREIFN; ++i) {
1246         PointPair pair;
1247
1248         for (int j = 0; j < 2; ++j) {
1249             pair.p1.s[j] = rand_in(-BRIEF_PATCH_SIZE_HALF, BRIEF_PATCH_SIZE_HALF + 1, &ctx->alfg);
1250             pair.p2.s[j] = rand_in(-BRIEF_PATCH_SIZE_HALF, BRIEF_PATCH_SIZE_HALF + 1, &ctx->alfg);
1251         }
1252
1253         pattern_host[i] = pair;
1254     }
1255
1256     for (int i = 0; i < 14; i++) {
1257         if (ctx->sw_format == disallowed_formats[i]) {
1258             av_log(avctx, AV_LOG_ERROR, "unsupported format in deshake_opencl.\n");
1259             err = AVERROR(ENOSYS);
1260             goto fail;
1261         }
1262     }
1263
1264     if (desc->flags & AV_PIX_FMT_FLAG_RGB) {
1265         ctx->is_yuv = false;
1266     } else {
1267         ctx->is_yuv = true;
1268     }
1269     ctx->sw_format = hw_frames_ctx->sw_format;
1270
1271     err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_deshake, 1);
1272     if (err < 0)
1273         goto fail;
1274
1275     if (ctx->debug_on) {
1276         queue_props = CL_QUEUE_PROFILING_ENABLE;
1277     } else {
1278         queue_props = 0;
1279     }
1280     ctx->command_queue = clCreateCommandQueue(
1281         ctx->ocf.hwctx->context,
1282         ctx->ocf.hwctx->device_id,
1283         queue_props,
1284         &cle
1285     );
1286     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL command queue %d.\n", cle);
1287
1288     CL_CREATE_KERNEL(ctx, grayscale);
1289     CL_CREATE_KERNEL(ctx, harris_response);
1290     CL_CREATE_KERNEL(ctx, refine_features);
1291     CL_CREATE_KERNEL(ctx, brief_descriptors);
1292     CL_CREATE_KERNEL(ctx, match_descriptors);
1293     CL_CREATE_KERNEL(ctx, transform);
1294     CL_CREATE_KERNEL(ctx, crop_upscale);
1295     if (ctx->debug_on)
1296         CL_CREATE_KERNEL(ctx, draw_debug_info);
1297
1298     if (!ctx->is_yuv) {
1299         grayscale_format.image_channel_order = CL_R;
1300         grayscale_format.image_channel_data_type = CL_FLOAT;
1301
1302         grayscale_desc = (cl_image_desc) {
1303             .image_type = CL_MEM_OBJECT_IMAGE2D,
1304             .image_width = outlink->w,
1305             .image_height = outlink->h,
1306             .image_depth = 0,
1307             .image_array_size = 0,
1308             .image_row_pitch = 0,
1309             .image_slice_pitch = 0,
1310             .num_mip_levels = 0,
1311             .num_samples = 0,
1312             .buffer = NULL,
1313         };
1314
1315         ctx->grayscale = clCreateImage(
1316             ctx->ocf.hwctx->context,
1317             0,
1318             &grayscale_format,
1319             &grayscale_desc,
1320             NULL,
1321             &cle
1322         );
1323         CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create grayscale image: %d.\n", cle);
1324     }
1325
1326     CL_CREATE_BUFFER(ctx, harris_buf, outlink->h * outlink->w * sizeof(float));
1327     CL_CREATE_BUFFER(ctx, refined_features, features_buf_size);
1328     CL_CREATE_BUFFER(ctx, prev_refined_features, features_buf_size);
1329     CL_CREATE_BUFFER_FLAGS(
1330         ctx,
1331         brief_pattern,
1332         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1333         BREIFN * sizeof(PointPair),
1334         pattern_host
1335     );
1336     CL_CREATE_BUFFER(ctx, descriptors, descriptor_buf_size);
1337     CL_CREATE_BUFFER(ctx, prev_descriptors, descriptor_buf_size);
1338     CL_CREATE_BUFFER(ctx, matches, image_grid_32 * sizeof(MotionVector));
1339     CL_CREATE_BUFFER(ctx, matches_contig, MATCHES_CONTIG_SIZE * sizeof(MotionVector));
1340     CL_CREATE_BUFFER(ctx, transform_y, 9 * sizeof(float));
1341     CL_CREATE_BUFFER(ctx, transform_uv, 9 * sizeof(float));
1342     if (ctx->debug_on) {
1343         CL_CREATE_BUFFER(ctx, debug_matches, MATCHES_CONTIG_SIZE * sizeof(MotionVector));
1344         CL_CREATE_BUFFER(ctx, debug_model_matches, 3 * sizeof(MotionVector));
1345     }
1346
1347     ctx->initialized = 1;
1348     av_freep(&pattern_host);
1349
1350     return 0;
1351
1352 fail:
1353     av_freep(&pattern_host);
1354     return err;
1355 }
1356
1357 // Logs debug information about the transform data
1358 static void transform_debug(AVFilterContext *avctx, float *new_vals, float *old_vals, int curr_frame) {
1359     av_log(avctx, AV_LOG_VERBOSE,
1360         "Frame %d:\n"
1361         "\tframe moved from: %f x, %f y\n"
1362         "\t              to: %f x, %f y\n"
1363         "\t    rotated from: %f degrees\n"
1364         "\t              to: %f degrees\n"
1365         "\t     scaled from: %f x, %f y\n"
1366         "\t              to: %f x, %f y\n"
1367         "\n"
1368         "\tframe moved by: %f x, %f y\n"
1369         "\t    rotated by: %f degrees\n"
1370         "\t     scaled by: %f x, %f y\n",
1371         curr_frame,
1372         old_vals[RingbufX], old_vals[RingbufY],
1373         new_vals[RingbufX], new_vals[RingbufY],
1374         old_vals[RingbufRot] * (180.0 / M_PI),
1375         new_vals[RingbufRot] * (180.0 / M_PI),
1376         old_vals[RingbufScaleX], old_vals[RingbufScaleY],
1377         new_vals[RingbufScaleX], new_vals[RingbufScaleY],
1378         old_vals[RingbufX] - new_vals[RingbufX], old_vals[RingbufY] - new_vals[RingbufY],
1379         old_vals[RingbufRot] * (180.0 / M_PI) - new_vals[RingbufRot] * (180.0 / M_PI),
1380         new_vals[RingbufScaleX] / old_vals[RingbufScaleX], new_vals[RingbufScaleY] / old_vals[RingbufScaleY]
1381     );
1382 }
1383
1384 // Uses the buffered motion information to determine a transform that smooths the
1385 // given frame and applies it
1386 static int filter_frame(AVFilterLink *link, AVFrame *input_frame)
1387 {
1388     AVFilterContext *avctx = link->dst;
1389     AVFilterLink *outlink = avctx->outputs[0];
1390     DeshakeOpenCLContext *deshake_ctx = avctx->priv;
1391     AVFrame *cropped_frame = NULL, *transformed_frame = NULL;
1392     int err;
1393     cl_int cle;
1394     float new_vals[RingbufCount];
1395     float old_vals[RingbufCount];
1396     // Luma (in the case of YUV) transform, or just the transform in the case of RGB
1397     float transform_y[9];
1398     // Chroma transform
1399     float transform_uv[9];
1400     // Luma crop transform (or RGB)
1401     float transform_crop_y[9];
1402     // Chroma crop transform
1403     float transform_crop_uv[9];
1404     float transform_debug_rgb[9];
1405     size_t global_work[2];
1406     int64_t duration;
1407     cl_mem src, transformed, dst;
1408     cl_mem transforms[3];
1409     CropInfo crops[3];
1410     cl_event transform_event, crop_upscale_event;
1411     DebugMatches debug_matches;
1412     cl_int num_model_matches;
1413
1414     const float center_w = (float)input_frame->width / 2;
1415     const float center_h = (float)input_frame->height / 2;
1416
1417     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(deshake_ctx->sw_format);
1418     const int chroma_width  = AV_CEIL_RSHIFT(input_frame->width, desc->log2_chroma_w);
1419     const int chroma_height = AV_CEIL_RSHIFT(input_frame->height, desc->log2_chroma_h);
1420
1421     const float center_w_chroma = (float)chroma_width / 2;
1422     const float center_h_chroma = (float)chroma_height / 2;
1423
1424     const float luma_w_over_chroma_w = ((float)input_frame->width / (float)chroma_width);
1425     const float luma_h_over_chroma_h = ((float)input_frame->height / (float)chroma_height);
1426
1427     if (deshake_ctx->debug_on) {
1428         av_fifo_generic_read(
1429             deshake_ctx->abs_motion.debug_matches,
1430             &debug_matches,
1431             sizeof(DebugMatches),
1432             NULL
1433         );
1434     }
1435
1436     if (input_frame->pkt_duration) {
1437         duration = input_frame->pkt_duration;
1438     } else {
1439         duration = av_rescale_q(1, av_inv_q(outlink->frame_rate), outlink->time_base);
1440     }
1441     deshake_ctx->duration = input_frame->pts + duration;
1442
1443     // Get the absolute transform data for this frame
1444     for (int i = 0; i < RingbufCount; i++) {
1445         av_fifo_generic_peek_at(
1446             deshake_ctx->abs_motion.ringbuffers[i],
1447             &old_vals[i],
1448             deshake_ctx->abs_motion.curr_frame_offset * sizeof(float),
1449             sizeof(float),
1450             NULL
1451         );
1452     }
1453
1454     if (deshake_ctx->tripod_mode) {
1455         // If tripod mode is turned on we simply undo all motion relative to the
1456         // first frame
1457
1458         new_vals[RingbufX] = 0.0f;
1459         new_vals[RingbufY] = 0.0f;
1460         new_vals[RingbufRot] = 0.0f;
1461         new_vals[RingbufScaleX] = 1.0f;
1462         new_vals[RingbufScaleY] = 1.0f;
1463     } else {
1464         // Tripod mode is off and we need to smooth a moving camera
1465
1466         new_vals[RingbufX] = smooth(
1467             deshake_ctx,
1468             deshake_ctx->gauss_kernel,
1469             deshake_ctx->smooth_window,
1470             input_frame->width,
1471             deshake_ctx->abs_motion.ringbuffers[RingbufX]
1472         );
1473         new_vals[RingbufY] = smooth(
1474             deshake_ctx,
1475             deshake_ctx->gauss_kernel,
1476             deshake_ctx->smooth_window,
1477             input_frame->height,
1478             deshake_ctx->abs_motion.ringbuffers[RingbufY]
1479         );
1480         new_vals[RingbufRot] = smooth(
1481             deshake_ctx,
1482             deshake_ctx->gauss_kernel,
1483             deshake_ctx->smooth_window,
1484             M_PI / 4,
1485             deshake_ctx->abs_motion.ringbuffers[RingbufRot]
1486         );
1487         new_vals[RingbufScaleX] = smooth(
1488             deshake_ctx,
1489             deshake_ctx->gauss_kernel,
1490             deshake_ctx->smooth_window,
1491             2.0f,
1492             deshake_ctx->abs_motion.ringbuffers[RingbufScaleX]
1493         );
1494         new_vals[RingbufScaleY] = smooth(
1495             deshake_ctx,
1496             deshake_ctx->gauss_kernel,
1497             deshake_ctx->smooth_window,
1498             2.0f,
1499             deshake_ctx->abs_motion.ringbuffers[RingbufScaleY]
1500         );
1501     }
1502
1503     transform_center_scale(
1504         old_vals[RingbufX] - new_vals[RingbufX],
1505         old_vals[RingbufY] - new_vals[RingbufY],
1506         old_vals[RingbufRot] - new_vals[RingbufRot],
1507         new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1508         new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1509         center_w,
1510         center_h,
1511         transform_y
1512     );
1513
1514     transform_center_scale(
1515         (old_vals[RingbufX] - new_vals[RingbufX]) / luma_w_over_chroma_w,
1516         (old_vals[RingbufY] - new_vals[RingbufY]) / luma_h_over_chroma_h,
1517         old_vals[RingbufRot] - new_vals[RingbufRot],
1518         new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1519         new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1520         center_w_chroma,
1521         center_h_chroma,
1522         transform_uv
1523     );
1524
1525     CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_y, 9 * sizeof(float), transform_y, NULL);
1526     CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_uv, 9 * sizeof(float), transform_uv, NULL);
1527
1528     if (deshake_ctx->debug_on)
1529         transform_debug(avctx, new_vals, old_vals, deshake_ctx->curr_frame);
1530
1531     cropped_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);
1532     if (!cropped_frame) {
1533         err = AVERROR(ENOMEM);
1534         goto fail;
1535     }
1536
1537     transformed_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);
1538     if (!transformed_frame) {
1539         err = AVERROR(ENOMEM);
1540         goto fail;
1541     }
1542
1543     transforms[0] = deshake_ctx->transform_y;
1544     transforms[1] = transforms[2] = deshake_ctx->transform_uv;
1545
1546     for (int p = 0; p < FF_ARRAY_ELEMS(transformed_frame->data); p++) {
1547         // Transform all of the planes appropriately
1548         src = (cl_mem)input_frame->data[p];
1549         transformed = (cl_mem)transformed_frame->data[p];
1550
1551         if (!transformed)
1552             break;
1553
1554         err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, p, 0);
1555         if (err < 0)
1556             goto fail;
1557
1558         CL_RUN_KERNEL_WITH_ARGS(
1559             deshake_ctx->command_queue,
1560             deshake_ctx->kernel_transform,
1561             global_work,
1562             NULL,
1563             &transform_event,
1564             { sizeof(cl_mem), &src },
1565             { sizeof(cl_mem), &transformed },
1566             { sizeof(cl_mem), &transforms[p] },
1567         );
1568     }
1569
1570     if (deshake_ctx->debug_on && !deshake_ctx->is_yuv && debug_matches.num_matches > 0) {
1571         CL_BLOCKING_WRITE_BUFFER(
1572             deshake_ctx->command_queue,
1573             deshake_ctx->debug_matches,
1574             debug_matches.num_matches * sizeof(MotionVector),
1575             debug_matches.matches,
1576             NULL
1577         );
1578
1579         CL_BLOCKING_WRITE_BUFFER(
1580             deshake_ctx->command_queue,
1581             deshake_ctx->debug_model_matches,
1582             debug_matches.num_model_matches * sizeof(MotionVector),
1583             debug_matches.model_matches,
1584             NULL
1585         );
1586
1587         num_model_matches = debug_matches.num_model_matches;
1588
1589         // Invert the transform
1590         transform_center_scale(
1591             new_vals[RingbufX] - old_vals[RingbufX],
1592             new_vals[RingbufY] - old_vals[RingbufY],
1593             new_vals[RingbufRot] - old_vals[RingbufRot],
1594             old_vals[RingbufScaleX] / new_vals[RingbufScaleX],
1595             old_vals[RingbufScaleY] / new_vals[RingbufScaleY],
1596             center_w,
1597             center_h,
1598             transform_debug_rgb
1599         );
1600
1601         CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_y, 9 * sizeof(float), transform_debug_rgb, NULL);
1602
1603         transformed = (cl_mem)transformed_frame->data[0];
1604         CL_RUN_KERNEL_WITH_ARGS(
1605             deshake_ctx->command_queue,
1606             deshake_ctx->kernel_draw_debug_info,
1607             (size_t[]){ debug_matches.num_matches },
1608             NULL,
1609             NULL,
1610             { sizeof(cl_mem), &transformed },
1611             { sizeof(cl_mem), &deshake_ctx->debug_matches },
1612             { sizeof(cl_mem), &deshake_ctx->debug_model_matches },
1613             { sizeof(cl_int), &num_model_matches },
1614             { sizeof(cl_mem), &deshake_ctx->transform_y }
1615         );
1616     }
1617
1618     if (deshake_ctx->should_crop) {
1619         // Generate transforms for cropping
1620         transform_center_scale(
1621             (old_vals[RingbufX] - new_vals[RingbufX]) / 5,
1622             (old_vals[RingbufY] - new_vals[RingbufY]) / 5,
1623             (old_vals[RingbufRot] - new_vals[RingbufRot]) / 5,
1624             new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1625             new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1626             center_w,
1627             center_h,
1628             transform_crop_y
1629         );
1630         update_needed_crop(&deshake_ctx->crop_y, transform_crop_y, input_frame->width, input_frame->height);
1631
1632         transform_center_scale(
1633             (old_vals[RingbufX] - new_vals[RingbufX]) / (5 * luma_w_over_chroma_w),
1634             (old_vals[RingbufY] - new_vals[RingbufY]) / (5 * luma_h_over_chroma_h),
1635             (old_vals[RingbufRot] - new_vals[RingbufRot]) / 5,
1636             new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1637             new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1638             center_w_chroma,
1639             center_h_chroma,
1640             transform_crop_uv
1641         );
1642         update_needed_crop(&deshake_ctx->crop_uv, transform_crop_uv, chroma_width, chroma_height);
1643
1644         crops[0] = deshake_ctx->crop_y;
1645         crops[1] = crops[2] = deshake_ctx->crop_uv;
1646
1647         for (int p = 0; p < FF_ARRAY_ELEMS(cropped_frame->data); p++) {
1648             // Crop all of the planes appropriately
1649             dst = (cl_mem)cropped_frame->data[p];
1650             transformed = (cl_mem)transformed_frame->data[p];
1651
1652             if (!dst)
1653                 break;
1654
1655             err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, p, 0);
1656             if (err < 0)
1657                 goto fail;
1658
1659             CL_RUN_KERNEL_WITH_ARGS(
1660                 deshake_ctx->command_queue,
1661                 deshake_ctx->kernel_crop_upscale,
1662                 global_work,
1663                 NULL,
1664                 &crop_upscale_event,
1665                 { sizeof(cl_mem), &transformed },
1666                 { sizeof(cl_mem), &dst },
1667                 { sizeof(cl_float2), &crops[p].top_left },
1668                 { sizeof(cl_float2), &crops[p].bottom_right },
1669             );
1670         }
1671     }
1672
1673     if (deshake_ctx->curr_frame < deshake_ctx->smooth_window / 2) {
1674         // This means we are somewhere at the start of the video. We need to
1675         // increment the current frame offset until it reaches the center of
1676         // the ringbuffers (as the current frame will be located there for
1677         // the rest of the video).
1678         //
1679         // The end of the video is taken care of by draining motion data
1680         // one-by-one out of the buffer, causing the (at that point fixed)
1681         // offset to move towards later frames' data.
1682         ++deshake_ctx->abs_motion.curr_frame_offset;
1683     }
1684
1685     if (deshake_ctx->abs_motion.data_end_offset != -1) {
1686         // Keep the end offset in sync with the frame it's supposed to be
1687         // positioned at
1688         --deshake_ctx->abs_motion.data_end_offset;
1689
1690         if (deshake_ctx->abs_motion.data_end_offset == deshake_ctx->abs_motion.curr_frame_offset - 1) {
1691             // The end offset would be the start of the new video sequence; flip to
1692             // start offset
1693             deshake_ctx->abs_motion.data_end_offset = -1;
1694             deshake_ctx->abs_motion.data_start_offset = deshake_ctx->abs_motion.curr_frame_offset;
1695         }
1696     } else if (deshake_ctx->abs_motion.data_start_offset != -1) {
1697         // Keep the start offset in sync with the frame it's supposed to be
1698         // positioned at
1699         --deshake_ctx->abs_motion.data_start_offset;
1700     }
1701
1702     if (deshake_ctx->debug_on) {
1703         deshake_ctx->transform_time += ff_opencl_get_event_time(transform_event);
1704         if (deshake_ctx->should_crop) {
1705             deshake_ctx->crop_upscale_time += ff_opencl_get_event_time(crop_upscale_event);
1706         }
1707     }
1708
1709     ++deshake_ctx->curr_frame;
1710
1711     if (deshake_ctx->debug_on)
1712         av_freep(&debug_matches.matches);
1713
1714     if (deshake_ctx->should_crop) {
1715         err = av_frame_copy_props(cropped_frame, input_frame);
1716         if (err < 0)
1717             goto fail;
1718
1719         av_frame_free(&transformed_frame);
1720         av_frame_free(&input_frame);
1721         return ff_filter_frame(outlink, cropped_frame);
1722
1723     } else {
1724         err = av_frame_copy_props(transformed_frame, input_frame);
1725         if (err < 0)
1726             goto fail;
1727
1728         av_frame_free(&cropped_frame);
1729         av_frame_free(&input_frame);
1730         return ff_filter_frame(outlink, transformed_frame);
1731     }
1732
1733 fail:
1734     clFinish(deshake_ctx->command_queue);
1735
1736     if (deshake_ctx->debug_on)
1737         if (debug_matches.matches)
1738             av_freep(&debug_matches.matches);
1739
1740     av_frame_free(&input_frame);
1741     av_frame_free(&transformed_frame);
1742     av_frame_free(&cropped_frame);
1743     return err;
1744 }
1745
1746 // Add the given frame to the frame queue to eventually be processed.
1747 //
1748 // Also determines the motion from the previous frame and updates the stored
1749 // motion information accordingly.
1750 static int queue_frame(AVFilterLink *link, AVFrame *input_frame)
1751 {
1752     AVFilterContext *avctx = link->dst;
1753     DeshakeOpenCLContext *deshake_ctx = avctx->priv;
1754     int err;
1755     int num_vectors;
1756     int num_inliers = 0;
1757     cl_int cle;
1758     FrameDelta relative;
1759     SimilarityMatrix model;
1760     size_t global_work[2];
1761     size_t harris_global_work[2];
1762     size_t grid_32_global_work[2];
1763     int grid_32_h, grid_32_w;
1764     size_t local_work[2];
1765     cl_mem src, temp;
1766     float prev_vals[5];
1767     float new_vals[5];
1768     cl_event grayscale_event, harris_response_event, refine_features_event,
1769              brief_event, match_descriptors_event, read_buf_event;
1770     DebugMatches debug_matches;
1771
1772     num_vectors = 0;
1773
1774     local_work[0] = 8;
1775     local_work[1] = 8;
1776
1777     err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, 0, 0);
1778     if (err < 0)
1779         goto fail;
1780
1781     err = ff_opencl_filter_work_size_from_image(avctx, harris_global_work, input_frame, 0, 8);
1782     if (err < 0)
1783         goto fail;
1784
1785     err = ff_opencl_filter_work_size_from_image(avctx, grid_32_global_work, input_frame, 0, 32);
1786     if (err < 0)
1787         goto fail;
1788
1789     // We want a single work-item for each 32x32 block of pixels in the input frame
1790     grid_32_global_work[0] /= 32;
1791     grid_32_global_work[1] /= 32;
1792
1793     grid_32_h = ROUNDED_UP_DIV(input_frame->height, 32);
1794     grid_32_w = ROUNDED_UP_DIV(input_frame->width, 32);
1795
1796     if (deshake_ctx->is_yuv) {
1797         deshake_ctx->grayscale = (cl_mem)input_frame->data[0];
1798     } else {
1799         src = (cl_mem)input_frame->data[0];
1800
1801         CL_RUN_KERNEL_WITH_ARGS(
1802             deshake_ctx->command_queue,
1803             deshake_ctx->kernel_grayscale,
1804             global_work,
1805             NULL,
1806             &grayscale_event,
1807             { sizeof(cl_mem), &src },
1808             { sizeof(cl_mem), &deshake_ctx->grayscale }
1809         );
1810     }
1811
1812     CL_RUN_KERNEL_WITH_ARGS(
1813         deshake_ctx->command_queue,
1814         deshake_ctx->kernel_harris_response,
1815         harris_global_work,
1816         local_work,
1817         &harris_response_event,
1818         { sizeof(cl_mem), &deshake_ctx->grayscale },
1819         { sizeof(cl_mem), &deshake_ctx->harris_buf }
1820     );
1821
1822     CL_RUN_KERNEL_WITH_ARGS(
1823         deshake_ctx->command_queue,
1824         deshake_ctx->kernel_refine_features,
1825         grid_32_global_work,
1826         NULL,
1827         &refine_features_event,
1828         { sizeof(cl_mem), &deshake_ctx->grayscale },
1829         { sizeof(cl_mem), &deshake_ctx->harris_buf },
1830         { sizeof(cl_mem), &deshake_ctx->refined_features },
1831         { sizeof(cl_int), &deshake_ctx->refine_features }
1832     );
1833
1834     CL_RUN_KERNEL_WITH_ARGS(
1835         deshake_ctx->command_queue,
1836         deshake_ctx->kernel_brief_descriptors,
1837         grid_32_global_work,
1838         NULL,
1839         &brief_event,
1840         { sizeof(cl_mem), &deshake_ctx->grayscale },
1841         { sizeof(cl_mem), &deshake_ctx->refined_features },
1842         { sizeof(cl_mem), &deshake_ctx->descriptors },
1843         { sizeof(cl_mem), &deshake_ctx->brief_pattern}
1844     );
1845
1846     if (av_fifo_size(deshake_ctx->abs_motion.ringbuffers[RingbufX]) == 0) {
1847         // This is the first frame we've been given to queue, meaning there is
1848         // no previous frame to match descriptors to
1849
1850         goto no_motion_data;
1851     }
1852
1853     CL_RUN_KERNEL_WITH_ARGS(
1854         deshake_ctx->command_queue,
1855         deshake_ctx->kernel_match_descriptors,
1856         grid_32_global_work,
1857         NULL,
1858         &match_descriptors_event,
1859         { sizeof(cl_mem), &deshake_ctx->prev_refined_features },
1860         { sizeof(cl_mem), &deshake_ctx->refined_features },
1861         { sizeof(cl_mem), &deshake_ctx->descriptors },
1862         { sizeof(cl_mem), &deshake_ctx->prev_descriptors },
1863         { sizeof(cl_mem), &deshake_ctx->matches }
1864     );
1865
1866     cle = clEnqueueReadBuffer(
1867         deshake_ctx->command_queue,
1868         deshake_ctx->matches,
1869         CL_TRUE,
1870         0,
1871         grid_32_h * grid_32_w * sizeof(MotionVector),
1872         deshake_ctx->matches_host,
1873         0,
1874         NULL,
1875         &read_buf_event
1876     );
1877     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read matches to host: %d.\n", cle);
1878
1879     num_vectors = make_vectors_contig(deshake_ctx, grid_32_h, grid_32_w);
1880
1881     if (num_vectors < 10) {
1882         // Not enough matches to get reliable motion data for this frame
1883         //
1884         // From this point on all data is relative to this frame rather than the
1885         // original frame. We have to make sure that we don't mix values that were
1886         // relative to the original frame with the new values relative to this
1887         // frame when doing the gaussian smoothing. We keep track of where the old
1888         // values end using this data_end_offset field in order to accomplish
1889         // that goal.
1890         //
1891         // If no motion data is present for multiple frames in a short window of
1892         // time, we leave the end where it was to avoid mixing 0s in with the
1893         // old data (and just treat them all as part of the new values)
1894         if (deshake_ctx->abs_motion.data_end_offset == -1) {
1895             deshake_ctx->abs_motion.data_end_offset =
1896                 av_fifo_size(deshake_ctx->abs_motion.ringbuffers[RingbufX]) / sizeof(float) - 1;
1897         }
1898
1899         goto no_motion_data;
1900     }
1901
1902     if (!estimate_affine_2d(
1903         deshake_ctx,
1904         deshake_ctx->matches_contig_host,
1905         &debug_matches,
1906         num_vectors,
1907         model.matrix,
1908         10.0,
1909         3000,
1910         0.999999999999
1911     )) {
1912         goto no_motion_data;
1913     }
1914
1915     for (int i = 0; i < num_vectors; i++) {
1916         if (deshake_ctx->matches_contig_host[i].should_consider) {
1917             deshake_ctx->inliers[num_inliers] = deshake_ctx->matches_contig_host[i];
1918             num_inliers++;
1919         }
1920     }
1921
1922     if (!minimize_error(
1923         deshake_ctx,
1924         deshake_ctx->inliers,
1925         &debug_matches,
1926         num_inliers,
1927         model.matrix,
1928         400
1929     )) {
1930         goto no_motion_data;
1931     }
1932
1933
1934     relative = decompose_transform(model.matrix);
1935
1936     // Get the absolute transform data for the previous frame
1937     for (int i = 0; i < RingbufCount; i++) {
1938         av_fifo_generic_peek_at(
1939             deshake_ctx->abs_motion.ringbuffers[i],
1940             &prev_vals[i],
1941             av_fifo_size(deshake_ctx->abs_motion.ringbuffers[i]) - sizeof(float),
1942             sizeof(float),
1943             NULL
1944         );
1945     }
1946
1947     new_vals[RingbufX]      = prev_vals[RingbufX] + relative.translation.s[0];
1948     new_vals[RingbufY]      = prev_vals[RingbufY] + relative.translation.s[1];
1949     new_vals[RingbufRot]    = prev_vals[RingbufRot] + relative.rotation;
1950     new_vals[RingbufScaleX] = prev_vals[RingbufScaleX] / relative.scale.s[0];
1951     new_vals[RingbufScaleY] = prev_vals[RingbufScaleY] / relative.scale.s[1];
1952
1953     if (deshake_ctx->debug_on) {
1954         if (!deshake_ctx->is_yuv) {
1955             deshake_ctx->grayscale_time     += ff_opencl_get_event_time(grayscale_event);
1956         }
1957         deshake_ctx->harris_response_time   += ff_opencl_get_event_time(harris_response_event);
1958         deshake_ctx->refine_features_time   += ff_opencl_get_event_time(refine_features_event);
1959         deshake_ctx->brief_descriptors_time += ff_opencl_get_event_time(brief_event);
1960         deshake_ctx->match_descriptors_time += ff_opencl_get_event_time(match_descriptors_event);
1961         deshake_ctx->read_buf_time          += ff_opencl_get_event_time(read_buf_event);
1962     }
1963
1964     goto end;
1965
1966 no_motion_data:
1967     new_vals[RingbufX]      = 0.0f;
1968     new_vals[RingbufY]      = 0.0f;
1969     new_vals[RingbufRot]    = 0.0f;
1970     new_vals[RingbufScaleX] = 1.0f;
1971     new_vals[RingbufScaleY] = 1.0f;
1972
1973     for (int i = 0; i < num_vectors; i++) {
1974         deshake_ctx->matches_contig_host[i].should_consider = false;
1975     }
1976     debug_matches.num_model_matches = 0;
1977
1978     if (deshake_ctx->debug_on) {
1979         av_log(avctx, AV_LOG_VERBOSE,
1980             "\n[ALERT] No motion data found in queue_frame, motion reset to 0\n\n"
1981         );
1982     }
1983
1984     goto end;
1985
1986 end:
1987     // Swap the descriptor buffers (we don't need the previous frame's descriptors
1988     // again so we will use that space for the next frame's descriptors)
1989     temp = deshake_ctx->prev_descriptors;
1990     deshake_ctx->prev_descriptors = deshake_ctx->descriptors;
1991     deshake_ctx->descriptors = temp;
1992
1993     // Same for the refined features
1994     temp = deshake_ctx->prev_refined_features;
1995     deshake_ctx->prev_refined_features = deshake_ctx->refined_features;
1996     deshake_ctx->refined_features = temp;
1997
1998     if (deshake_ctx->debug_on) {
1999         if (num_vectors == 0) {
2000             debug_matches.matches = NULL;
2001         } else {
2002             debug_matches.matches = av_malloc_array(num_vectors, sizeof(MotionVector));
2003
2004             if (!debug_matches.matches) {
2005                 err = AVERROR(ENOMEM);
2006                 goto fail;
2007             }
2008         }
2009
2010         for (int i = 0; i < num_vectors; i++) {
2011             debug_matches.matches[i] = deshake_ctx->matches_contig_host[i];
2012         }
2013         debug_matches.num_matches = num_vectors;
2014
2015         av_fifo_generic_write(
2016             deshake_ctx->abs_motion.debug_matches,
2017             &debug_matches,
2018             sizeof(DebugMatches),
2019             NULL
2020         );
2021     }
2022
2023     for (int i = 0; i < RingbufCount; i++) {
2024         av_fifo_generic_write(
2025             deshake_ctx->abs_motion.ringbuffers[i],
2026             &new_vals[i],
2027             sizeof(float),
2028             NULL
2029         );
2030     }
2031
2032     return ff_framequeue_add(&deshake_ctx->fq, input_frame);
2033
2034 fail:
2035     clFinish(deshake_ctx->command_queue);
2036     av_frame_free(&input_frame);
2037     return err;
2038 }
2039
2040 static int activate(AVFilterContext *ctx)
2041 {
2042     AVFilterLink *inlink = ctx->inputs[0];
2043     AVFilterLink *outlink = ctx->outputs[0];
2044     DeshakeOpenCLContext *deshake_ctx = ctx->priv;
2045     AVFrame *frame = NULL;
2046     int ret, status;
2047     int64_t pts;
2048
2049     FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
2050
2051     if (!deshake_ctx->eof) {
2052         ret = ff_inlink_consume_frame(inlink, &frame);
2053         if (ret < 0)
2054             return ret;
2055         if (ret > 0) {
2056             if (!frame->hw_frames_ctx)
2057                 return AVERROR(EINVAL);
2058
2059             if (!deshake_ctx->initialized) {
2060                 ret = deshake_opencl_init(ctx);
2061                 if (ret < 0)
2062                     return ret;
2063             }
2064
2065             // If there is no more space in the ringbuffers, remove the oldest
2066             // values to make room for the new ones
2067             if (av_fifo_space(deshake_ctx->abs_motion.ringbuffers[RingbufX]) == 0) {
2068                 for (int i = 0; i < RingbufCount; i++) {
2069                     av_fifo_drain(deshake_ctx->abs_motion.ringbuffers[i], sizeof(float));
2070                 }
2071             }
2072             ret = queue_frame(inlink, frame);
2073             if (ret < 0)
2074                 return ret;
2075             if (ret >= 0) {
2076                 // See if we have enough buffered frames to process one
2077                 //
2078                 // "enough" is half the smooth window of queued frames into the future
2079                 if (ff_framequeue_queued_frames(&deshake_ctx->fq) >= deshake_ctx->smooth_window / 2) {
2080                     return filter_frame(inlink, ff_framequeue_take(&deshake_ctx->fq));
2081                 }
2082             }
2083         }
2084     }
2085
2086     if (!deshake_ctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts)) {
2087         if (status == AVERROR_EOF) {
2088             deshake_ctx->eof = true;
2089         }
2090     }
2091
2092     if (deshake_ctx->eof) {
2093         // Finish processing the rest of the frames in the queue.
2094         while(ff_framequeue_queued_frames(&deshake_ctx->fq) != 0) {
2095             for (int i = 0; i < RingbufCount; i++) {
2096                 av_fifo_drain(deshake_ctx->abs_motion.ringbuffers[i], sizeof(float));
2097             }
2098
2099             ret = filter_frame(inlink, ff_framequeue_take(&deshake_ctx->fq));
2100             if (ret < 0) {
2101                 return ret;
2102             }
2103         }
2104
2105         if (deshake_ctx->debug_on) {
2106             av_log(ctx, AV_LOG_VERBOSE,
2107                 "Average kernel execution times:\n"
2108                 "\t        grayscale: %0.3f ms\n"
2109                 "\t  harris_response: %0.3f ms\n"
2110                 "\t  refine_features: %0.3f ms\n"
2111                 "\tbrief_descriptors: %0.3f ms\n"
2112                 "\tmatch_descriptors: %0.3f ms\n"
2113                 "\t        transform: %0.3f ms\n"
2114                 "\t     crop_upscale: %0.3f ms\n"
2115                 "Average buffer read times:\n"
2116                 "\t     features buf: %0.3f ms\n",
2117                 averaged_event_time_ms(deshake_ctx->grayscale_time, deshake_ctx->curr_frame),
2118                 averaged_event_time_ms(deshake_ctx->harris_response_time, deshake_ctx->curr_frame),
2119                 averaged_event_time_ms(deshake_ctx->refine_features_time, deshake_ctx->curr_frame),
2120                 averaged_event_time_ms(deshake_ctx->brief_descriptors_time, deshake_ctx->curr_frame),
2121                 averaged_event_time_ms(deshake_ctx->match_descriptors_time, deshake_ctx->curr_frame),
2122                 averaged_event_time_ms(deshake_ctx->transform_time, deshake_ctx->curr_frame),
2123                 averaged_event_time_ms(deshake_ctx->crop_upscale_time, deshake_ctx->curr_frame),
2124                 averaged_event_time_ms(deshake_ctx->read_buf_time, deshake_ctx->curr_frame)
2125             );
2126         }
2127
2128         ff_outlink_set_status(outlink, AVERROR_EOF, deshake_ctx->duration);
2129         return 0;
2130     }
2131
2132     if (!deshake_ctx->eof) {
2133         FF_FILTER_FORWARD_WANTED(outlink, inlink);
2134     }
2135
2136     return FFERROR_NOT_READY;
2137 }
2138
2139 static const AVFilterPad deshake_opencl_inputs[] = {
2140     {
2141         .name = "default",
2142         .type = AVMEDIA_TYPE_VIDEO,
2143         .config_props = &ff_opencl_filter_config_input,
2144     },
2145     { NULL }
2146 };
2147
2148 static const AVFilterPad deshake_opencl_outputs[] = {
2149     {
2150         .name = "default",
2151         .type = AVMEDIA_TYPE_VIDEO,
2152         .config_props = &ff_opencl_filter_config_output,
2153     },
2154     { NULL }
2155 };
2156
2157 #define OFFSET(x) offsetof(DeshakeOpenCLContext, x)
2158 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
2159
2160 static const AVOption deshake_opencl_options[] = {
2161     {
2162         "tripod", "simulates a tripod by preventing any camera movement whatsoever "
2163         "from the original frame",
2164         OFFSET(tripod_mode), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS
2165     },
2166     {
2167         "debug", "turn on additional debugging information",
2168         OFFSET(debug_on), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS
2169     },
2170     {
2171         "adaptive_crop", "attempt to subtly crop borders to reduce mirrored content",
2172         OFFSET(should_crop), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS
2173     },
2174     {
2175         "refine_features", "refine feature point locations at a sub-pixel level",
2176         OFFSET(refine_features), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS
2177     },
2178     {
2179         "smooth_strength", "smoothing strength (0 attempts to adaptively determine optimal strength)",
2180         OFFSET(smooth_percent), AV_OPT_TYPE_FLOAT, {.dbl = 0.0f}, 0.0f, 1.0f, FLAGS
2181     },
2182     {
2183         "smooth_window_multiplier", "multiplier for number of frames to buffer for motion data",
2184         OFFSET(smooth_window_multiplier), AV_OPT_TYPE_FLOAT, {.dbl = 2.0}, 0.1, 10.0, FLAGS
2185     },
2186     { NULL }
2187 };
2188
2189 AVFILTER_DEFINE_CLASS(deshake_opencl);
2190
2191 AVFilter ff_vf_deshake_opencl = {
2192     .name           = "deshake_opencl",
2193     .description    = NULL_IF_CONFIG_SMALL("Feature-point based video stabilization filter"),
2194     .priv_size      = sizeof(DeshakeOpenCLContext),
2195     .priv_class     = &deshake_opencl_class,
2196     .init           = &ff_opencl_filter_init,
2197     .uninit         = &deshake_opencl_uninit,
2198     .query_formats  = &ff_opencl_filter_query_formats,
2199     .activate       = activate,
2200     .inputs         = deshake_opencl_inputs,
2201     .outputs        = deshake_opencl_outputs,
2202     .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE
2203 };