git.sesse.net Git - ffmpeg/blob - libavfilter/vf_deshake_opencl.c

   1 /*
   2  * This file is part of FFmpeg.
   3  *
   4  * FFmpeg is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * FFmpeg is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with FFmpeg; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17  *
  18  * Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  19  * Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  20  * Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  21  * Third party copyrights are property of their respective owners.
  22  *
  23  * Redistribution and use in source and binary forms, with or without modification,
  24  * are permitted provided that the following conditions are met:
  25  *
  26  *   * Redistribution's of source code must retain the above copyright notice,
  27  *     this list of conditions and the following disclaimer.
  28  *
  29  *   * Redistribution's in binary form must reproduce the above copyright notice,
  30  *     this list of conditions and the following disclaimer in the documentation
  31  *     and/or other materials provided with the distribution.
  32  *
  33  *   * The name of the copyright holders may not be used to endorse or promote products
  34  *     derived from this software without specific prior written permission.
  35  *
  36  * This software is provided by the copyright holders and contributors "as is" and
  37  * any express or implied warranties, including, but not limited to, the implied
  38  * warranties of merchantability and fitness for a particular purpose are disclaimed.
  39  * In no event shall the Intel Corporation or contributors be liable for any direct,
  40  * indirect, incidental, special, exemplary, or consequential damages
  41  * (including, but not limited to, procurement of substitute goods or services;
  42  * loss of use, data, or profits; or business interruption) however caused
  43  * and on any theory of liability, whether in contract, strict liability,
  44  * or tort (including negligence or otherwise) arising in any way out of
  45  * the use of this software, even if advised of the possibility of such damage.
  46  */
  47
  48 #include <stdbool.h>
  49 #include <float.h>
  50 #include <libavutil/lfg.h>
  51 #include "libavutil/opt.h"
  52 #include "libavutil/imgutils.h"
  53 #include "libavutil/mem.h"
  54 #include "libavutil/fifo.h"
  55 #include "libavutil/common.h"
  56 #include "libavutil/avassert.h"
  57 #include "libavutil/pixfmt.h"
  58 #include "avfilter.h"
  59 #include "framequeue.h"
  60 #include "filters.h"
  61 #include "transform.h"
  62 #include "formats.h"
  63 #include "internal.h"
  64 #include "opencl.h"
  65 #include "opencl_source.h"
  66 #include "video.h"
  67
  68 /*
  69 This filter matches feature points between frames (dealing with outliers) and then
  70 uses the matches to estimate an affine transform between frames. This transform is
  71 decomposed into various values (translation, scale, rotation) and the values are
  72 summed relative to the start of the video to obtain on absolute camera position
  73 for each frame. This "camera path" is then smoothed via a gaussian filter, resulting
  74 in a new path that is turned back into an affine transform and applied to each
  75 frame to render it.
  76
  77 High-level overview:
  78
  79 All of the work to extract motion data from frames occurs in queue_frame. Motion data
  80 is buffered in a smoothing window, so queue_frame simply computes the absolute camera
  81 positions and places them in ringbuffers.
  82
  83 filter_frame is responsible for looking at the absolute camera positions currently
  84 in the ringbuffers, applying the gaussian filter, and then transforming the frames.
  85 */
  86
  87 // Number of bits for BRIEF descriptors
  88 #define BREIFN 512
  89 // Size of the patch from which a BRIEF descriptor is extracted
  90 // This is the size used in OpenCV
  91 #define BRIEF_PATCH_SIZE 31
  92 #define BRIEF_PATCH_SIZE_HALF (BRIEF_PATCH_SIZE / 2)
  93
  94 #define MATCHES_CONTIG_SIZE 2000
  95
  96 #define ROUNDED_UP_DIV(a, b) ((a + (b - 1)) / b)
  97
  98 typedef struct PointPair {
  99     // Previous frame
 100     cl_float2 p1;
 101     // Current frame
 102     cl_float2 p2;
 103 } PointPair;
 104
 105 typedef struct MotionVector {
 106     PointPair p;
 107     // Used to mark vectors as potential outliers
 108     cl_int should_consider;
 109 } MotionVector;
 110
 111 // Denotes the indices for the different types of motion in the ringbuffers array
 112 enum RingbufferIndices {
 113     RingbufX,
 114     RingbufY,
 115     RingbufRot,
 116     RingbufScaleX,
 117     RingbufScaleY,
 118
 119     // Should always be last
 120     RingbufCount
 121 };
 122
 123 // Struct that holds data for drawing point match debug data
 124 typedef struct DebugMatches {
 125     MotionVector *matches;
 126     // The points used to calculate the affine transform for a frame
 127     MotionVector model_matches[3];
 128
 129     int num_matches;
 130     // For cases where we couldn't calculate a model
 131     int num_model_matches;
 132 } DebugMatches;
 133
 134 // Groups together the ringbuffers that store absolute distortion / position values
 135 // for each frame
 136 typedef struct AbsoluteFrameMotion {
 137     // Array with the various ringbuffers, indexed via the RingbufferIndices enum
 138     AVFifoBuffer *ringbuffers[RingbufCount];
 139
 140     // Offset to get to the current frame being processed
 141     // (not in bytes)
 142     int curr_frame_offset;
 143     // Keeps track of where the start and end of contiguous motion data is (to
 144     // deal with cases where no motion data is found between two frames)
 145     int data_start_offset;
 146     int data_end_offset;
 147
 148     AVFifoBuffer *debug_matches;
 149 } AbsoluteFrameMotion;
 150
 151 // Takes care of freeing the arrays within the DebugMatches inside of the
 152 // debug_matches ringbuffer and then freeing the buffer itself.
 153 static void free_debug_matches(AbsoluteFrameMotion *afm) {
 154     DebugMatches dm;
 155
 156     if (!afm->debug_matches) {
 157         return;
 158     }
 159
 160     while (av_fifo_size(afm->debug_matches) > 0) {
 161         av_fifo_generic_read(
 162             afm->debug_matches,
 163             &dm,
 164             sizeof(DebugMatches),
 165             NULL
 166         );
 167
 168         av_freep(&dm.matches);
 169     }
 170
 171     av_fifo_freep(&afm->debug_matches);
 172 }
 173
 174 // Stores the translation, scale, rotation, and skew deltas between two frames
 175 typedef struct FrameDelta {
 176     cl_float2 translation;
 177     float rotation;
 178     cl_float2 scale;
 179     cl_float2 skew;
 180 } FrameDelta;
 181
 182 typedef struct SimilarityMatrix {
 183     // The 2x3 similarity matrix
 184     double matrix[6];
 185 } SimilarityMatrix;
 186
 187 typedef struct CropInfo {
 188     // The top left corner of the bounding box for the crop
 189     cl_float2 top_left;
 190     // The bottom right corner of the bounding box for the crop
 191     cl_float2 bottom_right;
 192 } CropInfo;
 193
 194 // Returned from function that determines start and end values for iteration
 195 // around the current frame in a ringbuffer
 196 typedef struct IterIndices {
 197     int start;
 198     int end;
 199 } IterIndices;
 200
 201 typedef struct DeshakeOpenCLContext {
 202     OpenCLFilterContext ocf;
 203     // Whether or not the above `OpenCLFilterContext` has been initialized
 204     int initialized;
 205
 206     // These variables are used in the activate callback
 207     int64_t duration;
 208     bool eof;
 209
 210     // State for random number generation
 211     AVLFG alfg;
 212
 213     // FIFO frame queue used to buffer future frames for processing
 214     FFFrameQueue fq;
 215     // Ringbuffers for frame positions
 216     AbsoluteFrameMotion abs_motion;
 217
 218     // The number of frames' motion to consider before and after the frame we are
 219     // smoothing
 220     int smooth_window;
 221     // The number of the frame we are currently processing
 222     int curr_frame;
 223
 224     // Stores a 1d array of normalised gaussian kernel values for convolution
 225     float *gauss_kernel;
 226
 227     // Buffer for error values used in RANSAC code
 228     float *ransac_err;
 229
 230     // Information regarding how to crop the smoothed luminance (or RGB) planes
 231     CropInfo crop_y;
 232     // Information regarding how to crop the smoothed chroma planes
 233     CropInfo crop_uv;
 234
 235     // Whether or not we are processing YUV input (as oppposed to RGB)
 236     bool is_yuv;
 237     // The underlying format of the hardware surfaces
 238     int sw_format;
 239
 240     // Buffer to copy `matches` into for the CPU to work with
 241     MotionVector *matches_host;
 242     MotionVector *matches_contig_host;
 243
 244     MotionVector *inliers;
 245
 246     cl_command_queue command_queue;
 247     cl_kernel kernel_grayscale;
 248     cl_kernel kernel_harris_response;
 249     cl_kernel kernel_refine_features;
 250     cl_kernel kernel_brief_descriptors;
 251     cl_kernel kernel_match_descriptors;
 252     cl_kernel kernel_transform;
 253     cl_kernel kernel_crop_upscale;
 254
 255     // Stores a frame converted to grayscale
 256     cl_mem grayscale;
 257     // Stores the harris response for a frame (measure of "cornerness" for each pixel)
 258     cl_mem harris_buf;
 259
 260     // Detected features after non-maximum suppression and sub-pixel refinement
 261     cl_mem refined_features;
 262     // Saved from the previous frame
 263     cl_mem prev_refined_features;
 264
 265     // BRIEF sampling pattern that is randomly initialized
 266     cl_mem brief_pattern;
 267     // Feature point descriptors for the current frame
 268     cl_mem descriptors;
 269     // Feature point descriptors for the previous frame
 270     cl_mem prev_descriptors;
 271     // Vectors between points in current and previous frame
 272     cl_mem matches;
 273     cl_mem matches_contig;
 274     // Holds the matrix to transform luminance (or RGB) with
 275     cl_mem transform_y;
 276     // Holds the matrix to transform chroma with
 277     cl_mem transform_uv;
 278
 279     // Configurable options
 280
 281     int tripod_mode;
 282     int debug_on;
 283     int should_crop;
 284
 285     // Whether or not feature points should be refined at a sub-pixel level
 286     cl_int refine_features;
 287     // If the user sets a value other than the default, 0, this percentage is
 288     // translated into a sigma value ranging from 0.5 to 40.0
 289     float smooth_percent;
 290     // This number is multiplied by the video frame rate to determine the size
 291     // of the smooth window
 292     float smooth_window_multiplier;
 293
 294     // Debug stuff
 295
 296     cl_kernel kernel_draw_debug_info;
 297     cl_mem debug_matches;
 298     cl_mem debug_model_matches;
 299
 300     // These store the total time spent executing the different kernels in nanoseconds
 301     unsigned long long grayscale_time;
 302     unsigned long long harris_response_time;
 303     unsigned long long refine_features_time;
 304     unsigned long long brief_descriptors_time;
 305     unsigned long long match_descriptors_time;
 306     unsigned long long transform_time;
 307     unsigned long long crop_upscale_time;
 308
 309     // Time spent copying matched features from the device to the host
 310     unsigned long long read_buf_time;
 311 } DeshakeOpenCLContext;
 312
 313 // Returns a random uniformly-distributed number in [low, high]
 314 static int rand_in(int low, int high, AVLFG *alfg) {
 315     return (av_lfg_get(alfg) % (high - low)) + low;
 316 }
 317
 318 // Returns the average execution time for an event given the total time and the
 319 // number of frames processed.
 320 static double averaged_event_time_ms(unsigned long long total_time, int num_frames) {
 321     return (double)total_time / (double)num_frames / 1000000.0;
 322 }
 323
 324 // The following code is loosely ported from OpenCV
 325
 326 // Estimates affine transform from 3 point pairs
 327 // model is a 2x3 matrix:
 328 //      a b c
 329 //      d e f
 330 static void run_estimate_kernel(const MotionVector *point_pairs, double *model)
 331 {
 332     // src points
 333     double x1 = point_pairs[0].p.p1.s[0];
 334     double y1 = point_pairs[0].p.p1.s[1];
 335     double x2 = point_pairs[1].p.p1.s[0];
 336     double y2 = point_pairs[1].p.p1.s[1];
 337     double x3 = point_pairs[2].p.p1.s[0];
 338     double y3 = point_pairs[2].p.p1.s[1];
 339
 340     // dest points
 341     double X1 = point_pairs[0].p.p2.s[0];
 342     double Y1 = point_pairs[0].p.p2.s[1];
 343     double X2 = point_pairs[1].p.p2.s[0];
 344     double Y2 = point_pairs[1].p.p2.s[1];
 345     double X3 = point_pairs[2].p.p2.s[0];
 346     double Y3 = point_pairs[2].p.p2.s[1];
 347
 348     double d = 1.0 / ( x1*(y2-y3) + x2*(y3-y1) + x3*(y1-y2) );
 349
 350     model[0] = d * ( X1*(y2-y3) + X2*(y3-y1) + X3*(y1-y2) );
 351     model[1] = d * ( X1*(x3-x2) + X2*(x1-x3) + X3*(x2-x1) );
 352     model[2] = d * ( X1*(x2*y3 - x3*y2) + X2*(x3*y1 - x1*y3) + X3*(x1*y2 - x2*y1) );
 353
 354     model[3] = d * ( Y1*(y2-y3) + Y2*(y3-y1) + Y3*(y1-y2) );
 355     model[4] = d * ( Y1*(x3-x2) + Y2*(x1-x3) + Y3*(x2-x1) );
 356     model[5] = d * ( Y1*(x2*y3 - x3*y2) + Y2*(x3*y1 - x1*y3) + Y3*(x1*y2 - x2*y1) );
 357 }
 358
 359 // Checks that the 3 points in the given array are not collinear
 360 static bool points_not_collinear(const cl_float2 **points)
 361 {
 362     int j, k, i = 2;
 363
 364     for (j = 0; j < i; j++) {
 365         double dx1 = points[j]->s[0] - points[i]->s[0];
 366         double dy1 = points[j]->s[1] - points[i]->s[1];
 367
 368         for (k = 0; k < j; k++) {
 369             double dx2 = points[k]->s[0] - points[i]->s[0];
 370             double dy2 = points[k]->s[1] - points[i]->s[1];
 371
 372             // Assuming a 3840 x 2160 video with a point at (0, 0) and one at
 373             // (3839, 2159), this prevents a third point from being within roughly
 374             // 0.5 of a pixel of the line connecting the two on both axes
 375             if (fabs(dx2*dy1 - dy2*dx1) <= 1.0) {
 376                 return false;
 377             }
 378         }
 379     }
 380
 381     return true;
 382 }
 383
 384 // Checks a subset of 3 point pairs to make sure that the points are not collinear
 385 // and not too close to each other
 386 static bool check_subset(const MotionVector *pairs_subset)
 387 {
 388     const cl_float2 *prev_points[] = {
 389         &pairs_subset[0].p.p1,
 390         &pairs_subset[1].p.p1,
 391         &pairs_subset[2].p.p1
 392     };
 393
 394     const cl_float2 *curr_points[] = {
 395         &pairs_subset[0].p.p2,
 396         &pairs_subset[1].p.p2,
 397         &pairs_subset[2].p.p2
 398     };
 399
 400     return points_not_collinear(prev_points) && points_not_collinear(curr_points);
 401 }
 402
 403 // Selects a random subset of 3 points from point_pairs and places them in pairs_subset
 404 static bool get_subset(
 405     AVLFG *alfg,
 406     const MotionVector *point_pairs,
 407     const int num_point_pairs,
 408     MotionVector *pairs_subset,
 409     int max_attempts
 410 ) {
 411     int idx[3];
 412     int i = 0, j, iters = 0;
 413
 414     for (; iters < max_attempts; iters++) {
 415         for (i = 0; i < 3 && iters < max_attempts;) {
 416             int idx_i = 0;
 417
 418             for (;;) {
 419                 idx_i = idx[i] = rand_in(0, num_point_pairs, alfg);
 420
 421                 for (j = 0; j < i; j++) {
 422                     if (idx_i == idx[j]) {
 423                         break;
 424                     }
 425                 }
 426
 427                 if (j == i) {
 428                     break;
 429                 }
 430             }
 431
 432             pairs_subset[i] = point_pairs[idx[i]];
 433             i++;
 434         }
 435
 436         if (i == 3 && !check_subset(pairs_subset)) {
 437             continue;
 438         }
 439         break;
 440     }
 441
 442     return i == 3 && iters < max_attempts;
 443 }
 444
 445 // Computes the error for each of the given points based on the given model.
 446 static void compute_error(
 447     const MotionVector *point_pairs,
 448     const int num_point_pairs,
 449     const double *model,
 450     float *err
 451 ) {
 452     double F0 = model[0], F1 = model[1], F2 = model[2];
 453     double F3 = model[3], F4 = model[4], F5 = model[5];
 454
 455     for (int i = 0; i < num_point_pairs; i++) {
 456         const cl_float2 *f = &point_pairs[i].p.p1;
 457         const cl_float2 *t = &point_pairs[i].p.p2;
 458
 459         double a = F0*f->s[0] + F1*f->s[1] + F2 - t->s[0];
 460         double b = F3*f->s[0] + F4*f->s[1] + F5 - t->s[1];
 461
 462         err[i] = a*a + b*b;
 463     }
 464 }
 465
 466 // Determines which of the given point matches are inliers for the given model
 467 // based on the specified threshold.
 468 //
 469 // err must be an array of num_point_pairs length
 470 static int find_inliers(
 471     MotionVector *point_pairs,
 472     const int num_point_pairs,
 473     const double *model,
 474     float *err,
 475     double thresh
 476 ) {
 477     float t = (float)(thresh * thresh);
 478     int i, n = num_point_pairs, num_inliers = 0;
 479
 480     compute_error(point_pairs, num_point_pairs, model, err);
 481
 482     for (i = 0; i < n; i++) {
 483         if (err[i] <= t) {
 484             // This is an inlier
 485             point_pairs[i].should_consider = true;
 486             num_inliers += 1;
 487         } else {
 488             point_pairs[i].should_consider = false;
 489         }
 490     }
 491
 492     return num_inliers;
 493 }
 494
 495 // Determines the number of iterations required to achieve the desired confidence level.
 496 //
 497 // The equation used to determine the number of iterations to do is:
 498 // 1 - confidence = (1 - inlier_probability^num_points)^num_iters
 499 //
 500 // Solving for num_iters:
 501 //
 502 // num_iters = log(1 - confidence) / log(1 - inlier_probability^num_points)
 503 //
 504 // A more in-depth explanation can be found at https://en.wikipedia.org/wiki/Random_sample_consensus
 505 // under the 'Parameters' heading
 506 static int ransac_update_num_iters(double confidence, double num_outliers, int max_iters)
 507 {
 508     double num, denom;
 509
 510     confidence   = av_clipd(confidence, 0.0, 1.0);
 511     num_outliers = av_clipd(num_outliers, 0.0, 1.0);
 512
 513     // avoid inf's & nan's
 514     num = FFMAX(1.0 - confidence, DBL_MIN);
 515     denom = 1.0 - pow(1.0 - num_outliers, 3);
 516     if (denom < DBL_MIN) {
 517         return 0;
 518     }
 519
 520     num = log(num);
 521     denom = log(denom);
 522
 523     return denom >= 0 || -num >= max_iters * (-denom) ? max_iters : (int)round(num / denom);
 524 }
 525
 526 // Estimates an affine transform between the given pairs of points using RANdom
 527 // SAmple Consensus
 528 static bool estimate_affine_2d(
 529     DeshakeOpenCLContext *deshake_ctx,
 530     MotionVector *point_pairs,
 531     DebugMatches *debug_matches,
 532     const int num_point_pairs,
 533     double *model_out,
 534     const double threshold,
 535     const int max_iters,
 536     const double confidence
 537 ) {
 538     bool result = false;
 539     double best_model[6], model[6];
 540     MotionVector pairs_subset[3], best_pairs[3];
 541
 542     int iter, niters = FFMAX(max_iters, 1);
 543     int good_count, max_good_count = 0;
 544
 545     // We need at least 3 points to build a model from
 546     if (num_point_pairs < 3) {
 547         return false;
 548     } else if (num_point_pairs == 3) {
 549         // There are only 3 points, so RANSAC doesn't apply here
 550         run_estimate_kernel(point_pairs, model_out);
 551
 552         for (int i = 0; i < 3; ++i) {
 553             point_pairs[i].should_consider = true;
 554         }
 555
 556         return true;
 557     }
 558
 559     for (iter = 0; iter < niters; ++iter) {
 560         bool found = get_subset(&deshake_ctx->alfg, point_pairs, num_point_pairs, pairs_subset, 10000);
 561
 562         if (!found) {
 563             if (iter == 0) {
 564                 return false;
 565             }
 566
 567             break;
 568         }
 569
 570         run_estimate_kernel(pairs_subset, model);
 571         good_count = find_inliers(point_pairs, num_point_pairs, model, deshake_ctx->ransac_err, threshold);
 572
 573         if (good_count > FFMAX(max_good_count, 2)) {
 574             for (int mi = 0; mi < 6; ++mi) {
 575                 best_model[mi] = model[mi];
 576             }
 577
 578             for (int pi = 0; pi < 3; pi++) {
 579                 best_pairs[pi] = pairs_subset[pi];
 580             }
 581
 582             max_good_count = good_count;
 583             niters = ransac_update_num_iters(
 584                 confidence,
 585                 (double)(num_point_pairs - good_count) / num_point_pairs,
 586                 niters
 587             );
 588         }
 589     }
 590
 591     if (max_good_count > 0) {
 592         for (int mi = 0; mi < 6; ++mi) {
 593             model_out[mi] = best_model[mi];
 594         }
 595
 596         for (int pi = 0; pi < 3; ++pi) {
 597             debug_matches->model_matches[pi] = best_pairs[pi];
 598         }
 599         debug_matches->num_model_matches = 3;
 600
 601         // Find the inliers again for the best model for debugging
 602         find_inliers(point_pairs, num_point_pairs, best_model, deshake_ctx->ransac_err, threshold);
 603         result = true;
 604     }
 605
 606     return result;
 607 }
 608
 609 // "Wiggles" the first point in best_pairs around a tiny bit in order to decrease the
 610 // total error
 611 static void optimize_model(
 612     DeshakeOpenCLContext *deshake_ctx,
 613     MotionVector *best_pairs,
 614     MotionVector *inliers,
 615     const int num_inliers,
 616     float best_err,
 617     double *model_out
 618 ) {
 619     float move_x_val = 0.01;
 620     float move_y_val = 0.01;
 621     bool move_x = true;
 622     float old_move_x_val = 0;
 623     double model[6];
 624     int last_changed = 0;
 625
 626     for (int iters = 0; iters < 200; iters++) {
 627         float total_err = 0;
 628
 629         if (move_x) {
 630             best_pairs[0].p.p2.s[0] += move_x_val;
 631         } else {
 632             best_pairs[0].p.p2.s[0] += move_y_val;
 633         }
 634
 635         run_estimate_kernel(best_pairs, model);
 636         compute_error(inliers, num_inliers, model, deshake_ctx->ransac_err);
 637
 638         for (int j = 0; j < num_inliers; j++) {
 639             total_err += deshake_ctx->ransac_err[j];
 640         }
 641
 642         if (total_err < best_err) {
 643             for (int mi = 0; mi < 6; ++mi) {
 644                 model_out[mi] = model[mi];
 645             }
 646
 647             best_err = total_err;
 648             last_changed = iters;
 649         } else {
 650             // Undo the change
 651             if (move_x) {
 652                 best_pairs[0].p.p2.s[0] -= move_x_val;
 653             } else {
 654                 best_pairs[0].p.p2.s[0] -= move_y_val;
 655             }
 656
 657             if (iters - last_changed > 4) {
 658                 // We've already improved the model as much as we can
 659                 break;
 660             }
 661
 662             old_move_x_val = move_x_val;
 663
 664             if (move_x) {
 665                 move_x_val *= -1;
 666             } else {
 667                 move_y_val *= -1;
 668             }
 669
 670             if (old_move_x_val < 0) {
 671                 move_x = false;
 672             } else {
 673                 move_x = true;
 674             }
 675         }
 676     }
 677 }
 678
 679 // Uses a process similar to that of RANSAC to find a transform that minimizes
 680 // the total error for a set of point matches determined to be inliers
 681 //
 682 // (Pick random subsets, compute model, find total error, iterate until error
 683 // is minimized.)
 684 static bool minimize_error(
 685     DeshakeOpenCLContext *deshake_ctx,
 686     MotionVector *inliers,
 687     DebugMatches *debug_matches,
 688     const int num_inliers,
 689     double *model_out,
 690     const int max_iters
 691 ) {
 692     bool result = false;
 693     float best_err = FLT_MAX;
 694     double best_model[6], model[6];
 695     MotionVector pairs_subset[3], best_pairs[3];
 696
 697     for (int i = 0; i < max_iters; i++) {
 698         float total_err = 0;
 699         bool found = get_subset(&deshake_ctx->alfg, inliers, num_inliers, pairs_subset, 10000);
 700
 701         if (!found) {
 702             if (i == 0) {
 703                 return false;
 704             }
 705
 706             break;
 707         }
 708
 709         run_estimate_kernel(pairs_subset, model);
 710         compute_error(inliers, num_inliers, model, deshake_ctx->ransac_err);
 711
 712         for (int j = 0; j < num_inliers; j++) {
 713             total_err += deshake_ctx->ransac_err[j];
 714         }
 715
 716         if (total_err < best_err) {
 717             for (int mi = 0; mi < 6; ++mi) {
 718                 best_model[mi] = model[mi];
 719             }
 720
 721             for (int pi = 0; pi < 3; pi++) {
 722                 best_pairs[pi] = pairs_subset[pi];
 723             }
 724
 725             best_err = total_err;
 726         }
 727     }
 728
 729     for (int mi = 0; mi < 6; ++mi) {
 730         model_out[mi] = best_model[mi];
 731     }
 732
 733     for (int pi = 0; pi < 3; ++pi) {
 734         debug_matches->model_matches[pi] = best_pairs[pi];
 735     }
 736     debug_matches->num_model_matches = 3;
 737     result = true;
 738
 739     optimize_model(deshake_ctx, best_pairs, inliers, num_inliers, best_err, model_out);
 740     return result;
 741 }
 742
 743 // End code from OpenCV
 744
 745 // Decomposes a similarity matrix into translation, rotation, scale, and skew
 746 //
 747 // See http://frederic-wang.fr/decomposition-of-2d-transform-matrices.html
 748 static FrameDelta decompose_transform(double *model)
 749 {
 750     FrameDelta ret;
 751
 752     double a = model[0];
 753     double c = model[1];
 754     double e = model[2];
 755     double b = model[3];
 756     double d = model[4];
 757     double f = model[5];
 758     double delta = a * d - b * c;
 759
 760     ret.translation.s[0] = e;
 761     ret.translation.s[1] = f;
 762
 763     // This is the QR method
 764     if (a != 0 || b != 0) {
 765         double r = hypot(a, b);
 766
 767         ret.rotation = FFSIGN(b) * acos(a / r);
 768         ret.scale.s[0] = r;
 769         ret.scale.s[1] = delta / r;
 770         ret.skew.s[0] = atan((a * c + b * d) / (r * r));
 771         ret.skew.s[1] = 0;
 772     } else if (c != 0 || d != 0) {
 773         double s = sqrt(c * c + d * d);
 774
 775         ret.rotation = M_PI / 2 - FFSIGN(d) * acos(-c / s);
 776         ret.scale.s[0] = delta / s;
 777         ret.scale.s[1] = s;
 778         ret.skew.s[0] = 0;
 779         ret.skew.s[1] = atan((a * c + b * d) / (s * s));
 780     } // otherwise there is only translation
 781
 782     return ret;
 783 }
 784
 785 // Move valid vectors from the 2d buffer into a 1d buffer where they are contiguous
 786 static int make_vectors_contig(
 787     DeshakeOpenCLContext *deshake_ctx,
 788     int size_y,
 789     int size_x
 790 ) {
 791     int num_vectors = 0;
 792
 793     for (int i = 0; i < size_y; ++i) {
 794         for (int j = 0; j < size_x; ++j) {
 795             MotionVector v = deshake_ctx->matches_host[j + i * size_x];
 796
 797             if (v.should_consider) {
 798                 deshake_ctx->matches_contig_host[num_vectors] = v;
 799                 ++num_vectors;
 800             }
 801
 802             // Make sure we do not exceed the amount of space we allocated for these vectors
 803             if (num_vectors == MATCHES_CONTIG_SIZE - 1) {
 804                 return num_vectors;
 805             }
 806         }
 807     }
 808     return num_vectors;
 809 }
 810
 811 // Returns the gaussian kernel value for the given x coordinate and sigma value
 812 static float gaussian_for(int x, float sigma) {
 813     return 1.0f / expf(((float)x * (float)x) / (2.0f * sigma * sigma));
 814 }
 815
 816 // Makes a normalized gaussian kernel of the given length for the given sigma
 817 // and places it in gauss_kernel
 818 static void make_gauss_kernel(float *gauss_kernel, float length, float sigma)
 819 {
 820     float gauss_sum = 0;
 821     int window_half = length / 2;
 822
 823     for (int i = 0; i < length; ++i) {
 824         float val = gaussian_for(i - window_half, sigma);
 825
 826         gauss_sum += val;
 827         gauss_kernel[i] = val;
 828     }
 829
 830     // Normalize the gaussian values
 831     for (int i = 0; i < length; ++i) {
 832         gauss_kernel[i] /= gauss_sum;
 833     }
 834 }
 835
 836 // Returns indices to start and end iteration at in order to iterate over a window
 837 // of length size centered at the current frame in a ringbuffer
 838 //
 839 // Always returns numbers that result in a window of length size, even if that
 840 // means specifying negative indices or indices past the end of the values in the
 841 // ringbuffers. Make sure you clip indices appropriately within your loop.
 842 static IterIndices start_end_for(DeshakeOpenCLContext *deshake_ctx, int length) {
 843     IterIndices indices;
 844
 845     indices.start = deshake_ctx->abs_motion.curr_frame_offset - (length / 2);
 846     indices.end = deshake_ctx->abs_motion.curr_frame_offset + (length / 2) + (length % 2);
 847
 848     return indices;
 849 }
 850
 851 // Sets val to the value in the given ringbuffer at the given offset, taking care of
 852 // clipping the offset into the appropriate range
 853 static void ringbuf_float_at(
 854     DeshakeOpenCLContext *deshake_ctx,
 855     AVFifoBuffer *values,
 856     float *val,
 857     int offset
 858 ) {
 859     int clip_start, clip_end, offset_clipped;
 860     if (deshake_ctx->abs_motion.data_end_offset != -1) {
 861         clip_end = deshake_ctx->abs_motion.data_end_offset;
 862     } else {
 863         // This expression represents the last valid index in the buffer,
 864         // which we use repeatedly at the end of the video.
 865         clip_end = deshake_ctx->smooth_window - (av_fifo_space(values) / sizeof(float)) - 1;
 866     }
 867
 868     if (deshake_ctx->abs_motion.data_start_offset != -1) {
 869         clip_start = deshake_ctx->abs_motion.data_start_offset;
 870     } else {
 871         // Negative indices will occur at the start of the video, and we want
 872         // them to be clipped to 0 in order to repeatedly use the position of
 873         // the first frame.
 874         clip_start = 0;
 875     }
 876
 877     offset_clipped = av_clip(
 878         offset,
 879         clip_start,
 880         clip_end
 881     );
 882
 883     av_fifo_generic_peek_at(
 884         values,
 885         val,
 886         offset_clipped * sizeof(float),
 887         sizeof(float),
 888         NULL
 889     );
 890 }
 891
 892 // Returns smoothed current frame value of the given buffer of floats based on the
 893 // given Gaussian kernel and its length (also the window length, centered around the
 894 // current frame) and the "maximum value" of the motion.
 895 //
 896 // This "maximum value" should be the width / height of the image in the case of
 897 // translation and an empirically chosen constant for rotation / scale.
 898 //
 899 // The sigma chosen to generate the final gaussian kernel with used to smooth the
 900 // camera path is either hardcoded (set by user, deshake_ctx->smooth_percent) or
 901 // adaptively chosen.
 902 static float smooth(
 903     DeshakeOpenCLContext *deshake_ctx,
 904     float *gauss_kernel,
 905     int length,
 906     float max_val,
 907     AVFifoBuffer *values
 908 ) {
 909     float new_large_s = 0, new_small_s = 0, new_best = 0, old, diff_between,
 910           percent_of_max, inverted_percent;
 911     IterIndices indices = start_end_for(deshake_ctx, length);
 912     float large_sigma = 40.0f;
 913     float small_sigma = 2.0f;
 914     float best_sigma;
 915
 916     if (deshake_ctx->smooth_percent) {
 917         best_sigma = (large_sigma - 0.5f) * deshake_ctx->smooth_percent + 0.5f;
 918     } else {
 919         // Strategy to adaptively smooth trajectory:
 920         //
 921         // 1. Smooth path with large and small sigma values
 922         // 2. Take the absolute value of the difference between them
 923         // 3. Get a percentage by putting the difference over the "max value"
 924         // 4, Invert the percentage
 925         // 5. Calculate a new sigma value weighted towards the larger sigma value
 926         // 6. Determine final smoothed trajectory value using that sigma
 927
 928         make_gauss_kernel(gauss_kernel, length, large_sigma);
 929         for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
 930             ringbuf_float_at(deshake_ctx, values, &old, i);
 931             new_large_s += old * gauss_kernel[j];
 932         }
 933
 934         make_gauss_kernel(gauss_kernel, length, small_sigma);
 935         for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
 936             ringbuf_float_at(deshake_ctx, values, &old, i);
 937             new_small_s += old * gauss_kernel[j];
 938         }
 939
 940         diff_between = fabsf(new_large_s - new_small_s);
 941         percent_of_max = diff_between / max_val;
 942         inverted_percent = 1 - percent_of_max;
 943         best_sigma = large_sigma * powf(inverted_percent, 40);
 944     }
 945
 946     make_gauss_kernel(gauss_kernel, length, best_sigma);
 947     for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
 948         ringbuf_float_at(deshake_ctx, values, &old, i);
 949         new_best += old * gauss_kernel[j];
 950     }
 951
 952     return new_best;
 953 }
 954
 955 // Returns the position of the given point after the transform is applied
 956 static cl_float2 transformed_point(float x, float y, float *transform) {
 957     cl_float2 ret;
 958
 959     ret.s[0] = x * transform[0] + y * transform[1] + transform[2];
 960     ret.s[1] = x * transform[3] + y * transform[4] + transform[5];
 961
 962     return ret;
 963 }
 964
 965 // Creates an affine transform that scales from the center of a frame
 966 static void transform_center_scale(
 967     float x_shift,
 968     float y_shift,
 969     float angle,
 970     float scale_x,
 971     float scale_y,
 972     float center_w,
 973     float center_h,
 974     float *matrix
 975 ) {
 976     cl_float2 center_s;
 977     float center_s_w, center_s_h;
 978
 979     ff_get_matrix(
 980         0,
 981         0,
 982         0,
 983         scale_x,
 984         scale_y,
 985         matrix
 986     );
 987
 988     center_s = transformed_point(center_w, center_h, matrix);
 989     center_s_w = center_w - center_s.s[0];
 990     center_s_h = center_h - center_s.s[1];
 991
 992     ff_get_matrix(
 993         x_shift + center_s_w,
 994         y_shift + center_s_h,
 995         angle,
 996         scale_x,
 997         scale_y,
 998         matrix
 999     );
1000 }
1001
1002 // Determines the crop necessary to eliminate black borders from a smoothed frame
1003 // and updates target crop accordingly
1004 static void update_needed_crop(
1005     CropInfo* crop,
1006     float *transform,
1007     float frame_width,
1008     float frame_height
1009 ) {
1010     float new_width, new_height, adjusted_width, adjusted_height, adjusted_x, adjusted_y;
1011
1012     cl_float2 top_left = transformed_point(0, 0, transform);
1013     cl_float2 top_right = transformed_point(frame_width, 0, transform);
1014     cl_float2 bottom_left = transformed_point(0, frame_height, transform);
1015     cl_float2 bottom_right = transformed_point(frame_width, frame_height, transform);
1016     float ar_h = frame_height / frame_width;
1017     float ar_w = frame_width / frame_height;
1018
1019     if (crop->bottom_right.s[0] == 0) {
1020         // The crop hasn't been set to the original size of the plane
1021         crop->bottom_right.s[0] = frame_width;
1022         crop->bottom_right.s[1] = frame_height;
1023     }
1024
1025     crop->top_left.s[0] = FFMAX3(
1026         crop->top_left.s[0],
1027         top_left.s[0],
1028         bottom_left.s[0]
1029     );
1030
1031     crop->top_left.s[1] = FFMAX3(
1032         crop->top_left.s[1],
1033         top_left.s[1],
1034         top_right.s[1]
1035     );
1036
1037     crop->bottom_right.s[0] = FFMIN3(
1038         crop->bottom_right.s[0],
1039         bottom_right.s[0],
1040         top_right.s[0]
1041     );
1042
1043     crop->bottom_right.s[1] = FFMIN3(
1044         crop->bottom_right.s[1],
1045         bottom_right.s[1],
1046         bottom_left.s[1]
1047     );
1048
1049     // Make sure our potentially new bounding box has the same aspect ratio
1050     new_height = crop->bottom_right.s[1] - crop->top_left.s[1];
1051     new_width = crop->bottom_right.s[0] - crop->top_left.s[0];
1052
1053     adjusted_width = new_height * ar_w;
1054     adjusted_x = crop->bottom_right.s[0] - adjusted_width;
1055
1056     if (adjusted_x >= crop->top_left.s[0]) {
1057         crop->top_left.s[0] = adjusted_x;
1058     } else {
1059         adjusted_height = new_width * ar_h;
1060         adjusted_y = crop->bottom_right.s[1] - adjusted_height;
1061         crop->top_left.s[1] = adjusted_y;
1062     }
1063 }
1064
1065 static av_cold void deshake_opencl_uninit(AVFilterContext *avctx)
1066 {
1067     DeshakeOpenCLContext *ctx = avctx->priv;
1068     cl_int cle;
1069
1070     for (int i = 0; i < RingbufCount; i++)
1071         av_fifo_freep(&ctx->abs_motion.ringbuffers[i]);
1072
1073     if (ctx->debug_on)
1074         free_debug_matches(&ctx->abs_motion);
1075
1076     if (ctx->gauss_kernel)
1077         av_freep(&ctx->gauss_kernel);
1078
1079     if (ctx->ransac_err)
1080         av_freep(&ctx->ransac_err);
1081
1082     if (ctx->matches_host)
1083         av_freep(&ctx->matches_host);
1084
1085     if (ctx->matches_contig_host)
1086         av_freep(&ctx->matches_contig_host);
1087
1088     if (ctx->inliers)
1089         av_freep(&ctx->inliers);
1090
1091     ff_framequeue_free(&ctx->fq);
1092
1093     CL_RELEASE_KERNEL(ctx->kernel_grayscale);
1094     CL_RELEASE_KERNEL(ctx->kernel_harris_response);
1095     CL_RELEASE_KERNEL(ctx->kernel_refine_features);
1096     CL_RELEASE_KERNEL(ctx->kernel_brief_descriptors);
1097     CL_RELEASE_KERNEL(ctx->kernel_match_descriptors);
1098     CL_RELEASE_KERNEL(ctx->kernel_crop_upscale);
1099     if (ctx->debug_on)
1100         CL_RELEASE_KERNEL(ctx->kernel_draw_debug_info);
1101
1102     CL_RELEASE_QUEUE(ctx->command_queue);
1103
1104     if (!ctx->is_yuv)
1105         CL_RELEASE_MEMORY(ctx->grayscale);
1106     CL_RELEASE_MEMORY(ctx->harris_buf);
1107     CL_RELEASE_MEMORY(ctx->refined_features);
1108     CL_RELEASE_MEMORY(ctx->prev_refined_features);
1109     CL_RELEASE_MEMORY(ctx->brief_pattern);
1110     CL_RELEASE_MEMORY(ctx->descriptors);
1111     CL_RELEASE_MEMORY(ctx->prev_descriptors);
1112     CL_RELEASE_MEMORY(ctx->matches);
1113     CL_RELEASE_MEMORY(ctx->matches_contig);
1114     CL_RELEASE_MEMORY(ctx->transform_y);
1115     CL_RELEASE_MEMORY(ctx->transform_uv);
1116     if (ctx->debug_on) {
1117         CL_RELEASE_MEMORY(ctx->debug_matches);
1118         CL_RELEASE_MEMORY(ctx->debug_model_matches);
1119     }
1120
1121     ff_opencl_filter_uninit(avctx);
1122 }
1123
1124 static int deshake_opencl_init(AVFilterContext *avctx)
1125 {
1126     DeshakeOpenCLContext *ctx = avctx->priv;
1127     AVFilterLink *outlink = avctx->outputs[0];
1128     AVFilterLink *inlink = avctx->inputs[0];
1129     // Pointer to the host-side pattern buffer to be initialized and then copied
1130     // to the GPU
1131     PointPair *pattern_host;
1132     cl_int cle;
1133     int err;
1134     cl_ulong8 zeroed_ulong8;
1135     FFFrameQueueGlobal fqg;
1136     cl_image_format grayscale_format;
1137     cl_image_desc grayscale_desc;
1138     cl_command_queue_properties queue_props;
1139
1140     const enum AVPixelFormat disallowed_formats[14] = {
1141         AV_PIX_FMT_GBRP,
1142         AV_PIX_FMT_GBRP9BE,
1143         AV_PIX_FMT_GBRP9LE,
1144         AV_PIX_FMT_GBRP10BE,
1145         AV_PIX_FMT_GBRP10LE,
1146         AV_PIX_FMT_GBRP16BE,
1147         AV_PIX_FMT_GBRP16LE,
1148         AV_PIX_FMT_GBRAP,
1149         AV_PIX_FMT_GBRAP16BE,
1150         AV_PIX_FMT_GBRAP16LE,
1151         AV_PIX_FMT_GBRAP12BE,
1152         AV_PIX_FMT_GBRAP12LE,
1153         AV_PIX_FMT_GBRAP10BE,
1154         AV_PIX_FMT_GBRAP10LE
1155     };
1156
1157     // Number of elements for an array
1158     const int image_grid_32 = ROUNDED_UP_DIV(outlink->h, 32) * ROUNDED_UP_DIV(outlink->w, 32);
1159
1160     const int descriptor_buf_size = image_grid_32 * (BREIFN / 8);
1161     const int features_buf_size = image_grid_32 * sizeof(cl_float2);
1162
1163     const AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
1164     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hw_frames_ctx->sw_format);
1165
1166     av_assert0(hw_frames_ctx);
1167     av_assert0(desc);
1168
1169     ff_framequeue_global_init(&fqg);
1170     ff_framequeue_init(&ctx->fq, &fqg);
1171     ctx->eof = false;
1172     ctx->smooth_window = (int)(av_q2d(avctx->inputs[0]->frame_rate) * ctx->smooth_window_multiplier);
1173     ctx->curr_frame = 0;
1174
1175     memset(&zeroed_ulong8, 0, sizeof(cl_ulong8));
1176
1177     ctx->gauss_kernel = av_malloc_array(ctx->smooth_window, sizeof(float));
1178     if (!ctx->gauss_kernel) {
1179         err = AVERROR(ENOMEM);
1180         goto fail;
1181     }
1182
1183     ctx->ransac_err = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(float));
1184     if (!ctx->ransac_err) {
1185         err = AVERROR(ENOMEM);
1186         goto fail;
1187     }
1188
1189     for (int i = 0; i < RingbufCount; i++) {
1190         ctx->abs_motion.ringbuffers[i] = av_fifo_alloc_array(
1191             ctx->smooth_window,
1192             sizeof(float)
1193         );
1194
1195         if (!ctx->abs_motion.ringbuffers[i]) {
1196             err = AVERROR(ENOMEM);
1197             goto fail;
1198         }
1199     }
1200
1201     if (ctx->debug_on) {
1202         ctx->abs_motion.debug_matches = av_fifo_alloc_array(
1203             ctx->smooth_window / 2,
1204             sizeof(DebugMatches)
1205         );
1206
1207         if (!ctx->abs_motion.debug_matches) {
1208             err = AVERROR(ENOMEM);
1209             goto fail;
1210         }
1211     }
1212
1213     ctx->abs_motion.curr_frame_offset = 0;
1214     ctx->abs_motion.data_start_offset = -1;
1215     ctx->abs_motion.data_end_offset = -1;
1216
1217     pattern_host = av_malloc_array(BREIFN, sizeof(PointPair));
1218     if (!pattern_host) {
1219         err = AVERROR(ENOMEM);
1220         goto fail;
1221     }
1222
1223     ctx->matches_host = av_malloc_array(image_grid_32, sizeof(MotionVector));
1224     if (!ctx->matches_host) {
1225         err = AVERROR(ENOMEM);
1226         goto fail;
1227     }
1228
1229     ctx->matches_contig_host = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(MotionVector));
1230     if (!ctx->matches_contig_host) {
1231         err = AVERROR(ENOMEM);
1232         goto fail;
1233     }
1234
1235     ctx->inliers = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(MotionVector));
1236     if (!ctx->inliers) {
1237         err = AVERROR(ENOMEM);
1238         goto fail;
1239     }
1240
1241     // Initializing the patch pattern for building BREIF descriptors with
1242     av_lfg_init(&ctx->alfg, 234342424);
1243     for (int i = 0; i < BREIFN; ++i) {
1244         PointPair pair;
1245
1246         for (int j = 0; j < 2; ++j) {
1247             pair.p1.s[j] = rand_in(-BRIEF_PATCH_SIZE_HALF, BRIEF_PATCH_SIZE_HALF + 1, &ctx->alfg);
1248             pair.p2.s[j] = rand_in(-BRIEF_PATCH_SIZE_HALF, BRIEF_PATCH_SIZE_HALF + 1, &ctx->alfg);
1249         }
1250
1251         pattern_host[i] = pair;
1252     }
1253
1254     for (int i = 0; i < 14; i++) {
1255         if (ctx->sw_format == disallowed_formats[i]) {
1256             av_log(avctx, AV_LOG_ERROR, "unsupported format in deshake_opencl.\n");
1257             err = AVERROR(ENOSYS);
1258             goto fail;
1259         }
1260     }
1261
1262     if (desc->flags & AV_PIX_FMT_FLAG_RGB) {
1263         ctx->is_yuv = false;
1264     } else {
1265         ctx->is_yuv = true;
1266     }
1267     ctx->sw_format = hw_frames_ctx->sw_format;
1268
1269     err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_deshake, 1);
1270     if (err < 0)
1271         goto fail;
1272
1273     if (ctx->debug_on) {
1274         queue_props = CL_QUEUE_PROFILING_ENABLE;
1275     } else {
1276         queue_props = 0;
1277     }
1278     ctx->command_queue = clCreateCommandQueue(
1279         ctx->ocf.hwctx->context,
1280         ctx->ocf.hwctx->device_id,
1281         queue_props,
1282         &cle
1283     );
1284     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL command queue %d.\n", cle);
1285
1286     CL_CREATE_KERNEL(ctx, grayscale);
1287     CL_CREATE_KERNEL(ctx, harris_response);
1288     CL_CREATE_KERNEL(ctx, refine_features);
1289     CL_CREATE_KERNEL(ctx, brief_descriptors);
1290     CL_CREATE_KERNEL(ctx, match_descriptors);
1291     CL_CREATE_KERNEL(ctx, transform);
1292     CL_CREATE_KERNEL(ctx, crop_upscale);
1293     if (ctx->debug_on)
1294         CL_CREATE_KERNEL(ctx, draw_debug_info);
1295
1296     if (!ctx->is_yuv) {
1297         grayscale_format.image_channel_order = CL_R;
1298         grayscale_format.image_channel_data_type = CL_FLOAT;
1299
1300         grayscale_desc = (cl_image_desc) {
1301             .image_type = CL_MEM_OBJECT_IMAGE2D,
1302             .image_width = outlink->w,
1303             .image_height = outlink->h,
1304             .image_depth = 0,
1305             .image_array_size = 0,
1306             .image_row_pitch = 0,
1307             .image_slice_pitch = 0,
1308             .num_mip_levels = 0,
1309             .num_samples = 0,
1310             .buffer = NULL,
1311         };
1312
1313         ctx->grayscale = clCreateImage(
1314             ctx->ocf.hwctx->context,
1315             0,
1316             &grayscale_format,
1317             &grayscale_desc,
1318             NULL,
1319             &cle
1320         );
1321         CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create grayscale image: %d.\n", cle);
1322     }
1323
1324     CL_CREATE_BUFFER(ctx, harris_buf, outlink->h * outlink->w * sizeof(float));
1325     CL_CREATE_BUFFER(ctx, refined_features, features_buf_size);
1326     CL_CREATE_BUFFER(ctx, prev_refined_features, features_buf_size);
1327     CL_CREATE_BUFFER_FLAGS(
1328         ctx,
1329         brief_pattern,
1330         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1331         BREIFN * sizeof(PointPair),
1332         pattern_host
1333     );
1334     CL_CREATE_BUFFER(ctx, descriptors, descriptor_buf_size);
1335     CL_CREATE_BUFFER(ctx, prev_descriptors, descriptor_buf_size);
1336     CL_CREATE_BUFFER(ctx, matches, image_grid_32 * sizeof(MotionVector));
1337     CL_CREATE_BUFFER(ctx, matches_contig, MATCHES_CONTIG_SIZE * sizeof(MotionVector));
1338     CL_CREATE_BUFFER(ctx, transform_y, 9 * sizeof(float));
1339     CL_CREATE_BUFFER(ctx, transform_uv, 9 * sizeof(float));
1340     if (ctx->debug_on) {
1341         CL_CREATE_BUFFER(ctx, debug_matches, MATCHES_CONTIG_SIZE * sizeof(MotionVector));
1342         CL_CREATE_BUFFER(ctx, debug_model_matches, 3 * sizeof(MotionVector));
1343     }
1344
1345     ctx->initialized = 1;
1346     av_freep(&pattern_host);
1347
1348     return 0;
1349
1350 fail:
1351     if (!pattern_host)
1352         av_freep(&pattern_host);
1353     return err;
1354 }
1355
1356 // Logs debug information about the transform data
1357 static void transform_debug(AVFilterContext *avctx, float *new_vals, float *old_vals, int curr_frame) {
1358     av_log(avctx, AV_LOG_VERBOSE,
1359         "Frame %d:\n"
1360         "\tframe moved from: %f x, %f y\n"
1361         "\t              to: %f x, %f y\n"
1362         "\t    rotated from: %f degrees\n"
1363         "\t              to: %f degrees\n"
1364         "\t     scaled from: %f x, %f y\n"
1365         "\t              to: %f x, %f y\n"
1366         "\n"
1367         "\tframe moved by: %f x, %f y\n"
1368         "\t    rotated by: %f degrees\n"
1369         "\t     scaled by: %f x, %f y\n",
1370         curr_frame,
1371         old_vals[RingbufX], old_vals[RingbufY],
1372         new_vals[RingbufX], new_vals[RingbufY],
1373         old_vals[RingbufRot] * (180.0 / M_PI),
1374         new_vals[RingbufRot] * (180.0 / M_PI),
1375         old_vals[RingbufScaleX], old_vals[RingbufScaleY],
1376         new_vals[RingbufScaleX], new_vals[RingbufScaleY],
1377         old_vals[RingbufX] - new_vals[RingbufX], old_vals[RingbufY] - new_vals[RingbufY],
1378         old_vals[RingbufRot] * (180.0 / M_PI) - new_vals[RingbufRot] * (180.0 / M_PI),
1379         new_vals[RingbufScaleX] / old_vals[RingbufScaleX], new_vals[RingbufScaleY] / old_vals[RingbufScaleY]
1380     );
1381 }
1382
1383 // Uses the buffered motion information to determine a transform that smooths the
1384 // given frame and applies it
1385 static int filter_frame(AVFilterLink *link, AVFrame *input_frame)
1386 {
1387     AVFilterContext *avctx = link->dst;
1388     AVFilterLink *outlink = avctx->outputs[0];
1389     DeshakeOpenCLContext *deshake_ctx = avctx->priv;
1390     AVFrame *cropped_frame = NULL, *transformed_frame = NULL;
1391     int err;
1392     cl_int cle;
1393     float new_vals[RingbufCount];
1394     float old_vals[RingbufCount];
1395     // Luma (in the case of YUV) transform, or just the transform in the case of RGB
1396     float transform_y[9];
1397     // Chroma transform
1398     float transform_uv[9];
1399     // Luma crop transform (or RGB)
1400     float transform_crop_y[9];
1401     // Chroma crop transform
1402     float transform_crop_uv[9];
1403     float transform_debug_rgb[9];
1404     size_t global_work[2];
1405     int64_t duration;
1406     cl_mem src, transformed, dst;
1407     cl_mem transforms[3];
1408     CropInfo crops[3];
1409     cl_event transform_event, crop_upscale_event;
1410     DebugMatches debug_matches;
1411     cl_int num_model_matches;
1412
1413     const float center_w = (float)input_frame->width / 2;
1414     const float center_h = (float)input_frame->height / 2;
1415
1416     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(deshake_ctx->sw_format);
1417     const int chroma_width  = AV_CEIL_RSHIFT(input_frame->width, desc->log2_chroma_w);
1418     const int chroma_height = AV_CEIL_RSHIFT(input_frame->height, desc->log2_chroma_h);
1419
1420     const float center_w_chroma = (float)chroma_width / 2;
1421     const float center_h_chroma = (float)chroma_height / 2;
1422
1423     const float luma_w_over_chroma_w = ((float)input_frame->width / (float)chroma_width);
1424     const float luma_h_over_chroma_h = ((float)input_frame->height / (float)chroma_height);
1425
1426     if (deshake_ctx->debug_on) {
1427         av_fifo_generic_read(
1428             deshake_ctx->abs_motion.debug_matches,
1429             &debug_matches,
1430             sizeof(DebugMatches),
1431             NULL
1432         );
1433     }
1434
1435     if (input_frame->pkt_duration) {
1436         duration = input_frame->pkt_duration;
1437     } else {
1438         duration = av_rescale_q(1, av_inv_q(outlink->frame_rate), outlink->time_base);
1439     }
1440     deshake_ctx->duration = input_frame->pts + duration;
1441
1442     // Get the absolute transform data for this frame
1443     for (int i = 0; i < RingbufCount; i++) {
1444         av_fifo_generic_peek_at(
1445             deshake_ctx->abs_motion.ringbuffers[i],
1446             &old_vals[i],
1447             deshake_ctx->abs_motion.curr_frame_offset * sizeof(float),
1448             sizeof(float),
1449             NULL
1450         );
1451     }
1452
1453     if (deshake_ctx->tripod_mode) {
1454         // If tripod mode is turned on we simply undo all motion relative to the
1455         // first frame
1456
1457         new_vals[RingbufX] = 0.0f;
1458         new_vals[RingbufY] = 0.0f;
1459         new_vals[RingbufRot] = 0.0f;
1460         new_vals[RingbufScaleX] = 1.0f;
1461         new_vals[RingbufScaleY] = 1.0f;
1462     } else {
1463         // Tripod mode is off and we need to smooth a moving camera
1464
1465         new_vals[RingbufX] = smooth(
1466             deshake_ctx,
1467             deshake_ctx->gauss_kernel,
1468             deshake_ctx->smooth_window,
1469             input_frame->width,
1470             deshake_ctx->abs_motion.ringbuffers[RingbufX]
1471         );
1472         new_vals[RingbufY] = smooth(
1473             deshake_ctx,
1474             deshake_ctx->gauss_kernel,
1475             deshake_ctx->smooth_window,
1476             input_frame->height,
1477             deshake_ctx->abs_motion.ringbuffers[RingbufY]
1478         );
1479         new_vals[RingbufRot] = smooth(
1480             deshake_ctx,
1481             deshake_ctx->gauss_kernel,
1482             deshake_ctx->smooth_window,
1483             M_PI / 4,
1484             deshake_ctx->abs_motion.ringbuffers[RingbufRot]
1485         );
1486         new_vals[RingbufScaleX] = smooth(
1487             deshake_ctx,
1488             deshake_ctx->gauss_kernel,
1489             deshake_ctx->smooth_window,
1490             2.0f,
1491             deshake_ctx->abs_motion.ringbuffers[RingbufScaleX]
1492         );
1493         new_vals[RingbufScaleY] = smooth(
1494             deshake_ctx,
1495             deshake_ctx->gauss_kernel,
1496             deshake_ctx->smooth_window,
1497             2.0f,
1498             deshake_ctx->abs_motion.ringbuffers[RingbufScaleY]
1499         );
1500     }
1501
1502     transform_center_scale(
1503         old_vals[RingbufX] - new_vals[RingbufX],
1504         old_vals[RingbufY] - new_vals[RingbufY],
1505         old_vals[RingbufRot] - new_vals[RingbufRot],
1506         new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1507         new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1508         center_w,
1509         center_h,
1510         transform_y
1511     );
1512
1513     transform_center_scale(
1514         (old_vals[RingbufX] - new_vals[RingbufX]) / luma_w_over_chroma_w,
1515         (old_vals[RingbufY] - new_vals[RingbufY]) / luma_h_over_chroma_h,
1516         old_vals[RingbufRot] - new_vals[RingbufRot],
1517         new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1518         new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1519         center_w_chroma,
1520         center_h_chroma,
1521         transform_uv
1522     );
1523
1524     CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_y, 9 * sizeof(float), transform_y, NULL);
1525     CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_uv, 9 * sizeof(float), transform_uv, NULL);
1526
1527     if (deshake_ctx->debug_on)
1528         transform_debug(avctx, new_vals, old_vals, deshake_ctx->curr_frame);
1529
1530     cropped_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);
1531     if (!cropped_frame) {
1532         err = AVERROR(ENOMEM);
1533         goto fail;
1534     }
1535
1536     transformed_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);
1537     if (!transformed_frame) {
1538         err = AVERROR(ENOMEM);
1539         goto fail;
1540     }
1541
1542     transforms[0] = deshake_ctx->transform_y;
1543     transforms[1] = transforms[2] = deshake_ctx->transform_uv;
1544
1545     for (int p = 0; p < FF_ARRAY_ELEMS(transformed_frame->data); p++) {
1546         // Transform all of the planes appropriately
1547         src = (cl_mem)input_frame->data[p];
1548         transformed = (cl_mem)transformed_frame->data[p];
1549
1550         if (!transformed)
1551             break;
1552
1553         err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, p, 0);
1554         if (err < 0)
1555             goto fail;
1556
1557         CL_RUN_KERNEL_WITH_ARGS(
1558             deshake_ctx->command_queue,
1559             deshake_ctx->kernel_transform,
1560             global_work,
1561             NULL,
1562             &transform_event,
1563             { sizeof(cl_mem), &src },
1564             { sizeof(cl_mem), &transformed },
1565             { sizeof(cl_mem), &transforms[p] },
1566         );
1567     }
1568
1569     if (deshake_ctx->debug_on && !deshake_ctx->is_yuv && debug_matches.num_matches > 0) {
1570         CL_BLOCKING_WRITE_BUFFER(
1571             deshake_ctx->command_queue,
1572             deshake_ctx->debug_matches,
1573             debug_matches.num_matches * sizeof(MotionVector),
1574             debug_matches.matches,
1575             NULL
1576         );
1577
1578         CL_BLOCKING_WRITE_BUFFER(
1579             deshake_ctx->command_queue,
1580             deshake_ctx->debug_model_matches,
1581             debug_matches.num_model_matches * sizeof(MotionVector),
1582             debug_matches.model_matches,
1583             NULL
1584         );
1585
1586         num_model_matches = debug_matches.num_model_matches;
1587
1588         // Invert the transform
1589         transform_center_scale(
1590             new_vals[RingbufX] - old_vals[RingbufX],
1591             new_vals[RingbufY] - old_vals[RingbufY],
1592             new_vals[RingbufRot] - old_vals[RingbufRot],
1593             old_vals[RingbufScaleX] / new_vals[RingbufScaleX],
1594             old_vals[RingbufScaleY] / new_vals[RingbufScaleY],
1595             center_w,
1596             center_h,
1597             transform_debug_rgb
1598         );
1599
1600         CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_y, 9 * sizeof(float), transform_debug_rgb, NULL);
1601
1602         transformed = (cl_mem)transformed_frame->data[0];
1603         CL_RUN_KERNEL_WITH_ARGS(
1604             deshake_ctx->command_queue,
1605             deshake_ctx->kernel_draw_debug_info,
1606             (size_t[]){ debug_matches.num_matches },
1607             NULL,
1608             NULL,
1609             { sizeof(cl_mem), &transformed },
1610             { sizeof(cl_mem), &deshake_ctx->debug_matches },
1611             { sizeof(cl_mem), &deshake_ctx->debug_model_matches },
1612             { sizeof(cl_int), &num_model_matches },
1613             { sizeof(cl_mem), &deshake_ctx->transform_y }
1614         );
1615     }
1616
1617     if (deshake_ctx->should_crop) {
1618         // Generate transforms for cropping
1619         transform_center_scale(
1620             (old_vals[RingbufX] - new_vals[RingbufX]) / 5,
1621             (old_vals[RingbufY] - new_vals[RingbufY]) / 5,
1622             (old_vals[RingbufRot] - new_vals[RingbufRot]) / 5,
1623             new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1624             new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1625             center_w,
1626             center_h,
1627             transform_crop_y
1628         );
1629         update_needed_crop(&deshake_ctx->crop_y, transform_crop_y, input_frame->width, input_frame->height);
1630
1631         transform_center_scale(
1632             (old_vals[RingbufX] - new_vals[RingbufX]) / (5 * luma_w_over_chroma_w),
1633             (old_vals[RingbufY] - new_vals[RingbufY]) / (5 * luma_h_over_chroma_h),
1634             (old_vals[RingbufRot] - new_vals[RingbufRot]) / 5,
1635             new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1636             new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1637             center_w_chroma,
1638             center_h_chroma,
1639             transform_crop_uv
1640         );
1641         update_needed_crop(&deshake_ctx->crop_uv, transform_crop_uv, chroma_width, chroma_height);
1642
1643         crops[0] = deshake_ctx->crop_y;
1644         crops[1] = crops[2] = deshake_ctx->crop_uv;
1645
1646         for (int p = 0; p < FF_ARRAY_ELEMS(cropped_frame->data); p++) {
1647             // Crop all of the planes appropriately
1648             dst = (cl_mem)cropped_frame->data[p];
1649             transformed = (cl_mem)transformed_frame->data[p];
1650
1651             if (!dst)
1652                 break;
1653
1654             err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, p, 0);
1655             if (err < 0)
1656                 goto fail;
1657
1658             CL_RUN_KERNEL_WITH_ARGS(
1659                 deshake_ctx->command_queue,
1660                 deshake_ctx->kernel_crop_upscale,
1661                 global_work,
1662                 NULL,
1663                 &crop_upscale_event,
1664                 { sizeof(cl_mem), &transformed },
1665                 { sizeof(cl_mem), &dst },
1666                 { sizeof(cl_float2), &crops[p].top_left },
1667                 { sizeof(cl_float2), &crops[p].bottom_right },
1668             );
1669         }
1670     }
1671
1672     if (deshake_ctx->curr_frame < deshake_ctx->smooth_window / 2) {
1673         // This means we are somewhere at the start of the video. We need to
1674         // increment the current frame offset until it reaches the center of
1675         // the ringbuffers (as the current frame will be located there for
1676         // the rest of the video).
1677         //
1678         // The end of the video is taken care of by draining motion data
1679         // one-by-one out of the buffer, causing the (at that point fixed)
1680         // offset to move towards later frames' data.
1681         ++deshake_ctx->abs_motion.curr_frame_offset;
1682     }
1683
1684     if (deshake_ctx->abs_motion.data_end_offset != -1) {
1685         // Keep the end offset in sync with the frame it's supposed to be
1686         // positioned at
1687         --deshake_ctx->abs_motion.data_end_offset;
1688
1689         if (deshake_ctx->abs_motion.data_end_offset == deshake_ctx->abs_motion.curr_frame_offset - 1) {
1690             // The end offset would be the start of the new video sequence; flip to
1691             // start offset
1692             deshake_ctx->abs_motion.data_end_offset = -1;
1693             deshake_ctx->abs_motion.data_start_offset = deshake_ctx->abs_motion.curr_frame_offset;
1694         }
1695     } else if (deshake_ctx->abs_motion.data_start_offset != -1) {
1696         // Keep the start offset in sync with the frame it's supposed to be
1697         // positioned at
1698         --deshake_ctx->abs_motion.data_start_offset;
1699     }
1700
1701     if (deshake_ctx->debug_on) {
1702         deshake_ctx->transform_time += ff_opencl_get_event_time(transform_event);
1703         if (deshake_ctx->should_crop) {
1704             deshake_ctx->crop_upscale_time += ff_opencl_get_event_time(crop_upscale_event);
1705         }
1706     }
1707
1708     ++deshake_ctx->curr_frame;
1709
1710     if (deshake_ctx->debug_on)
1711         av_freep(&debug_matches.matches);
1712
1713     if (deshake_ctx->should_crop) {
1714         err = av_frame_copy_props(cropped_frame, input_frame);
1715         if (err < 0)
1716             goto fail;
1717
1718         av_frame_free(&transformed_frame);
1719         av_frame_free(&input_frame);
1720         return ff_filter_frame(outlink, cropped_frame);
1721
1722     } else {
1723         err = av_frame_copy_props(transformed_frame, input_frame);
1724         if (err < 0)
1725             goto fail;
1726
1727         av_frame_free(&cropped_frame);
1728         av_frame_free(&input_frame);
1729         return ff_filter_frame(outlink, transformed_frame);
1730     }
1731
1732 fail:
1733     clFinish(deshake_ctx->command_queue);
1734
1735     if (deshake_ctx->debug_on)
1736         if (debug_matches.matches)
1737             av_freep(&debug_matches.matches);
1738
1739     av_frame_free(&input_frame);
1740     av_frame_free(&transformed_frame);
1741     av_frame_free(&cropped_frame);
1742     return err;
1743 }
1744
1745 // Add the given frame to the frame queue to eventually be processed.
1746 //
1747 // Also determines the motion from the previous frame and updates the stored
1748 // motion information accordingly.
1749 static int queue_frame(AVFilterLink *link, AVFrame *input_frame)
1750 {
1751     AVFilterContext *avctx = link->dst;
1752     DeshakeOpenCLContext *deshake_ctx = avctx->priv;
1753     int err;
1754     int num_vectors;
1755     int num_inliers = 0;
1756     cl_int cle;
1757     FrameDelta relative;
1758     SimilarityMatrix model;
1759     size_t global_work[2];
1760     size_t harris_global_work[2];
1761     size_t grid_32_global_work[2];
1762     int grid_32_h, grid_32_w;
1763     size_t local_work[2];
1764     cl_mem src, temp;
1765     float prev_vals[5];
1766     float new_vals[5];
1767     cl_event grayscale_event, harris_response_event, refine_features_event,
1768              brief_event, match_descriptors_event, read_buf_event;
1769     DebugMatches debug_matches;
1770
1771     num_vectors = 0;
1772
1773     local_work[0] = 8;
1774     local_work[1] = 8;
1775
1776     err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, 0, 0);
1777     if (err < 0)
1778         goto fail;
1779
1780     err = ff_opencl_filter_work_size_from_image(avctx, harris_global_work, input_frame, 0, 8);
1781     if (err < 0)
1782         goto fail;
1783
1784     err = ff_opencl_filter_work_size_from_image(avctx, grid_32_global_work, input_frame, 0, 32);
1785     if (err < 0)
1786         goto fail;
1787
1788     // We want a single work-item for each 32x32 block of pixels in the input frame
1789     grid_32_global_work[0] /= 32;
1790     grid_32_global_work[1] /= 32;
1791
1792     grid_32_h = ROUNDED_UP_DIV(input_frame->height, 32);
1793     grid_32_w = ROUNDED_UP_DIV(input_frame->width, 32);
1794
1795     if (deshake_ctx->is_yuv) {
1796         deshake_ctx->grayscale = (cl_mem)input_frame->data[0];
1797     } else {
1798         src = (cl_mem)input_frame->data[0];
1799
1800         CL_RUN_KERNEL_WITH_ARGS(
1801             deshake_ctx->command_queue,
1802             deshake_ctx->kernel_grayscale,
1803             global_work,
1804             NULL,
1805             &grayscale_event,
1806             { sizeof(cl_mem), &src },
1807             { sizeof(cl_mem), &deshake_ctx->grayscale }
1808         );
1809     }
1810
1811     CL_RUN_KERNEL_WITH_ARGS(
1812         deshake_ctx->command_queue,
1813         deshake_ctx->kernel_harris_response,
1814         harris_global_work,
1815         local_work,
1816         &harris_response_event,
1817         { sizeof(cl_mem), &deshake_ctx->grayscale },
1818         { sizeof(cl_mem), &deshake_ctx->harris_buf }
1819     );
1820
1821     CL_RUN_KERNEL_WITH_ARGS(
1822         deshake_ctx->command_queue,
1823         deshake_ctx->kernel_refine_features,
1824         grid_32_global_work,
1825         NULL,
1826         &refine_features_event,
1827         { sizeof(cl_mem), &deshake_ctx->grayscale },
1828         { sizeof(cl_mem), &deshake_ctx->harris_buf },
1829         { sizeof(cl_mem), &deshake_ctx->refined_features },
1830         { sizeof(cl_int), &deshake_ctx->refine_features }
1831     );
1832
1833     CL_RUN_KERNEL_WITH_ARGS(
1834         deshake_ctx->command_queue,
1835         deshake_ctx->kernel_brief_descriptors,
1836         grid_32_global_work,
1837         NULL,
1838         &brief_event,
1839         { sizeof(cl_mem), &deshake_ctx->grayscale },
1840         { sizeof(cl_mem), &deshake_ctx->refined_features },
1841         { sizeof(cl_mem), &deshake_ctx->descriptors },
1842         { sizeof(cl_mem), &deshake_ctx->brief_pattern}
1843     );
1844
1845     if (av_fifo_size(deshake_ctx->abs_motion.ringbuffers[RingbufX]) == 0) {
1846         // This is the first frame we've been given to queue, meaning there is
1847         // no previous frame to match descriptors to
1848
1849         goto no_motion_data;
1850     }
1851
1852     CL_RUN_KERNEL_WITH_ARGS(
1853         deshake_ctx->command_queue,
1854         deshake_ctx->kernel_match_descriptors,
1855         grid_32_global_work,
1856         NULL,
1857         &match_descriptors_event,
1858         { sizeof(cl_mem), &deshake_ctx->prev_refined_features },
1859         { sizeof(cl_mem), &deshake_ctx->refined_features },
1860         { sizeof(cl_mem), &deshake_ctx->descriptors },
1861         { sizeof(cl_mem), &deshake_ctx->prev_descriptors },
1862         { sizeof(cl_mem), &deshake_ctx->matches }
1863     );
1864
1865     cle = clEnqueueReadBuffer(
1866         deshake_ctx->command_queue,
1867         deshake_ctx->matches,
1868         CL_TRUE,
1869         0,
1870         grid_32_h * grid_32_w * sizeof(MotionVector),
1871         deshake_ctx->matches_host,
1872         0,
1873         NULL,
1874         &read_buf_event
1875     );
1876     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read matches to host: %d.\n", cle);
1877
1878     num_vectors = make_vectors_contig(deshake_ctx, grid_32_h, grid_32_w);
1879
1880     if (num_vectors < 10) {
1881         // Not enough matches to get reliable motion data for this frame
1882         //
1883         // From this point on all data is relative to this frame rather than the
1884         // original frame. We have to make sure that we don't mix values that were
1885         // relative to the original frame with the new values relative to this
1886         // frame when doing the gaussian smoothing. We keep track of where the old
1887         // values end using this data_end_offset field in order to accomplish
1888         // that goal.
1889         //
1890         // If no motion data is present for multiple frames in a short window of
1891         // time, we leave the end where it was to avoid mixing 0s in with the
1892         // old data (and just treat them all as part of the new values)
1893         if (deshake_ctx->abs_motion.data_end_offset == -1) {
1894             deshake_ctx->abs_motion.data_end_offset =
1895                 av_fifo_size(deshake_ctx->abs_motion.ringbuffers[RingbufX]) / sizeof(float) - 1;
1896         }
1897
1898         goto no_motion_data;
1899     }
1900
1901     if (!estimate_affine_2d(
1902         deshake_ctx,
1903         deshake_ctx->matches_contig_host,
1904         &debug_matches,
1905         num_vectors,
1906         model.matrix,
1907         10.0,
1908         3000,
1909         0.999999999999
1910     )) {
1911         goto no_motion_data;
1912     }
1913
1914     for (int i = 0; i < num_vectors; i++) {
1915         if (deshake_ctx->matches_contig_host[i].should_consider) {
1916             deshake_ctx->inliers[num_inliers] = deshake_ctx->matches_contig_host[i];
1917             num_inliers++;
1918         }
1919     }
1920
1921     if (!minimize_error(
1922         deshake_ctx,
1923         deshake_ctx->inliers,
1924         &debug_matches,
1925         num_inliers,
1926         model.matrix,
1927         400
1928     )) {
1929         goto no_motion_data;
1930     }
1931
1932
1933     relative = decompose_transform(model.matrix);
1934
1935     // Get the absolute transform data for the previous frame
1936     for (int i = 0; i < RingbufCount; i++) {
1937         av_fifo_generic_peek_at(
1938             deshake_ctx->abs_motion.ringbuffers[i],
1939             &prev_vals[i],
1940             av_fifo_size(deshake_ctx->abs_motion.ringbuffers[i]) - sizeof(float),
1941             sizeof(float),
1942             NULL
1943         );
1944     }
1945
1946     new_vals[RingbufX]      = prev_vals[RingbufX] + relative.translation.s[0];
1947     new_vals[RingbufY]      = prev_vals[RingbufY] + relative.translation.s[1];
1948     new_vals[RingbufRot]    = prev_vals[RingbufRot] + relative.rotation;
1949     new_vals[RingbufScaleX] = prev_vals[RingbufScaleX] / relative.scale.s[0];
1950     new_vals[RingbufScaleY] = prev_vals[RingbufScaleY] / relative.scale.s[1];
1951
1952     if (deshake_ctx->debug_on) {
1953         if (!deshake_ctx->is_yuv) {
1954             deshake_ctx->grayscale_time     += ff_opencl_get_event_time(grayscale_event);
1955         }
1956         deshake_ctx->harris_response_time   += ff_opencl_get_event_time(harris_response_event);
1957         deshake_ctx->refine_features_time   += ff_opencl_get_event_time(refine_features_event);
1958         deshake_ctx->brief_descriptors_time += ff_opencl_get_event_time(brief_event);
1959         deshake_ctx->match_descriptors_time += ff_opencl_get_event_time(match_descriptors_event);
1960         deshake_ctx->read_buf_time          += ff_opencl_get_event_time(read_buf_event);
1961     }
1962
1963     goto end;
1964
1965 no_motion_data:
1966     new_vals[RingbufX]      = 0.0f;
1967     new_vals[RingbufY]      = 0.0f;
1968     new_vals[RingbufRot]    = 0.0f;
1969     new_vals[RingbufScaleX] = 1.0f;
1970     new_vals[RingbufScaleY] = 1.0f;
1971
1972     for (int i = 0; i < num_vectors; i++) {
1973         deshake_ctx->matches_contig_host[i].should_consider = false;
1974     }
1975     debug_matches.num_model_matches = 0;
1976
1977     if (deshake_ctx->debug_on) {
1978         av_log(avctx, AV_LOG_VERBOSE,
1979             "\n[ALERT] No motion data found in queue_frame, motion reset to 0\n\n"
1980         );
1981     }
1982
1983     goto end;
1984
1985 end:
1986     // Swap the descriptor buffers (we don't need the previous frame's descriptors
1987     // again so we will use that space for the next frame's descriptors)
1988     temp = deshake_ctx->prev_descriptors;
1989     deshake_ctx->prev_descriptors = deshake_ctx->descriptors;
1990     deshake_ctx->descriptors = temp;
1991
1992     // Same for the refined features
1993     temp = deshake_ctx->prev_refined_features;
1994     deshake_ctx->prev_refined_features = deshake_ctx->refined_features;
1995     deshake_ctx->refined_features = temp;
1996
1997     if (deshake_ctx->debug_on) {
1998         if (num_vectors == 0) {
1999             debug_matches.matches = NULL;
2000         } else {
2001             debug_matches.matches = av_malloc_array(num_vectors, sizeof(MotionVector));
2002
2003             if (!debug_matches.matches) {
2004                 err = AVERROR(ENOMEM);
2005                 goto fail;
2006             }
2007         }
2008
2009         for (int i = 0; i < num_vectors; i++) {
2010             debug_matches.matches[i] = deshake_ctx->matches_contig_host[i];
2011         }
2012         debug_matches.num_matches = num_vectors;
2013
2014         av_fifo_generic_write(
2015             deshake_ctx->abs_motion.debug_matches,
2016             &debug_matches,
2017             sizeof(DebugMatches),
2018             NULL
2019         );
2020     }
2021
2022     for (int i = 0; i < RingbufCount; i++) {
2023         av_fifo_generic_write(
2024             deshake_ctx->abs_motion.ringbuffers[i],
2025             &new_vals[i],
2026             sizeof(float),
2027             NULL
2028         );
2029     }
2030
2031     return ff_framequeue_add(&deshake_ctx->fq, input_frame);
2032
2033 fail:
2034     clFinish(deshake_ctx->command_queue);
2035     av_frame_free(&input_frame);
2036     return err;
2037 }
2038
2039 static int activate(AVFilterContext *ctx)
2040 {
2041     AVFilterLink *inlink = ctx->inputs[0];
2042     AVFilterLink *outlink = ctx->outputs[0];
2043     DeshakeOpenCLContext *deshake_ctx = ctx->priv;
2044     AVFrame *frame = NULL;
2045     int ret, status;
2046     int64_t pts;
2047
2048     FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
2049
2050     if (!deshake_ctx->eof) {
2051         ret = ff_inlink_consume_frame(inlink, &frame);
2052         if (ret < 0)
2053             return ret;
2054         if (ret > 0) {
2055             if (!frame->hw_frames_ctx)
2056                 return AVERROR(EINVAL);
2057
2058             if (!deshake_ctx->initialized) {
2059                 ret = deshake_opencl_init(ctx);
2060                 if (ret < 0)
2061                     return ret;
2062             }
2063
2064             // If there is no more space in the ringbuffers, remove the oldest
2065             // values to make room for the new ones
2066             if (av_fifo_space(deshake_ctx->abs_motion.ringbuffers[RingbufX]) == 0) {
2067                 for (int i = 0; i < RingbufCount; i++) {
2068                     av_fifo_drain(deshake_ctx->abs_motion.ringbuffers[i], sizeof(float));
2069                 }
2070             }
2071             ret = queue_frame(inlink, frame);
2072             if (ret < 0)
2073                 return ret;
2074             if (ret >= 0) {
2075                 // See if we have enough buffered frames to process one
2076                 //
2077                 // "enough" is half the smooth window of queued frames into the future
2078                 if (ff_framequeue_queued_frames(&deshake_ctx->fq) >= deshake_ctx->smooth_window / 2) {
2079                     return filter_frame(inlink, ff_framequeue_take(&deshake_ctx->fq));
2080                 }
2081             }
2082         }
2083     }
2084
2085     if (!deshake_ctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts)) {
2086         if (status == AVERROR_EOF) {
2087             deshake_ctx->eof = true;
2088         }
2089     }
2090
2091     if (deshake_ctx->eof) {
2092         // Finish processing the rest of the frames in the queue.
2093         while(ff_framequeue_queued_frames(&deshake_ctx->fq) != 0) {
2094             for (int i = 0; i < RingbufCount; i++) {
2095                 av_fifo_drain(deshake_ctx->abs_motion.ringbuffers[i], sizeof(float));
2096             }
2097
2098             ret = filter_frame(inlink, ff_framequeue_take(&deshake_ctx->fq));
2099             if (ret < 0) {
2100                 return ret;
2101             }
2102         }
2103
2104         if (deshake_ctx->debug_on) {
2105             av_log(ctx, AV_LOG_VERBOSE,
2106                 "Average kernel execution times:\n"
2107                 "\t        grayscale: %0.3f ms\n"
2108                 "\t  harris_response: %0.3f ms\n"
2109                 "\t  refine_features: %0.3f ms\n"
2110                 "\tbrief_descriptors: %0.3f ms\n"
2111                 "\tmatch_descriptors: %0.3f ms\n"
2112                 "\t        transform: %0.3f ms\n"
2113                 "\t     crop_upscale: %0.3f ms\n"
2114                 "Average buffer read times:\n"
2115                 "\t     features buf: %0.3f ms\n",
2116                 averaged_event_time_ms(deshake_ctx->grayscale_time, deshake_ctx->curr_frame),
2117                 averaged_event_time_ms(deshake_ctx->harris_response_time, deshake_ctx->curr_frame),
2118                 averaged_event_time_ms(deshake_ctx->refine_features_time, deshake_ctx->curr_frame),
2119                 averaged_event_time_ms(deshake_ctx->brief_descriptors_time, deshake_ctx->curr_frame),
2120                 averaged_event_time_ms(deshake_ctx->match_descriptors_time, deshake_ctx->curr_frame),
2121                 averaged_event_time_ms(deshake_ctx->transform_time, deshake_ctx->curr_frame),
2122                 averaged_event_time_ms(deshake_ctx->crop_upscale_time, deshake_ctx->curr_frame),
2123                 averaged_event_time_ms(deshake_ctx->read_buf_time, deshake_ctx->curr_frame)
2124             );
2125         }
2126
2127         ff_outlink_set_status(outlink, AVERROR_EOF, deshake_ctx->duration);
2128         return 0;
2129     }
2130
2131     if (!deshake_ctx->eof) {
2132         FF_FILTER_FORWARD_WANTED(outlink, inlink);
2133     }
2134
2135     return FFERROR_NOT_READY;
2136 }
2137
2138 static const AVFilterPad deshake_opencl_inputs[] = {
2139     {
2140         .name = "default",
2141         .type = AVMEDIA_TYPE_VIDEO,
2142         .config_props = &ff_opencl_filter_config_input,
2143     },
2144     { NULL }
2145 };
2146
2147 static const AVFilterPad deshake_opencl_outputs[] = {
2148     {
2149         .name = "default",
2150         .type = AVMEDIA_TYPE_VIDEO,
2151         .config_props = &ff_opencl_filter_config_output,
2152     },
2153     { NULL }
2154 };
2155
2156 #define OFFSET(x) offsetof(DeshakeOpenCLContext, x)
2157 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
2158
2159 static const AVOption deshake_opencl_options[] = {
2160     {
2161         "tripod", "simulates a tripod by preventing any camera movement whatsoever "
2162         "from the original frame",
2163         OFFSET(tripod_mode), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS
2164     },
2165     {
2166         "debug", "turn on additional debugging information",
2167         OFFSET(debug_on), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS
2168     },
2169     {
2170         "adaptive_crop", "attempt to subtly crop borders to reduce mirrored content",
2171         OFFSET(should_crop), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS
2172     },
2173     {
2174         "refine_features", "refine feature point locations at a sub-pixel level",
2175         OFFSET(refine_features), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS
2176     },
2177     {
2178         "smooth_strength", "smoothing strength (0 attempts to adaptively determine optimal strength)",
2179         OFFSET(smooth_percent), AV_OPT_TYPE_FLOAT, {.dbl = 0.0f}, 0.0f, 1.0f, FLAGS
2180     },
2181     {
2182         "smooth_window_multiplier", "multiplier for number of frames to buffer for motion data",
2183         OFFSET(smooth_window_multiplier), AV_OPT_TYPE_FLOAT, {.dbl = 2.0}, 0.1, 10.0, FLAGS
2184     },
2185     { NULL }
2186 };
2187
2188 AVFILTER_DEFINE_CLASS(deshake_opencl);
2189
2190 AVFilter ff_vf_deshake_opencl = {
2191     .name           = "deshake_opencl",
2192     .description    = NULL_IF_CONFIG_SMALL("Feature-point based video stabilization filter"),
2193     .priv_size      = sizeof(DeshakeOpenCLContext),
2194     .priv_class     = &deshake_opencl_class,
2195     .init           = &ff_opencl_filter_init,
2196     .uninit         = &deshake_opencl_uninit,
2197     .query_formats  = &ff_opencl_filter_query_formats,
2198     .activate       = activate,
2199     .inputs         = deshake_opencl_inputs,
2200     .outputs        = deshake_opencl_outputs,
2201     .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE
2202 };