git.sesse.net Git - ffmpeg/blob - libavcodec/hevcdsp_template.c

   1 /*
   2  * HEVC video decoder
   3  *
   4  * Copyright (C) 2012 - 2013 Guillaume Martres
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "get_bits.h"
  24 #include "hevc.h"
  25
  26 #include "bit_depth_template.c"
  27 #include "hevcdsp.h"
  28
  29
  30 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
  31                           GetBitContext *gb, int pcm_bit_depth)
  32 {
  33     int x, y;
  34     pixel *dst = (pixel *)_dst;
  35
  36     stride /= sizeof(pixel);
  37
  38     for (y = 0; y < height; y++) {
  39         for (x = 0; x < width; x++)
  40             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
  41         dst += stride;
  42     }
  43 }
  44
  45 static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
  46                                                 ptrdiff_t stride, int size)
  47 {
  48     int x, y;
  49     pixel *dst = (pixel *)_dst;
  50
  51     stride /= sizeof(pixel);
  52
  53     for (y = 0; y < size; y++) {
  54         for (x = 0; x < size; x++) {
  55             dst[x] = av_clip_pixel(dst[x] + *res);
  56             res++;
  57         }
  58         dst += stride;
  59     }
  60 }
  61
  62 static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
  63                                   ptrdiff_t stride)
  64 {
  65     FUNC(add_residual)(_dst, res, stride, 4);
  66 }
  67
  68 static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
  69                                   ptrdiff_t stride)
  70 {
  71     FUNC(add_residual)(_dst, res, stride, 8);
  72 }
  73
  74 static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
  75                                     ptrdiff_t stride)
  76 {
  77     FUNC(add_residual)(_dst, res, stride, 16);
  78 }
  79
  80 static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
  81                                     ptrdiff_t stride)
  82 {
  83     FUNC(add_residual)(_dst, res, stride, 32);
  84 }
  85
  86
  87 static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
  88 {
  89     int16_t *coeffs = (int16_t *) _coeffs;
  90     int x, y;
  91     int size = 1 << log2_size;
  92
  93     if (mode) {
  94         coeffs += size;
  95         for (y = 0; y < size - 1; y++) {
  96             for (x = 0; x < size; x++)
  97                 coeffs[x] += coeffs[x - size];
  98             coeffs += size;
  99         }
 100     } else {
 101         for (y = 0; y < size; y++) {
 102             for (x = 1; x < size; x++)
 103                 coeffs[x] += coeffs[x - 1];
 104             coeffs += size;
 105         }
 106     }
 107 }
 108
 109 static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
 110 {
 111     int shift  = 15 - BIT_DEPTH - log2_size;
 112     int x, y;
 113     int size = 1 << log2_size;
 114
 115     if (shift > 0) {
 116         int offset = 1 << (shift - 1);
 117         for (y = 0; y < size; y++) {
 118             for (x = 0; x < size; x++) {
 119                 *coeffs = (*coeffs + offset) >> shift;
 120                 coeffs++;
 121             }
 122         }
 123     } else {
 124         for (y = 0; y < size; y++) {
 125             for (x = 0; x < size; x++) {
 126                 *coeffs = *coeffs << -shift;
 127                 coeffs++;
 128             }
 129         }
 130     }
 131 }
 132
 133 #define SET(dst, x)   (dst) = (x)
 134 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
 135
 136 #define TR_4x4_LUMA(dst, src, step, assign)                             \
 137     do {                                                                \
 138         int c0 = src[0 * step] + src[2 * step];                         \
 139         int c1 = src[2 * step] + src[3 * step];                         \
 140         int c2 = src[0 * step] - src[3 * step];                         \
 141         int c3 = 74 * src[1 * step];                                    \
 142                                                                         \
 143         assign(dst[2 * step], 74 * (src[0 * step] -                     \
 144                                     src[2 * step] +                     \
 145                                     src[3 * step]));                    \
 146         assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
 147         assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
 148         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
 149     } while (0)
 150
 151 static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 152 {
 153     int i;
 154     int shift    = 7;
 155     int add      = 1 << (shift - 1);
 156     int16_t *src = coeffs;
 157
 158     for (i = 0; i < 4; i++) {
 159         TR_4x4_LUMA(src, src, 4, SCALE);
 160         src++;
 161     }
 162
 163     shift = 20 - BIT_DEPTH;
 164     add   = 1 << (shift - 1);
 165     for (i = 0; i < 4; i++) {
 166         TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
 167         coeffs += 4;
 168     }
 169 }
 170
 171 #undef TR_4x4_LUMA
 172
 173 #define TR_4(dst, src, dstep, sstep, assign, end)                              \
 174     do {                                                                       \
 175         const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
 176         const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
 177         const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
 178         const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
 179                                                                                \
 180         assign(dst[0 * dstep], e0 + o0);                                       \
 181         assign(dst[1 * dstep], e1 + o1);                                       \
 182         assign(dst[2 * dstep], e1 - o1);                                       \
 183         assign(dst[3 * dstep], e0 - o0);                                       \
 184     } while (0)
 185
 186 #define TR_8(dst, src, dstep, sstep, assign, end)                              \
 187     do {                                                                       \
 188         int i, j;                                                              \
 189         int e_8[4];                                                            \
 190         int o_8[4] = { 0 };                                                    \
 191         for (i = 0; i < 4; i++)                                                \
 192             for (j = 1; j < end; j += 2)                                       \
 193                 o_8[i] += transform[4 * j][i] * src[j * sstep];                \
 194         TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
 195                                                                                \
 196         for (i = 0; i < 4; i++) {                                              \
 197             assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
 198             assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
 199         }                                                                      \
 200     } while (0)
 201
 202 #define TR_16(dst, src, dstep, sstep, assign, end)                             \
 203     do {                                                                       \
 204         int i, j;                                                              \
 205         int e_16[8];                                                           \
 206         int o_16[8] = { 0 };                                                   \
 207         for (i = 0; i < 8; i++)                                                \
 208             for (j = 1; j < end; j += 2)                                       \
 209                 o_16[i] += transform[2 * j][i] * src[j * sstep];               \
 210         TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
 211                                                                                \
 212         for (i = 0; i < 8; i++) {                                              \
 213             assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
 214             assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
 215         }                                                                      \
 216     } while (0)
 217
 218 #define TR_32(dst, src, dstep, sstep, assign, end)                             \
 219     do {                                                                       \
 220         int i, j;                                                              \
 221         int e_32[16];                                                          \
 222         int o_32[16] = { 0 };                                                  \
 223         for (i = 0; i < 16; i++)                                               \
 224             for (j = 1; j < end; j += 2)                                       \
 225                 o_32[i] += transform[j][i] * src[j * sstep];                   \
 226         TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
 227                                                                                \
 228         for (i = 0; i < 16; i++) {                                             \
 229             assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
 230             assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
 231         }                                                                      \
 232     } while (0)
 233
 234 #define IDCT_VAR4(H)                                                          \
 235     int      limit2   = FFMIN(col_limit + 4, H)
 236 #define IDCT_VAR8(H)                                                          \
 237         int      limit   = FFMIN(col_limit, H);                               \
 238         int      limit2   = FFMIN(col_limit + 4, H)
 239 #define IDCT_VAR16(H)   IDCT_VAR8(H)
 240 #define IDCT_VAR32(H)   IDCT_VAR8(H)
 241
 242 #define IDCT(H)                                                              \
 243 static void FUNC(idct_##H ##x ##H )(                                         \
 244                    int16_t *coeffs, int col_limit) {                         \
 245     int i;                                                                   \
 246     int      shift   = 7;                                                    \
 247     int      add     = 1 << (shift - 1);                                     \
 248     int16_t *src     = coeffs;                                               \
 249     IDCT_VAR ##H(H);                                                         \
 250                                                                              \
 251     for (i = 0; i < H; i++) {                                                \
 252         TR_ ## H(src, src, H, H, SCALE, limit2);                             \
 253         if (limit2 < H && i%4 == 0 && !!i)                                   \
 254             limit2 -= 4;                                                     \
 255         src++;                                                               \
 256     }                                                                        \
 257                                                                              \
 258     shift   = 20 - BIT_DEPTH;                                                \
 259     add     = 1 << (shift - 1);                                              \
 260     for (i = 0; i < H; i++) {                                                \
 261         TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
 262         coeffs += H;                                                         \
 263     }                                                                        \
 264 }
 265
 266 #define IDCT_DC(H)                                                           \
 267 static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
 268                    int16_t *coeffs) {                                        \
 269     int i, j;                                                                \
 270     int      shift   = 14 - BIT_DEPTH;                                       \
 271     int      add     = 1 << (shift - 1);                                     \
 272     int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
 273                                                                              \
 274     for (j = 0; j < H; j++) {                                                \
 275         for (i = 0; i < H; i++) {                                            \
 276             coeffs[i+j*H] = coeff;                                           \
 277         }                                                                    \
 278     }                                                                        \
 279 }
 280
 281 IDCT( 4)
 282 IDCT( 8)
 283 IDCT(16)
 284 IDCT(32)
 285
 286 IDCT_DC( 4)
 287 IDCT_DC( 8)
 288 IDCT_DC(16)
 289 IDCT_DC(32)
 290
 291 #undef TR_4
 292 #undef TR_8
 293 #undef TR_16
 294 #undef TR_32
 295
 296 #undef SET
 297 #undef SCALE
 298
 299 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
 300                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
 301                                   int16_t *sao_offset_val, int sao_left_class,
 302                                   int width, int height)
 303 {
 304     pixel *dst = (pixel *)_dst;
 305     pixel *src = (pixel *)_src;
 306     int offset_table[32] = { 0 };
 307     int k, y, x;
 308     int shift  = BIT_DEPTH - 5;
 309
 310     stride_dst /= sizeof(pixel);
 311     stride_src /= sizeof(pixel);
 312
 313     for (k = 0; k < 4; k++)
 314         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
 315     for (y = 0; y < height; y++) {
 316         for (x = 0; x < width; x++)
 317             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 318         dst += stride_dst;
 319         src += stride_src;
 320     }
 321 }
 322
 323 #define CMP(a, b) (((a) > (b)) - ((a) < (b)))
 324
 325 static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
 326                                   int eo, int width, int height) {
 327
 328     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 329     static const int8_t pos[4][2][2] = {
 330         { { -1,  0 }, {  1, 0 } }, // horizontal
 331         { {  0, -1 }, {  0, 1 } }, // vertical
 332         { { -1, -1 }, {  1, 1 } }, // 45 degree
 333         { {  1, -1 }, { -1, 1 } }, // 135 degree
 334     };
 335     pixel *dst = (pixel *)_dst;
 336     pixel *src = (pixel *)_src;
 337     int a_stride, b_stride;
 338     int x, y;
 339     ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
 340     stride_dst /= sizeof(pixel);
 341
 342     a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
 343     b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
 344     for (y = 0; y < height; y++) {
 345         for (x = 0; x < width; x++) {
 346             int diff0 = CMP(src[x], src[x + a_stride]);
 347             int diff1 = CMP(src[x], src[x + b_stride]);
 348             int offset_val        = edge_idx[2 + diff0 + diff1];
 349             dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
 350         }
 351         src += stride_src;
 352         dst += stride_dst;
 353     }
 354 }
 355
 356 static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
 357                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 358                                     int *borders, int _width, int _height,
 359                                     int c_idx, uint8_t *vert_edge,
 360                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 361 {
 362     int x, y;
 363     pixel *dst = (pixel *)_dst;
 364     pixel *src = (pixel *)_src;
 365     int16_t *sao_offset_val = sao->offset_val[c_idx];
 366     int sao_eo_class    = sao->eo_class[c_idx];
 367     int init_x = 0, width = _width, height = _height;
 368
 369     stride_dst /= sizeof(pixel);
 370     stride_src /= sizeof(pixel);
 371
 372     if (sao_eo_class != SAO_EO_VERT) {
 373         if (borders[0]) {
 374             int offset_val = sao_offset_val[0];
 375             for (y = 0; y < height; y++) {
 376                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 377             }
 378             init_x = 1;
 379         }
 380         if (borders[2]) {
 381             int offset_val = sao_offset_val[0];
 382             int offset     = width - 1;
 383             for (x = 0; x < height; x++) {
 384                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 385             }
 386             width--;
 387         }
 388     }
 389     if (sao_eo_class != SAO_EO_HORIZ) {
 390         if (borders[1]) {
 391             int offset_val = sao_offset_val[0];
 392             for (x = init_x; x < width; x++)
 393                 dst[x] = av_clip_pixel(src[x] + offset_val);
 394         }
 395         if (borders[3]) {
 396             int offset_val   = sao_offset_val[0];
 397             int y_stride_dst = stride_dst * (height - 1);
 398             int y_stride_src = stride_src * (height - 1);
 399             for (x = init_x; x < width; x++)
 400                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 401             height--;
 402         }
 403     }
 404 }
 405
 406 static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
 407                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 408                                     int *borders, int _width, int _height,
 409                                     int c_idx, uint8_t *vert_edge,
 410                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 411 {
 412     int x, y;
 413     pixel *dst = (pixel *)_dst;
 414     pixel *src = (pixel *)_src;
 415     int16_t *sao_offset_val = sao->offset_val[c_idx];
 416     int sao_eo_class    = sao->eo_class[c_idx];
 417     int init_x = 0, init_y = 0, width = _width, height = _height;
 418
 419     stride_dst /= sizeof(pixel);
 420     stride_src /= sizeof(pixel);
 421
 422     if (sao_eo_class != SAO_EO_VERT) {
 423         if (borders[0]) {
 424             int offset_val = sao_offset_val[0];
 425             for (y = 0; y < height; y++) {
 426                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 427             }
 428             init_x = 1;
 429         }
 430         if (borders[2]) {
 431             int offset_val = sao_offset_val[0];
 432             int offset     = width - 1;
 433             for (x = 0; x < height; x++) {
 434                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 435             }
 436             width--;
 437         }
 438     }
 439     if (sao_eo_class != SAO_EO_HORIZ) {
 440         if (borders[1]) {
 441             int offset_val = sao_offset_val[0];
 442             for (x = init_x; x < width; x++)
 443                 dst[x] = av_clip_pixel(src[x] + offset_val);
 444             init_y = 1;
 445         }
 446         if (borders[3]) {
 447             int offset_val   = sao_offset_val[0];
 448             int y_stride_dst = stride_dst * (height - 1);
 449             int y_stride_src = stride_src * (height - 1);
 450             for (x = init_x; x < width; x++)
 451                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 452             height--;
 453         }
 454     }
 455
 456     {
 457         int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
 458         int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
 459         int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
 460         int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
 461
 462         // Restore pixels that can't be modified
 463         if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
 464             for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
 465                 dst[y*stride_dst] = src[y*stride_src];
 466         }
 467         if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
 468             for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
 469                 dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
 470         }
 471
 472         if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
 473             for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
 474                 dst[x] = src[x];
 475         }
 476         if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
 477             for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
 478                 dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
 479         }
 480         if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
 481             dst[0] = src[0];
 482         if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
 483             dst[width-1] = src[width-1];
 484         if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
 485             dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
 486         if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
 487             dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
 488
 489     }
 490 }
 491
 492 #undef CMP
 493
 494 ////////////////////////////////////////////////////////////////////////////////
 495 //
 496 ////////////////////////////////////////////////////////////////////////////////
 497 static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
 498                                       uint8_t *_src, ptrdiff_t _srcstride,
 499                                       int height, intptr_t mx, intptr_t my, int width)
 500 {
 501     int x, y;
 502     pixel *src          = (pixel *)_src;
 503     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 504
 505     for (y = 0; y < height; y++) {
 506         for (x = 0; x < width; x++)
 507             dst[x] = src[x] << (14 - BIT_DEPTH);
 508         src += srcstride;
 509         dst += MAX_PB_SIZE;
 510     }
 511 }
 512
 513 static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 514                                           int height, intptr_t mx, intptr_t my, int width)
 515 {
 516     int y;
 517     pixel *src          = (pixel *)_src;
 518     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 519     pixel *dst          = (pixel *)_dst;
 520     ptrdiff_t dststride = _dststride / sizeof(pixel);
 521
 522     for (y = 0; y < height; y++) {
 523         memcpy(dst, src, width * sizeof(pixel));
 524         src += srcstride;
 525         dst += dststride;
 526     }
 527 }
 528
 529 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 530                                          int16_t *src2,
 531                                          int height, intptr_t mx, intptr_t my, int width)
 532 {
 533     int x, y;
 534     pixel *src          = (pixel *)_src;
 535     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 536     pixel *dst          = (pixel *)_dst;
 537     ptrdiff_t dststride = _dststride / sizeof(pixel);
 538
 539     int shift = 14  + 1 - BIT_DEPTH;
 540 #if BIT_DEPTH < 14
 541     int offset = 1 << (shift - 1);
 542 #else
 543     int offset = 0;
 544 #endif
 545
 546     for (y = 0; y < height; y++) {
 547         for (x = 0; x < width; x++)
 548             dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
 549         src  += srcstride;
 550         dst  += dststride;
 551         src2 += MAX_PB_SIZE;
 552     }
 553 }
 554
 555 static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 556                                             int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
 557 {
 558     int x, y;
 559     pixel *src          = (pixel *)_src;
 560     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 561     pixel *dst          = (pixel *)_dst;
 562     ptrdiff_t dststride = _dststride / sizeof(pixel);
 563     int shift = denom + 14 - BIT_DEPTH;
 564 #if BIT_DEPTH < 14
 565     int offset = 1 << (shift - 1);
 566 #else
 567     int offset = 0;
 568 #endif
 569
 570     ox     = ox * (1 << (BIT_DEPTH - 8));
 571     for (y = 0; y < height; y++) {
 572         for (x = 0; x < width; x++)
 573             dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
 574         src += srcstride;
 575         dst += dststride;
 576     }
 577 }
 578
 579 static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 580                                            int16_t *src2,
 581                                            int height, int denom, int wx0, int wx1,
 582                                            int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 583 {
 584     int x, y;
 585     pixel *src          = (pixel *)_src;
 586     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 587     pixel *dst          = (pixel *)_dst;
 588     ptrdiff_t dststride = _dststride / sizeof(pixel);
 589
 590     int shift = 14  + 1 - BIT_DEPTH;
 591     int log2Wd = denom + shift - 1;
 592
 593     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 594     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 595     for (y = 0; y < height; y++) {
 596         for (x = 0; x < width; x++) {
 597             dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 598         }
 599         src  += srcstride;
 600         dst  += dststride;
 601         src2 += MAX_PB_SIZE;
 602     }
 603 }
 604
 605 ////////////////////////////////////////////////////////////////////////////////
 606 //
 607 ////////////////////////////////////////////////////////////////////////////////
 608 #define QPEL_FILTER(src, stride)                                               \
 609     (filter[0] * src[x - 3 * stride] +                                         \
 610      filter[1] * src[x - 2 * stride] +                                         \
 611      filter[2] * src[x -     stride] +                                         \
 612      filter[3] * src[x             ] +                                         \
 613      filter[4] * src[x +     stride] +                                         \
 614      filter[5] * src[x + 2 * stride] +                                         \
 615      filter[6] * src[x + 3 * stride] +                                         \
 616      filter[7] * src[x + 4 * stride])
 617
 618 static void FUNC(put_hevc_qpel_h)(int16_t *dst,
 619                                   uint8_t *_src, ptrdiff_t _srcstride,
 620                                   int height, intptr_t mx, intptr_t my, int width)
 621 {
 622     int x, y;
 623     pixel        *src       = (pixel*)_src;
 624     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 625     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 626     for (y = 0; y < height; y++) {
 627         for (x = 0; x < width; x++)
 628             dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 629         src += srcstride;
 630         dst += MAX_PB_SIZE;
 631     }
 632 }
 633
 634 static void FUNC(put_hevc_qpel_v)(int16_t *dst,
 635                                   uint8_t *_src, ptrdiff_t _srcstride,
 636                                   int height, intptr_t mx, intptr_t my, int width)
 637 {
 638     int x, y;
 639     pixel        *src       = (pixel*)_src;
 640     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 641     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 642     for (y = 0; y < height; y++)  {
 643         for (x = 0; x < width; x++)
 644             dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
 645         src += srcstride;
 646         dst += MAX_PB_SIZE;
 647     }
 648 }
 649
 650 static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
 651                                    uint8_t *_src,
 652                                    ptrdiff_t _srcstride,
 653                                    int height, intptr_t mx,
 654                                    intptr_t my, int width)
 655 {
 656     int x, y;
 657     const int8_t *filter;
 658     pixel *src = (pixel*)_src;
 659     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 660     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 661     int16_t *tmp = tmp_array;
 662
 663     src   -= QPEL_EXTRA_BEFORE * srcstride;
 664     filter = ff_hevc_qpel_filters[mx - 1];
 665     for (y = 0; y < height + QPEL_EXTRA; y++) {
 666         for (x = 0; x < width; x++)
 667             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 668         src += srcstride;
 669         tmp += MAX_PB_SIZE;
 670     }
 671
 672     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 673     filter = ff_hevc_qpel_filters[my - 1];
 674     for (y = 0; y < height; y++) {
 675         for (x = 0; x < width; x++)
 676             dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
 677         tmp += MAX_PB_SIZE;
 678         dst += MAX_PB_SIZE;
 679     }
 680 }
 681
 682 static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 683                                       uint8_t *_src, ptrdiff_t _srcstride,
 684                                       int height, intptr_t mx, intptr_t my, int width)
 685 {
 686     int x, y;
 687     pixel        *src       = (pixel*)_src;
 688     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 689     pixel *dst          = (pixel *)_dst;
 690     ptrdiff_t dststride = _dststride / sizeof(pixel);
 691     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 692     int shift = 14 - BIT_DEPTH;
 693
 694 #if BIT_DEPTH < 14
 695     int offset = 1 << (shift - 1);
 696 #else
 697     int offset = 0;
 698 #endif
 699
 700     for (y = 0; y < height; y++) {
 701         for (x = 0; x < width; x++)
 702             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
 703         src += srcstride;
 704         dst += dststride;
 705     }
 706 }
 707
 708 static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 709                                      int16_t *src2,
 710                                      int height, intptr_t mx, intptr_t my, int width)
 711 {
 712     int x, y;
 713     pixel        *src       = (pixel*)_src;
 714     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 715     pixel *dst          = (pixel *)_dst;
 716     ptrdiff_t dststride = _dststride / sizeof(pixel);
 717
 718     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 719
 720     int shift = 14  + 1 - BIT_DEPTH;
 721 #if BIT_DEPTH < 14
 722     int offset = 1 << (shift - 1);
 723 #else
 724     int offset = 0;
 725 #endif
 726
 727     for (y = 0; y < height; y++) {
 728         for (x = 0; x < width; x++)
 729             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 730         src  += srcstride;
 731         dst  += dststride;
 732         src2 += MAX_PB_SIZE;
 733     }
 734 }
 735
 736 static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 737                                      uint8_t *_src, ptrdiff_t _srcstride,
 738                                      int height, intptr_t mx, intptr_t my, int width)
 739 {
 740     int x, y;
 741     pixel        *src       = (pixel*)_src;
 742     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 743     pixel *dst          = (pixel *)_dst;
 744     ptrdiff_t dststride = _dststride / sizeof(pixel);
 745     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 746     int shift = 14 - BIT_DEPTH;
 747
 748 #if BIT_DEPTH < 14
 749     int offset = 1 << (shift - 1);
 750 #else
 751     int offset = 0;
 752 #endif
 753
 754     for (y = 0; y < height; y++) {
 755         for (x = 0; x < width; x++)
 756             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
 757         src += srcstride;
 758         dst += dststride;
 759     }
 760 }
 761
 762
 763 static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 764                                      int16_t *src2,
 765                                      int height, intptr_t mx, intptr_t my, int width)
 766 {
 767     int x, y;
 768     pixel        *src       = (pixel*)_src;
 769     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 770     pixel *dst          = (pixel *)_dst;
 771     ptrdiff_t dststride = _dststride / sizeof(pixel);
 772
 773     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 774
 775     int shift = 14 + 1 - BIT_DEPTH;
 776 #if BIT_DEPTH < 14
 777     int offset = 1 << (shift - 1);
 778 #else
 779     int offset = 0;
 780 #endif
 781
 782     for (y = 0; y < height; y++) {
 783         for (x = 0; x < width; x++)
 784             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 785         src  += srcstride;
 786         dst  += dststride;
 787         src2 += MAX_PB_SIZE;
 788     }
 789 }
 790
 791 static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 792                                        uint8_t *_src, ptrdiff_t _srcstride,
 793                                        int height, intptr_t mx, intptr_t my, int width)
 794 {
 795     int x, y;
 796     const int8_t *filter;
 797     pixel *src = (pixel*)_src;
 798     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 799     pixel *dst          = (pixel *)_dst;
 800     ptrdiff_t dststride = _dststride / sizeof(pixel);
 801     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 802     int16_t *tmp = tmp_array;
 803     int shift =  14 - BIT_DEPTH;
 804
 805 #if BIT_DEPTH < 14
 806     int offset = 1 << (shift - 1);
 807 #else
 808     int offset = 0;
 809 #endif
 810
 811     src   -= QPEL_EXTRA_BEFORE * srcstride;
 812     filter = ff_hevc_qpel_filters[mx - 1];
 813     for (y = 0; y < height + QPEL_EXTRA; y++) {
 814         for (x = 0; x < width; x++)
 815             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 816         src += srcstride;
 817         tmp += MAX_PB_SIZE;
 818     }
 819
 820     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 821     filter = ff_hevc_qpel_filters[my - 1];
 822
 823     for (y = 0; y < height; y++) {
 824         for (x = 0; x < width; x++)
 825             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
 826         tmp += MAX_PB_SIZE;
 827         dst += dststride;
 828     }
 829 }
 830
 831 static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 832                                       int16_t *src2,
 833                                       int height, intptr_t mx, intptr_t my, int width)
 834 {
 835     int x, y;
 836     const int8_t *filter;
 837     pixel *src = (pixel*)_src;
 838     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 839     pixel *dst          = (pixel *)_dst;
 840     ptrdiff_t dststride = _dststride / sizeof(pixel);
 841     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 842     int16_t *tmp = tmp_array;
 843     int shift = 14 + 1 - BIT_DEPTH;
 844 #if BIT_DEPTH < 14
 845     int offset = 1 << (shift - 1);
 846 #else
 847     int offset = 0;
 848 #endif
 849
 850     src   -= QPEL_EXTRA_BEFORE * srcstride;
 851     filter = ff_hevc_qpel_filters[mx - 1];
 852     for (y = 0; y < height + QPEL_EXTRA; y++) {
 853         for (x = 0; x < width; x++)
 854             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 855         src += srcstride;
 856         tmp += MAX_PB_SIZE;
 857     }
 858
 859     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 860     filter = ff_hevc_qpel_filters[my - 1];
 861
 862     for (y = 0; y < height; y++) {
 863         for (x = 0; x < width; x++)
 864             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
 865         tmp  += MAX_PB_SIZE;
 866         dst  += dststride;
 867         src2 += MAX_PB_SIZE;
 868     }
 869 }
 870
 871 static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 872                                         uint8_t *_src, ptrdiff_t _srcstride,
 873                                         int height, int denom, int wx, int ox,
 874                                         intptr_t mx, intptr_t my, int width)
 875 {
 876     int x, y;
 877     pixel        *src       = (pixel*)_src;
 878     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 879     pixel *dst          = (pixel *)_dst;
 880     ptrdiff_t dststride = _dststride / sizeof(pixel);
 881     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 882     int shift = denom + 14 - BIT_DEPTH;
 883 #if BIT_DEPTH < 14
 884     int offset = 1 << (shift - 1);
 885 #else
 886     int offset = 0;
 887 #endif
 888
 889     ox = ox * (1 << (BIT_DEPTH - 8));
 890     for (y = 0; y < height; y++) {
 891         for (x = 0; x < width; x++)
 892             dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 893         src += srcstride;
 894         dst += dststride;
 895     }
 896 }
 897
 898 static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 899                                        int16_t *src2,
 900                                        int height, int denom, int wx0, int wx1,
 901                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 902 {
 903     int x, y;
 904     pixel        *src       = (pixel*)_src;
 905     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 906     pixel *dst          = (pixel *)_dst;
 907     ptrdiff_t dststride = _dststride / sizeof(pixel);
 908
 909     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 910
 911     int shift = 14  + 1 - BIT_DEPTH;
 912     int log2Wd = denom + shift - 1;
 913
 914     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 915     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 916     for (y = 0; y < height; y++) {
 917         for (x = 0; x < width; x++)
 918             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 919                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 920         src  += srcstride;
 921         dst  += dststride;
 922         src2 += MAX_PB_SIZE;
 923     }
 924 }
 925
 926 static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 927                                         uint8_t *_src, ptrdiff_t _srcstride,
 928                                         int height, int denom, int wx, int ox,
 929                                         intptr_t mx, intptr_t my, int width)
 930 {
 931     int x, y;
 932     pixel        *src       = (pixel*)_src;
 933     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 934     pixel *dst          = (pixel *)_dst;
 935     ptrdiff_t dststride = _dststride / sizeof(pixel);
 936     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 937     int shift = denom + 14 - BIT_DEPTH;
 938 #if BIT_DEPTH < 14
 939     int offset = 1 << (shift - 1);
 940 #else
 941     int offset = 0;
 942 #endif
 943
 944     ox = ox * (1 << (BIT_DEPTH - 8));
 945     for (y = 0; y < height; y++) {
 946         for (x = 0; x < width; x++)
 947             dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 948         src += srcstride;
 949         dst += dststride;
 950     }
 951 }
 952
 953 static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 954                                        int16_t *src2,
 955                                        int height, int denom, int wx0, int wx1,
 956                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 957 {
 958     int x, y;
 959     pixel        *src       = (pixel*)_src;
 960     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 961     pixel *dst          = (pixel *)_dst;
 962     ptrdiff_t dststride = _dststride / sizeof(pixel);
 963
 964     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 965
 966     int shift = 14 + 1 - BIT_DEPTH;
 967     int log2Wd = denom + shift - 1;
 968
 969     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 970     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 971     for (y = 0; y < height; y++) {
 972         for (x = 0; x < width; x++)
 973             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 974                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 975         src  += srcstride;
 976         dst  += dststride;
 977         src2 += MAX_PB_SIZE;
 978     }
 979 }
 980
 981 static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 982                                          uint8_t *_src, ptrdiff_t _srcstride,
 983                                          int height, int denom, int wx, int ox,
 984                                          intptr_t mx, intptr_t my, int width)
 985 {
 986     int x, y;
 987     const int8_t *filter;
 988     pixel *src = (pixel*)_src;
 989     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 990     pixel *dst          = (pixel *)_dst;
 991     ptrdiff_t dststride = _dststride / sizeof(pixel);
 992     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 993     int16_t *tmp = tmp_array;
 994     int shift = denom + 14 - BIT_DEPTH;
 995 #if BIT_DEPTH < 14
 996     int offset = 1 << (shift - 1);
 997 #else
 998     int offset = 0;
 999 #endif
1000
1001     src   -= QPEL_EXTRA_BEFORE * srcstride;
1002     filter = ff_hevc_qpel_filters[mx - 1];
1003     for (y = 0; y < height + QPEL_EXTRA; y++) {
1004         for (x = 0; x < width; x++)
1005             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1006         src += srcstride;
1007         tmp += MAX_PB_SIZE;
1008     }
1009
1010     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1011     filter = ff_hevc_qpel_filters[my - 1];
1012
1013     ox = ox * (1 << (BIT_DEPTH - 8));
1014     for (y = 0; y < height; y++) {
1015         for (x = 0; x < width; x++)
1016             dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1017         tmp += MAX_PB_SIZE;
1018         dst += dststride;
1019     }
1020 }
1021
1022 static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1023                                         int16_t *src2,
1024                                         int height, int denom, int wx0, int wx1,
1025                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1026 {
1027     int x, y;
1028     const int8_t *filter;
1029     pixel *src = (pixel*)_src;
1030     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1031     pixel *dst          = (pixel *)_dst;
1032     ptrdiff_t dststride = _dststride / sizeof(pixel);
1033     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1034     int16_t *tmp = tmp_array;
1035     int shift = 14 + 1 - BIT_DEPTH;
1036     int log2Wd = denom + shift - 1;
1037
1038     src   -= QPEL_EXTRA_BEFORE * srcstride;
1039     filter = ff_hevc_qpel_filters[mx - 1];
1040     for (y = 0; y < height + QPEL_EXTRA; y++) {
1041         for (x = 0; x < width; x++)
1042             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1043         src += srcstride;
1044         tmp += MAX_PB_SIZE;
1045     }
1046
1047     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1048     filter = ff_hevc_qpel_filters[my - 1];
1049
1050     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1051     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1052     for (y = 0; y < height; y++) {
1053         for (x = 0; x < width; x++)
1054             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1055                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1056         tmp  += MAX_PB_SIZE;
1057         dst  += dststride;
1058         src2 += MAX_PB_SIZE;
1059     }
1060 }
1061
1062 ////////////////////////////////////////////////////////////////////////////////
1063 //
1064 ////////////////////////////////////////////////////////////////////////////////
1065 #define EPEL_FILTER(src, stride)                                               \
1066     (filter[0] * src[x - stride] +                                             \
1067      filter[1] * src[x]          +                                             \
1068      filter[2] * src[x + stride] +                                             \
1069      filter[3] * src[x + 2 * stride])
1070
1071 static void FUNC(put_hevc_epel_h)(int16_t *dst,
1072                                   uint8_t *_src, ptrdiff_t _srcstride,
1073                                   int height, intptr_t mx, intptr_t my, int width)
1074 {
1075     int x, y;
1076     pixel *src = (pixel *)_src;
1077     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1078     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1079     for (y = 0; y < height; y++) {
1080         for (x = 0; x < width; x++)
1081             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1082         src += srcstride;
1083         dst += MAX_PB_SIZE;
1084     }
1085 }
1086
1087 static void FUNC(put_hevc_epel_v)(int16_t *dst,
1088                                   uint8_t *_src, ptrdiff_t _srcstride,
1089                                   int height, intptr_t mx, intptr_t my, int width)
1090 {
1091     int x, y;
1092     pixel *src = (pixel *)_src;
1093     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1094     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1095
1096     for (y = 0; y < height; y++) {
1097         for (x = 0; x < width; x++)
1098             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1099         src += srcstride;
1100         dst += MAX_PB_SIZE;
1101     }
1102 }
1103
1104 static void FUNC(put_hevc_epel_hv)(int16_t *dst,
1105                                    uint8_t *_src, ptrdiff_t _srcstride,
1106                                    int height, intptr_t mx, intptr_t my, int width)
1107 {
1108     int x, y;
1109     pixel *src = (pixel *)_src;
1110     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1111     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1112     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1113     int16_t *tmp = tmp_array;
1114
1115     src -= EPEL_EXTRA_BEFORE * srcstride;
1116
1117     for (y = 0; y < height + EPEL_EXTRA; y++) {
1118         for (x = 0; x < width; x++)
1119             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1120         src += srcstride;
1121         tmp += MAX_PB_SIZE;
1122     }
1123
1124     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1125     filter = ff_hevc_epel_filters[my - 1];
1126
1127     for (y = 0; y < height; y++) {
1128         for (x = 0; x < width; x++)
1129             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1130         tmp += MAX_PB_SIZE;
1131         dst += MAX_PB_SIZE;
1132     }
1133 }
1134
1135 static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1136                                       int height, intptr_t mx, intptr_t my, int width)
1137 {
1138     int x, y;
1139     pixel *src = (pixel *)_src;
1140     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1141     pixel *dst          = (pixel *)_dst;
1142     ptrdiff_t dststride = _dststride / sizeof(pixel);
1143     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1144     int shift = 14 - BIT_DEPTH;
1145 #if BIT_DEPTH < 14
1146     int offset = 1 << (shift - 1);
1147 #else
1148     int offset = 0;
1149 #endif
1150
1151     for (y = 0; y < height; y++) {
1152         for (x = 0; x < width; x++)
1153             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
1154         src += srcstride;
1155         dst += dststride;
1156     }
1157 }
1158
1159 static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1160                                      int16_t *src2,
1161                                      int height, intptr_t mx, intptr_t my, int width)
1162 {
1163     int x, y;
1164     pixel *src = (pixel *)_src;
1165     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1166     pixel *dst          = (pixel *)_dst;
1167     ptrdiff_t dststride = _dststride / sizeof(pixel);
1168     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1169     int shift = 14 + 1 - BIT_DEPTH;
1170 #if BIT_DEPTH < 14
1171     int offset = 1 << (shift - 1);
1172 #else
1173     int offset = 0;
1174 #endif
1175
1176     for (y = 0; y < height; y++) {
1177         for (x = 0; x < width; x++) {
1178             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1179         }
1180         dst  += dststride;
1181         src  += srcstride;
1182         src2 += MAX_PB_SIZE;
1183     }
1184 }
1185
1186 static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1187                                       int height, intptr_t mx, intptr_t my, int width)
1188 {
1189     int x, y;
1190     pixel *src = (pixel *)_src;
1191     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1192     pixel *dst          = (pixel *)_dst;
1193     ptrdiff_t dststride = _dststride / sizeof(pixel);
1194     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1195     int shift = 14 - BIT_DEPTH;
1196 #if BIT_DEPTH < 14
1197     int offset = 1 << (shift - 1);
1198 #else
1199     int offset = 0;
1200 #endif
1201
1202     for (y = 0; y < height; y++) {
1203         for (x = 0; x < width; x++)
1204             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
1205         src += srcstride;
1206         dst += dststride;
1207     }
1208 }
1209
1210 static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1211                                      int16_t *src2,
1212                                      int height, intptr_t mx, intptr_t my, int width)
1213 {
1214     int x, y;
1215     pixel *src = (pixel *)_src;
1216     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1217     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1218     pixel *dst          = (pixel *)_dst;
1219     ptrdiff_t dststride = _dststride / sizeof(pixel);
1220     int shift = 14 + 1 - BIT_DEPTH;
1221 #if BIT_DEPTH < 14
1222     int offset = 1 << (shift - 1);
1223 #else
1224     int offset = 0;
1225 #endif
1226
1227     for (y = 0; y < height; y++) {
1228         for (x = 0; x < width; x++)
1229             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1230         dst  += dststride;
1231         src  += srcstride;
1232         src2 += MAX_PB_SIZE;
1233     }
1234 }
1235
1236 static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1237                                        int height, intptr_t mx, intptr_t my, int width)
1238 {
1239     int x, y;
1240     pixel *src = (pixel *)_src;
1241     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1242     pixel *dst          = (pixel *)_dst;
1243     ptrdiff_t dststride = _dststride / sizeof(pixel);
1244     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1245     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1246     int16_t *tmp = tmp_array;
1247     int shift = 14 - BIT_DEPTH;
1248 #if BIT_DEPTH < 14
1249     int offset = 1 << (shift - 1);
1250 #else
1251     int offset = 0;
1252 #endif
1253
1254     src -= EPEL_EXTRA_BEFORE * srcstride;
1255
1256     for (y = 0; y < height + EPEL_EXTRA; y++) {
1257         for (x = 0; x < width; x++)
1258             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1259         src += srcstride;
1260         tmp += MAX_PB_SIZE;
1261     }
1262
1263     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1264     filter = ff_hevc_epel_filters[my - 1];
1265
1266     for (y = 0; y < height; y++) {
1267         for (x = 0; x < width; x++)
1268             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
1269         tmp += MAX_PB_SIZE;
1270         dst += dststride;
1271     }
1272 }
1273
1274 static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1275                                       int16_t *src2,
1276                                       int height, intptr_t mx, intptr_t my, int width)
1277 {
1278     int x, y;
1279     pixel *src = (pixel *)_src;
1280     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1281     pixel *dst          = (pixel *)_dst;
1282     ptrdiff_t dststride = _dststride / sizeof(pixel);
1283     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1284     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1285     int16_t *tmp = tmp_array;
1286     int shift = 14 + 1 - BIT_DEPTH;
1287 #if BIT_DEPTH < 14
1288     int offset = 1 << (shift - 1);
1289 #else
1290     int offset = 0;
1291 #endif
1292
1293     src -= EPEL_EXTRA_BEFORE * srcstride;
1294
1295     for (y = 0; y < height + EPEL_EXTRA; y++) {
1296         for (x = 0; x < width; x++)
1297             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1298         src += srcstride;
1299         tmp += MAX_PB_SIZE;
1300     }
1301
1302     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1303     filter = ff_hevc_epel_filters[my - 1];
1304
1305     for (y = 0; y < height; y++) {
1306         for (x = 0; x < width; x++)
1307             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
1308         tmp  += MAX_PB_SIZE;
1309         dst  += dststride;
1310         src2 += MAX_PB_SIZE;
1311     }
1312 }
1313
1314 static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1315                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1316 {
1317     int x, y;
1318     pixel *src = (pixel *)_src;
1319     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1320     pixel *dst          = (pixel *)_dst;
1321     ptrdiff_t dststride = _dststride / sizeof(pixel);
1322     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1323     int shift = denom + 14 - BIT_DEPTH;
1324 #if BIT_DEPTH < 14
1325     int offset = 1 << (shift - 1);
1326 #else
1327     int offset = 0;
1328 #endif
1329
1330     ox     = ox * (1 << (BIT_DEPTH - 8));
1331     for (y = 0; y < height; y++) {
1332         for (x = 0; x < width; x++) {
1333             dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1334         }
1335         dst += dststride;
1336         src += srcstride;
1337     }
1338 }
1339
1340 static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1341                                        int16_t *src2,
1342                                        int height, int denom, int wx0, int wx1,
1343                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1344 {
1345     int x, y;
1346     pixel *src = (pixel *)_src;
1347     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1348     pixel *dst          = (pixel *)_dst;
1349     ptrdiff_t dststride = _dststride / sizeof(pixel);
1350     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1351     int shift = 14 + 1 - BIT_DEPTH;
1352     int log2Wd = denom + shift - 1;
1353
1354     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1355     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1356     for (y = 0; y < height; y++) {
1357         for (x = 0; x < width; x++)
1358             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1359                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1360         src  += srcstride;
1361         dst  += dststride;
1362         src2 += MAX_PB_SIZE;
1363     }
1364 }
1365
1366 static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1367                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1368 {
1369     int x, y;
1370     pixel *src = (pixel *)_src;
1371     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1372     pixel *dst          = (pixel *)_dst;
1373     ptrdiff_t dststride = _dststride / sizeof(pixel);
1374     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1375     int shift = denom + 14 - BIT_DEPTH;
1376 #if BIT_DEPTH < 14
1377     int offset = 1 << (shift - 1);
1378 #else
1379     int offset = 0;
1380 #endif
1381
1382     ox     = ox * (1 << (BIT_DEPTH - 8));
1383     for (y = 0; y < height; y++) {
1384         for (x = 0; x < width; x++) {
1385             dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1386         }
1387         dst += dststride;
1388         src += srcstride;
1389     }
1390 }
1391
1392 static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1393                                        int16_t *src2,
1394                                        int height, int denom, int wx0, int wx1,
1395                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1396 {
1397     int x, y;
1398     pixel *src = (pixel *)_src;
1399     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1400     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1401     pixel *dst          = (pixel *)_dst;
1402     ptrdiff_t dststride = _dststride / sizeof(pixel);
1403     int shift = 14 + 1 - BIT_DEPTH;
1404     int log2Wd = denom + shift - 1;
1405
1406     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1407     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1408     for (y = 0; y < height; y++) {
1409         for (x = 0; x < width; x++)
1410             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1411                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1412         src  += srcstride;
1413         dst  += dststride;
1414         src2 += MAX_PB_SIZE;
1415     }
1416 }
1417
1418 static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1419                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1420 {
1421     int x, y;
1422     pixel *src = (pixel *)_src;
1423     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1424     pixel *dst          = (pixel *)_dst;
1425     ptrdiff_t dststride = _dststride / sizeof(pixel);
1426     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1427     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1428     int16_t *tmp = tmp_array;
1429     int shift = denom + 14 - BIT_DEPTH;
1430 #if BIT_DEPTH < 14
1431     int offset = 1 << (shift - 1);
1432 #else
1433     int offset = 0;
1434 #endif
1435
1436     src -= EPEL_EXTRA_BEFORE * srcstride;
1437
1438     for (y = 0; y < height + EPEL_EXTRA; y++) {
1439         for (x = 0; x < width; x++)
1440             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1441         src += srcstride;
1442         tmp += MAX_PB_SIZE;
1443     }
1444
1445     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1446     filter = ff_hevc_epel_filters[my - 1];
1447
1448     ox     = ox * (1 << (BIT_DEPTH - 8));
1449     for (y = 0; y < height; y++) {
1450         for (x = 0; x < width; x++)
1451             dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1452         tmp += MAX_PB_SIZE;
1453         dst += dststride;
1454     }
1455 }
1456
1457 static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1458                                         int16_t *src2,
1459                                         int height, int denom, int wx0, int wx1,
1460                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1461 {
1462     int x, y;
1463     pixel *src = (pixel *)_src;
1464     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1465     pixel *dst          = (pixel *)_dst;
1466     ptrdiff_t dststride = _dststride / sizeof(pixel);
1467     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1468     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1469     int16_t *tmp = tmp_array;
1470     int shift = 14 + 1 - BIT_DEPTH;
1471     int log2Wd = denom + shift - 1;
1472
1473     src -= EPEL_EXTRA_BEFORE * srcstride;
1474
1475     for (y = 0; y < height + EPEL_EXTRA; y++) {
1476         for (x = 0; x < width; x++)
1477             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1478         src += srcstride;
1479         tmp += MAX_PB_SIZE;
1480     }
1481
1482     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1483     filter = ff_hevc_epel_filters[my - 1];
1484
1485     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1486     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1487     for (y = 0; y < height; y++) {
1488         for (x = 0; x < width; x++)
1489             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1490                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1491         tmp  += MAX_PB_SIZE;
1492         dst  += dststride;
1493         src2 += MAX_PB_SIZE;
1494     }
1495 }
1496
1497 // line zero
1498 #define P3 pix[-4 * xstride]
1499 #define P2 pix[-3 * xstride]
1500 #define P1 pix[-2 * xstride]
1501 #define P0 pix[-1 * xstride]
1502 #define Q0 pix[0 * xstride]
1503 #define Q1 pix[1 * xstride]
1504 #define Q2 pix[2 * xstride]
1505 #define Q3 pix[3 * xstride]
1506
1507 // line three. used only for deblocking decision
1508 #define TP3 pix[-4 * xstride + 3 * ystride]
1509 #define TP2 pix[-3 * xstride + 3 * ystride]
1510 #define TP1 pix[-2 * xstride + 3 * ystride]
1511 #define TP0 pix[-1 * xstride + 3 * ystride]
1512 #define TQ0 pix[0  * xstride + 3 * ystride]
1513 #define TQ1 pix[1  * xstride + 3 * ystride]
1514 #define TQ2 pix[2  * xstride + 3 * ystride]
1515 #define TQ3 pix[3  * xstride + 3 * ystride]
1516
1517 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1518                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
1519                                         int beta, int *_tc,
1520                                         uint8_t *_no_p, uint8_t *_no_q)
1521 {
1522     int d, j;
1523     pixel *pix        = (pixel *)_pix;
1524     ptrdiff_t xstride = _xstride / sizeof(pixel);
1525     ptrdiff_t ystride = _ystride / sizeof(pixel);
1526
1527     beta <<= BIT_DEPTH - 8;
1528
1529     for (j = 0; j < 2; j++) {
1530         const int dp0  = abs(P2  - 2 * P1  + P0);
1531         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1532         const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1533         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1534         const int d0   = dp0 + dq0;
1535         const int d3   = dp3 + dq3;
1536         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1537         const int no_p = _no_p[j];
1538         const int no_q = _no_q[j];
1539
1540         if (d0 + d3 >= beta) {
1541             pix += 4 * ystride;
1542             continue;
1543         } else {
1544             const int beta_3 = beta >> 3;
1545             const int beta_2 = beta >> 2;
1546             const int tc25   = ((tc * 5 + 1) >> 1);
1547
1548             if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1549                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1550                                       (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1551                 // strong filtering
1552                 const int tc2 = tc << 1;
1553                 for (d = 0; d < 4; d++) {
1554                     const int p3 = P3;
1555                     const int p2 = P2;
1556                     const int p1 = P1;
1557                     const int p0 = P0;
1558                     const int q0 = Q0;
1559                     const int q1 = Q1;
1560                     const int q2 = Q2;
1561                     const int q3 = Q3;
1562                     if (!no_p) {
1563                         P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1564                         P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1565                         P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1566                     }
1567                     if (!no_q) {
1568                         Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1569                         Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1570                         Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1571                     }
1572                     pix += ystride;
1573                 }
1574             } else { // normal filtering
1575                 int nd_p = 1;
1576                 int nd_q = 1;
1577                 const int tc_2 = tc >> 1;
1578                 if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1579                     nd_p = 2;
1580                 if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1581                     nd_q = 2;
1582
1583                 for (d = 0; d < 4; d++) {
1584                     const int p2 = P2;
1585                     const int p1 = P1;
1586                     const int p0 = P0;
1587                     const int q0 = Q0;
1588                     const int q1 = Q1;
1589                     const int q2 = Q2;
1590                     int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1591                     if (abs(delta0) < 10 * tc) {
1592                         delta0 = av_clip(delta0, -tc, tc);
1593                         if (!no_p)
1594                             P0 = av_clip_pixel(p0 + delta0);
1595                         if (!no_q)
1596                             Q0 = av_clip_pixel(q0 - delta0);
1597                         if (!no_p && nd_p > 1) {
1598                             const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1599                             P1 = av_clip_pixel(p1 + deltap1);
1600                         }
1601                         if (!no_q && nd_q > 1) {
1602                             const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1603                             Q1 = av_clip_pixel(q1 + deltaq1);
1604                         }
1605                     }
1606                     pix += ystride;
1607                 }
1608             }
1609         }
1610     }
1611 }
1612
1613 static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1614                                           ptrdiff_t _ystride, int *_tc,
1615                                           uint8_t *_no_p, uint8_t *_no_q)
1616 {
1617     int d, j, no_p, no_q;
1618     pixel *pix        = (pixel *)_pix;
1619     ptrdiff_t xstride = _xstride / sizeof(pixel);
1620     ptrdiff_t ystride = _ystride / sizeof(pixel);
1621
1622     for (j = 0; j < 2; j++) {
1623         const int tc = _tc[j] << (BIT_DEPTH - 8);
1624         if (tc <= 0) {
1625             pix += 4 * ystride;
1626             continue;
1627         }
1628         no_p = _no_p[j];
1629         no_q = _no_q[j];
1630
1631         for (d = 0; d < 4; d++) {
1632             int delta0;
1633             const int p1 = P1;
1634             const int p0 = P0;
1635             const int q0 = Q0;
1636             const int q1 = Q1;
1637             delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1638             if (!no_p)
1639                 P0 = av_clip_pixel(p0 + delta0);
1640             if (!no_q)
1641                 Q0 = av_clip_pixel(q0 - delta0);
1642             pix += ystride;
1643         }
1644     }
1645 }
1646
1647 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1648                                             int32_t *tc, uint8_t *no_p,
1649                                             uint8_t *no_q)
1650 {
1651     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1652 }
1653
1654 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1655                                             int32_t *tc, uint8_t *no_p,
1656                                             uint8_t *no_q)
1657 {
1658     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1659 }
1660
1661 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1662                                           int beta, int32_t *tc, uint8_t *no_p,
1663                                           uint8_t *no_q)
1664 {
1665     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1666                                 beta, tc, no_p, no_q);
1667 }
1668
1669 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1670                                           int beta, int32_t *tc, uint8_t *no_p,
1671                                           uint8_t *no_q)
1672 {
1673     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1674                                 beta, tc, no_p, no_q);
1675 }
1676
1677 #undef P3
1678 #undef P2
1679 #undef P1
1680 #undef P0
1681 #undef Q0
1682 #undef Q1
1683 #undef Q2
1684 #undef Q3
1685
1686 #undef TP3
1687 #undef TP2
1688 #undef TP1
1689 #undef TP0
1690 #undef TQ0
1691 #undef TQ1
1692 #undef TQ2
1693 #undef TQ3