git.sesse.net Git - ffmpeg/blob - libavcodec/hevcdsp_template.c

   1 /*
   2  * HEVC video decoder
   3  *
   4  * Copyright (C) 2012 - 2013 Guillaume Martres
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "get_bits.h"
  24 #include "hevc.h"
  25
  26 #include "bit_depth_template.c"
  27 #include "hevcdsp.h"
  28
  29
  30 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
  31                           GetBitContext *gb, int pcm_bit_depth)
  32 {
  33     int x, y;
  34     pixel *dst = (pixel *)_dst;
  35
  36     stride /= sizeof(pixel);
  37
  38     for (y = 0; y < height; y++) {
  39         for (x = 0; x < width; x++)
  40             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
  41         dst += stride;
  42     }
  43 }
  44
  45 static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
  46                                                      ptrdiff_t stride, int size)
  47 {
  48     int x, y;
  49     pixel *dst = (pixel *)_dst;
  50
  51     stride /= sizeof(pixel);
  52
  53     for (y = 0; y < size; y++) {
  54         for (x = 0; x < size; x++) {
  55             dst[x] = av_clip_pixel(dst[x] + *coeffs);
  56             coeffs++;
  57         }
  58         dst += stride;
  59     }
  60 }
  61
  62 static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
  63                                        ptrdiff_t stride)
  64 {
  65     FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
  66 }
  67
  68 static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
  69                                        ptrdiff_t stride)
  70 {
  71     FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
  72 }
  73
  74 static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
  75                                          ptrdiff_t stride)
  76 {
  77     FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
  78 }
  79
  80 static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
  81                                          ptrdiff_t stride)
  82 {
  83     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
  84 }
  85
  86
  87 static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
  88 {
  89     int16_t *coeffs = (int16_t *) _coeffs;
  90     int x, y;
  91     int size = 1 << log2_size;
  92
  93     if (mode) {
  94         coeffs += size;
  95         for (y = 0; y < size - 1; y++) {
  96             for (x = 0; x < size; x++)
  97                 coeffs[x] += coeffs[x - size];
  98             coeffs += size;
  99         }
 100     } else {
 101         for (y = 0; y < size; y++) {
 102             for (x = 1; x < size; x++)
 103                 coeffs[x] += coeffs[x - 1];
 104             coeffs += size;
 105         }
 106     }
 107 }
 108
 109 static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
 110 {
 111     int shift  = 15 - BIT_DEPTH - log2_size;
 112     int x, y;
 113     int size = 1 << log2_size;
 114     int16_t *coeffs = _coeffs;
 115
 116
 117     if (shift > 0) {
 118         int offset = 1 << (shift - 1);
 119         for (y = 0; y < size; y++) {
 120             for (x = 0; x < size; x++) {
 121                 *coeffs = (*coeffs + offset) >> shift;
 122                 coeffs++;
 123             }
 124         }
 125     } else {
 126         for (y = 0; y < size; y++) {
 127             for (x = 0; x < size; x++) {
 128                 *coeffs = *coeffs << -shift;
 129                 coeffs++;
 130             }
 131         }
 132     }
 133 }
 134
 135 #define SET(dst, x)   (dst) = (x)
 136 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
 137 #define ADD_AND_SCALE(dst, x)                                           \
 138     (dst) = av_clip_pixel((dst) + av_clip_int16(((x) + add) >> shift))
 139
 140 #define TR_4x4_LUMA(dst, src, step, assign)                             \
 141     do {                                                                \
 142         int c0 = src[0 * step] + src[2 * step];                         \
 143         int c1 = src[2 * step] + src[3 * step];                         \
 144         int c2 = src[0 * step] - src[3 * step];                         \
 145         int c3 = 74 * src[1 * step];                                    \
 146                                                                         \
 147         assign(dst[2 * step], 74 * (src[0 * step] -                     \
 148                                     src[2 * step] +                     \
 149                                     src[3 * step]));                    \
 150         assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
 151         assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
 152         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
 153     } while (0)
 154
 155 static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 156 {
 157     int i;
 158     int shift    = 7;
 159     int add      = 1 << (shift - 1);
 160     int16_t *src = coeffs;
 161
 162     for (i = 0; i < 4; i++) {
 163         TR_4x4_LUMA(src, src, 4, SCALE);
 164         src++;
 165     }
 166
 167     shift = 20 - BIT_DEPTH;
 168     add   = 1 << (shift - 1);
 169     for (i = 0; i < 4; i++) {
 170         TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
 171         coeffs += 4;
 172     }
 173 }
 174
 175 #undef TR_4x4_LUMA
 176
 177 #define TR_4(dst, src, dstep, sstep, assign, end)                              \
 178     do {                                                                       \
 179         const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
 180         const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
 181         const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
 182         const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
 183                                                                                \
 184         assign(dst[0 * dstep], e0 + o0);                                       \
 185         assign(dst[1 * dstep], e1 + o1);                                       \
 186         assign(dst[2 * dstep], e1 - o1);                                       \
 187         assign(dst[3 * dstep], e0 - o0);                                       \
 188     } while (0)
 189
 190 #define TR_8(dst, src, dstep, sstep, assign, end)                              \
 191     do {                                                                       \
 192         int i, j;                                                              \
 193         int e_8[4];                                                            \
 194         int o_8[4] = { 0 };                                                    \
 195         for (i = 0; i < 4; i++)                                                \
 196             for (j = 1; j < end; j += 2)                                       \
 197                 o_8[i] += transform[4 * j][i] * src[j * sstep];                \
 198         TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
 199                                                                                \
 200         for (i = 0; i < 4; i++) {                                              \
 201             assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
 202             assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
 203         }                                                                      \
 204     } while (0)
 205
 206 #define TR_16(dst, src, dstep, sstep, assign, end)                             \
 207     do {                                                                       \
 208         int i, j;                                                              \
 209         int e_16[8];                                                           \
 210         int o_16[8] = { 0 };                                                   \
 211         for (i = 0; i < 8; i++)                                                \
 212             for (j = 1; j < end; j += 2)                                       \
 213                 o_16[i] += transform[2 * j][i] * src[j * sstep];               \
 214         TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
 215                                                                                \
 216         for (i = 0; i < 8; i++) {                                              \
 217             assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
 218             assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
 219         }                                                                      \
 220     } while (0)
 221
 222 #define TR_32(dst, src, dstep, sstep, assign, end)                             \
 223     do {                                                                       \
 224         int i, j;                                                              \
 225         int e_32[16];                                                          \
 226         int o_32[16] = { 0 };                                                  \
 227         for (i = 0; i < 16; i++)                                               \
 228             for (j = 1; j < end; j += 2)                                       \
 229                 o_32[i] += transform[j][i] * src[j * sstep];                   \
 230         TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
 231                                                                                \
 232         for (i = 0; i < 16; i++) {                                             \
 233             assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
 234             assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
 235         }                                                                      \
 236     } while (0)
 237
 238 #define IDCT_VAR4(H)                                                          \
 239     int      limit2   = FFMIN(col_limit + 4, H)
 240 #define IDCT_VAR8(H)                                                          \
 241         int      limit   = FFMIN(col_limit, H);                               \
 242         int      limit2   = FFMIN(col_limit + 4, H)
 243 #define IDCT_VAR16(H)   IDCT_VAR8(H)
 244 #define IDCT_VAR32(H)   IDCT_VAR8(H)
 245
 246 #define IDCT(H)                                                              \
 247 static void FUNC(idct_##H ##x ##H )(                                         \
 248                    int16_t *coeffs, int col_limit) {                         \
 249     int i;                                                                   \
 250     int      shift   = 7;                                                    \
 251     int      add     = 1 << (shift - 1);                                     \
 252     int16_t *src     = coeffs;                                               \
 253     IDCT_VAR ##H(H);                                                         \
 254                                                                              \
 255     for (i = 0; i < H; i++) {                                                \
 256         TR_ ## H(src, src, H, H, SCALE, limit2);                             \
 257         if (limit2 < H && i%4 == 0 && !!i)                                   \
 258             limit2 -= 4;                                                     \
 259         src++;                                                               \
 260     }                                                                        \
 261                                                                              \
 262     shift   = 20 - BIT_DEPTH;                                                \
 263     add     = 1 << (shift - 1);                                              \
 264     for (i = 0; i < H; i++) {                                                \
 265         TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
 266         coeffs += H;                                                         \
 267     }                                                                        \
 268 }
 269
 270 #define IDCT_DC(H)                                                           \
 271 static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
 272                    int16_t *coeffs) {                                        \
 273     int i, j;                                                                \
 274     int      shift   = 14 - BIT_DEPTH;                                       \
 275     int      add     = 1 << (shift - 1);                                     \
 276     int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
 277                                                                              \
 278     for (j = 0; j < H; j++) {                                                \
 279         for (i = 0; i < H; i++) {                                            \
 280             coeffs[i+j*H] = coeff;                                           \
 281         }                                                                    \
 282     }                                                                        \
 283 }
 284
 285 IDCT( 4)
 286 IDCT( 8)
 287 IDCT(16)
 288 IDCT(32)
 289
 290 IDCT_DC( 4)
 291 IDCT_DC( 8)
 292 IDCT_DC(16)
 293 IDCT_DC(32)
 294
 295 #undef TR_4
 296 #undef TR_8
 297 #undef TR_16
 298 #undef TR_32
 299
 300 #undef SET
 301 #undef SCALE
 302 #undef ADD_AND_SCALE
 303
 304 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
 305                                   ptrdiff_t stride_dst, ptrdiff_t stride_src,
 306                                   int16_t *sao_offset_val, int sao_left_class,
 307                                   int width, int height)
 308 {
 309     pixel *dst = (pixel *)_dst;
 310     pixel *src = (pixel *)_src;
 311     int offset_table[32] = { 0 };
 312     int k, y, x;
 313     int shift  = BIT_DEPTH - 5;
 314
 315     stride_dst /= sizeof(pixel);
 316     stride_src /= sizeof(pixel);
 317
 318     for (k = 0; k < 4; k++)
 319         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
 320     for (y = 0; y < height; y++) {
 321         for (x = 0; x < width; x++)
 322             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 323         dst += stride_dst;
 324         src += stride_src;
 325     }
 326 }
 327
 328 #define CMP(a, b) (((a) > (b)) - ((a) < (b)))
 329
 330 static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
 331                                   int eo, int width, int height) {
 332
 333     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 334     static const int8_t pos[4][2][2] = {
 335         { { -1,  0 }, {  1, 0 } }, // horizontal
 336         { {  0, -1 }, {  0, 1 } }, // vertical
 337         { { -1, -1 }, {  1, 1 } }, // 45 degree
 338         { {  1, -1 }, { -1, 1 } }, // 135 degree
 339     };
 340     pixel *dst = (pixel *)_dst;
 341     pixel *src = (pixel *)_src;
 342     int a_stride, b_stride;
 343     int x, y;
 344     ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
 345     stride_dst /= sizeof(pixel);
 346
 347     a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
 348     b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
 349     for (y = 0; y < height; y++) {
 350         for (x = 0; x < width; x++) {
 351             int diff0 = CMP(src[x], src[x + a_stride]);
 352             int diff1 = CMP(src[x], src[x + b_stride]);
 353             int offset_val        = edge_idx[2 + diff0 + diff1];
 354             dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
 355         }
 356         src += stride_src;
 357         dst += stride_dst;
 358     }
 359 }
 360
 361 static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
 362                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 363                                     int *borders, int _width, int _height,
 364                                     int c_idx, uint8_t *vert_edge,
 365                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 366 {
 367     int x, y;
 368     pixel *dst = (pixel *)_dst;
 369     pixel *src = (pixel *)_src;
 370     int16_t *sao_offset_val = sao->offset_val[c_idx];
 371     int sao_eo_class    = sao->eo_class[c_idx];
 372     int init_x = 0, width = _width, height = _height;
 373
 374     stride_dst /= sizeof(pixel);
 375     stride_src /= sizeof(pixel);
 376
 377     if (sao_eo_class != SAO_EO_VERT) {
 378         if (borders[0]) {
 379             int offset_val = sao_offset_val[0];
 380             for (y = 0; y < height; y++) {
 381                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 382             }
 383             init_x = 1;
 384         }
 385         if (borders[2]) {
 386             int offset_val = sao_offset_val[0];
 387             int offset     = width - 1;
 388             for (x = 0; x < height; x++) {
 389                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 390             }
 391             width--;
 392         }
 393     }
 394     if (sao_eo_class != SAO_EO_HORIZ) {
 395         if (borders[1]) {
 396             int offset_val = sao_offset_val[0];
 397             for (x = init_x; x < width; x++)
 398                 dst[x] = av_clip_pixel(src[x] + offset_val);
 399         }
 400         if (borders[3]) {
 401             int offset_val   = sao_offset_val[0];
 402             int y_stride_dst = stride_dst * (height - 1);
 403             int y_stride_src = stride_src * (height - 1);
 404             for (x = init_x; x < width; x++)
 405                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 406             height--;
 407         }
 408     }
 409 }
 410
 411 static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
 412                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 413                                     int *borders, int _width, int _height,
 414                                     int c_idx, uint8_t *vert_edge,
 415                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 416 {
 417     int x, y;
 418     pixel *dst = (pixel *)_dst;
 419     pixel *src = (pixel *)_src;
 420     int16_t *sao_offset_val = sao->offset_val[c_idx];
 421     int sao_eo_class    = sao->eo_class[c_idx];
 422     int init_x = 0, init_y = 0, width = _width, height = _height;
 423
 424     stride_dst /= sizeof(pixel);
 425     stride_src /= sizeof(pixel);
 426
 427     if (sao_eo_class != SAO_EO_VERT) {
 428         if (borders[0]) {
 429             int offset_val = sao_offset_val[0];
 430             for (y = 0; y < height; y++) {
 431                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 432             }
 433             init_x = 1;
 434         }
 435         if (borders[2]) {
 436             int offset_val = sao_offset_val[0];
 437             int offset     = width - 1;
 438             for (x = 0; x < height; x++) {
 439                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 440             }
 441             width--;
 442         }
 443     }
 444     if (sao_eo_class != SAO_EO_HORIZ) {
 445         if (borders[1]) {
 446             int offset_val = sao_offset_val[0];
 447             for (x = init_x; x < width; x++)
 448                 dst[x] = av_clip_pixel(src[x] + offset_val);
 449             init_y = 1;
 450         }
 451         if (borders[3]) {
 452             int offset_val   = sao_offset_val[0];
 453             int y_stride_dst = stride_dst * (height - 1);
 454             int y_stride_src = stride_src * (height - 1);
 455             for (x = init_x; x < width; x++)
 456                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 457             height--;
 458         }
 459     }
 460
 461     {
 462         int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
 463         int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
 464         int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
 465         int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
 466
 467         // Restore pixels that can't be modified
 468         if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
 469             for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
 470                 dst[y*stride_dst] = src[y*stride_src];
 471         }
 472         if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
 473             for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
 474                 dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
 475         }
 476
 477         if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
 478             for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
 479                 dst[x] = src[x];
 480         }
 481         if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
 482             for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
 483                 dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
 484         }
 485         if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
 486             dst[0] = src[0];
 487         if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
 488             dst[width-1] = src[width-1];
 489         if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
 490             dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
 491         if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
 492             dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
 493
 494     }
 495 }
 496
 497 #undef CMP
 498
 499 ////////////////////////////////////////////////////////////////////////////////
 500 //
 501 ////////////////////////////////////////////////////////////////////////////////
 502 static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
 503                                       uint8_t *_src, ptrdiff_t _srcstride,
 504                                       int height, intptr_t mx, intptr_t my, int width)
 505 {
 506     int x, y;
 507     pixel *src          = (pixel *)_src;
 508     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 509
 510     for (y = 0; y < height; y++) {
 511         for (x = 0; x < width; x++)
 512             dst[x] = src[x] << (14 - BIT_DEPTH);
 513         src += srcstride;
 514         dst += MAX_PB_SIZE;
 515     }
 516 }
 517
 518 static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 519                                           int height, intptr_t mx, intptr_t my, int width)
 520 {
 521     int y;
 522     pixel *src          = (pixel *)_src;
 523     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 524     pixel *dst          = (pixel *)_dst;
 525     ptrdiff_t dststride = _dststride / sizeof(pixel);
 526
 527     for (y = 0; y < height; y++) {
 528         memcpy(dst, src, width * sizeof(pixel));
 529         src += srcstride;
 530         dst += dststride;
 531     }
 532 }
 533
 534 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 535                                          int16_t *src2,
 536                                          int height, intptr_t mx, intptr_t my, int width)
 537 {
 538     int x, y;
 539     pixel *src          = (pixel *)_src;
 540     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 541     pixel *dst          = (pixel *)_dst;
 542     ptrdiff_t dststride = _dststride / sizeof(pixel);
 543
 544     int shift = 14  + 1 - BIT_DEPTH;
 545 #if BIT_DEPTH < 14
 546     int offset = 1 << (shift - 1);
 547 #else
 548     int offset = 0;
 549 #endif
 550
 551     for (y = 0; y < height; y++) {
 552         for (x = 0; x < width; x++)
 553             dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
 554         src  += srcstride;
 555         dst  += dststride;
 556         src2 += MAX_PB_SIZE;
 557     }
 558 }
 559
 560 static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 561                                             int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
 562 {
 563     int x, y;
 564     pixel *src          = (pixel *)_src;
 565     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 566     pixel *dst          = (pixel *)_dst;
 567     ptrdiff_t dststride = _dststride / sizeof(pixel);
 568     int shift = denom + 14 - BIT_DEPTH;
 569 #if BIT_DEPTH < 14
 570     int offset = 1 << (shift - 1);
 571 #else
 572     int offset = 0;
 573 #endif
 574
 575     ox     = ox * (1 << (BIT_DEPTH - 8));
 576     for (y = 0; y < height; y++) {
 577         for (x = 0; x < width; x++)
 578             dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
 579         src += srcstride;
 580         dst += dststride;
 581     }
 582 }
 583
 584 static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 585                                            int16_t *src2,
 586                                            int height, int denom, int wx0, int wx1,
 587                                            int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 588 {
 589     int x, y;
 590     pixel *src          = (pixel *)_src;
 591     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 592     pixel *dst          = (pixel *)_dst;
 593     ptrdiff_t dststride = _dststride / sizeof(pixel);
 594
 595     int shift = 14  + 1 - BIT_DEPTH;
 596     int log2Wd = denom + shift - 1;
 597
 598     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 599     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 600     for (y = 0; y < height; y++) {
 601         for (x = 0; x < width; x++) {
 602             dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 603         }
 604         src  += srcstride;
 605         dst  += dststride;
 606         src2 += MAX_PB_SIZE;
 607     }
 608 }
 609
 610 ////////////////////////////////////////////////////////////////////////////////
 611 //
 612 ////////////////////////////////////////////////////////////////////////////////
 613 #define QPEL_FILTER(src, stride)                                               \
 614     (filter[0] * src[x - 3 * stride] +                                         \
 615      filter[1] * src[x - 2 * stride] +                                         \
 616      filter[2] * src[x -     stride] +                                         \
 617      filter[3] * src[x             ] +                                         \
 618      filter[4] * src[x +     stride] +                                         \
 619      filter[5] * src[x + 2 * stride] +                                         \
 620      filter[6] * src[x + 3 * stride] +                                         \
 621      filter[7] * src[x + 4 * stride])
 622
 623 static void FUNC(put_hevc_qpel_h)(int16_t *dst,
 624                                   uint8_t *_src, ptrdiff_t _srcstride,
 625                                   int height, intptr_t mx, intptr_t my, int width)
 626 {
 627     int x, y;
 628     pixel        *src       = (pixel*)_src;
 629     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 630     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 631     for (y = 0; y < height; y++) {
 632         for (x = 0; x < width; x++)
 633             dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 634         src += srcstride;
 635         dst += MAX_PB_SIZE;
 636     }
 637 }
 638
 639 static void FUNC(put_hevc_qpel_v)(int16_t *dst,
 640                                   uint8_t *_src, ptrdiff_t _srcstride,
 641                                   int height, intptr_t mx, intptr_t my, int width)
 642 {
 643     int x, y;
 644     pixel        *src       = (pixel*)_src;
 645     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 646     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 647     for (y = 0; y < height; y++)  {
 648         for (x = 0; x < width; x++)
 649             dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
 650         src += srcstride;
 651         dst += MAX_PB_SIZE;
 652     }
 653 }
 654
 655 static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
 656                                    uint8_t *_src,
 657                                    ptrdiff_t _srcstride,
 658                                    int height, intptr_t mx,
 659                                    intptr_t my, int width)
 660 {
 661     int x, y;
 662     const int8_t *filter;
 663     pixel *src = (pixel*)_src;
 664     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 665     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 666     int16_t *tmp = tmp_array;
 667
 668     src   -= QPEL_EXTRA_BEFORE * srcstride;
 669     filter = ff_hevc_qpel_filters[mx - 1];
 670     for (y = 0; y < height + QPEL_EXTRA; y++) {
 671         for (x = 0; x < width; x++)
 672             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 673         src += srcstride;
 674         tmp += MAX_PB_SIZE;
 675     }
 676
 677     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 678     filter = ff_hevc_qpel_filters[my - 1];
 679     for (y = 0; y < height; y++) {
 680         for (x = 0; x < width; x++)
 681             dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
 682         tmp += MAX_PB_SIZE;
 683         dst += MAX_PB_SIZE;
 684     }
 685 }
 686
 687 static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 688                                       uint8_t *_src, ptrdiff_t _srcstride,
 689                                       int height, intptr_t mx, intptr_t my, int width)
 690 {
 691     int x, y;
 692     pixel        *src       = (pixel*)_src;
 693     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 694     pixel *dst          = (pixel *)_dst;
 695     ptrdiff_t dststride = _dststride / sizeof(pixel);
 696     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 697     int shift = 14 - BIT_DEPTH;
 698
 699 #if BIT_DEPTH < 14
 700     int offset = 1 << (shift - 1);
 701 #else
 702     int offset = 0;
 703 #endif
 704
 705     for (y = 0; y < height; y++) {
 706         for (x = 0; x < width; x++)
 707             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
 708         src += srcstride;
 709         dst += dststride;
 710     }
 711 }
 712
 713 static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 714                                      int16_t *src2,
 715                                      int height, intptr_t mx, intptr_t my, int width)
 716 {
 717     int x, y;
 718     pixel        *src       = (pixel*)_src;
 719     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 720     pixel *dst          = (pixel *)_dst;
 721     ptrdiff_t dststride = _dststride / sizeof(pixel);
 722
 723     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 724
 725     int shift = 14  + 1 - BIT_DEPTH;
 726 #if BIT_DEPTH < 14
 727     int offset = 1 << (shift - 1);
 728 #else
 729     int offset = 0;
 730 #endif
 731
 732     for (y = 0; y < height; y++) {
 733         for (x = 0; x < width; x++)
 734             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 735         src  += srcstride;
 736         dst  += dststride;
 737         src2 += MAX_PB_SIZE;
 738     }
 739 }
 740
 741 static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 742                                      uint8_t *_src, ptrdiff_t _srcstride,
 743                                      int height, intptr_t mx, intptr_t my, int width)
 744 {
 745     int x, y;
 746     pixel        *src       = (pixel*)_src;
 747     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 748     pixel *dst          = (pixel *)_dst;
 749     ptrdiff_t dststride = _dststride / sizeof(pixel);
 750     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 751     int shift = 14 - BIT_DEPTH;
 752
 753 #if BIT_DEPTH < 14
 754     int offset = 1 << (shift - 1);
 755 #else
 756     int offset = 0;
 757 #endif
 758
 759     for (y = 0; y < height; y++) {
 760         for (x = 0; x < width; x++)
 761             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
 762         src += srcstride;
 763         dst += dststride;
 764     }
 765 }
 766
 767
 768 static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 769                                      int16_t *src2,
 770                                      int height, intptr_t mx, intptr_t my, int width)
 771 {
 772     int x, y;
 773     pixel        *src       = (pixel*)_src;
 774     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 775     pixel *dst          = (pixel *)_dst;
 776     ptrdiff_t dststride = _dststride / sizeof(pixel);
 777
 778     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 779
 780     int shift = 14 + 1 - BIT_DEPTH;
 781 #if BIT_DEPTH < 14
 782     int offset = 1 << (shift - 1);
 783 #else
 784     int offset = 0;
 785 #endif
 786
 787     for (y = 0; y < height; y++) {
 788         for (x = 0; x < width; x++)
 789             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 790         src  += srcstride;
 791         dst  += dststride;
 792         src2 += MAX_PB_SIZE;
 793     }
 794 }
 795
 796 static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 797                                        uint8_t *_src, ptrdiff_t _srcstride,
 798                                        int height, intptr_t mx, intptr_t my, int width)
 799 {
 800     int x, y;
 801     const int8_t *filter;
 802     pixel *src = (pixel*)_src;
 803     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 804     pixel *dst          = (pixel *)_dst;
 805     ptrdiff_t dststride = _dststride / sizeof(pixel);
 806     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 807     int16_t *tmp = tmp_array;
 808     int shift =  14 - BIT_DEPTH;
 809
 810 #if BIT_DEPTH < 14
 811     int offset = 1 << (shift - 1);
 812 #else
 813     int offset = 0;
 814 #endif
 815
 816     src   -= QPEL_EXTRA_BEFORE * srcstride;
 817     filter = ff_hevc_qpel_filters[mx - 1];
 818     for (y = 0; y < height + QPEL_EXTRA; y++) {
 819         for (x = 0; x < width; x++)
 820             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 821         src += srcstride;
 822         tmp += MAX_PB_SIZE;
 823     }
 824
 825     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 826     filter = ff_hevc_qpel_filters[my - 1];
 827
 828     for (y = 0; y < height; y++) {
 829         for (x = 0; x < width; x++)
 830             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
 831         tmp += MAX_PB_SIZE;
 832         dst += dststride;
 833     }
 834 }
 835
 836 static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 837                                       int16_t *src2,
 838                                       int height, intptr_t mx, intptr_t my, int width)
 839 {
 840     int x, y;
 841     const int8_t *filter;
 842     pixel *src = (pixel*)_src;
 843     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 844     pixel *dst          = (pixel *)_dst;
 845     ptrdiff_t dststride = _dststride / sizeof(pixel);
 846     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 847     int16_t *tmp = tmp_array;
 848     int shift = 14 + 1 - BIT_DEPTH;
 849 #if BIT_DEPTH < 14
 850     int offset = 1 << (shift - 1);
 851 #else
 852     int offset = 0;
 853 #endif
 854
 855     src   -= QPEL_EXTRA_BEFORE * srcstride;
 856     filter = ff_hevc_qpel_filters[mx - 1];
 857     for (y = 0; y < height + QPEL_EXTRA; y++) {
 858         for (x = 0; x < width; x++)
 859             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 860         src += srcstride;
 861         tmp += MAX_PB_SIZE;
 862     }
 863
 864     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 865     filter = ff_hevc_qpel_filters[my - 1];
 866
 867     for (y = 0; y < height; y++) {
 868         for (x = 0; x < width; x++)
 869             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
 870         tmp  += MAX_PB_SIZE;
 871         dst  += dststride;
 872         src2 += MAX_PB_SIZE;
 873     }
 874 }
 875
 876 static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 877                                         uint8_t *_src, ptrdiff_t _srcstride,
 878                                         int height, int denom, int wx, int ox,
 879                                         intptr_t mx, intptr_t my, int width)
 880 {
 881     int x, y;
 882     pixel        *src       = (pixel*)_src;
 883     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 884     pixel *dst          = (pixel *)_dst;
 885     ptrdiff_t dststride = _dststride / sizeof(pixel);
 886     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 887     int shift = denom + 14 - BIT_DEPTH;
 888 #if BIT_DEPTH < 14
 889     int offset = 1 << (shift - 1);
 890 #else
 891     int offset = 0;
 892 #endif
 893
 894     ox = ox * (1 << (BIT_DEPTH - 8));
 895     for (y = 0; y < height; y++) {
 896         for (x = 0; x < width; x++)
 897             dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 898         src += srcstride;
 899         dst += dststride;
 900     }
 901 }
 902
 903 static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 904                                        int16_t *src2,
 905                                        int height, int denom, int wx0, int wx1,
 906                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 907 {
 908     int x, y;
 909     pixel        *src       = (pixel*)_src;
 910     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 911     pixel *dst          = (pixel *)_dst;
 912     ptrdiff_t dststride = _dststride / sizeof(pixel);
 913
 914     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 915
 916     int shift = 14  + 1 - BIT_DEPTH;
 917     int log2Wd = denom + shift - 1;
 918
 919     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 920     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 921     for (y = 0; y < height; y++) {
 922         for (x = 0; x < width; x++)
 923             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 924                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 925         src  += srcstride;
 926         dst  += dststride;
 927         src2 += MAX_PB_SIZE;
 928     }
 929 }
 930
 931 static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 932                                         uint8_t *_src, ptrdiff_t _srcstride,
 933                                         int height, int denom, int wx, int ox,
 934                                         intptr_t mx, intptr_t my, int width)
 935 {
 936     int x, y;
 937     pixel        *src       = (pixel*)_src;
 938     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 939     pixel *dst          = (pixel *)_dst;
 940     ptrdiff_t dststride = _dststride / sizeof(pixel);
 941     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 942     int shift = denom + 14 - BIT_DEPTH;
 943 #if BIT_DEPTH < 14
 944     int offset = 1 << (shift - 1);
 945 #else
 946     int offset = 0;
 947 #endif
 948
 949     ox = ox * (1 << (BIT_DEPTH - 8));
 950     for (y = 0; y < height; y++) {
 951         for (x = 0; x < width; x++)
 952             dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 953         src += srcstride;
 954         dst += dststride;
 955     }
 956 }
 957
 958 static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 959                                        int16_t *src2,
 960                                        int height, int denom, int wx0, int wx1,
 961                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 962 {
 963     int x, y;
 964     pixel        *src       = (pixel*)_src;
 965     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 966     pixel *dst          = (pixel *)_dst;
 967     ptrdiff_t dststride = _dststride / sizeof(pixel);
 968
 969     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 970
 971     int shift = 14 + 1 - BIT_DEPTH;
 972     int log2Wd = denom + shift - 1;
 973
 974     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 975     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 976     for (y = 0; y < height; y++) {
 977         for (x = 0; x < width; x++)
 978             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 979                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 980         src  += srcstride;
 981         dst  += dststride;
 982         src2 += MAX_PB_SIZE;
 983     }
 984 }
 985
 986 static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 987                                          uint8_t *_src, ptrdiff_t _srcstride,
 988                                          int height, int denom, int wx, int ox,
 989                                          intptr_t mx, intptr_t my, int width)
 990 {
 991     int x, y;
 992     const int8_t *filter;
 993     pixel *src = (pixel*)_src;
 994     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 995     pixel *dst          = (pixel *)_dst;
 996     ptrdiff_t dststride = _dststride / sizeof(pixel);
 997     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 998     int16_t *tmp = tmp_array;
 999     int shift = denom + 14 - BIT_DEPTH;
1000 #if BIT_DEPTH < 14
1001     int offset = 1 << (shift - 1);
1002 #else
1003     int offset = 0;
1004 #endif
1005
1006     src   -= QPEL_EXTRA_BEFORE * srcstride;
1007     filter = ff_hevc_qpel_filters[mx - 1];
1008     for (y = 0; y < height + QPEL_EXTRA; y++) {
1009         for (x = 0; x < width; x++)
1010             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1011         src += srcstride;
1012         tmp += MAX_PB_SIZE;
1013     }
1014
1015     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1016     filter = ff_hevc_qpel_filters[my - 1];
1017
1018     ox = ox * (1 << (BIT_DEPTH - 8));
1019     for (y = 0; y < height; y++) {
1020         for (x = 0; x < width; x++)
1021             dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1022         tmp += MAX_PB_SIZE;
1023         dst += dststride;
1024     }
1025 }
1026
1027 static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1028                                         int16_t *src2,
1029                                         int height, int denom, int wx0, int wx1,
1030                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1031 {
1032     int x, y;
1033     const int8_t *filter;
1034     pixel *src = (pixel*)_src;
1035     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1036     pixel *dst          = (pixel *)_dst;
1037     ptrdiff_t dststride = _dststride / sizeof(pixel);
1038     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1039     int16_t *tmp = tmp_array;
1040     int shift = 14 + 1 - BIT_DEPTH;
1041     int log2Wd = denom + shift - 1;
1042
1043     src   -= QPEL_EXTRA_BEFORE * srcstride;
1044     filter = ff_hevc_qpel_filters[mx - 1];
1045     for (y = 0; y < height + QPEL_EXTRA; y++) {
1046         for (x = 0; x < width; x++)
1047             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1048         src += srcstride;
1049         tmp += MAX_PB_SIZE;
1050     }
1051
1052     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1053     filter = ff_hevc_qpel_filters[my - 1];
1054
1055     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1056     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1057     for (y = 0; y < height; y++) {
1058         for (x = 0; x < width; x++)
1059             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1060                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1061         tmp  += MAX_PB_SIZE;
1062         dst  += dststride;
1063         src2 += MAX_PB_SIZE;
1064     }
1065 }
1066
1067 ////////////////////////////////////////////////////////////////////////////////
1068 //
1069 ////////////////////////////////////////////////////////////////////////////////
1070 #define EPEL_FILTER(src, stride)                                               \
1071     (filter[0] * src[x - stride] +                                             \
1072      filter[1] * src[x]          +                                             \
1073      filter[2] * src[x + stride] +                                             \
1074      filter[3] * src[x + 2 * stride])
1075
1076 static void FUNC(put_hevc_epel_h)(int16_t *dst,
1077                                   uint8_t *_src, ptrdiff_t _srcstride,
1078                                   int height, intptr_t mx, intptr_t my, int width)
1079 {
1080     int x, y;
1081     pixel *src = (pixel *)_src;
1082     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1083     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1084     for (y = 0; y < height; y++) {
1085         for (x = 0; x < width; x++)
1086             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1087         src += srcstride;
1088         dst += MAX_PB_SIZE;
1089     }
1090 }
1091
1092 static void FUNC(put_hevc_epel_v)(int16_t *dst,
1093                                   uint8_t *_src, ptrdiff_t _srcstride,
1094                                   int height, intptr_t mx, intptr_t my, int width)
1095 {
1096     int x, y;
1097     pixel *src = (pixel *)_src;
1098     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1099     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1100
1101     for (y = 0; y < height; y++) {
1102         for (x = 0; x < width; x++)
1103             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1104         src += srcstride;
1105         dst += MAX_PB_SIZE;
1106     }
1107 }
1108
1109 static void FUNC(put_hevc_epel_hv)(int16_t *dst,
1110                                    uint8_t *_src, ptrdiff_t _srcstride,
1111                                    int height, intptr_t mx, intptr_t my, int width)
1112 {
1113     int x, y;
1114     pixel *src = (pixel *)_src;
1115     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1116     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1117     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1118     int16_t *tmp = tmp_array;
1119
1120     src -= EPEL_EXTRA_BEFORE * srcstride;
1121
1122     for (y = 0; y < height + EPEL_EXTRA; y++) {
1123         for (x = 0; x < width; x++)
1124             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1125         src += srcstride;
1126         tmp += MAX_PB_SIZE;
1127     }
1128
1129     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1130     filter = ff_hevc_epel_filters[my - 1];
1131
1132     for (y = 0; y < height; y++) {
1133         for (x = 0; x < width; x++)
1134             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1135         tmp += MAX_PB_SIZE;
1136         dst += MAX_PB_SIZE;
1137     }
1138 }
1139
1140 static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1141                                       int height, intptr_t mx, intptr_t my, int width)
1142 {
1143     int x, y;
1144     pixel *src = (pixel *)_src;
1145     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1146     pixel *dst          = (pixel *)_dst;
1147     ptrdiff_t dststride = _dststride / sizeof(pixel);
1148     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1149     int shift = 14 - BIT_DEPTH;
1150 #if BIT_DEPTH < 14
1151     int offset = 1 << (shift - 1);
1152 #else
1153     int offset = 0;
1154 #endif
1155
1156     for (y = 0; y < height; y++) {
1157         for (x = 0; x < width; x++)
1158             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
1159         src += srcstride;
1160         dst += dststride;
1161     }
1162 }
1163
1164 static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1165                                      int16_t *src2,
1166                                      int height, intptr_t mx, intptr_t my, int width)
1167 {
1168     int x, y;
1169     pixel *src = (pixel *)_src;
1170     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1171     pixel *dst          = (pixel *)_dst;
1172     ptrdiff_t dststride = _dststride / sizeof(pixel);
1173     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1174     int shift = 14 + 1 - BIT_DEPTH;
1175 #if BIT_DEPTH < 14
1176     int offset = 1 << (shift - 1);
1177 #else
1178     int offset = 0;
1179 #endif
1180
1181     for (y = 0; y < height; y++) {
1182         for (x = 0; x < width; x++) {
1183             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1184         }
1185         dst  += dststride;
1186         src  += srcstride;
1187         src2 += MAX_PB_SIZE;
1188     }
1189 }
1190
1191 static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1192                                       int height, intptr_t mx, intptr_t my, int width)
1193 {
1194     int x, y;
1195     pixel *src = (pixel *)_src;
1196     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1197     pixel *dst          = (pixel *)_dst;
1198     ptrdiff_t dststride = _dststride / sizeof(pixel);
1199     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1200     int shift = 14 - BIT_DEPTH;
1201 #if BIT_DEPTH < 14
1202     int offset = 1 << (shift - 1);
1203 #else
1204     int offset = 0;
1205 #endif
1206
1207     for (y = 0; y < height; y++) {
1208         for (x = 0; x < width; x++)
1209             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
1210         src += srcstride;
1211         dst += dststride;
1212     }
1213 }
1214
1215 static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1216                                      int16_t *src2,
1217                                      int height, intptr_t mx, intptr_t my, int width)
1218 {
1219     int x, y;
1220     pixel *src = (pixel *)_src;
1221     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1222     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1223     pixel *dst          = (pixel *)_dst;
1224     ptrdiff_t dststride = _dststride / sizeof(pixel);
1225     int shift = 14 + 1 - BIT_DEPTH;
1226 #if BIT_DEPTH < 14
1227     int offset = 1 << (shift - 1);
1228 #else
1229     int offset = 0;
1230 #endif
1231
1232     for (y = 0; y < height; y++) {
1233         for (x = 0; x < width; x++)
1234             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1235         dst  += dststride;
1236         src  += srcstride;
1237         src2 += MAX_PB_SIZE;
1238     }
1239 }
1240
1241 static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1242                                        int height, intptr_t mx, intptr_t my, int width)
1243 {
1244     int x, y;
1245     pixel *src = (pixel *)_src;
1246     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1247     pixel *dst          = (pixel *)_dst;
1248     ptrdiff_t dststride = _dststride / sizeof(pixel);
1249     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1250     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1251     int16_t *tmp = tmp_array;
1252     int shift = 14 - BIT_DEPTH;
1253 #if BIT_DEPTH < 14
1254     int offset = 1 << (shift - 1);
1255 #else
1256     int offset = 0;
1257 #endif
1258
1259     src -= EPEL_EXTRA_BEFORE * srcstride;
1260
1261     for (y = 0; y < height + EPEL_EXTRA; y++) {
1262         for (x = 0; x < width; x++)
1263             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1264         src += srcstride;
1265         tmp += MAX_PB_SIZE;
1266     }
1267
1268     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1269     filter = ff_hevc_epel_filters[my - 1];
1270
1271     for (y = 0; y < height; y++) {
1272         for (x = 0; x < width; x++)
1273             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
1274         tmp += MAX_PB_SIZE;
1275         dst += dststride;
1276     }
1277 }
1278
1279 static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1280                                       int16_t *src2,
1281                                       int height, intptr_t mx, intptr_t my, int width)
1282 {
1283     int x, y;
1284     pixel *src = (pixel *)_src;
1285     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1286     pixel *dst          = (pixel *)_dst;
1287     ptrdiff_t dststride = _dststride / sizeof(pixel);
1288     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1289     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1290     int16_t *tmp = tmp_array;
1291     int shift = 14 + 1 - BIT_DEPTH;
1292 #if BIT_DEPTH < 14
1293     int offset = 1 << (shift - 1);
1294 #else
1295     int offset = 0;
1296 #endif
1297
1298     src -= EPEL_EXTRA_BEFORE * srcstride;
1299
1300     for (y = 0; y < height + EPEL_EXTRA; y++) {
1301         for (x = 0; x < width; x++)
1302             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1303         src += srcstride;
1304         tmp += MAX_PB_SIZE;
1305     }
1306
1307     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1308     filter = ff_hevc_epel_filters[my - 1];
1309
1310     for (y = 0; y < height; y++) {
1311         for (x = 0; x < width; x++)
1312             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
1313         tmp  += MAX_PB_SIZE;
1314         dst  += dststride;
1315         src2 += MAX_PB_SIZE;
1316     }
1317 }
1318
1319 static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1320                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1321 {
1322     int x, y;
1323     pixel *src = (pixel *)_src;
1324     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1325     pixel *dst          = (pixel *)_dst;
1326     ptrdiff_t dststride = _dststride / sizeof(pixel);
1327     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1328     int shift = denom + 14 - BIT_DEPTH;
1329 #if BIT_DEPTH < 14
1330     int offset = 1 << (shift - 1);
1331 #else
1332     int offset = 0;
1333 #endif
1334
1335     ox     = ox * (1 << (BIT_DEPTH - 8));
1336     for (y = 0; y < height; y++) {
1337         for (x = 0; x < width; x++) {
1338             dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1339         }
1340         dst += dststride;
1341         src += srcstride;
1342     }
1343 }
1344
1345 static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1346                                        int16_t *src2,
1347                                        int height, int denom, int wx0, int wx1,
1348                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1349 {
1350     int x, y;
1351     pixel *src = (pixel *)_src;
1352     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1353     pixel *dst          = (pixel *)_dst;
1354     ptrdiff_t dststride = _dststride / sizeof(pixel);
1355     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1356     int shift = 14 + 1 - BIT_DEPTH;
1357     int log2Wd = denom + shift - 1;
1358
1359     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1360     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1361     for (y = 0; y < height; y++) {
1362         for (x = 0; x < width; x++)
1363             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1364                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1365         src  += srcstride;
1366         dst  += dststride;
1367         src2 += MAX_PB_SIZE;
1368     }
1369 }
1370
1371 static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1372                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1373 {
1374     int x, y;
1375     pixel *src = (pixel *)_src;
1376     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1377     pixel *dst          = (pixel *)_dst;
1378     ptrdiff_t dststride = _dststride / sizeof(pixel);
1379     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1380     int shift = denom + 14 - BIT_DEPTH;
1381 #if BIT_DEPTH < 14
1382     int offset = 1 << (shift - 1);
1383 #else
1384     int offset = 0;
1385 #endif
1386
1387     ox     = ox * (1 << (BIT_DEPTH - 8));
1388     for (y = 0; y < height; y++) {
1389         for (x = 0; x < width; x++) {
1390             dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1391         }
1392         dst += dststride;
1393         src += srcstride;
1394     }
1395 }
1396
1397 static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1398                                        int16_t *src2,
1399                                        int height, int denom, int wx0, int wx1,
1400                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1401 {
1402     int x, y;
1403     pixel *src = (pixel *)_src;
1404     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1405     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1406     pixel *dst          = (pixel *)_dst;
1407     ptrdiff_t dststride = _dststride / sizeof(pixel);
1408     int shift = 14 + 1 - BIT_DEPTH;
1409     int log2Wd = denom + shift - 1;
1410
1411     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1412     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1413     for (y = 0; y < height; y++) {
1414         for (x = 0; x < width; x++)
1415             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1416                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1417         src  += srcstride;
1418         dst  += dststride;
1419         src2 += MAX_PB_SIZE;
1420     }
1421 }
1422
1423 static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1424                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1425 {
1426     int x, y;
1427     pixel *src = (pixel *)_src;
1428     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1429     pixel *dst          = (pixel *)_dst;
1430     ptrdiff_t dststride = _dststride / sizeof(pixel);
1431     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1432     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1433     int16_t *tmp = tmp_array;
1434     int shift = denom + 14 - BIT_DEPTH;
1435 #if BIT_DEPTH < 14
1436     int offset = 1 << (shift - 1);
1437 #else
1438     int offset = 0;
1439 #endif
1440
1441     src -= EPEL_EXTRA_BEFORE * srcstride;
1442
1443     for (y = 0; y < height + EPEL_EXTRA; y++) {
1444         for (x = 0; x < width; x++)
1445             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1446         src += srcstride;
1447         tmp += MAX_PB_SIZE;
1448     }
1449
1450     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1451     filter = ff_hevc_epel_filters[my - 1];
1452
1453     ox     = ox * (1 << (BIT_DEPTH - 8));
1454     for (y = 0; y < height; y++) {
1455         for (x = 0; x < width; x++)
1456             dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1457         tmp += MAX_PB_SIZE;
1458         dst += dststride;
1459     }
1460 }
1461
1462 static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1463                                         int16_t *src2,
1464                                         int height, int denom, int wx0, int wx1,
1465                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1466 {
1467     int x, y;
1468     pixel *src = (pixel *)_src;
1469     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1470     pixel *dst          = (pixel *)_dst;
1471     ptrdiff_t dststride = _dststride / sizeof(pixel);
1472     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1473     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1474     int16_t *tmp = tmp_array;
1475     int shift = 14 + 1 - BIT_DEPTH;
1476     int log2Wd = denom + shift - 1;
1477
1478     src -= EPEL_EXTRA_BEFORE * srcstride;
1479
1480     for (y = 0; y < height + EPEL_EXTRA; y++) {
1481         for (x = 0; x < width; x++)
1482             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1483         src += srcstride;
1484         tmp += MAX_PB_SIZE;
1485     }
1486
1487     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1488     filter = ff_hevc_epel_filters[my - 1];
1489
1490     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1491     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1492     for (y = 0; y < height; y++) {
1493         for (x = 0; x < width; x++)
1494             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1495                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1496         tmp  += MAX_PB_SIZE;
1497         dst  += dststride;
1498         src2 += MAX_PB_SIZE;
1499     }
1500 }// line zero
1501 #define P3 pix[-4 * xstride]
1502 #define P2 pix[-3 * xstride]
1503 #define P1 pix[-2 * xstride]
1504 #define P0 pix[-1 * xstride]
1505 #define Q0 pix[0 * xstride]
1506 #define Q1 pix[1 * xstride]
1507 #define Q2 pix[2 * xstride]
1508 #define Q3 pix[3 * xstride]
1509
1510 // line three. used only for deblocking decision
1511 #define TP3 pix[-4 * xstride + 3 * ystride]
1512 #define TP2 pix[-3 * xstride + 3 * ystride]
1513 #define TP1 pix[-2 * xstride + 3 * ystride]
1514 #define TP0 pix[-1 * xstride + 3 * ystride]
1515 #define TQ0 pix[0  * xstride + 3 * ystride]
1516 #define TQ1 pix[1  * xstride + 3 * ystride]
1517 #define TQ2 pix[2  * xstride + 3 * ystride]
1518 #define TQ3 pix[3  * xstride + 3 * ystride]
1519
1520 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1521                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
1522                                         int beta, int *_tc,
1523                                         uint8_t *_no_p, uint8_t *_no_q)
1524 {
1525     int d, j;
1526     pixel *pix        = (pixel *)_pix;
1527     ptrdiff_t xstride = _xstride / sizeof(pixel);
1528     ptrdiff_t ystride = _ystride / sizeof(pixel);
1529
1530     beta <<= BIT_DEPTH - 8;
1531
1532     for (j = 0; j < 2; j++) {
1533         const int dp0  = abs(P2  - 2 * P1  + P0);
1534         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1535         const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1536         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1537         const int d0   = dp0 + dq0;
1538         const int d3   = dp3 + dq3;
1539         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1540         const int no_p = _no_p[j];
1541         const int no_q = _no_q[j];
1542
1543         if (d0 + d3 >= beta) {
1544             pix += 4 * ystride;
1545             continue;
1546         } else {
1547             const int beta_3 = beta >> 3;
1548             const int beta_2 = beta >> 2;
1549             const int tc25   = ((tc * 5 + 1) >> 1);
1550
1551             if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1552                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1553                                       (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1554                 // strong filtering
1555                 const int tc2 = tc << 1;
1556                 for (d = 0; d < 4; d++) {
1557                     const int p3 = P3;
1558                     const int p2 = P2;
1559                     const int p1 = P1;
1560                     const int p0 = P0;
1561                     const int q0 = Q0;
1562                     const int q1 = Q1;
1563                     const int q2 = Q2;
1564                     const int q3 = Q3;
1565                     if (!no_p) {
1566                         P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1567                         P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1568                         P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1569                     }
1570                     if (!no_q) {
1571                         Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1572                         Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1573                         Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1574                     }
1575                     pix += ystride;
1576                 }
1577             } else { // normal filtering
1578                 int nd_p = 1;
1579                 int nd_q = 1;
1580                 const int tc_2 = tc >> 1;
1581                 if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1582                     nd_p = 2;
1583                 if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1584                     nd_q = 2;
1585
1586                 for (d = 0; d < 4; d++) {
1587                     const int p2 = P2;
1588                     const int p1 = P1;
1589                     const int p0 = P0;
1590                     const int q0 = Q0;
1591                     const int q1 = Q1;
1592                     const int q2 = Q2;
1593                     int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1594                     if (abs(delta0) < 10 * tc) {
1595                         delta0 = av_clip(delta0, -tc, tc);
1596                         if (!no_p)
1597                             P0 = av_clip_pixel(p0 + delta0);
1598                         if (!no_q)
1599                             Q0 = av_clip_pixel(q0 - delta0);
1600                         if (!no_p && nd_p > 1) {
1601                             const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1602                             P1 = av_clip_pixel(p1 + deltap1);
1603                         }
1604                         if (!no_q && nd_q > 1) {
1605                             const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1606                             Q1 = av_clip_pixel(q1 + deltaq1);
1607                         }
1608                     }
1609                     pix += ystride;
1610                 }
1611             }
1612         }
1613     }
1614 }
1615
1616 static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1617                                           ptrdiff_t _ystride, int *_tc,
1618                                           uint8_t *_no_p, uint8_t *_no_q)
1619 {
1620     int d, j, no_p, no_q;
1621     pixel *pix        = (pixel *)_pix;
1622     ptrdiff_t xstride = _xstride / sizeof(pixel);
1623     ptrdiff_t ystride = _ystride / sizeof(pixel);
1624
1625     for (j = 0; j < 2; j++) {
1626         const int tc = _tc[j] << (BIT_DEPTH - 8);
1627         if (tc <= 0) {
1628             pix += 4 * ystride;
1629             continue;
1630         }
1631         no_p = _no_p[j];
1632         no_q = _no_q[j];
1633
1634         for (d = 0; d < 4; d++) {
1635             int delta0;
1636             const int p1 = P1;
1637             const int p0 = P0;
1638             const int q0 = Q0;
1639             const int q1 = Q1;
1640             delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1641             if (!no_p)
1642                 P0 = av_clip_pixel(p0 + delta0);
1643             if (!no_q)
1644                 Q0 = av_clip_pixel(q0 - delta0);
1645             pix += ystride;
1646         }
1647     }
1648 }
1649
1650 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1651                                             int32_t *tc, uint8_t *no_p,
1652                                             uint8_t *no_q)
1653 {
1654     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1655 }
1656
1657 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1658                                             int32_t *tc, uint8_t *no_p,
1659                                             uint8_t *no_q)
1660 {
1661     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1662 }
1663
1664 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1665                                           int beta, int32_t *tc, uint8_t *no_p,
1666                                           uint8_t *no_q)
1667 {
1668     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1669                                 beta, tc, no_p, no_q);
1670 }
1671
1672 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1673                                           int beta, int32_t *tc, uint8_t *no_p,
1674                                           uint8_t *no_q)
1675 {
1676     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1677                                 beta, tc, no_p, no_q);
1678 }
1679
1680 #undef P3
1681 #undef P2
1682 #undef P1
1683 #undef P0
1684 #undef Q0
1685 #undef Q1
1686 #undef Q2
1687 #undef Q3
1688
1689 #undef TP3
1690 #undef TP2
1691 #undef TP1
1692 #undef TP0
1693 #undef TQ0
1694 #undef TQ1
1695 #undef TQ2
1696 #undef TQ3