git.sesse.net Git - ffmpeg/blob - libavcodec/hevcdsp_template.c

   1 /*
   2  * HEVC video decoder
   3  *
   4  * Copyright (C) 2012 - 2013 Guillaume Martres
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "get_bits.h"
  24 #include "hevc.h"
  25
  26 #include "bit_depth_template.c"
  27 #include "hevcdsp.h"
  28
  29
  30 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
  31                           GetBitContext *gb, int pcm_bit_depth)
  32 {
  33     int x, y;
  34     pixel *dst = (pixel *)_dst;
  35
  36     stride /= sizeof(pixel);
  37
  38     for (y = 0; y < height; y++) {
  39         for (x = 0; x < width; x++)
  40             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
  41         dst += stride;
  42     }
  43 }
  44
  45 static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
  46                                                      ptrdiff_t stride, int size)
  47 {
  48     int x, y;
  49     pixel *dst = (pixel *)_dst;
  50
  51     stride /= sizeof(pixel);
  52
  53     for (y = 0; y < size; y++) {
  54         for (x = 0; x < size; x++) {
  55             dst[x] = av_clip_pixel(dst[x] + *coeffs);
  56             coeffs++;
  57         }
  58         dst += stride;
  59     }
  60 }
  61
  62 static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
  63                                        ptrdiff_t stride)
  64 {
  65     FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
  66 }
  67
  68 static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
  69                                        ptrdiff_t stride)
  70 {
  71     FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
  72 }
  73
  74 static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
  75                                          ptrdiff_t stride)
  76 {
  77     FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
  78 }
  79
  80 static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
  81                                          ptrdiff_t stride)
  82 {
  83     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
  84 }
  85
  86
  87 static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
  88 {
  89     int16_t *coeffs = (int16_t *) _coeffs;
  90     int x, y;
  91     int size = 1 << log2_size;
  92
  93     if (mode) {
  94         coeffs += size;
  95         for (y = 0; y < size - 1; y++) {
  96             for (x = 0; x < size; x++)
  97                 coeffs[x] += coeffs[x - size];
  98             coeffs += size;
  99         }
 100     } else {
 101         for (y = 0; y < size; y++) {
 102             for (x = 1; x < size; x++)
 103                 coeffs[x] += coeffs[x - 1];
 104             coeffs += size;
 105         }
 106     }
 107 }
 108
 109 static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
 110 {
 111     int shift  = 15 - BIT_DEPTH - log2_size;
 112     int x, y;
 113     int size = 1 << log2_size;
 114     int16_t *coeffs = _coeffs;
 115
 116
 117     if (shift > 0) {
 118         int offset = 1 << (shift - 1);
 119         for (y = 0; y < size; y++) {
 120             for (x = 0; x < size; x++) {
 121                 *coeffs = (*coeffs + offset) >> shift;
 122                 coeffs++;
 123             }
 124         }
 125     } else {
 126         for (y = 0; y < size; y++) {
 127             for (x = 0; x < size; x++) {
 128                 *coeffs = *coeffs << -shift;
 129                 coeffs++;
 130             }
 131         }
 132     }
 133 }
 134
 135 #define SET(dst, x)   (dst) = (x)
 136 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
 137 #define ADD_AND_SCALE(dst, x)                                           \
 138     (dst) = av_clip_pixel((dst) + av_clip_int16(((x) + add) >> shift))
 139
 140 #define TR_4x4_LUMA(dst, src, step, assign)                             \
 141     do {                                                                \
 142         int c0 = src[0 * step] + src[2 * step];                         \
 143         int c1 = src[2 * step] + src[3 * step];                         \
 144         int c2 = src[0 * step] - src[3 * step];                         \
 145         int c3 = 74 * src[1 * step];                                    \
 146                                                                         \
 147         assign(dst[2 * step], 74 * (src[0 * step] -                     \
 148                                     src[2 * step] +                     \
 149                                     src[3 * step]));                    \
 150         assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
 151         assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
 152         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
 153     } while (0)
 154
 155 static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 156 {
 157     int i;
 158     int shift    = 7;
 159     int add      = 1 << (shift - 1);
 160     int16_t *src = coeffs;
 161
 162     for (i = 0; i < 4; i++) {
 163         TR_4x4_LUMA(src, src, 4, SCALE);
 164         src++;
 165     }
 166
 167     shift = 20 - BIT_DEPTH;
 168     add   = 1 << (shift - 1);
 169     for (i = 0; i < 4; i++) {
 170         TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
 171         coeffs += 4;
 172     }
 173 }
 174
 175 #undef TR_4x4_LUMA
 176
 177 #define TR_4(dst, src, dstep, sstep, assign, end)                              \
 178     do {                                                                       \
 179         const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
 180         const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
 181         const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
 182         const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
 183                                                                                \
 184         assign(dst[0 * dstep], e0 + o0);                                       \
 185         assign(dst[1 * dstep], e1 + o1);                                       \
 186         assign(dst[2 * dstep], e1 - o1);                                       \
 187         assign(dst[3 * dstep], e0 - o0);                                       \
 188     } while (0)
 189
 190 #define TR_8(dst, src, dstep, sstep, assign, end)                              \
 191     do {                                                                       \
 192         int i, j;                                                              \
 193         int e_8[4];                                                            \
 194         int o_8[4] = { 0 };                                                    \
 195         for (i = 0; i < 4; i++)                                                \
 196             for (j = 1; j < end; j += 2)                                       \
 197                 o_8[i] += transform[4 * j][i] * src[j * sstep];                \
 198         TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
 199                                                                                \
 200         for (i = 0; i < 4; i++) {                                              \
 201             assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
 202             assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
 203         }                                                                      \
 204     } while (0)
 205
 206 #define TR_16(dst, src, dstep, sstep, assign, end)                             \
 207     do {                                                                       \
 208         int i, j;                                                              \
 209         int e_16[8];                                                           \
 210         int o_16[8] = { 0 };                                                   \
 211         for (i = 0; i < 8; i++)                                                \
 212             for (j = 1; j < end; j += 2)                                       \
 213                 o_16[i] += transform[2 * j][i] * src[j * sstep];               \
 214         TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
 215                                                                                \
 216         for (i = 0; i < 8; i++) {                                              \
 217             assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
 218             assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
 219         }                                                                      \
 220     } while (0)
 221
 222 #define TR_32(dst, src, dstep, sstep, assign, end)                             \
 223     do {                                                                       \
 224         int i, j;                                                              \
 225         int e_32[16];                                                          \
 226         int o_32[16] = { 0 };                                                  \
 227         for (i = 0; i < 16; i++)                                               \
 228             for (j = 1; j < end; j += 2)                                       \
 229                 o_32[i] += transform[j][i] * src[j * sstep];                   \
 230         TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
 231                                                                                \
 232         for (i = 0; i < 16; i++) {                                             \
 233             assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
 234             assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
 235         }                                                                      \
 236     } while (0)
 237
 238 #define IDCT_VAR4(H)                                                          \
 239     int      limit2   = FFMIN(col_limit + 4, H)
 240 #define IDCT_VAR8(H)                                                          \
 241         int      limit   = FFMIN(col_limit, H);                               \
 242         int      limit2   = FFMIN(col_limit + 4, H)
 243 #define IDCT_VAR16(H)   IDCT_VAR8(H)
 244 #define IDCT_VAR32(H)   IDCT_VAR8(H)
 245
 246 #define IDCT(H)                                                              \
 247 static void FUNC(idct_##H ##x ##H )(                                         \
 248                    int16_t *coeffs, int col_limit) {                         \
 249     int i;                                                                   \
 250     int      shift   = 7;                                                    \
 251     int      add     = 1 << (shift - 1);                                     \
 252     int16_t *src     = coeffs;                                               \
 253     IDCT_VAR ##H(H);                                                         \
 254                                                                              \
 255     for (i = 0; i < H; i++) {                                                \
 256         TR_ ## H(src, src, H, H, SCALE, limit2);                             \
 257         if (limit2 < H && i%4 == 0 && !!i)                                   \
 258             limit2 -= 4;                                                     \
 259         src++;                                                               \
 260     }                                                                        \
 261                                                                              \
 262     shift   = 20 - BIT_DEPTH;                                                \
 263     add     = 1 << (shift - 1);                                              \
 264     for (i = 0; i < H; i++) {                                                \
 265         TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
 266         coeffs += H;                                                         \
 267     }                                                                        \
 268 }
 269
 270 #define IDCT_DC(H)                                                           \
 271 static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
 272                    int16_t *coeffs) {                                        \
 273     int i, j;                                                                \
 274     int      shift   = 14 - BIT_DEPTH;                                       \
 275     int      add     = 1 << (shift - 1);                                     \
 276     int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
 277                                                                              \
 278     for (j = 0; j < H; j++) {                                                \
 279         for (i = 0; i < H; i++) {                                            \
 280             coeffs[i+j*H] = coeff;                                           \
 281         }                                                                    \
 282     }                                                                        \
 283 }
 284
 285 IDCT( 4)
 286 IDCT( 8)
 287 IDCT(16)
 288 IDCT(32)
 289
 290 IDCT_DC( 4)
 291 IDCT_DC( 8)
 292 IDCT_DC(16)
 293 IDCT_DC(32)
 294
 295 #undef TR_4
 296 #undef TR_8
 297 #undef TR_16
 298 #undef TR_32
 299
 300 #undef SET
 301 #undef SCALE
 302 #undef ADD_AND_SCALE
 303
 304 static void FUNC(sao_band_filter_0)(uint8_t *_dst, uint8_t *_src,
 305                                     ptrdiff_t stride_dst, ptrdiff_t stride_src,
 306                                     int16_t *sao_offset_val, int sao_left_class,
 307                                     int width, int height)
 308 {
 309     pixel *dst = (pixel *)_dst;
 310     pixel *src = (pixel *)_src;
 311     int offset_table[32] = { 0 };
 312     int k, y, x;
 313     int shift  = BIT_DEPTH - 5;
 314
 315     stride_dst /= sizeof(pixel);
 316     stride_src /= sizeof(pixel);
 317
 318     for (k = 0; k < 4; k++)
 319         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
 320     for (y = 0; y < height; y++) {
 321         for (x = 0; x < width; x++)
 322             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 323         dst += stride_dst;
 324         src += stride_src;
 325     }
 326 }
 327
 328 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 329
 330 static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src,
 331                                   ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 332                                   int width, int height,
 333                                   int c_idx, int init_x, int init_y) {
 334
 335     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 336     static const int8_t pos[4][2][2] = {
 337         { { -1,  0 }, {  1, 0 } }, // horizontal
 338         { {  0, -1 }, {  0, 1 } }, // vertical
 339         { { -1, -1 }, {  1, 1 } }, // 45 degree
 340         { {  1, -1 }, { -1, 1 } }, // 135 degree
 341     };
 342     int16_t *sao_offset_val = sao->offset_val[c_idx];
 343     int sao_eo_class    = sao->eo_class[c_idx];
 344     pixel *dst = (pixel *)_dst;
 345     pixel *src = (pixel *)_src;
 346
 347     int y_stride_src = init_y * stride_src;
 348     int y_stride_dst = init_y * stride_dst;
 349     int pos_0_0  = pos[sao_eo_class][0][0];
 350     int pos_0_1  = pos[sao_eo_class][0][1];
 351     int pos_1_0  = pos[sao_eo_class][1][0];
 352     int pos_1_1  = pos[sao_eo_class][1][1];
 353     int x, y;
 354
 355     int y_stride_0_1 = (init_y + pos_0_1) * stride_src;
 356     int y_stride_1_1 = (init_y + pos_1_1) * stride_src;
 357     for (y = init_y; y < height; y++) {
 358         for (x = init_x; x < width; x++) {
 359             int diff0             = CMP(src[x + y_stride_src], src[x + pos_0_0 + y_stride_0_1]);
 360             int diff1             = CMP(src[x + y_stride_src], src[x + pos_1_0 + y_stride_1_1]);
 361             int offset_val        = edge_idx[2 + diff0 + diff1];
 362             dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + sao_offset_val[offset_val]);
 363         }
 364         y_stride_src += stride_src;
 365         y_stride_dst += stride_dst;
 366         y_stride_0_1 += stride_src;
 367         y_stride_1_1 += stride_src;
 368     }
 369 }
 370
 371 static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
 372                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 373                                     int *borders, int _width, int _height,
 374                                     int c_idx, uint8_t *vert_edge,
 375                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 376 {
 377     int x, y;
 378     pixel *dst = (pixel *)_dst;
 379     pixel *src = (pixel *)_src;
 380     int16_t *sao_offset_val = sao->offset_val[c_idx];
 381     int sao_eo_class    = sao->eo_class[c_idx];
 382     int init_x = 0, init_y = 0, width = _width, height = _height;
 383
 384     stride_dst /= sizeof(pixel);
 385     stride_src /= sizeof(pixel);
 386
 387     if (sao_eo_class != SAO_EO_VERT) {
 388         if (borders[0]) {
 389             int offset_val = sao_offset_val[0];
 390             for (y = 0; y < height; y++) {
 391                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 392             }
 393             init_x = 1;
 394         }
 395         if (borders[2]) {
 396             int offset_val = sao_offset_val[0];
 397             int offset     = width - 1;
 398             for (x = 0; x < height; x++) {
 399                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 400             }
 401             width--;
 402         }
 403     }
 404     if (sao_eo_class != SAO_EO_HORIZ) {
 405         if (borders[1]) {
 406             int offset_val = sao_offset_val[0];
 407             for (x = init_x; x < width; x++)
 408                 dst[x] = av_clip_pixel(src[x] + offset_val);
 409             init_y = 1;
 410         }
 411         if (borders[3]) {
 412             int offset_val   = sao_offset_val[0];
 413             int y_stride_dst = stride_dst * (height - 1);
 414             int y_stride_src = stride_src * (height - 1);
 415             for (x = init_x; x < width; x++)
 416                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 417             height--;
 418         }
 419     }
 420
 421     FUNC(sao_edge_filter)((uint8_t *)dst, (uint8_t *)src, stride_dst, stride_src, sao, width, height, c_idx, init_x, init_y);
 422 }
 423
 424 static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
 425                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 426                                     int *borders, int _width, int _height,
 427                                     int c_idx, uint8_t *vert_edge,
 428                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 429 {
 430     int x, y;
 431     pixel *dst = (pixel *)_dst;
 432     pixel *src = (pixel *)_src;
 433     int16_t *sao_offset_val = sao->offset_val[c_idx];
 434     int sao_eo_class    = sao->eo_class[c_idx];
 435     int init_x = 0, init_y = 0, width = _width, height = _height;
 436
 437     stride_dst /= sizeof(pixel);
 438     stride_src /= sizeof(pixel);
 439
 440     if (sao_eo_class != SAO_EO_VERT) {
 441         if (borders[0]) {
 442             int offset_val = sao_offset_val[0];
 443             for (y = 0; y < height; y++) {
 444                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 445             }
 446             init_x = 1;
 447         }
 448         if (borders[2]) {
 449             int offset_val = sao_offset_val[0];
 450             int offset     = width - 1;
 451             for (x = 0; x < height; x++) {
 452                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 453             }
 454             width--;
 455         }
 456     }
 457     if (sao_eo_class != SAO_EO_HORIZ) {
 458         if (borders[1]) {
 459             int offset_val = sao_offset_val[0];
 460             for (x = init_x; x < width; x++)
 461                 dst[x] = av_clip_pixel(src[x] + offset_val);
 462             init_y = 1;
 463         }
 464         if (borders[3]) {
 465             int offset_val   = sao_offset_val[0];
 466             int y_stride_dst = stride_dst * (height - 1);
 467             int y_stride_src = stride_src * (height - 1);
 468             for (x = init_x; x < width; x++)
 469                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 470             height--;
 471         }
 472     }
 473
 474     FUNC(sao_edge_filter)((uint8_t *)dst, (uint8_t *)src, stride_dst, stride_src, sao, width, height, c_idx, init_x, init_y);
 475
 476     {
 477         int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
 478         int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
 479         int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
 480         int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
 481
 482         // Restore pixels that can't be modified
 483         if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
 484             for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
 485                 dst[y*stride_dst] = src[y*stride_src];
 486         }
 487         if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
 488             for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
 489                 dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
 490         }
 491
 492         if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
 493             for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
 494                 dst[x] = src[x];
 495         }
 496         if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
 497             for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
 498                 dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
 499         }
 500         if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
 501             dst[0] = src[0];
 502         if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
 503             dst[width-1] = src[width-1];
 504         if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
 505             dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
 506         if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
 507             dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
 508
 509     }
 510 }
 511
 512 #undef CMP
 513
 514 ////////////////////////////////////////////////////////////////////////////////
 515 //
 516 ////////////////////////////////////////////////////////////////////////////////
 517 static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
 518                                       uint8_t *_src, ptrdiff_t _srcstride,
 519                                       int height, intptr_t mx, intptr_t my, int width)
 520 {
 521     int x, y;
 522     pixel *src          = (pixel *)_src;
 523     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 524
 525     for (y = 0; y < height; y++) {
 526         for (x = 0; x < width; x++)
 527             dst[x] = src[x] << (14 - BIT_DEPTH);
 528         src += srcstride;
 529         dst += MAX_PB_SIZE;
 530     }
 531 }
 532
 533 static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 534                                           int height, intptr_t mx, intptr_t my, int width)
 535 {
 536     int y;
 537     pixel *src          = (pixel *)_src;
 538     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 539     pixel *dst          = (pixel *)_dst;
 540     ptrdiff_t dststride = _dststride / sizeof(pixel);
 541
 542     for (y = 0; y < height; y++) {
 543         memcpy(dst, src, width * sizeof(pixel));
 544         src += srcstride;
 545         dst += dststride;
 546     }
 547 }
 548
 549 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 550                                          int16_t *src2,
 551                                          int height, intptr_t mx, intptr_t my, int width)
 552 {
 553     int x, y;
 554     pixel *src          = (pixel *)_src;
 555     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 556     pixel *dst          = (pixel *)_dst;
 557     ptrdiff_t dststride = _dststride / sizeof(pixel);
 558
 559     int shift = 14  + 1 - BIT_DEPTH;
 560 #if BIT_DEPTH < 14
 561     int offset = 1 << (shift - 1);
 562 #else
 563     int offset = 0;
 564 #endif
 565
 566     for (y = 0; y < height; y++) {
 567         for (x = 0; x < width; x++)
 568             dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
 569         src  += srcstride;
 570         dst  += dststride;
 571         src2 += MAX_PB_SIZE;
 572     }
 573 }
 574
 575 static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 576                                             int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
 577 {
 578     int x, y;
 579     pixel *src          = (pixel *)_src;
 580     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 581     pixel *dst          = (pixel *)_dst;
 582     ptrdiff_t dststride = _dststride / sizeof(pixel);
 583     int shift = denom + 14 - BIT_DEPTH;
 584 #if BIT_DEPTH < 14
 585     int offset = 1 << (shift - 1);
 586 #else
 587     int offset = 0;
 588 #endif
 589
 590     ox     = ox * (1 << (BIT_DEPTH - 8));
 591     for (y = 0; y < height; y++) {
 592         for (x = 0; x < width; x++)
 593             dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
 594         src += srcstride;
 595         dst += dststride;
 596     }
 597 }
 598
 599 static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 600                                            int16_t *src2,
 601                                            int height, int denom, int wx0, int wx1,
 602                                            int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 603 {
 604     int x, y;
 605     pixel *src          = (pixel *)_src;
 606     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 607     pixel *dst          = (pixel *)_dst;
 608     ptrdiff_t dststride = _dststride / sizeof(pixel);
 609
 610     int shift = 14  + 1 - BIT_DEPTH;
 611     int log2Wd = denom + shift - 1;
 612
 613     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 614     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 615     for (y = 0; y < height; y++) {
 616         for (x = 0; x < width; x++) {
 617             dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 618         }
 619         src  += srcstride;
 620         dst  += dststride;
 621         src2 += MAX_PB_SIZE;
 622     }
 623 }
 624
 625 ////////////////////////////////////////////////////////////////////////////////
 626 //
 627 ////////////////////////////////////////////////////////////////////////////////
 628 #define QPEL_FILTER(src, stride)                                               \
 629     (filter[0] * src[x - 3 * stride] +                                         \
 630      filter[1] * src[x - 2 * stride] +                                         \
 631      filter[2] * src[x -     stride] +                                         \
 632      filter[3] * src[x             ] +                                         \
 633      filter[4] * src[x +     stride] +                                         \
 634      filter[5] * src[x + 2 * stride] +                                         \
 635      filter[6] * src[x + 3 * stride] +                                         \
 636      filter[7] * src[x + 4 * stride])
 637
 638 static void FUNC(put_hevc_qpel_h)(int16_t *dst,
 639                                   uint8_t *_src, ptrdiff_t _srcstride,
 640                                   int height, intptr_t mx, intptr_t my, int width)
 641 {
 642     int x, y;
 643     pixel        *src       = (pixel*)_src;
 644     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 645     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 646     for (y = 0; y < height; y++) {
 647         for (x = 0; x < width; x++)
 648             dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 649         src += srcstride;
 650         dst += MAX_PB_SIZE;
 651     }
 652 }
 653
 654 static void FUNC(put_hevc_qpel_v)(int16_t *dst,
 655                                   uint8_t *_src, ptrdiff_t _srcstride,
 656                                   int height, intptr_t mx, intptr_t my, int width)
 657 {
 658     int x, y;
 659     pixel        *src       = (pixel*)_src;
 660     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 661     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 662     for (y = 0; y < height; y++)  {
 663         for (x = 0; x < width; x++)
 664             dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
 665         src += srcstride;
 666         dst += MAX_PB_SIZE;
 667     }
 668 }
 669
 670 static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
 671                                    uint8_t *_src,
 672                                    ptrdiff_t _srcstride,
 673                                    int height, intptr_t mx,
 674                                    intptr_t my, int width)
 675 {
 676     int x, y;
 677     const int8_t *filter;
 678     pixel *src = (pixel*)_src;
 679     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 680     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 681     int16_t *tmp = tmp_array;
 682
 683     src   -= QPEL_EXTRA_BEFORE * srcstride;
 684     filter = ff_hevc_qpel_filters[mx - 1];
 685     for (y = 0; y < height + QPEL_EXTRA; y++) {
 686         for (x = 0; x < width; x++)
 687             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 688         src += srcstride;
 689         tmp += MAX_PB_SIZE;
 690     }
 691
 692     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 693     filter = ff_hevc_qpel_filters[my - 1];
 694     for (y = 0; y < height; y++) {
 695         for (x = 0; x < width; x++)
 696             dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
 697         tmp += MAX_PB_SIZE;
 698         dst += MAX_PB_SIZE;
 699     }
 700 }
 701
 702 static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 703                                       uint8_t *_src, ptrdiff_t _srcstride,
 704                                       int height, intptr_t mx, intptr_t my, int width)
 705 {
 706     int x, y;
 707     pixel        *src       = (pixel*)_src;
 708     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 709     pixel *dst          = (pixel *)_dst;
 710     ptrdiff_t dststride = _dststride / sizeof(pixel);
 711     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 712     int shift = 14 - BIT_DEPTH;
 713
 714 #if BIT_DEPTH < 14
 715     int offset = 1 << (shift - 1);
 716 #else
 717     int offset = 0;
 718 #endif
 719
 720     for (y = 0; y < height; y++) {
 721         for (x = 0; x < width; x++)
 722             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
 723         src += srcstride;
 724         dst += dststride;
 725     }
 726 }
 727
 728 static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 729                                      int16_t *src2,
 730                                      int height, intptr_t mx, intptr_t my, int width)
 731 {
 732     int x, y;
 733     pixel        *src       = (pixel*)_src;
 734     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 735     pixel *dst          = (pixel *)_dst;
 736     ptrdiff_t dststride = _dststride / sizeof(pixel);
 737
 738     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 739
 740     int shift = 14  + 1 - BIT_DEPTH;
 741 #if BIT_DEPTH < 14
 742     int offset = 1 << (shift - 1);
 743 #else
 744     int offset = 0;
 745 #endif
 746
 747     for (y = 0; y < height; y++) {
 748         for (x = 0; x < width; x++)
 749             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 750         src  += srcstride;
 751         dst  += dststride;
 752         src2 += MAX_PB_SIZE;
 753     }
 754 }
 755
 756 static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 757                                      uint8_t *_src, ptrdiff_t _srcstride,
 758                                      int height, intptr_t mx, intptr_t my, int width)
 759 {
 760     int x, y;
 761     pixel        *src       = (pixel*)_src;
 762     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 763     pixel *dst          = (pixel *)_dst;
 764     ptrdiff_t dststride = _dststride / sizeof(pixel);
 765     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 766     int shift = 14 - BIT_DEPTH;
 767
 768 #if BIT_DEPTH < 14
 769     int offset = 1 << (shift - 1);
 770 #else
 771     int offset = 0;
 772 #endif
 773
 774     for (y = 0; y < height; y++) {
 775         for (x = 0; x < width; x++)
 776             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
 777         src += srcstride;
 778         dst += dststride;
 779     }
 780 }
 781
 782
 783 static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 784                                      int16_t *src2,
 785                                      int height, intptr_t mx, intptr_t my, int width)
 786 {
 787     int x, y;
 788     pixel        *src       = (pixel*)_src;
 789     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 790     pixel *dst          = (pixel *)_dst;
 791     ptrdiff_t dststride = _dststride / sizeof(pixel);
 792
 793     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 794
 795     int shift = 14 + 1 - BIT_DEPTH;
 796 #if BIT_DEPTH < 14
 797     int offset = 1 << (shift - 1);
 798 #else
 799     int offset = 0;
 800 #endif
 801
 802     for (y = 0; y < height; y++) {
 803         for (x = 0; x < width; x++)
 804             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 805         src  += srcstride;
 806         dst  += dststride;
 807         src2 += MAX_PB_SIZE;
 808     }
 809 }
 810
 811 static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 812                                        uint8_t *_src, ptrdiff_t _srcstride,
 813                                        int height, intptr_t mx, intptr_t my, int width)
 814 {
 815     int x, y;
 816     const int8_t *filter;
 817     pixel *src = (pixel*)_src;
 818     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 819     pixel *dst          = (pixel *)_dst;
 820     ptrdiff_t dststride = _dststride / sizeof(pixel);
 821     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 822     int16_t *tmp = tmp_array;
 823     int shift =  14 - BIT_DEPTH;
 824
 825 #if BIT_DEPTH < 14
 826     int offset = 1 << (shift - 1);
 827 #else
 828     int offset = 0;
 829 #endif
 830
 831     src   -= QPEL_EXTRA_BEFORE * srcstride;
 832     filter = ff_hevc_qpel_filters[mx - 1];
 833     for (y = 0; y < height + QPEL_EXTRA; y++) {
 834         for (x = 0; x < width; x++)
 835             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 836         src += srcstride;
 837         tmp += MAX_PB_SIZE;
 838     }
 839
 840     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 841     filter = ff_hevc_qpel_filters[my - 1];
 842
 843     for (y = 0; y < height; y++) {
 844         for (x = 0; x < width; x++)
 845             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
 846         tmp += MAX_PB_SIZE;
 847         dst += dststride;
 848     }
 849 }
 850
 851 static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 852                                       int16_t *src2,
 853                                       int height, intptr_t mx, intptr_t my, int width)
 854 {
 855     int x, y;
 856     const int8_t *filter;
 857     pixel *src = (pixel*)_src;
 858     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 859     pixel *dst          = (pixel *)_dst;
 860     ptrdiff_t dststride = _dststride / sizeof(pixel);
 861     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 862     int16_t *tmp = tmp_array;
 863     int shift = 14 + 1 - BIT_DEPTH;
 864 #if BIT_DEPTH < 14
 865     int offset = 1 << (shift - 1);
 866 #else
 867     int offset = 0;
 868 #endif
 869
 870     src   -= QPEL_EXTRA_BEFORE * srcstride;
 871     filter = ff_hevc_qpel_filters[mx - 1];
 872     for (y = 0; y < height + QPEL_EXTRA; y++) {
 873         for (x = 0; x < width; x++)
 874             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 875         src += srcstride;
 876         tmp += MAX_PB_SIZE;
 877     }
 878
 879     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 880     filter = ff_hevc_qpel_filters[my - 1];
 881
 882     for (y = 0; y < height; y++) {
 883         for (x = 0; x < width; x++)
 884             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
 885         tmp  += MAX_PB_SIZE;
 886         dst  += dststride;
 887         src2 += MAX_PB_SIZE;
 888     }
 889 }
 890
 891 static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 892                                         uint8_t *_src, ptrdiff_t _srcstride,
 893                                         int height, int denom, int wx, int ox,
 894                                         intptr_t mx, intptr_t my, int width)
 895 {
 896     int x, y;
 897     pixel        *src       = (pixel*)_src;
 898     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 899     pixel *dst          = (pixel *)_dst;
 900     ptrdiff_t dststride = _dststride / sizeof(pixel);
 901     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 902     int shift = denom + 14 - BIT_DEPTH;
 903 #if BIT_DEPTH < 14
 904     int offset = 1 << (shift - 1);
 905 #else
 906     int offset = 0;
 907 #endif
 908
 909     ox = ox * (1 << (BIT_DEPTH - 8));
 910     for (y = 0; y < height; y++) {
 911         for (x = 0; x < width; x++)
 912             dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 913         src += srcstride;
 914         dst += dststride;
 915     }
 916 }
 917
 918 static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 919                                        int16_t *src2,
 920                                        int height, int denom, int wx0, int wx1,
 921                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 922 {
 923     int x, y;
 924     pixel        *src       = (pixel*)_src;
 925     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 926     pixel *dst          = (pixel *)_dst;
 927     ptrdiff_t dststride = _dststride / sizeof(pixel);
 928
 929     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 930
 931     int shift = 14  + 1 - BIT_DEPTH;
 932     int log2Wd = denom + shift - 1;
 933
 934     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 935     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 936     for (y = 0; y < height; y++) {
 937         for (x = 0; x < width; x++)
 938             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 939                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 940         src  += srcstride;
 941         dst  += dststride;
 942         src2 += MAX_PB_SIZE;
 943     }
 944 }
 945
 946 static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 947                                         uint8_t *_src, ptrdiff_t _srcstride,
 948                                         int height, int denom, int wx, int ox,
 949                                         intptr_t mx, intptr_t my, int width)
 950 {
 951     int x, y;
 952     pixel        *src       = (pixel*)_src;
 953     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 954     pixel *dst          = (pixel *)_dst;
 955     ptrdiff_t dststride = _dststride / sizeof(pixel);
 956     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 957     int shift = denom + 14 - BIT_DEPTH;
 958 #if BIT_DEPTH < 14
 959     int offset = 1 << (shift - 1);
 960 #else
 961     int offset = 0;
 962 #endif
 963
 964     ox = ox * (1 << (BIT_DEPTH - 8));
 965     for (y = 0; y < height; y++) {
 966         for (x = 0; x < width; x++)
 967             dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 968         src += srcstride;
 969         dst += dststride;
 970     }
 971 }
 972
 973 static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 974                                        int16_t *src2,
 975                                        int height, int denom, int wx0, int wx1,
 976                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 977 {
 978     int x, y;
 979     pixel        *src       = (pixel*)_src;
 980     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 981     pixel *dst          = (pixel *)_dst;
 982     ptrdiff_t dststride = _dststride / sizeof(pixel);
 983
 984     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 985
 986     int shift = 14 + 1 - BIT_DEPTH;
 987     int log2Wd = denom + shift - 1;
 988
 989     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 990     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 991     for (y = 0; y < height; y++) {
 992         for (x = 0; x < width; x++)
 993             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 994                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 995         src  += srcstride;
 996         dst  += dststride;
 997         src2 += MAX_PB_SIZE;
 998     }
 999 }
1000
1001 static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
1002                                          uint8_t *_src, ptrdiff_t _srcstride,
1003                                          int height, int denom, int wx, int ox,
1004                                          intptr_t mx, intptr_t my, int width)
1005 {
1006     int x, y;
1007     const int8_t *filter;
1008     pixel *src = (pixel*)_src;
1009     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1010     pixel *dst          = (pixel *)_dst;
1011     ptrdiff_t dststride = _dststride / sizeof(pixel);
1012     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1013     int16_t *tmp = tmp_array;
1014     int shift = denom + 14 - BIT_DEPTH;
1015 #if BIT_DEPTH < 14
1016     int offset = 1 << (shift - 1);
1017 #else
1018     int offset = 0;
1019 #endif
1020
1021     src   -= QPEL_EXTRA_BEFORE * srcstride;
1022     filter = ff_hevc_qpel_filters[mx - 1];
1023     for (y = 0; y < height + QPEL_EXTRA; y++) {
1024         for (x = 0; x < width; x++)
1025             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1026         src += srcstride;
1027         tmp += MAX_PB_SIZE;
1028     }
1029
1030     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1031     filter = ff_hevc_qpel_filters[my - 1];
1032
1033     ox = ox * (1 << (BIT_DEPTH - 8));
1034     for (y = 0; y < height; y++) {
1035         for (x = 0; x < width; x++)
1036             dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1037         tmp += MAX_PB_SIZE;
1038         dst += dststride;
1039     }
1040 }
1041
1042 static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1043                                         int16_t *src2,
1044                                         int height, int denom, int wx0, int wx1,
1045                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1046 {
1047     int x, y;
1048     const int8_t *filter;
1049     pixel *src = (pixel*)_src;
1050     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1051     pixel *dst          = (pixel *)_dst;
1052     ptrdiff_t dststride = _dststride / sizeof(pixel);
1053     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1054     int16_t *tmp = tmp_array;
1055     int shift = 14 + 1 - BIT_DEPTH;
1056     int log2Wd = denom + shift - 1;
1057
1058     src   -= QPEL_EXTRA_BEFORE * srcstride;
1059     filter = ff_hevc_qpel_filters[mx - 1];
1060     for (y = 0; y < height + QPEL_EXTRA; y++) {
1061         for (x = 0; x < width; x++)
1062             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1063         src += srcstride;
1064         tmp += MAX_PB_SIZE;
1065     }
1066
1067     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1068     filter = ff_hevc_qpel_filters[my - 1];
1069
1070     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1071     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1072     for (y = 0; y < height; y++) {
1073         for (x = 0; x < width; x++)
1074             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1075                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1076         tmp  += MAX_PB_SIZE;
1077         dst  += dststride;
1078         src2 += MAX_PB_SIZE;
1079     }
1080 }
1081
1082 ////////////////////////////////////////////////////////////////////////////////
1083 //
1084 ////////////////////////////////////////////////////////////////////////////////
1085 #define EPEL_FILTER(src, stride)                                               \
1086     (filter[0] * src[x - stride] +                                             \
1087      filter[1] * src[x]          +                                             \
1088      filter[2] * src[x + stride] +                                             \
1089      filter[3] * src[x + 2 * stride])
1090
1091 static void FUNC(put_hevc_epel_h)(int16_t *dst,
1092                                   uint8_t *_src, ptrdiff_t _srcstride,
1093                                   int height, intptr_t mx, intptr_t my, int width)
1094 {
1095     int x, y;
1096     pixel *src = (pixel *)_src;
1097     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1098     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1099     for (y = 0; y < height; y++) {
1100         for (x = 0; x < width; x++)
1101             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1102         src += srcstride;
1103         dst += MAX_PB_SIZE;
1104     }
1105 }
1106
1107 static void FUNC(put_hevc_epel_v)(int16_t *dst,
1108                                   uint8_t *_src, ptrdiff_t _srcstride,
1109                                   int height, intptr_t mx, intptr_t my, int width)
1110 {
1111     int x, y;
1112     pixel *src = (pixel *)_src;
1113     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1114     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1115
1116     for (y = 0; y < height; y++) {
1117         for (x = 0; x < width; x++)
1118             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1119         src += srcstride;
1120         dst += MAX_PB_SIZE;
1121     }
1122 }
1123
1124 static void FUNC(put_hevc_epel_hv)(int16_t *dst,
1125                                    uint8_t *_src, ptrdiff_t _srcstride,
1126                                    int height, intptr_t mx, intptr_t my, int width)
1127 {
1128     int x, y;
1129     pixel *src = (pixel *)_src;
1130     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1131     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1132     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1133     int16_t *tmp = tmp_array;
1134
1135     src -= EPEL_EXTRA_BEFORE * srcstride;
1136
1137     for (y = 0; y < height + EPEL_EXTRA; y++) {
1138         for (x = 0; x < width; x++)
1139             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1140         src += srcstride;
1141         tmp += MAX_PB_SIZE;
1142     }
1143
1144     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1145     filter = ff_hevc_epel_filters[my - 1];
1146
1147     for (y = 0; y < height; y++) {
1148         for (x = 0; x < width; x++)
1149             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1150         tmp += MAX_PB_SIZE;
1151         dst += MAX_PB_SIZE;
1152     }
1153 }
1154
1155 static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1156                                       int height, intptr_t mx, intptr_t my, int width)
1157 {
1158     int x, y;
1159     pixel *src = (pixel *)_src;
1160     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1161     pixel *dst          = (pixel *)_dst;
1162     ptrdiff_t dststride = _dststride / sizeof(pixel);
1163     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1164     int shift = 14 - BIT_DEPTH;
1165 #if BIT_DEPTH < 14
1166     int offset = 1 << (shift - 1);
1167 #else
1168     int offset = 0;
1169 #endif
1170
1171     for (y = 0; y < height; y++) {
1172         for (x = 0; x < width; x++)
1173             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
1174         src += srcstride;
1175         dst += dststride;
1176     }
1177 }
1178
1179 static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1180                                      int16_t *src2,
1181                                      int height, intptr_t mx, intptr_t my, int width)
1182 {
1183     int x, y;
1184     pixel *src = (pixel *)_src;
1185     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1186     pixel *dst          = (pixel *)_dst;
1187     ptrdiff_t dststride = _dststride / sizeof(pixel);
1188     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1189     int shift = 14 + 1 - BIT_DEPTH;
1190 #if BIT_DEPTH < 14
1191     int offset = 1 << (shift - 1);
1192 #else
1193     int offset = 0;
1194 #endif
1195
1196     for (y = 0; y < height; y++) {
1197         for (x = 0; x < width; x++) {
1198             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1199         }
1200         dst  += dststride;
1201         src  += srcstride;
1202         src2 += MAX_PB_SIZE;
1203     }
1204 }
1205
1206 static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1207                                       int height, intptr_t mx, intptr_t my, int width)
1208 {
1209     int x, y;
1210     pixel *src = (pixel *)_src;
1211     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1212     pixel *dst          = (pixel *)_dst;
1213     ptrdiff_t dststride = _dststride / sizeof(pixel);
1214     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1215     int shift = 14 - BIT_DEPTH;
1216 #if BIT_DEPTH < 14
1217     int offset = 1 << (shift - 1);
1218 #else
1219     int offset = 0;
1220 #endif
1221
1222     for (y = 0; y < height; y++) {
1223         for (x = 0; x < width; x++)
1224             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
1225         src += srcstride;
1226         dst += dststride;
1227     }
1228 }
1229
1230 static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1231                                      int16_t *src2,
1232                                      int height, intptr_t mx, intptr_t my, int width)
1233 {
1234     int x, y;
1235     pixel *src = (pixel *)_src;
1236     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1237     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1238     pixel *dst          = (pixel *)_dst;
1239     ptrdiff_t dststride = _dststride / sizeof(pixel);
1240     int shift = 14 + 1 - BIT_DEPTH;
1241 #if BIT_DEPTH < 14
1242     int offset = 1 << (shift - 1);
1243 #else
1244     int offset = 0;
1245 #endif
1246
1247     for (y = 0; y < height; y++) {
1248         for (x = 0; x < width; x++)
1249             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1250         dst  += dststride;
1251         src  += srcstride;
1252         src2 += MAX_PB_SIZE;
1253     }
1254 }
1255
1256 static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1257                                        int height, intptr_t mx, intptr_t my, int width)
1258 {
1259     int x, y;
1260     pixel *src = (pixel *)_src;
1261     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1262     pixel *dst          = (pixel *)_dst;
1263     ptrdiff_t dststride = _dststride / sizeof(pixel);
1264     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1265     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1266     int16_t *tmp = tmp_array;
1267     int shift = 14 - BIT_DEPTH;
1268 #if BIT_DEPTH < 14
1269     int offset = 1 << (shift - 1);
1270 #else
1271     int offset = 0;
1272 #endif
1273
1274     src -= EPEL_EXTRA_BEFORE * srcstride;
1275
1276     for (y = 0; y < height + EPEL_EXTRA; y++) {
1277         for (x = 0; x < width; x++)
1278             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1279         src += srcstride;
1280         tmp += MAX_PB_SIZE;
1281     }
1282
1283     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1284     filter = ff_hevc_epel_filters[my - 1];
1285
1286     for (y = 0; y < height; y++) {
1287         for (x = 0; x < width; x++)
1288             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
1289         tmp += MAX_PB_SIZE;
1290         dst += dststride;
1291     }
1292 }
1293
1294 static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1295                                       int16_t *src2,
1296                                       int height, intptr_t mx, intptr_t my, int width)
1297 {
1298     int x, y;
1299     pixel *src = (pixel *)_src;
1300     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1301     pixel *dst          = (pixel *)_dst;
1302     ptrdiff_t dststride = _dststride / sizeof(pixel);
1303     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1304     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1305     int16_t *tmp = tmp_array;
1306     int shift = 14 + 1 - BIT_DEPTH;
1307 #if BIT_DEPTH < 14
1308     int offset = 1 << (shift - 1);
1309 #else
1310     int offset = 0;
1311 #endif
1312
1313     src -= EPEL_EXTRA_BEFORE * srcstride;
1314
1315     for (y = 0; y < height + EPEL_EXTRA; y++) {
1316         for (x = 0; x < width; x++)
1317             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1318         src += srcstride;
1319         tmp += MAX_PB_SIZE;
1320     }
1321
1322     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1323     filter = ff_hevc_epel_filters[my - 1];
1324
1325     for (y = 0; y < height; y++) {
1326         for (x = 0; x < width; x++)
1327             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
1328         tmp  += MAX_PB_SIZE;
1329         dst  += dststride;
1330         src2 += MAX_PB_SIZE;
1331     }
1332 }
1333
1334 static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1335                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1336 {
1337     int x, y;
1338     pixel *src = (pixel *)_src;
1339     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1340     pixel *dst          = (pixel *)_dst;
1341     ptrdiff_t dststride = _dststride / sizeof(pixel);
1342     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1343     int shift = denom + 14 - BIT_DEPTH;
1344 #if BIT_DEPTH < 14
1345     int offset = 1 << (shift - 1);
1346 #else
1347     int offset = 0;
1348 #endif
1349
1350     ox     = ox * (1 << (BIT_DEPTH - 8));
1351     for (y = 0; y < height; y++) {
1352         for (x = 0; x < width; x++) {
1353             dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1354         }
1355         dst += dststride;
1356         src += srcstride;
1357     }
1358 }
1359
1360 static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1361                                        int16_t *src2,
1362                                        int height, int denom, int wx0, int wx1,
1363                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1364 {
1365     int x, y;
1366     pixel *src = (pixel *)_src;
1367     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1368     pixel *dst          = (pixel *)_dst;
1369     ptrdiff_t dststride = _dststride / sizeof(pixel);
1370     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1371     int shift = 14 + 1 - BIT_DEPTH;
1372     int log2Wd = denom + shift - 1;
1373
1374     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1375     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1376     for (y = 0; y < height; y++) {
1377         for (x = 0; x < width; x++)
1378             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1379                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1380         src  += srcstride;
1381         dst  += dststride;
1382         src2 += MAX_PB_SIZE;
1383     }
1384 }
1385
1386 static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1387                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1388 {
1389     int x, y;
1390     pixel *src = (pixel *)_src;
1391     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1392     pixel *dst          = (pixel *)_dst;
1393     ptrdiff_t dststride = _dststride / sizeof(pixel);
1394     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1395     int shift = denom + 14 - BIT_DEPTH;
1396 #if BIT_DEPTH < 14
1397     int offset = 1 << (shift - 1);
1398 #else
1399     int offset = 0;
1400 #endif
1401
1402     ox     = ox * (1 << (BIT_DEPTH - 8));
1403     for (y = 0; y < height; y++) {
1404         for (x = 0; x < width; x++) {
1405             dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1406         }
1407         dst += dststride;
1408         src += srcstride;
1409     }
1410 }
1411
1412 static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1413                                        int16_t *src2,
1414                                        int height, int denom, int wx0, int wx1,
1415                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1416 {
1417     int x, y;
1418     pixel *src = (pixel *)_src;
1419     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1420     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1421     pixel *dst          = (pixel *)_dst;
1422     ptrdiff_t dststride = _dststride / sizeof(pixel);
1423     int shift = 14 + 1 - BIT_DEPTH;
1424     int log2Wd = denom + shift - 1;
1425
1426     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1427     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1428     for (y = 0; y < height; y++) {
1429         for (x = 0; x < width; x++)
1430             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1431                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1432         src  += srcstride;
1433         dst  += dststride;
1434         src2 += MAX_PB_SIZE;
1435     }
1436 }
1437
1438 static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1439                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1440 {
1441     int x, y;
1442     pixel *src = (pixel *)_src;
1443     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1444     pixel *dst          = (pixel *)_dst;
1445     ptrdiff_t dststride = _dststride / sizeof(pixel);
1446     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1447     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1448     int16_t *tmp = tmp_array;
1449     int shift = denom + 14 - BIT_DEPTH;
1450 #if BIT_DEPTH < 14
1451     int offset = 1 << (shift - 1);
1452 #else
1453     int offset = 0;
1454 #endif
1455
1456     src -= EPEL_EXTRA_BEFORE * srcstride;
1457
1458     for (y = 0; y < height + EPEL_EXTRA; y++) {
1459         for (x = 0; x < width; x++)
1460             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1461         src += srcstride;
1462         tmp += MAX_PB_SIZE;
1463     }
1464
1465     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1466     filter = ff_hevc_epel_filters[my - 1];
1467
1468     ox     = ox * (1 << (BIT_DEPTH - 8));
1469     for (y = 0; y < height; y++) {
1470         for (x = 0; x < width; x++)
1471             dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1472         tmp += MAX_PB_SIZE;
1473         dst += dststride;
1474     }
1475 }
1476
1477 static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1478                                         int16_t *src2,
1479                                         int height, int denom, int wx0, int wx1,
1480                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1481 {
1482     int x, y;
1483     pixel *src = (pixel *)_src;
1484     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1485     pixel *dst          = (pixel *)_dst;
1486     ptrdiff_t dststride = _dststride / sizeof(pixel);
1487     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1488     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1489     int16_t *tmp = tmp_array;
1490     int shift = 14 + 1 - BIT_DEPTH;
1491     int log2Wd = denom + shift - 1;
1492
1493     src -= EPEL_EXTRA_BEFORE * srcstride;
1494
1495     for (y = 0; y < height + EPEL_EXTRA; y++) {
1496         for (x = 0; x < width; x++)
1497             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1498         src += srcstride;
1499         tmp += MAX_PB_SIZE;
1500     }
1501
1502     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1503     filter = ff_hevc_epel_filters[my - 1];
1504
1505     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1506     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1507     for (y = 0; y < height; y++) {
1508         for (x = 0; x < width; x++)
1509             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1510                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1511         tmp  += MAX_PB_SIZE;
1512         dst  += dststride;
1513         src2 += MAX_PB_SIZE;
1514     }
1515 }// line zero
1516 #define P3 pix[-4 * xstride]
1517 #define P2 pix[-3 * xstride]
1518 #define P1 pix[-2 * xstride]
1519 #define P0 pix[-1 * xstride]
1520 #define Q0 pix[0 * xstride]
1521 #define Q1 pix[1 * xstride]
1522 #define Q2 pix[2 * xstride]
1523 #define Q3 pix[3 * xstride]
1524
1525 // line three. used only for deblocking decision
1526 #define TP3 pix[-4 * xstride + 3 * ystride]
1527 #define TP2 pix[-3 * xstride + 3 * ystride]
1528 #define TP1 pix[-2 * xstride + 3 * ystride]
1529 #define TP0 pix[-1 * xstride + 3 * ystride]
1530 #define TQ0 pix[0  * xstride + 3 * ystride]
1531 #define TQ1 pix[1  * xstride + 3 * ystride]
1532 #define TQ2 pix[2  * xstride + 3 * ystride]
1533 #define TQ3 pix[3  * xstride + 3 * ystride]
1534
1535 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1536                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
1537                                         int beta, int *_tc,
1538                                         uint8_t *_no_p, uint8_t *_no_q)
1539 {
1540     int d, j;
1541     pixel *pix        = (pixel *)_pix;
1542     ptrdiff_t xstride = _xstride / sizeof(pixel);
1543     ptrdiff_t ystride = _ystride / sizeof(pixel);
1544
1545     beta <<= BIT_DEPTH - 8;
1546
1547     for (j = 0; j < 2; j++) {
1548         const int dp0  = abs(P2  - 2 * P1  + P0);
1549         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1550         const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1551         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1552         const int d0   = dp0 + dq0;
1553         const int d3   = dp3 + dq3;
1554         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1555         const int no_p = _no_p[j];
1556         const int no_q = _no_q[j];
1557
1558         if (d0 + d3 >= beta) {
1559             pix += 4 * ystride;
1560             continue;
1561         } else {
1562             const int beta_3 = beta >> 3;
1563             const int beta_2 = beta >> 2;
1564             const int tc25   = ((tc * 5 + 1) >> 1);
1565
1566             if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1567                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1568                                       (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1569                 // strong filtering
1570                 const int tc2 = tc << 1;
1571                 for (d = 0; d < 4; d++) {
1572                     const int p3 = P3;
1573                     const int p2 = P2;
1574                     const int p1 = P1;
1575                     const int p0 = P0;
1576                     const int q0 = Q0;
1577                     const int q1 = Q1;
1578                     const int q2 = Q2;
1579                     const int q3 = Q3;
1580                     if (!no_p) {
1581                         P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1582                         P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1583                         P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1584                     }
1585                     if (!no_q) {
1586                         Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1587                         Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1588                         Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1589                     }
1590                     pix += ystride;
1591                 }
1592             } else { // normal filtering
1593                 int nd_p = 1;
1594                 int nd_q = 1;
1595                 const int tc_2 = tc >> 1;
1596                 if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1597                     nd_p = 2;
1598                 if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1599                     nd_q = 2;
1600
1601                 for (d = 0; d < 4; d++) {
1602                     const int p2 = P2;
1603                     const int p1 = P1;
1604                     const int p0 = P0;
1605                     const int q0 = Q0;
1606                     const int q1 = Q1;
1607                     const int q2 = Q2;
1608                     int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1609                     if (abs(delta0) < 10 * tc) {
1610                         delta0 = av_clip(delta0, -tc, tc);
1611                         if (!no_p)
1612                             P0 = av_clip_pixel(p0 + delta0);
1613                         if (!no_q)
1614                             Q0 = av_clip_pixel(q0 - delta0);
1615                         if (!no_p && nd_p > 1) {
1616                             const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1617                             P1 = av_clip_pixel(p1 + deltap1);
1618                         }
1619                         if (!no_q && nd_q > 1) {
1620                             const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1621                             Q1 = av_clip_pixel(q1 + deltaq1);
1622                         }
1623                     }
1624                     pix += ystride;
1625                 }
1626             }
1627         }
1628     }
1629 }
1630
1631 static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1632                                           ptrdiff_t _ystride, int *_tc,
1633                                           uint8_t *_no_p, uint8_t *_no_q)
1634 {
1635     int d, j, no_p, no_q;
1636     pixel *pix        = (pixel *)_pix;
1637     ptrdiff_t xstride = _xstride / sizeof(pixel);
1638     ptrdiff_t ystride = _ystride / sizeof(pixel);
1639
1640     for (j = 0; j < 2; j++) {
1641         const int tc = _tc[j] << (BIT_DEPTH - 8);
1642         if (tc <= 0) {
1643             pix += 4 * ystride;
1644             continue;
1645         }
1646         no_p = _no_p[j];
1647         no_q = _no_q[j];
1648
1649         for (d = 0; d < 4; d++) {
1650             int delta0;
1651             const int p1 = P1;
1652             const int p0 = P0;
1653             const int q0 = Q0;
1654             const int q1 = Q1;
1655             delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1656             if (!no_p)
1657                 P0 = av_clip_pixel(p0 + delta0);
1658             if (!no_q)
1659                 Q0 = av_clip_pixel(q0 - delta0);
1660             pix += ystride;
1661         }
1662     }
1663 }
1664
1665 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1666                                             int32_t *tc, uint8_t *no_p,
1667                                             uint8_t *no_q)
1668 {
1669     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1670 }
1671
1672 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1673                                             int32_t *tc, uint8_t *no_p,
1674                                             uint8_t *no_q)
1675 {
1676     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1677 }
1678
1679 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1680                                           int beta, int32_t *tc, uint8_t *no_p,
1681                                           uint8_t *no_q)
1682 {
1683     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1684                                 beta, tc, no_p, no_q);
1685 }
1686
1687 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1688                                           int beta, int32_t *tc, uint8_t *no_p,
1689                                           uint8_t *no_q)
1690 {
1691     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1692                                 beta, tc, no_p, no_q);
1693 }
1694
1695 #undef P3
1696 #undef P2
1697 #undef P1
1698 #undef P0
1699 #undef Q0
1700 #undef Q1
1701 #undef Q2
1702 #undef Q3
1703
1704 #undef TP3
1705 #undef TP2
1706 #undef TP1
1707 #undef TP0
1708 #undef TQ0
1709 #undef TQ1
1710 #undef TQ2
1711 #undef TQ3