git.sesse.net Git - ffmpeg/blob - libavcodec/hevcdsp_template.c

   1 /*
   2  * HEVC video decoder
   3  *
   4  * Copyright (C) 2012 - 2013 Guillaume Martres
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "get_bits.h"
  24 #include "hevc.h"
  25
  26 #include "bit_depth_template.c"
  27 #include "hevcdsp.h"
  28
  29
  30 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
  31                           GetBitContext *gb, int pcm_bit_depth)
  32 {
  33     int x, y;
  34     pixel *dst = (pixel *)_dst;
  35
  36     stride /= sizeof(pixel);
  37
  38     for (y = 0; y < height; y++) {
  39         for (x = 0; x < width; x++)
  40             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
  41         dst += stride;
  42     }
  43 }
  44
  45 static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
  46                                                      ptrdiff_t stride, int size)
  47 {
  48     int x, y;
  49     pixel *dst = (pixel *)_dst;
  50
  51     stride /= sizeof(pixel);
  52
  53     for (y = 0; y < size; y++) {
  54         for (x = 0; x < size; x++) {
  55             dst[x] = av_clip_pixel(dst[x] + *coeffs);
  56             coeffs++;
  57         }
  58         dst += stride;
  59     }
  60 }
  61
  62 static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
  63                                        ptrdiff_t stride)
  64 {
  65     FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
  66 }
  67
  68 static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
  69                                        ptrdiff_t stride)
  70 {
  71     FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
  72 }
  73
  74 static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
  75                                          ptrdiff_t stride)
  76 {
  77     FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
  78 }
  79
  80 static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
  81                                          ptrdiff_t stride)
  82 {
  83     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
  84 }
  85
  86
  87 static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
  88 {
  89     int16_t *coeffs = (int16_t *) _coeffs;
  90     int x, y;
  91     int size = 1 << log2_size;
  92
  93     if (mode) {
  94         coeffs += size;
  95         for (y = 0; y < size - 1; y++) {
  96             for (x = 0; x < size; x++)
  97                 coeffs[x] += coeffs[x - size];
  98             coeffs += size;
  99         }
 100     } else {
 101         for (y = 0; y < size; y++) {
 102             for (x = 1; x < size; x++)
 103                 coeffs[x] += coeffs[x - 1];
 104             coeffs += size;
 105         }
 106     }
 107 }
 108
 109 static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
 110 {
 111     int shift  = 15 - BIT_DEPTH - log2_size;
 112     int x, y;
 113     int size = 1 << log2_size;
 114     int16_t *coeffs = _coeffs;
 115
 116
 117     if (shift > 0) {
 118         int offset = 1 << (shift - 1);
 119         for (y = 0; y < size; y++) {
 120             for (x = 0; x < size; x++) {
 121                 *coeffs = (*coeffs + offset) >> shift;
 122                 coeffs++;
 123             }
 124         }
 125     } else {
 126         for (y = 0; y < size; y++) {
 127             for (x = 0; x < size; x++) {
 128                 *coeffs = *coeffs << -shift;
 129                 coeffs++;
 130             }
 131         }
 132     }
 133 }
 134
 135 #define SET(dst, x)   (dst) = (x)
 136 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
 137 #define ADD_AND_SCALE(dst, x)                                           \
 138     (dst) = av_clip_pixel((dst) + av_clip_int16(((x) + add) >> shift))
 139
 140 #define TR_4x4_LUMA(dst, src, step, assign)                             \
 141     do {                                                                \
 142         int c0 = src[0 * step] + src[2 * step];                         \
 143         int c1 = src[2 * step] + src[3 * step];                         \
 144         int c2 = src[0 * step] - src[3 * step];                         \
 145         int c3 = 74 * src[1 * step];                                    \
 146                                                                         \
 147         assign(dst[2 * step], 74 * (src[0 * step] -                     \
 148                                     src[2 * step] +                     \
 149                                     src[3 * step]));                    \
 150         assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
 151         assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
 152         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
 153     } while (0)
 154
 155 static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 156 {
 157     int i;
 158     int shift    = 7;
 159     int add      = 1 << (shift - 1);
 160     int16_t *src = coeffs;
 161
 162     for (i = 0; i < 4; i++) {
 163         TR_4x4_LUMA(src, src, 4, SCALE);
 164         src++;
 165     }
 166
 167     shift = 20 - BIT_DEPTH;
 168     add   = 1 << (shift - 1);
 169     for (i = 0; i < 4; i++) {
 170         TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
 171         coeffs += 4;
 172     }
 173 }
 174
 175 #undef TR_4x4_LUMA
 176
 177 #define TR_4(dst, src, dstep, sstep, assign, end)                              \
 178     do {                                                                       \
 179         const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
 180         const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
 181         const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
 182         const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
 183                                                                                \
 184         assign(dst[0 * dstep], e0 + o0);                                       \
 185         assign(dst[1 * dstep], e1 + o1);                                       \
 186         assign(dst[2 * dstep], e1 - o1);                                       \
 187         assign(dst[3 * dstep], e0 - o0);                                       \
 188     } while (0)
 189
 190 #define TR_8(dst, src, dstep, sstep, assign, end)                              \
 191     do {                                                                       \
 192         int i, j;                                                              \
 193         int e_8[4];                                                            \
 194         int o_8[4] = { 0 };                                                    \
 195         for (i = 0; i < 4; i++)                                                \
 196             for (j = 1; j < end; j += 2)                                       \
 197                 o_8[i] += transform[4 * j][i] * src[j * sstep];                \
 198         TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
 199                                                                                \
 200         for (i = 0; i < 4; i++) {                                              \
 201             assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
 202             assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
 203         }                                                                      \
 204     } while (0)
 205
 206 #define TR_16(dst, src, dstep, sstep, assign, end)                             \
 207     do {                                                                       \
 208         int i, j;                                                              \
 209         int e_16[8];                                                           \
 210         int o_16[8] = { 0 };                                                   \
 211         for (i = 0; i < 8; i++)                                                \
 212             for (j = 1; j < end; j += 2)                                       \
 213                 o_16[i] += transform[2 * j][i] * src[j * sstep];               \
 214         TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
 215                                                                                \
 216         for (i = 0; i < 8; i++) {                                              \
 217             assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
 218             assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
 219         }                                                                      \
 220     } while (0)
 221
 222 #define TR_32(dst, src, dstep, sstep, assign, end)                             \
 223     do {                                                                       \
 224         int i, j;                                                              \
 225         int e_32[16];                                                          \
 226         int o_32[16] = { 0 };                                                  \
 227         for (i = 0; i < 16; i++)                                               \
 228             for (j = 1; j < end; j += 2)                                       \
 229                 o_32[i] += transform[j][i] * src[j * sstep];                   \
 230         TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
 231                                                                                \
 232         for (i = 0; i < 16; i++) {                                             \
 233             assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
 234             assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
 235         }                                                                      \
 236     } while (0)
 237
 238 #define IDCT_VAR4(H)                                                          \
 239     int      limit2   = FFMIN(col_limit + 4, H)
 240 #define IDCT_VAR8(H)                                                          \
 241         int      limit   = FFMIN(col_limit, H);                               \
 242         int      limit2   = FFMIN(col_limit + 4, H)
 243 #define IDCT_VAR16(H)   IDCT_VAR8(H)
 244 #define IDCT_VAR32(H)   IDCT_VAR8(H)
 245
 246 #define IDCT(H)                                                              \
 247 static void FUNC(idct_##H ##x ##H )(                                         \
 248                    int16_t *coeffs, int col_limit) {                         \
 249     int i;                                                                   \
 250     int      shift   = 7;                                                    \
 251     int      add     = 1 << (shift - 1);                                     \
 252     int16_t *src     = coeffs;                                               \
 253     IDCT_VAR ##H(H);                                                         \
 254                                                                              \
 255     for (i = 0; i < H; i++) {                                                \
 256         TR_ ## H(src, src, H, H, SCALE, limit2);                             \
 257         if (limit2 < H && i%4 == 0 && !!i)                                   \
 258             limit2 -= 4;                                                     \
 259         src++;                                                               \
 260     }                                                                        \
 261                                                                              \
 262     shift   = 20 - BIT_DEPTH;                                                \
 263     add     = 1 << (shift - 1);                                              \
 264     for (i = 0; i < H; i++) {                                                \
 265         TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
 266         coeffs += H;                                                         \
 267     }                                                                        \
 268 }
 269
 270 #define IDCT_DC(H)                                                           \
 271 static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
 272                    int16_t *coeffs) {                                        \
 273     int i, j;                                                                \
 274     int      shift   = 14 - BIT_DEPTH;                                       \
 275     int      add     = 1 << (shift - 1);                                     \
 276     int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
 277                                                                              \
 278     for (j = 0; j < H; j++) {                                                \
 279         for (i = 0; i < H; i++) {                                            \
 280             coeffs[i+j*H] = coeff;                                           \
 281         }                                                                    \
 282     }                                                                        \
 283 }
 284
 285 IDCT( 4)
 286 IDCT( 8)
 287 IDCT(16)
 288 IDCT(32)
 289
 290 IDCT_DC( 4)
 291 IDCT_DC( 8)
 292 IDCT_DC(16)
 293 IDCT_DC(32)
 294
 295 #undef TR_4
 296 #undef TR_8
 297 #undef TR_16
 298 #undef TR_32
 299
 300 #undef SET
 301 #undef SCALE
 302 #undef ADD_AND_SCALE
 303
 304 static void FUNC(sao_band_filter_0)(uint8_t *_dst, uint8_t *_src,
 305                                   ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 306                                   int *borders, int width, int height,
 307                                   int c_idx)
 308 {
 309     pixel *dst = (pixel *)_dst;
 310     pixel *src = (pixel *)_src;
 311     int offset_table[32] = { 0 };
 312     int k, y, x;
 313     int shift  = BIT_DEPTH - 5;
 314     int16_t *sao_offset_val = sao->offset_val[c_idx];
 315     int sao_left_class  = sao->band_position[c_idx];
 316
 317     stride_dst /= sizeof(pixel);
 318     stride_src /= sizeof(pixel);
 319
 320     for (k = 0; k < 4; k++)
 321         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
 322     for (y = 0; y < height; y++) {
 323         for (x = 0; x < width; x++)
 324             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 325         dst += stride_dst;
 326         src += stride_src;
 327     }
 328 }
 329
 330 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 331
 332 static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src,
 333                                   ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 334                                   int width, int height,
 335                                   int c_idx, int init_x, int init_y) {
 336
 337     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 338     static const int8_t pos[4][2][2] = {
 339         { { -1,  0 }, {  1, 0 } }, // horizontal
 340         { {  0, -1 }, {  0, 1 } }, // vertical
 341         { { -1, -1 }, {  1, 1 } }, // 45 degree
 342         { {  1, -1 }, { -1, 1 } }, // 135 degree
 343     };
 344     int16_t *sao_offset_val = sao->offset_val[c_idx];
 345     int sao_eo_class    = sao->eo_class[c_idx];
 346     pixel *dst = (pixel *)_dst;
 347     pixel *src = (pixel *)_src;
 348
 349     int y_stride_src = init_y * stride_src;
 350     int y_stride_dst = init_y * stride_dst;
 351     int pos_0_0  = pos[sao_eo_class][0][0];
 352     int pos_0_1  = pos[sao_eo_class][0][1];
 353     int pos_1_0  = pos[sao_eo_class][1][0];
 354     int pos_1_1  = pos[sao_eo_class][1][1];
 355     int x, y;
 356
 357     int y_stride_0_1 = (init_y + pos_0_1) * stride_src;
 358     int y_stride_1_1 = (init_y + pos_1_1) * stride_src;
 359     for (y = init_y; y < height; y++) {
 360         for (x = init_x; x < width; x++) {
 361             int diff0             = CMP(src[x + y_stride_src], src[x + pos_0_0 + y_stride_0_1]);
 362             int diff1             = CMP(src[x + y_stride_src], src[x + pos_1_0 + y_stride_1_1]);
 363             int offset_val        = edge_idx[2 + diff0 + diff1];
 364             dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + sao_offset_val[offset_val]);
 365         }
 366         y_stride_src += stride_src;
 367         y_stride_dst += stride_dst;
 368         y_stride_0_1 += stride_src;
 369         y_stride_1_1 += stride_src;
 370     }
 371 }
 372
 373 static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
 374                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 375                                     int *borders, int _width, int _height,
 376                                     int c_idx, uint8_t *vert_edge,
 377                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 378 {
 379     int x, y;
 380     pixel *dst = (pixel *)_dst;
 381     pixel *src = (pixel *)_src;
 382     int16_t *sao_offset_val = sao->offset_val[c_idx];
 383     int sao_eo_class    = sao->eo_class[c_idx];
 384     int init_x = 0, init_y = 0, width = _width, height = _height;
 385
 386     stride_dst /= sizeof(pixel);
 387     stride_src /= sizeof(pixel);
 388
 389     if (sao_eo_class != SAO_EO_VERT) {
 390         if (borders[0]) {
 391             int offset_val = sao_offset_val[0];
 392             for (y = 0; y < height; y++) {
 393                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 394             }
 395             init_x = 1;
 396         }
 397         if (borders[2]) {
 398             int offset_val = sao_offset_val[0];
 399             int offset     = width - 1;
 400             for (x = 0; x < height; x++) {
 401                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 402             }
 403             width--;
 404         }
 405     }
 406     if (sao_eo_class != SAO_EO_HORIZ) {
 407         if (borders[1]) {
 408             int offset_val = sao_offset_val[0];
 409             for (x = init_x; x < width; x++)
 410                 dst[x] = av_clip_pixel(src[x] + offset_val);
 411             init_y = 1;
 412         }
 413         if (borders[3]) {
 414             int offset_val   = sao_offset_val[0];
 415             int y_stride_dst = stride_dst * (height - 1);
 416             int y_stride_src = stride_src * (height - 1);
 417             for (x = init_x; x < width; x++)
 418                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 419             height--;
 420         }
 421     }
 422
 423     FUNC(sao_edge_filter)((uint8_t *)dst, (uint8_t *)src, stride_dst, stride_src, sao, width, height, c_idx, init_x, init_y);
 424 }
 425
 426 static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
 427                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 428                                     int *borders, int _width, int _height,
 429                                     int c_idx, uint8_t *vert_edge,
 430                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 431 {
 432     int x, y;
 433     pixel *dst = (pixel *)_dst;
 434     pixel *src = (pixel *)_src;
 435     int16_t *sao_offset_val = sao->offset_val[c_idx];
 436     int sao_eo_class    = sao->eo_class[c_idx];
 437     int init_x = 0, init_y = 0, width = _width, height = _height;
 438
 439     stride_dst /= sizeof(pixel);
 440     stride_src /= sizeof(pixel);
 441
 442     if (sao_eo_class != SAO_EO_VERT) {
 443         if (borders[0]) {
 444             int offset_val = sao_offset_val[0];
 445             for (y = 0; y < height; y++) {
 446                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 447             }
 448             init_x = 1;
 449         }
 450         if (borders[2]) {
 451             int offset_val = sao_offset_val[0];
 452             int offset     = width - 1;
 453             for (x = 0; x < height; x++) {
 454                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 455             }
 456             width--;
 457         }
 458     }
 459     if (sao_eo_class != SAO_EO_HORIZ) {
 460         if (borders[1]) {
 461             int offset_val = sao_offset_val[0];
 462             for (x = init_x; x < width; x++)
 463                 dst[x] = av_clip_pixel(src[x] + offset_val);
 464             init_y = 1;
 465         }
 466         if (borders[3]) {
 467             int offset_val   = sao_offset_val[0];
 468             int y_stride_dst = stride_dst * (height - 1);
 469             int y_stride_src = stride_src * (height - 1);
 470             for (x = init_x; x < width; x++)
 471                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 472             height--;
 473         }
 474     }
 475
 476     FUNC(sao_edge_filter)((uint8_t *)dst, (uint8_t *)src, stride_dst, stride_src, sao, width, height, c_idx, init_x, init_y);
 477
 478     {
 479         int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
 480         int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
 481         int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
 482         int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
 483
 484         // Restore pixels that can't be modified
 485         if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
 486             for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
 487                 dst[y*stride_dst] = src[y*stride_src];
 488         }
 489         if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
 490             for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
 491                 dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
 492         }
 493
 494         if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
 495             for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
 496                 dst[x] = src[x];
 497         }
 498         if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
 499             for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
 500                 dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
 501         }
 502         if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
 503             dst[0] = src[0];
 504         if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
 505             dst[width-1] = src[width-1];
 506         if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
 507             dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
 508         if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
 509             dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
 510
 511     }
 512 }
 513
 514 #undef CMP
 515
 516 ////////////////////////////////////////////////////////////////////////////////
 517 //
 518 ////////////////////////////////////////////////////////////////////////////////
 519 static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
 520                                       uint8_t *_src, ptrdiff_t _srcstride,
 521                                       int height, intptr_t mx, intptr_t my, int width)
 522 {
 523     int x, y;
 524     pixel *src          = (pixel *)_src;
 525     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 526
 527     for (y = 0; y < height; y++) {
 528         for (x = 0; x < width; x++)
 529             dst[x] = src[x] << (14 - BIT_DEPTH);
 530         src += srcstride;
 531         dst += MAX_PB_SIZE;
 532     }
 533 }
 534
 535 static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 536                                           int height, intptr_t mx, intptr_t my, int width)
 537 {
 538     int y;
 539     pixel *src          = (pixel *)_src;
 540     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 541     pixel *dst          = (pixel *)_dst;
 542     ptrdiff_t dststride = _dststride / sizeof(pixel);
 543
 544     for (y = 0; y < height; y++) {
 545         memcpy(dst, src, width * sizeof(pixel));
 546         src += srcstride;
 547         dst += dststride;
 548     }
 549 }
 550
 551 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 552                                          int16_t *src2,
 553                                          int height, intptr_t mx, intptr_t my, int width)
 554 {
 555     int x, y;
 556     pixel *src          = (pixel *)_src;
 557     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 558     pixel *dst          = (pixel *)_dst;
 559     ptrdiff_t dststride = _dststride / sizeof(pixel);
 560
 561     int shift = 14  + 1 - BIT_DEPTH;
 562 #if BIT_DEPTH < 14
 563     int offset = 1 << (shift - 1);
 564 #else
 565     int offset = 0;
 566 #endif
 567
 568     for (y = 0; y < height; y++) {
 569         for (x = 0; x < width; x++)
 570             dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
 571         src  += srcstride;
 572         dst  += dststride;
 573         src2 += MAX_PB_SIZE;
 574     }
 575 }
 576
 577 static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 578                                             int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
 579 {
 580     int x, y;
 581     pixel *src          = (pixel *)_src;
 582     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 583     pixel *dst          = (pixel *)_dst;
 584     ptrdiff_t dststride = _dststride / sizeof(pixel);
 585     int shift = denom + 14 - BIT_DEPTH;
 586 #if BIT_DEPTH < 14
 587     int offset = 1 << (shift - 1);
 588 #else
 589     int offset = 0;
 590 #endif
 591
 592     ox     = ox * (1 << (BIT_DEPTH - 8));
 593     for (y = 0; y < height; y++) {
 594         for (x = 0; x < width; x++)
 595             dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
 596         src += srcstride;
 597         dst += dststride;
 598     }
 599 }
 600
 601 static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 602                                            int16_t *src2,
 603                                            int height, int denom, int wx0, int wx1,
 604                                            int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 605 {
 606     int x, y;
 607     pixel *src          = (pixel *)_src;
 608     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 609     pixel *dst          = (pixel *)_dst;
 610     ptrdiff_t dststride = _dststride / sizeof(pixel);
 611
 612     int shift = 14  + 1 - BIT_DEPTH;
 613     int log2Wd = denom + shift - 1;
 614
 615     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 616     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 617     for (y = 0; y < height; y++) {
 618         for (x = 0; x < width; x++) {
 619             dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 620         }
 621         src  += srcstride;
 622         dst  += dststride;
 623         src2 += MAX_PB_SIZE;
 624     }
 625 }
 626
 627 ////////////////////////////////////////////////////////////////////////////////
 628 //
 629 ////////////////////////////////////////////////////////////////////////////////
 630 #define QPEL_FILTER(src, stride)                                               \
 631     (filter[0] * src[x - 3 * stride] +                                         \
 632      filter[1] * src[x - 2 * stride] +                                         \
 633      filter[2] * src[x -     stride] +                                         \
 634      filter[3] * src[x             ] +                                         \
 635      filter[4] * src[x +     stride] +                                         \
 636      filter[5] * src[x + 2 * stride] +                                         \
 637      filter[6] * src[x + 3 * stride] +                                         \
 638      filter[7] * src[x + 4 * stride])
 639
 640 static void FUNC(put_hevc_qpel_h)(int16_t *dst,
 641                                   uint8_t *_src, ptrdiff_t _srcstride,
 642                                   int height, intptr_t mx, intptr_t my, int width)
 643 {
 644     int x, y;
 645     pixel        *src       = (pixel*)_src;
 646     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 647     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 648     for (y = 0; y < height; y++) {
 649         for (x = 0; x < width; x++)
 650             dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 651         src += srcstride;
 652         dst += MAX_PB_SIZE;
 653     }
 654 }
 655
 656 static void FUNC(put_hevc_qpel_v)(int16_t *dst,
 657                                   uint8_t *_src, ptrdiff_t _srcstride,
 658                                   int height, intptr_t mx, intptr_t my, int width)
 659 {
 660     int x, y;
 661     pixel        *src       = (pixel*)_src;
 662     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 663     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 664     for (y = 0; y < height; y++)  {
 665         for (x = 0; x < width; x++)
 666             dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
 667         src += srcstride;
 668         dst += MAX_PB_SIZE;
 669     }
 670 }
 671
 672 static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
 673                                    uint8_t *_src,
 674                                    ptrdiff_t _srcstride,
 675                                    int height, intptr_t mx,
 676                                    intptr_t my, int width)
 677 {
 678     int x, y;
 679     const int8_t *filter;
 680     pixel *src = (pixel*)_src;
 681     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 682     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 683     int16_t *tmp = tmp_array;
 684
 685     src   -= QPEL_EXTRA_BEFORE * srcstride;
 686     filter = ff_hevc_qpel_filters[mx - 1];
 687     for (y = 0; y < height + QPEL_EXTRA; y++) {
 688         for (x = 0; x < width; x++)
 689             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 690         src += srcstride;
 691         tmp += MAX_PB_SIZE;
 692     }
 693
 694     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 695     filter = ff_hevc_qpel_filters[my - 1];
 696     for (y = 0; y < height; y++) {
 697         for (x = 0; x < width; x++)
 698             dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
 699         tmp += MAX_PB_SIZE;
 700         dst += MAX_PB_SIZE;
 701     }
 702 }
 703
 704 static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 705                                       uint8_t *_src, ptrdiff_t _srcstride,
 706                                       int height, intptr_t mx, intptr_t my, int width)
 707 {
 708     int x, y;
 709     pixel        *src       = (pixel*)_src;
 710     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 711     pixel *dst          = (pixel *)_dst;
 712     ptrdiff_t dststride = _dststride / sizeof(pixel);
 713     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 714     int shift = 14 - BIT_DEPTH;
 715
 716 #if BIT_DEPTH < 14
 717     int offset = 1 << (shift - 1);
 718 #else
 719     int offset = 0;
 720 #endif
 721
 722     for (y = 0; y < height; y++) {
 723         for (x = 0; x < width; x++)
 724             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
 725         src += srcstride;
 726         dst += dststride;
 727     }
 728 }
 729
 730 static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 731                                      int16_t *src2,
 732                                      int height, intptr_t mx, intptr_t my, int width)
 733 {
 734     int x, y;
 735     pixel        *src       = (pixel*)_src;
 736     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 737     pixel *dst          = (pixel *)_dst;
 738     ptrdiff_t dststride = _dststride / sizeof(pixel);
 739
 740     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 741
 742     int shift = 14  + 1 - BIT_DEPTH;
 743 #if BIT_DEPTH < 14
 744     int offset = 1 << (shift - 1);
 745 #else
 746     int offset = 0;
 747 #endif
 748
 749     for (y = 0; y < height; y++) {
 750         for (x = 0; x < width; x++)
 751             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 752         src  += srcstride;
 753         dst  += dststride;
 754         src2 += MAX_PB_SIZE;
 755     }
 756 }
 757
 758 static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 759                                      uint8_t *_src, ptrdiff_t _srcstride,
 760                                      int height, intptr_t mx, intptr_t my, int width)
 761 {
 762     int x, y;
 763     pixel        *src       = (pixel*)_src;
 764     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 765     pixel *dst          = (pixel *)_dst;
 766     ptrdiff_t dststride = _dststride / sizeof(pixel);
 767     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 768     int shift = 14 - BIT_DEPTH;
 769
 770 #if BIT_DEPTH < 14
 771     int offset = 1 << (shift - 1);
 772 #else
 773     int offset = 0;
 774 #endif
 775
 776     for (y = 0; y < height; y++) {
 777         for (x = 0; x < width; x++)
 778             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
 779         src += srcstride;
 780         dst += dststride;
 781     }
 782 }
 783
 784
 785 static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 786                                      int16_t *src2,
 787                                      int height, intptr_t mx, intptr_t my, int width)
 788 {
 789     int x, y;
 790     pixel        *src       = (pixel*)_src;
 791     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 792     pixel *dst          = (pixel *)_dst;
 793     ptrdiff_t dststride = _dststride / sizeof(pixel);
 794
 795     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 796
 797     int shift = 14 + 1 - BIT_DEPTH;
 798 #if BIT_DEPTH < 14
 799     int offset = 1 << (shift - 1);
 800 #else
 801     int offset = 0;
 802 #endif
 803
 804     for (y = 0; y < height; y++) {
 805         for (x = 0; x < width; x++)
 806             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 807         src  += srcstride;
 808         dst  += dststride;
 809         src2 += MAX_PB_SIZE;
 810     }
 811 }
 812
 813 static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 814                                        uint8_t *_src, ptrdiff_t _srcstride,
 815                                        int height, intptr_t mx, intptr_t my, int width)
 816 {
 817     int x, y;
 818     const int8_t *filter;
 819     pixel *src = (pixel*)_src;
 820     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 821     pixel *dst          = (pixel *)_dst;
 822     ptrdiff_t dststride = _dststride / sizeof(pixel);
 823     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 824     int16_t *tmp = tmp_array;
 825     int shift =  14 - BIT_DEPTH;
 826
 827 #if BIT_DEPTH < 14
 828     int offset = 1 << (shift - 1);
 829 #else
 830     int offset = 0;
 831 #endif
 832
 833     src   -= QPEL_EXTRA_BEFORE * srcstride;
 834     filter = ff_hevc_qpel_filters[mx - 1];
 835     for (y = 0; y < height + QPEL_EXTRA; y++) {
 836         for (x = 0; x < width; x++)
 837             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 838         src += srcstride;
 839         tmp += MAX_PB_SIZE;
 840     }
 841
 842     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 843     filter = ff_hevc_qpel_filters[my - 1];
 844
 845     for (y = 0; y < height; y++) {
 846         for (x = 0; x < width; x++)
 847             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
 848         tmp += MAX_PB_SIZE;
 849         dst += dststride;
 850     }
 851 }
 852
 853 static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 854                                       int16_t *src2,
 855                                       int height, intptr_t mx, intptr_t my, int width)
 856 {
 857     int x, y;
 858     const int8_t *filter;
 859     pixel *src = (pixel*)_src;
 860     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 861     pixel *dst          = (pixel *)_dst;
 862     ptrdiff_t dststride = _dststride / sizeof(pixel);
 863     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 864     int16_t *tmp = tmp_array;
 865     int shift = 14 + 1 - BIT_DEPTH;
 866 #if BIT_DEPTH < 14
 867     int offset = 1 << (shift - 1);
 868 #else
 869     int offset = 0;
 870 #endif
 871
 872     src   -= QPEL_EXTRA_BEFORE * srcstride;
 873     filter = ff_hevc_qpel_filters[mx - 1];
 874     for (y = 0; y < height + QPEL_EXTRA; y++) {
 875         for (x = 0; x < width; x++)
 876             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 877         src += srcstride;
 878         tmp += MAX_PB_SIZE;
 879     }
 880
 881     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 882     filter = ff_hevc_qpel_filters[my - 1];
 883
 884     for (y = 0; y < height; y++) {
 885         for (x = 0; x < width; x++)
 886             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
 887         tmp  += MAX_PB_SIZE;
 888         dst  += dststride;
 889         src2 += MAX_PB_SIZE;
 890     }
 891 }
 892
 893 static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 894                                         uint8_t *_src, ptrdiff_t _srcstride,
 895                                         int height, int denom, int wx, int ox,
 896                                         intptr_t mx, intptr_t my, int width)
 897 {
 898     int x, y;
 899     pixel        *src       = (pixel*)_src;
 900     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 901     pixel *dst          = (pixel *)_dst;
 902     ptrdiff_t dststride = _dststride / sizeof(pixel);
 903     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 904     int shift = denom + 14 - BIT_DEPTH;
 905 #if BIT_DEPTH < 14
 906     int offset = 1 << (shift - 1);
 907 #else
 908     int offset = 0;
 909 #endif
 910
 911     ox = ox * (1 << (BIT_DEPTH - 8));
 912     for (y = 0; y < height; y++) {
 913         for (x = 0; x < width; x++)
 914             dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 915         src += srcstride;
 916         dst += dststride;
 917     }
 918 }
 919
 920 static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 921                                        int16_t *src2,
 922                                        int height, int denom, int wx0, int wx1,
 923                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 924 {
 925     int x, y;
 926     pixel        *src       = (pixel*)_src;
 927     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 928     pixel *dst          = (pixel *)_dst;
 929     ptrdiff_t dststride = _dststride / sizeof(pixel);
 930
 931     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 932
 933     int shift = 14  + 1 - BIT_DEPTH;
 934     int log2Wd = denom + shift - 1;
 935
 936     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 937     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 938     for (y = 0; y < height; y++) {
 939         for (x = 0; x < width; x++)
 940             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 941                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 942         src  += srcstride;
 943         dst  += dststride;
 944         src2 += MAX_PB_SIZE;
 945     }
 946 }
 947
 948 static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 949                                         uint8_t *_src, ptrdiff_t _srcstride,
 950                                         int height, int denom, int wx, int ox,
 951                                         intptr_t mx, intptr_t my, int width)
 952 {
 953     int x, y;
 954     pixel        *src       = (pixel*)_src;
 955     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 956     pixel *dst          = (pixel *)_dst;
 957     ptrdiff_t dststride = _dststride / sizeof(pixel);
 958     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 959     int shift = denom + 14 - BIT_DEPTH;
 960 #if BIT_DEPTH < 14
 961     int offset = 1 << (shift - 1);
 962 #else
 963     int offset = 0;
 964 #endif
 965
 966     ox = ox * (1 << (BIT_DEPTH - 8));
 967     for (y = 0; y < height; y++) {
 968         for (x = 0; x < width; x++)
 969             dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 970         src += srcstride;
 971         dst += dststride;
 972     }
 973 }
 974
 975 static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 976                                        int16_t *src2,
 977                                        int height, int denom, int wx0, int wx1,
 978                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 979 {
 980     int x, y;
 981     pixel        *src       = (pixel*)_src;
 982     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 983     pixel *dst          = (pixel *)_dst;
 984     ptrdiff_t dststride = _dststride / sizeof(pixel);
 985
 986     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 987
 988     int shift = 14 + 1 - BIT_DEPTH;
 989     int log2Wd = denom + shift - 1;
 990
 991     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 992     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 993     for (y = 0; y < height; y++) {
 994         for (x = 0; x < width; x++)
 995             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 996                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 997         src  += srcstride;
 998         dst  += dststride;
 999         src2 += MAX_PB_SIZE;
1000     }
1001 }
1002
1003 static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
1004                                          uint8_t *_src, ptrdiff_t _srcstride,
1005                                          int height, int denom, int wx, int ox,
1006                                          intptr_t mx, intptr_t my, int width)
1007 {
1008     int x, y;
1009     const int8_t *filter;
1010     pixel *src = (pixel*)_src;
1011     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1012     pixel *dst          = (pixel *)_dst;
1013     ptrdiff_t dststride = _dststride / sizeof(pixel);
1014     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1015     int16_t *tmp = tmp_array;
1016     int shift = denom + 14 - BIT_DEPTH;
1017 #if BIT_DEPTH < 14
1018     int offset = 1 << (shift - 1);
1019 #else
1020     int offset = 0;
1021 #endif
1022
1023     src   -= QPEL_EXTRA_BEFORE * srcstride;
1024     filter = ff_hevc_qpel_filters[mx - 1];
1025     for (y = 0; y < height + QPEL_EXTRA; y++) {
1026         for (x = 0; x < width; x++)
1027             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1028         src += srcstride;
1029         tmp += MAX_PB_SIZE;
1030     }
1031
1032     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1033     filter = ff_hevc_qpel_filters[my - 1];
1034
1035     ox = ox * (1 << (BIT_DEPTH - 8));
1036     for (y = 0; y < height; y++) {
1037         for (x = 0; x < width; x++)
1038             dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1039         tmp += MAX_PB_SIZE;
1040         dst += dststride;
1041     }
1042 }
1043
1044 static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1045                                         int16_t *src2,
1046                                         int height, int denom, int wx0, int wx1,
1047                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1048 {
1049     int x, y;
1050     const int8_t *filter;
1051     pixel *src = (pixel*)_src;
1052     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1053     pixel *dst          = (pixel *)_dst;
1054     ptrdiff_t dststride = _dststride / sizeof(pixel);
1055     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1056     int16_t *tmp = tmp_array;
1057     int shift = 14 + 1 - BIT_DEPTH;
1058     int log2Wd = denom + shift - 1;
1059
1060     src   -= QPEL_EXTRA_BEFORE * srcstride;
1061     filter = ff_hevc_qpel_filters[mx - 1];
1062     for (y = 0; y < height + QPEL_EXTRA; y++) {
1063         for (x = 0; x < width; x++)
1064             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1065         src += srcstride;
1066         tmp += MAX_PB_SIZE;
1067     }
1068
1069     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1070     filter = ff_hevc_qpel_filters[my - 1];
1071
1072     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1073     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1074     for (y = 0; y < height; y++) {
1075         for (x = 0; x < width; x++)
1076             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1077                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1078         tmp  += MAX_PB_SIZE;
1079         dst  += dststride;
1080         src2 += MAX_PB_SIZE;
1081     }
1082 }
1083
1084 ////////////////////////////////////////////////////////////////////////////////
1085 //
1086 ////////////////////////////////////////////////////////////////////////////////
1087 #define EPEL_FILTER(src, stride)                                               \
1088     (filter[0] * src[x - stride] +                                             \
1089      filter[1] * src[x]          +                                             \
1090      filter[2] * src[x + stride] +                                             \
1091      filter[3] * src[x + 2 * stride])
1092
1093 static void FUNC(put_hevc_epel_h)(int16_t *dst,
1094                                   uint8_t *_src, ptrdiff_t _srcstride,
1095                                   int height, intptr_t mx, intptr_t my, int width)
1096 {
1097     int x, y;
1098     pixel *src = (pixel *)_src;
1099     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1100     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1101     for (y = 0; y < height; y++) {
1102         for (x = 0; x < width; x++)
1103             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1104         src += srcstride;
1105         dst += MAX_PB_SIZE;
1106     }
1107 }
1108
1109 static void FUNC(put_hevc_epel_v)(int16_t *dst,
1110                                   uint8_t *_src, ptrdiff_t _srcstride,
1111                                   int height, intptr_t mx, intptr_t my, int width)
1112 {
1113     int x, y;
1114     pixel *src = (pixel *)_src;
1115     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1116     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1117
1118     for (y = 0; y < height; y++) {
1119         for (x = 0; x < width; x++)
1120             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1121         src += srcstride;
1122         dst += MAX_PB_SIZE;
1123     }
1124 }
1125
1126 static void FUNC(put_hevc_epel_hv)(int16_t *dst,
1127                                    uint8_t *_src, ptrdiff_t _srcstride,
1128                                    int height, intptr_t mx, intptr_t my, int width)
1129 {
1130     int x, y;
1131     pixel *src = (pixel *)_src;
1132     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1133     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1134     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1135     int16_t *tmp = tmp_array;
1136
1137     src -= EPEL_EXTRA_BEFORE * srcstride;
1138
1139     for (y = 0; y < height + EPEL_EXTRA; y++) {
1140         for (x = 0; x < width; x++)
1141             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1142         src += srcstride;
1143         tmp += MAX_PB_SIZE;
1144     }
1145
1146     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1147     filter = ff_hevc_epel_filters[my - 1];
1148
1149     for (y = 0; y < height; y++) {
1150         for (x = 0; x < width; x++)
1151             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1152         tmp += MAX_PB_SIZE;
1153         dst += MAX_PB_SIZE;
1154     }
1155 }
1156
1157 static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1158                                       int height, intptr_t mx, intptr_t my, int width)
1159 {
1160     int x, y;
1161     pixel *src = (pixel *)_src;
1162     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1163     pixel *dst          = (pixel *)_dst;
1164     ptrdiff_t dststride = _dststride / sizeof(pixel);
1165     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1166     int shift = 14 - BIT_DEPTH;
1167 #if BIT_DEPTH < 14
1168     int offset = 1 << (shift - 1);
1169 #else
1170     int offset = 0;
1171 #endif
1172
1173     for (y = 0; y < height; y++) {
1174         for (x = 0; x < width; x++)
1175             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
1176         src += srcstride;
1177         dst += dststride;
1178     }
1179 }
1180
1181 static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1182                                      int16_t *src2,
1183                                      int height, intptr_t mx, intptr_t my, int width)
1184 {
1185     int x, y;
1186     pixel *src = (pixel *)_src;
1187     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1188     pixel *dst          = (pixel *)_dst;
1189     ptrdiff_t dststride = _dststride / sizeof(pixel);
1190     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1191     int shift = 14 + 1 - BIT_DEPTH;
1192 #if BIT_DEPTH < 14
1193     int offset = 1 << (shift - 1);
1194 #else
1195     int offset = 0;
1196 #endif
1197
1198     for (y = 0; y < height; y++) {
1199         for (x = 0; x < width; x++) {
1200             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1201         }
1202         dst  += dststride;
1203         src  += srcstride;
1204         src2 += MAX_PB_SIZE;
1205     }
1206 }
1207
1208 static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1209                                       int height, intptr_t mx, intptr_t my, int width)
1210 {
1211     int x, y;
1212     pixel *src = (pixel *)_src;
1213     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1214     pixel *dst          = (pixel *)_dst;
1215     ptrdiff_t dststride = _dststride / sizeof(pixel);
1216     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1217     int shift = 14 - BIT_DEPTH;
1218 #if BIT_DEPTH < 14
1219     int offset = 1 << (shift - 1);
1220 #else
1221     int offset = 0;
1222 #endif
1223
1224     for (y = 0; y < height; y++) {
1225         for (x = 0; x < width; x++)
1226             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
1227         src += srcstride;
1228         dst += dststride;
1229     }
1230 }
1231
1232 static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1233                                      int16_t *src2,
1234                                      int height, intptr_t mx, intptr_t my, int width)
1235 {
1236     int x, y;
1237     pixel *src = (pixel *)_src;
1238     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1239     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1240     pixel *dst          = (pixel *)_dst;
1241     ptrdiff_t dststride = _dststride / sizeof(pixel);
1242     int shift = 14 + 1 - BIT_DEPTH;
1243 #if BIT_DEPTH < 14
1244     int offset = 1 << (shift - 1);
1245 #else
1246     int offset = 0;
1247 #endif
1248
1249     for (y = 0; y < height; y++) {
1250         for (x = 0; x < width; x++)
1251             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1252         dst  += dststride;
1253         src  += srcstride;
1254         src2 += MAX_PB_SIZE;
1255     }
1256 }
1257
1258 static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1259                                        int height, intptr_t mx, intptr_t my, int width)
1260 {
1261     int x, y;
1262     pixel *src = (pixel *)_src;
1263     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1264     pixel *dst          = (pixel *)_dst;
1265     ptrdiff_t dststride = _dststride / sizeof(pixel);
1266     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1267     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1268     int16_t *tmp = tmp_array;
1269     int shift = 14 - BIT_DEPTH;
1270 #if BIT_DEPTH < 14
1271     int offset = 1 << (shift - 1);
1272 #else
1273     int offset = 0;
1274 #endif
1275
1276     src -= EPEL_EXTRA_BEFORE * srcstride;
1277
1278     for (y = 0; y < height + EPEL_EXTRA; y++) {
1279         for (x = 0; x < width; x++)
1280             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1281         src += srcstride;
1282         tmp += MAX_PB_SIZE;
1283     }
1284
1285     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1286     filter = ff_hevc_epel_filters[my - 1];
1287
1288     for (y = 0; y < height; y++) {
1289         for (x = 0; x < width; x++)
1290             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
1291         tmp += MAX_PB_SIZE;
1292         dst += dststride;
1293     }
1294 }
1295
1296 static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1297                                       int16_t *src2,
1298                                       int height, intptr_t mx, intptr_t my, int width)
1299 {
1300     int x, y;
1301     pixel *src = (pixel *)_src;
1302     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1303     pixel *dst          = (pixel *)_dst;
1304     ptrdiff_t dststride = _dststride / sizeof(pixel);
1305     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1306     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1307     int16_t *tmp = tmp_array;
1308     int shift = 14 + 1 - BIT_DEPTH;
1309 #if BIT_DEPTH < 14
1310     int offset = 1 << (shift - 1);
1311 #else
1312     int offset = 0;
1313 #endif
1314
1315     src -= EPEL_EXTRA_BEFORE * srcstride;
1316
1317     for (y = 0; y < height + EPEL_EXTRA; y++) {
1318         for (x = 0; x < width; x++)
1319             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1320         src += srcstride;
1321         tmp += MAX_PB_SIZE;
1322     }
1323
1324     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1325     filter = ff_hevc_epel_filters[my - 1];
1326
1327     for (y = 0; y < height; y++) {
1328         for (x = 0; x < width; x++)
1329             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
1330         tmp  += MAX_PB_SIZE;
1331         dst  += dststride;
1332         src2 += MAX_PB_SIZE;
1333     }
1334 }
1335
1336 static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1337                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1338 {
1339     int x, y;
1340     pixel *src = (pixel *)_src;
1341     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1342     pixel *dst          = (pixel *)_dst;
1343     ptrdiff_t dststride = _dststride / sizeof(pixel);
1344     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1345     int shift = denom + 14 - BIT_DEPTH;
1346 #if BIT_DEPTH < 14
1347     int offset = 1 << (shift - 1);
1348 #else
1349     int offset = 0;
1350 #endif
1351
1352     ox     = ox * (1 << (BIT_DEPTH - 8));
1353     for (y = 0; y < height; y++) {
1354         for (x = 0; x < width; x++) {
1355             dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1356         }
1357         dst += dststride;
1358         src += srcstride;
1359     }
1360 }
1361
1362 static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1363                                        int16_t *src2,
1364                                        int height, int denom, int wx0, int wx1,
1365                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1366 {
1367     int x, y;
1368     pixel *src = (pixel *)_src;
1369     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1370     pixel *dst          = (pixel *)_dst;
1371     ptrdiff_t dststride = _dststride / sizeof(pixel);
1372     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1373     int shift = 14 + 1 - BIT_DEPTH;
1374     int log2Wd = denom + shift - 1;
1375
1376     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1377     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1378     for (y = 0; y < height; y++) {
1379         for (x = 0; x < width; x++)
1380             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1381                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1382         src  += srcstride;
1383         dst  += dststride;
1384         src2 += MAX_PB_SIZE;
1385     }
1386 }
1387
1388 static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1389                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1390 {
1391     int x, y;
1392     pixel *src = (pixel *)_src;
1393     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1394     pixel *dst          = (pixel *)_dst;
1395     ptrdiff_t dststride = _dststride / sizeof(pixel);
1396     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1397     int shift = denom + 14 - BIT_DEPTH;
1398 #if BIT_DEPTH < 14
1399     int offset = 1 << (shift - 1);
1400 #else
1401     int offset = 0;
1402 #endif
1403
1404     ox     = ox * (1 << (BIT_DEPTH - 8));
1405     for (y = 0; y < height; y++) {
1406         for (x = 0; x < width; x++) {
1407             dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1408         }
1409         dst += dststride;
1410         src += srcstride;
1411     }
1412 }
1413
1414 static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1415                                        int16_t *src2,
1416                                        int height, int denom, int wx0, int wx1,
1417                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1418 {
1419     int x, y;
1420     pixel *src = (pixel *)_src;
1421     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1422     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1423     pixel *dst          = (pixel *)_dst;
1424     ptrdiff_t dststride = _dststride / sizeof(pixel);
1425     int shift = 14 + 1 - BIT_DEPTH;
1426     int log2Wd = denom + shift - 1;
1427
1428     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1429     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1430     for (y = 0; y < height; y++) {
1431         for (x = 0; x < width; x++)
1432             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1433                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1434         src  += srcstride;
1435         dst  += dststride;
1436         src2 += MAX_PB_SIZE;
1437     }
1438 }
1439
1440 static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1441                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1442 {
1443     int x, y;
1444     pixel *src = (pixel *)_src;
1445     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1446     pixel *dst          = (pixel *)_dst;
1447     ptrdiff_t dststride = _dststride / sizeof(pixel);
1448     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1449     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1450     int16_t *tmp = tmp_array;
1451     int shift = denom + 14 - BIT_DEPTH;
1452 #if BIT_DEPTH < 14
1453     int offset = 1 << (shift - 1);
1454 #else
1455     int offset = 0;
1456 #endif
1457
1458     src -= EPEL_EXTRA_BEFORE * srcstride;
1459
1460     for (y = 0; y < height + EPEL_EXTRA; y++) {
1461         for (x = 0; x < width; x++)
1462             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1463         src += srcstride;
1464         tmp += MAX_PB_SIZE;
1465     }
1466
1467     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1468     filter = ff_hevc_epel_filters[my - 1];
1469
1470     ox     = ox * (1 << (BIT_DEPTH - 8));
1471     for (y = 0; y < height; y++) {
1472         for (x = 0; x < width; x++)
1473             dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1474         tmp += MAX_PB_SIZE;
1475         dst += dststride;
1476     }
1477 }
1478
1479 static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1480                                         int16_t *src2,
1481                                         int height, int denom, int wx0, int wx1,
1482                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1483 {
1484     int x, y;
1485     pixel *src = (pixel *)_src;
1486     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1487     pixel *dst          = (pixel *)_dst;
1488     ptrdiff_t dststride = _dststride / sizeof(pixel);
1489     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1490     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1491     int16_t *tmp = tmp_array;
1492     int shift = 14 + 1 - BIT_DEPTH;
1493     int log2Wd = denom + shift - 1;
1494
1495     src -= EPEL_EXTRA_BEFORE * srcstride;
1496
1497     for (y = 0; y < height + EPEL_EXTRA; y++) {
1498         for (x = 0; x < width; x++)
1499             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1500         src += srcstride;
1501         tmp += MAX_PB_SIZE;
1502     }
1503
1504     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1505     filter = ff_hevc_epel_filters[my - 1];
1506
1507     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1508     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1509     for (y = 0; y < height; y++) {
1510         for (x = 0; x < width; x++)
1511             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1512                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1513         tmp  += MAX_PB_SIZE;
1514         dst  += dststride;
1515         src2 += MAX_PB_SIZE;
1516     }
1517 }// line zero
1518 #define P3 pix[-4 * xstride]
1519 #define P2 pix[-3 * xstride]
1520 #define P1 pix[-2 * xstride]
1521 #define P0 pix[-1 * xstride]
1522 #define Q0 pix[0 * xstride]
1523 #define Q1 pix[1 * xstride]
1524 #define Q2 pix[2 * xstride]
1525 #define Q3 pix[3 * xstride]
1526
1527 // line three. used only for deblocking decision
1528 #define TP3 pix[-4 * xstride + 3 * ystride]
1529 #define TP2 pix[-3 * xstride + 3 * ystride]
1530 #define TP1 pix[-2 * xstride + 3 * ystride]
1531 #define TP0 pix[-1 * xstride + 3 * ystride]
1532 #define TQ0 pix[0  * xstride + 3 * ystride]
1533 #define TQ1 pix[1  * xstride + 3 * ystride]
1534 #define TQ2 pix[2  * xstride + 3 * ystride]
1535 #define TQ3 pix[3  * xstride + 3 * ystride]
1536
1537 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1538                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
1539                                         int beta, int *_tc,
1540                                         uint8_t *_no_p, uint8_t *_no_q)
1541 {
1542     int d, j;
1543     pixel *pix        = (pixel *)_pix;
1544     ptrdiff_t xstride = _xstride / sizeof(pixel);
1545     ptrdiff_t ystride = _ystride / sizeof(pixel);
1546
1547     beta <<= BIT_DEPTH - 8;
1548
1549     for (j = 0; j < 2; j++) {
1550         const int dp0  = abs(P2  - 2 * P1  + P0);
1551         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1552         const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1553         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1554         const int d0   = dp0 + dq0;
1555         const int d3   = dp3 + dq3;
1556         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1557         const int no_p = _no_p[j];
1558         const int no_q = _no_q[j];
1559
1560         if (d0 + d3 >= beta) {
1561             pix += 4 * ystride;
1562             continue;
1563         } else {
1564             const int beta_3 = beta >> 3;
1565             const int beta_2 = beta >> 2;
1566             const int tc25   = ((tc * 5 + 1) >> 1);
1567
1568             if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1569                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1570                                       (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1571                 // strong filtering
1572                 const int tc2 = tc << 1;
1573                 for (d = 0; d < 4; d++) {
1574                     const int p3 = P3;
1575                     const int p2 = P2;
1576                     const int p1 = P1;
1577                     const int p0 = P0;
1578                     const int q0 = Q0;
1579                     const int q1 = Q1;
1580                     const int q2 = Q2;
1581                     const int q3 = Q3;
1582                     if (!no_p) {
1583                         P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1584                         P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1585                         P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1586                     }
1587                     if (!no_q) {
1588                         Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1589                         Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1590                         Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1591                     }
1592                     pix += ystride;
1593                 }
1594             } else { // normal filtering
1595                 int nd_p = 1;
1596                 int nd_q = 1;
1597                 const int tc_2 = tc >> 1;
1598                 if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1599                     nd_p = 2;
1600                 if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1601                     nd_q = 2;
1602
1603                 for (d = 0; d < 4; d++) {
1604                     const int p2 = P2;
1605                     const int p1 = P1;
1606                     const int p0 = P0;
1607                     const int q0 = Q0;
1608                     const int q1 = Q1;
1609                     const int q2 = Q2;
1610                     int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1611                     if (abs(delta0) < 10 * tc) {
1612                         delta0 = av_clip(delta0, -tc, tc);
1613                         if (!no_p)
1614                             P0 = av_clip_pixel(p0 + delta0);
1615                         if (!no_q)
1616                             Q0 = av_clip_pixel(q0 - delta0);
1617                         if (!no_p && nd_p > 1) {
1618                             const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1619                             P1 = av_clip_pixel(p1 + deltap1);
1620                         }
1621                         if (!no_q && nd_q > 1) {
1622                             const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1623                             Q1 = av_clip_pixel(q1 + deltaq1);
1624                         }
1625                     }
1626                     pix += ystride;
1627                 }
1628             }
1629         }
1630     }
1631 }
1632
1633 static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1634                                           ptrdiff_t _ystride, int *_tc,
1635                                           uint8_t *_no_p, uint8_t *_no_q)
1636 {
1637     int d, j, no_p, no_q;
1638     pixel *pix        = (pixel *)_pix;
1639     ptrdiff_t xstride = _xstride / sizeof(pixel);
1640     ptrdiff_t ystride = _ystride / sizeof(pixel);
1641
1642     for (j = 0; j < 2; j++) {
1643         const int tc = _tc[j] << (BIT_DEPTH - 8);
1644         if (tc <= 0) {
1645             pix += 4 * ystride;
1646             continue;
1647         }
1648         no_p = _no_p[j];
1649         no_q = _no_q[j];
1650
1651         for (d = 0; d < 4; d++) {
1652             int delta0;
1653             const int p1 = P1;
1654             const int p0 = P0;
1655             const int q0 = Q0;
1656             const int q1 = Q1;
1657             delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1658             if (!no_p)
1659                 P0 = av_clip_pixel(p0 + delta0);
1660             if (!no_q)
1661                 Q0 = av_clip_pixel(q0 - delta0);
1662             pix += ystride;
1663         }
1664     }
1665 }
1666
1667 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1668                                             int32_t *tc, uint8_t *no_p,
1669                                             uint8_t *no_q)
1670 {
1671     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1672 }
1673
1674 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1675                                             int32_t *tc, uint8_t *no_p,
1676                                             uint8_t *no_q)
1677 {
1678     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1679 }
1680
1681 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1682                                           int beta, int32_t *tc, uint8_t *no_p,
1683                                           uint8_t *no_q)
1684 {
1685     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1686                                 beta, tc, no_p, no_q);
1687 }
1688
1689 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1690                                           int beta, int32_t *tc, uint8_t *no_p,
1691                                           uint8_t *no_q)
1692 {
1693     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1694                                 beta, tc, no_p, no_q);
1695 }
1696
1697 #undef P3
1698 #undef P2
1699 #undef P1
1700 #undef P0
1701 #undef Q0
1702 #undef Q1
1703 #undef Q2
1704 #undef Q3
1705
1706 #undef TP3
1707 #undef TP2
1708 #undef TP1
1709 #undef TP0
1710 #undef TQ0
1711 #undef TQ1
1712 #undef TQ2
1713 #undef TQ3