git.sesse.net Git - ffmpeg/blob - libavcodec/hevcdsp_template.c

   1 /*
   2  * HEVC video decoder
   3  *
   4  * Copyright (C) 2012 - 2013 Guillaume Martres
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "get_bits.h"
  24 #include "hevc.h"
  25
  26 #include "bit_depth_template.c"
  27 #include "hevcdsp.h"
  28
  29
  30 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
  31                           GetBitContext *gb, int pcm_bit_depth)
  32 {
  33     int x, y;
  34     pixel *dst = (pixel *)_dst;
  35
  36     stride /= sizeof(pixel);
  37
  38     for (y = 0; y < height; y++) {
  39         for (x = 0; x < width; x++)
  40             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
  41         dst += stride;
  42     }
  43 }
  44
  45 static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
  46                                        ptrdiff_t stride)
  47 {
  48     int x, y;
  49     pixel *dst = (pixel *)_dst;
  50
  51     stride /= sizeof(pixel);
  52
  53     for (y = 0; y < 4; y++) {
  54         for (x = 0; x < 4; x++) {
  55             dst[x] = av_clip_pixel(dst[x] + *coeffs);
  56             coeffs++;
  57         }
  58         dst += stride;
  59     }
  60 }
  61
  62 static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
  63                                        ptrdiff_t stride)
  64 {
  65     int x, y;
  66     pixel *dst = (pixel *)_dst;
  67
  68     stride /= sizeof(pixel);
  69
  70     for (y = 0; y < 8; y++) {
  71         for (x = 0; x < 8; x++) {
  72             dst[x] = av_clip_pixel(dst[x] + *coeffs);
  73             coeffs++;
  74         }
  75         dst += stride;
  76     }
  77 }
  78
  79 static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
  80                                          ptrdiff_t stride)
  81 {
  82     int x, y;
  83     pixel *dst = (pixel *)_dst;
  84
  85     stride /= sizeof(pixel);
  86
  87     for (y = 0; y < 16; y++) {
  88         for (x = 0; x < 16; x++) {
  89             dst[x] = av_clip_pixel(dst[x] + *coeffs);
  90             coeffs++;
  91         }
  92         dst += stride;
  93     }
  94 }
  95
  96 static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
  97                                          ptrdiff_t stride)
  98 {
  99     int x, y;
 100     pixel *dst = (pixel *)_dst;
 101
 102     stride /= sizeof(pixel);
 103
 104     for (y = 0; y < 32; y++) {
 105         for (x = 0; x < 32; x++) {
 106             dst[x] = av_clip_pixel(dst[x] + *coeffs);
 107             coeffs++;
 108         }
 109         dst += stride;
 110     }
 111 }
 112
 113
 114 static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
 115 {
 116     int16_t *coeffs = (int16_t *) _coeffs;
 117     int x, y;
 118     int size = 1 << log2_size;
 119
 120     if (mode) {
 121         coeffs += size;
 122         for (y = 0; y < size - 1; y++) {
 123             for (x = 0; x < size; x++)
 124                 coeffs[x] += coeffs[x - size];
 125             coeffs += size;
 126         }
 127     } else {
 128         for (y = 0; y < size; y++) {
 129             for (x = 1; x < size; x++)
 130                 coeffs[x] += coeffs[x - 1];
 131             coeffs += size;
 132         }
 133     }
 134 }
 135
 136 static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
 137 {
 138     int shift  = 15 - BIT_DEPTH - log2_size;
 139     int x, y;
 140     int size = 1 << log2_size;
 141     int16_t *coeffs = _coeffs;
 142
 143
 144     if (shift > 0) {
 145         int offset = 1 << (shift - 1);
 146         for (y = 0; y < size; y++) {
 147             for (x = 0; x < size; x++) {
 148                 *coeffs = (*coeffs + offset) >> shift;
 149                 coeffs++;
 150             }
 151         }
 152     } else {
 153         for (y = 0; y < size; y++) {
 154             for (x = 0; x < size; x++) {
 155                 *coeffs = *coeffs << -shift;
 156                 coeffs++;
 157             }
 158         }
 159     }
 160 }
 161
 162 #define SET(dst, x)   (dst) = (x)
 163 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
 164 #define ADD_AND_SCALE(dst, x)                                           \
 165     (dst) = av_clip_pixel((dst) + av_clip_int16(((x) + add) >> shift))
 166
 167 #define TR_4x4_LUMA(dst, src, step, assign)                             \
 168     do {                                                                \
 169         int c0 = src[0 * step] + src[2 * step];                         \
 170         int c1 = src[2 * step] + src[3 * step];                         \
 171         int c2 = src[0 * step] - src[3 * step];                         \
 172         int c3 = 74 * src[1 * step];                                    \
 173                                                                         \
 174         assign(dst[2 * step], 74 * (src[0 * step] -                     \
 175                                     src[2 * step] +                     \
 176                                     src[3 * step]));                    \
 177         assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
 178         assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
 179         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
 180     } while (0)
 181
 182 static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 183 {
 184     int i;
 185     int shift    = 7;
 186     int add      = 1 << (shift - 1);
 187     int16_t *src = coeffs;
 188
 189     for (i = 0; i < 4; i++) {
 190         TR_4x4_LUMA(src, src, 4, SCALE);
 191         src++;
 192     }
 193
 194     shift = 20 - BIT_DEPTH;
 195     add   = 1 << (shift - 1);
 196     for (i = 0; i < 4; i++) {
 197         TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
 198         coeffs += 4;
 199     }
 200 }
 201
 202 #undef TR_4x4_LUMA
 203
 204 #define TR_4(dst, src, dstep, sstep, assign, end)                              \
 205     do {                                                                       \
 206         const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep];              \
 207         const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep];              \
 208         const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep];              \
 209         const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep];              \
 210                                                                                \
 211         assign(dst[0 * dstep], e0 + o0);                                       \
 212         assign(dst[1 * dstep], e1 + o1);                                       \
 213         assign(dst[2 * dstep], e1 - o1);                                       \
 214         assign(dst[3 * dstep], e0 - o0);                                       \
 215     } while (0)
 216
 217 #define TR_8(dst, src, dstep, sstep, assign, end)                              \
 218     do {                                                                       \
 219         int i, j;                                                              \
 220         int e_8[4];                                                            \
 221         int o_8[4] = { 0 };                                                    \
 222         for (i = 0; i < 4; i++)                                                \
 223             for (j = 1; j < end; j += 2)                                       \
 224                 o_8[i] += transform[4 * j][i] * src[j * sstep];                \
 225         TR_4(e_8, src, 1, 2 * sstep, SET, 4);                                  \
 226                                                                                \
 227         for (i = 0; i < 4; i++) {                                              \
 228             assign(dst[i * dstep], e_8[i] + o_8[i]);                           \
 229             assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);                     \
 230         }                                                                      \
 231     } while (0)
 232
 233 #define TR_16(dst, src, dstep, sstep, assign, end)                             \
 234     do {                                                                       \
 235         int i, j;                                                              \
 236         int e_16[8];                                                           \
 237         int o_16[8] = { 0 };                                                   \
 238         for (i = 0; i < 8; i++)                                                \
 239             for (j = 1; j < end; j += 2)                                       \
 240                 o_16[i] += transform[2 * j][i] * src[j * sstep];               \
 241         TR_8(e_16, src, 1, 2 * sstep, SET, 8);                                 \
 242                                                                                \
 243         for (i = 0; i < 8; i++) {                                              \
 244             assign(dst[i * dstep], e_16[i] + o_16[i]);                         \
 245             assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);                  \
 246         }                                                                      \
 247     } while (0)
 248
 249 #define TR_32(dst, src, dstep, sstep, assign, end)                             \
 250     do {                                                                       \
 251         int i, j;                                                              \
 252         int e_32[16];                                                          \
 253         int o_32[16] = { 0 };                                                  \
 254         for (i = 0; i < 16; i++)                                               \
 255             for (j = 1; j < end; j += 2)                                       \
 256                 o_32[i] += transform[j][i] * src[j * sstep];                   \
 257         TR_16(e_32, src, 1, 2 * sstep, SET, end/2);                            \
 258                                                                                \
 259         for (i = 0; i < 16; i++) {                                             \
 260             assign(dst[i * dstep], e_32[i] + o_32[i]);                         \
 261             assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);                  \
 262         }                                                                      \
 263     } while (0)
 264
 265 #define IDCT_VAR4(H)                                                          \
 266     int      limit2   = FFMIN(col_limit + 4, H)
 267 #define IDCT_VAR8(H)                                                          \
 268         int      limit   = FFMIN(col_limit, H);                               \
 269         int      limit2   = FFMIN(col_limit + 4, H)
 270 #define IDCT_VAR16(H)   IDCT_VAR8(H)
 271 #define IDCT_VAR32(H)   IDCT_VAR8(H)
 272
 273 #define IDCT(H)                                                              \
 274 static void FUNC(idct_##H ##x ##H )(                                         \
 275                    int16_t *coeffs, int col_limit) {                         \
 276     int i;                                                                   \
 277     int      shift   = 7;                                                    \
 278     int      add     = 1 << (shift - 1);                                     \
 279     int16_t *src     = coeffs;                                               \
 280     IDCT_VAR ##H(H);                                                         \
 281                                                                              \
 282     for (i = 0; i < H; i++) {                                                \
 283         TR_ ## H(src, src, H, H, SCALE, limit2);                             \
 284         if (limit2 < H && i%4 == 0 && !!i)                                   \
 285             limit2 -= 4;                                                     \
 286         src++;                                                               \
 287     }                                                                        \
 288                                                                              \
 289     shift   = 20 - BIT_DEPTH;                                                \
 290     add     = 1 << (shift - 1);                                              \
 291     for (i = 0; i < H; i++) {                                                \
 292         TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);                        \
 293         coeffs += H;                                                         \
 294     }                                                                        \
 295 }
 296
 297 #define IDCT_DC(H)                                                           \
 298 static void FUNC(idct_##H ##x ##H ##_dc)(                                    \
 299                    int16_t *coeffs) {                                        \
 300     int i, j;                                                                \
 301     int      shift   = 14 - BIT_DEPTH;                                       \
 302     int      add     = 1 << (shift - 1);                                     \
 303     int      coeff   = (((coeffs[0] + 1) >> 1) + add) >> shift;              \
 304                                                                              \
 305     for (j = 0; j < H; j++) {                                                \
 306         for (i = 0; i < H; i++) {                                            \
 307             coeffs[i+j*H] = coeff;                                           \
 308         }                                                                    \
 309     }                                                                        \
 310 }
 311
 312 IDCT( 4)
 313 IDCT( 8)
 314 IDCT(16)
 315 IDCT(32)
 316
 317 IDCT_DC( 4)
 318 IDCT_DC( 8)
 319 IDCT_DC(16)
 320 IDCT_DC(32)
 321
 322 #undef TR_4
 323 #undef TR_8
 324 #undef TR_16
 325 #undef TR_32
 326
 327 #undef SET
 328 #undef SCALE
 329 #undef ADD_AND_SCALE
 330
 331 static void FUNC(sao_band_filter_0)(uint8_t *_dst, uint8_t *_src,
 332                                   ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 333                                   int *borders, int width, int height,
 334                                   int c_idx)
 335 {
 336     pixel *dst = (pixel *)_dst;
 337     pixel *src = (pixel *)_src;
 338     int offset_table[32] = { 0 };
 339     int k, y, x;
 340     int shift  = BIT_DEPTH - 5;
 341     int16_t *sao_offset_val = sao->offset_val[c_idx];
 342     int sao_left_class  = sao->band_position[c_idx];
 343
 344     stride_dst /= sizeof(pixel);
 345     stride_src /= sizeof(pixel);
 346
 347     for (k = 0; k < 4; k++)
 348         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
 349     for (y = 0; y < height; y++) {
 350         for (x = 0; x < width; x++)
 351             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 352         dst += stride_dst;
 353         src += stride_src;
 354     }
 355 }
 356
 357 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 358
 359 static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src,
 360                                   ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 361                                   int width, int height,
 362                                   int c_idx, int init_x, int init_y) {
 363
 364     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 365     static const int8_t pos[4][2][2] = {
 366         { { -1,  0 }, {  1, 0 } }, // horizontal
 367         { {  0, -1 }, {  0, 1 } }, // vertical
 368         { { -1, -1 }, {  1, 1 } }, // 45 degree
 369         { {  1, -1 }, { -1, 1 } }, // 135 degree
 370     };
 371     int16_t *sao_offset_val = sao->offset_val[c_idx];
 372     int sao_eo_class    = sao->eo_class[c_idx];
 373     pixel *dst = (pixel *)_dst;
 374     pixel *src = (pixel *)_src;
 375
 376     int y_stride_src = init_y * stride_src;
 377     int y_stride_dst = init_y * stride_dst;
 378     int pos_0_0  = pos[sao_eo_class][0][0];
 379     int pos_0_1  = pos[sao_eo_class][0][1];
 380     int pos_1_0  = pos[sao_eo_class][1][0];
 381     int pos_1_1  = pos[sao_eo_class][1][1];
 382     int x, y;
 383
 384     int y_stride_0_1 = (init_y + pos_0_1) * stride_src;
 385     int y_stride_1_1 = (init_y + pos_1_1) * stride_src;
 386     for (y = init_y; y < height; y++) {
 387         for (x = init_x; x < width; x++) {
 388             int diff0             = CMP(src[x + y_stride_src], src[x + pos_0_0 + y_stride_0_1]);
 389             int diff1             = CMP(src[x + y_stride_src], src[x + pos_1_0 + y_stride_1_1]);
 390             int offset_val        = edge_idx[2 + diff0 + diff1];
 391             dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + sao_offset_val[offset_val]);
 392         }
 393         y_stride_src += stride_src;
 394         y_stride_dst += stride_dst;
 395         y_stride_0_1 += stride_src;
 396         y_stride_1_1 += stride_src;
 397     }
 398 }
 399
 400 static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
 401                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 402                                     int *borders, int _width, int _height,
 403                                     int c_idx, uint8_t *vert_edge,
 404                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 405 {
 406     int x, y;
 407     pixel *dst = (pixel *)_dst;
 408     pixel *src = (pixel *)_src;
 409     int16_t *sao_offset_val = sao->offset_val[c_idx];
 410     int sao_eo_class    = sao->eo_class[c_idx];
 411     int init_x = 0, init_y = 0, width = _width, height = _height;
 412
 413     stride_dst /= sizeof(pixel);
 414     stride_src /= sizeof(pixel);
 415
 416     if (sao_eo_class != SAO_EO_VERT) {
 417         if (borders[0]) {
 418             int offset_val = sao_offset_val[0];
 419             for (y = 0; y < height; y++) {
 420                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 421             }
 422             init_x = 1;
 423         }
 424         if (borders[2]) {
 425             int offset_val = sao_offset_val[0];
 426             int offset     = width - 1;
 427             for (x = 0; x < height; x++) {
 428                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 429             }
 430             width--;
 431         }
 432     }
 433     if (sao_eo_class != SAO_EO_HORIZ) {
 434         if (borders[1]) {
 435             int offset_val = sao_offset_val[0];
 436             for (x = init_x; x < width; x++)
 437                 dst[x] = av_clip_pixel(src[x] + offset_val);
 438             init_y = 1;
 439         }
 440         if (borders[3]) {
 441             int offset_val   = sao_offset_val[0];
 442             int y_stride_dst = stride_dst * (height - 1);
 443             int y_stride_src = stride_src * (height - 1);
 444             for (x = init_x; x < width; x++)
 445                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 446             height--;
 447         }
 448     }
 449
 450     FUNC(sao_edge_filter)((uint8_t *)dst, (uint8_t *)src, stride_dst, stride_src, sao, width, height, c_idx, init_x, init_y);
 451 }
 452
 453 static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
 454                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
 455                                     int *borders, int _width, int _height,
 456                                     int c_idx, uint8_t *vert_edge,
 457                                     uint8_t *horiz_edge, uint8_t *diag_edge)
 458 {
 459     int x, y;
 460     pixel *dst = (pixel *)_dst;
 461     pixel *src = (pixel *)_src;
 462     int16_t *sao_offset_val = sao->offset_val[c_idx];
 463     int sao_eo_class    = sao->eo_class[c_idx];
 464     int init_x = 0, init_y = 0, width = _width, height = _height;
 465
 466     stride_dst /= sizeof(pixel);
 467     stride_src /= sizeof(pixel);
 468
 469     if (sao_eo_class != SAO_EO_VERT) {
 470         if (borders[0]) {
 471             int offset_val = sao_offset_val[0];
 472             for (y = 0; y < height; y++) {
 473                 dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
 474             }
 475             init_x = 1;
 476         }
 477         if (borders[2]) {
 478             int offset_val = sao_offset_val[0];
 479             int offset     = width - 1;
 480             for (x = 0; x < height; x++) {
 481                 dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
 482             }
 483             width--;
 484         }
 485     }
 486     if (sao_eo_class != SAO_EO_HORIZ) {
 487         if (borders[1]) {
 488             int offset_val = sao_offset_val[0];
 489             for (x = init_x; x < width; x++)
 490                 dst[x] = av_clip_pixel(src[x] + offset_val);
 491             init_y = 1;
 492         }
 493         if (borders[3]) {
 494             int offset_val   = sao_offset_val[0];
 495             int y_stride_dst = stride_dst * (height - 1);
 496             int y_stride_src = stride_src * (height - 1);
 497             for (x = init_x; x < width; x++)
 498                 dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
 499             height--;
 500         }
 501     }
 502
 503     FUNC(sao_edge_filter)((uint8_t *)dst, (uint8_t *)src, stride_dst, stride_src, sao, width, height, c_idx, init_x, init_y);
 504
 505     {
 506         int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
 507         int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
 508         int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
 509         int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
 510
 511         // Restore pixels that can't be modified
 512         if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
 513             for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
 514                 dst[y*stride_dst] = src[y*stride_src];
 515         }
 516         if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
 517             for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
 518                 dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
 519         }
 520
 521         if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
 522             for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
 523                 dst[x] = src[x];
 524         }
 525         if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
 526             for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
 527                 dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
 528         }
 529         if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
 530             dst[0] = src[0];
 531         if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
 532             dst[width-1] = src[width-1];
 533         if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
 534             dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
 535         if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
 536             dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
 537
 538     }
 539 }
 540
 541 #undef CMP
 542
 543 ////////////////////////////////////////////////////////////////////////////////
 544 //
 545 ////////////////////////////////////////////////////////////////////////////////
 546 static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
 547                                       uint8_t *_src, ptrdiff_t _srcstride,
 548                                       int height, intptr_t mx, intptr_t my, int width)
 549 {
 550     int x, y;
 551     pixel *src          = (pixel *)_src;
 552     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 553
 554     for (y = 0; y < height; y++) {
 555         for (x = 0; x < width; x++)
 556             dst[x] = src[x] << (14 - BIT_DEPTH);
 557         src += srcstride;
 558         dst += MAX_PB_SIZE;
 559     }
 560 }
 561
 562 static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 563                                           int height, intptr_t mx, intptr_t my, int width)
 564 {
 565     int y;
 566     pixel *src          = (pixel *)_src;
 567     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 568     pixel *dst          = (pixel *)_dst;
 569     ptrdiff_t dststride = _dststride / sizeof(pixel);
 570
 571     for (y = 0; y < height; y++) {
 572         memcpy(dst, src, width * sizeof(pixel));
 573         src += srcstride;
 574         dst += dststride;
 575     }
 576 }
 577
 578 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 579                                          int16_t *src2,
 580                                          int height, intptr_t mx, intptr_t my, int width)
 581 {
 582     int x, y;
 583     pixel *src          = (pixel *)_src;
 584     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 585     pixel *dst          = (pixel *)_dst;
 586     ptrdiff_t dststride = _dststride / sizeof(pixel);
 587
 588     int shift = 14  + 1 - BIT_DEPTH;
 589 #if BIT_DEPTH < 14
 590     int offset = 1 << (shift - 1);
 591 #else
 592     int offset = 0;
 593 #endif
 594
 595     for (y = 0; y < height; y++) {
 596         for (x = 0; x < width; x++)
 597             dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
 598         src  += srcstride;
 599         dst  += dststride;
 600         src2 += MAX_PB_SIZE;
 601     }
 602 }
 603
 604 static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 605                                             int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
 606 {
 607     int x, y;
 608     pixel *src          = (pixel *)_src;
 609     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 610     pixel *dst          = (pixel *)_dst;
 611     ptrdiff_t dststride = _dststride / sizeof(pixel);
 612     int shift = denom + 14 - BIT_DEPTH;
 613 #if BIT_DEPTH < 14
 614     int offset = 1 << (shift - 1);
 615 #else
 616     int offset = 0;
 617 #endif
 618
 619     ox     = ox * (1 << (BIT_DEPTH - 8));
 620     for (y = 0; y < height; y++) {
 621         for (x = 0; x < width; x++)
 622             dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
 623         src += srcstride;
 624         dst += dststride;
 625     }
 626 }
 627
 628 static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 629                                            int16_t *src2,
 630                                            int height, int denom, int wx0, int wx1,
 631                                            int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 632 {
 633     int x, y;
 634     pixel *src          = (pixel *)_src;
 635     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 636     pixel *dst          = (pixel *)_dst;
 637     ptrdiff_t dststride = _dststride / sizeof(pixel);
 638
 639     int shift = 14  + 1 - BIT_DEPTH;
 640     int log2Wd = denom + shift - 1;
 641
 642     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 643     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 644     for (y = 0; y < height; y++) {
 645         for (x = 0; x < width; x++) {
 646             dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 647         }
 648         src  += srcstride;
 649         dst  += dststride;
 650         src2 += MAX_PB_SIZE;
 651     }
 652 }
 653
 654 ////////////////////////////////////////////////////////////////////////////////
 655 //
 656 ////////////////////////////////////////////////////////////////////////////////
 657 #define QPEL_FILTER(src, stride)                                               \
 658     (filter[0] * src[x - 3 * stride] +                                         \
 659      filter[1] * src[x - 2 * stride] +                                         \
 660      filter[2] * src[x -     stride] +                                         \
 661      filter[3] * src[x             ] +                                         \
 662      filter[4] * src[x +     stride] +                                         \
 663      filter[5] * src[x + 2 * stride] +                                         \
 664      filter[6] * src[x + 3 * stride] +                                         \
 665      filter[7] * src[x + 4 * stride])
 666
 667 static void FUNC(put_hevc_qpel_h)(int16_t *dst,
 668                                   uint8_t *_src, ptrdiff_t _srcstride,
 669                                   int height, intptr_t mx, intptr_t my, int width)
 670 {
 671     int x, y;
 672     pixel        *src       = (pixel*)_src;
 673     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 674     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 675     for (y = 0; y < height; y++) {
 676         for (x = 0; x < width; x++)
 677             dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 678         src += srcstride;
 679         dst += MAX_PB_SIZE;
 680     }
 681 }
 682
 683 static void FUNC(put_hevc_qpel_v)(int16_t *dst,
 684                                   uint8_t *_src, ptrdiff_t _srcstride,
 685                                   int height, intptr_t mx, intptr_t my, int width)
 686 {
 687     int x, y;
 688     pixel        *src       = (pixel*)_src;
 689     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 690     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 691     for (y = 0; y < height; y++)  {
 692         for (x = 0; x < width; x++)
 693             dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
 694         src += srcstride;
 695         dst += MAX_PB_SIZE;
 696     }
 697 }
 698
 699 static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
 700                                    uint8_t *_src,
 701                                    ptrdiff_t _srcstride,
 702                                    int height, intptr_t mx,
 703                                    intptr_t my, int width)
 704 {
 705     int x, y;
 706     const int8_t *filter;
 707     pixel *src = (pixel*)_src;
 708     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 709     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 710     int16_t *tmp = tmp_array;
 711
 712     src   -= QPEL_EXTRA_BEFORE * srcstride;
 713     filter = ff_hevc_qpel_filters[mx - 1];
 714     for (y = 0; y < height + QPEL_EXTRA; y++) {
 715         for (x = 0; x < width; x++)
 716             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 717         src += srcstride;
 718         tmp += MAX_PB_SIZE;
 719     }
 720
 721     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 722     filter = ff_hevc_qpel_filters[my - 1];
 723     for (y = 0; y < height; y++) {
 724         for (x = 0; x < width; x++)
 725             dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
 726         tmp += MAX_PB_SIZE;
 727         dst += MAX_PB_SIZE;
 728     }
 729 }
 730
 731 static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 732                                       uint8_t *_src, ptrdiff_t _srcstride,
 733                                       int height, intptr_t mx, intptr_t my, int width)
 734 {
 735     int x, y;
 736     pixel        *src       = (pixel*)_src;
 737     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 738     pixel *dst          = (pixel *)_dst;
 739     ptrdiff_t dststride = _dststride / sizeof(pixel);
 740     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 741     int shift = 14 - BIT_DEPTH;
 742
 743 #if BIT_DEPTH < 14
 744     int offset = 1 << (shift - 1);
 745 #else
 746     int offset = 0;
 747 #endif
 748
 749     for (y = 0; y < height; y++) {
 750         for (x = 0; x < width; x++)
 751             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
 752         src += srcstride;
 753         dst += dststride;
 754     }
 755 }
 756
 757 static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 758                                      int16_t *src2,
 759                                      int height, intptr_t mx, intptr_t my, int width)
 760 {
 761     int x, y;
 762     pixel        *src       = (pixel*)_src;
 763     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 764     pixel *dst          = (pixel *)_dst;
 765     ptrdiff_t dststride = _dststride / sizeof(pixel);
 766
 767     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 768
 769     int shift = 14  + 1 - BIT_DEPTH;
 770 #if BIT_DEPTH < 14
 771     int offset = 1 << (shift - 1);
 772 #else
 773     int offset = 0;
 774 #endif
 775
 776     for (y = 0; y < height; y++) {
 777         for (x = 0; x < width; x++)
 778             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 779         src  += srcstride;
 780         dst  += dststride;
 781         src2 += MAX_PB_SIZE;
 782     }
 783 }
 784
 785 static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 786                                      uint8_t *_src, ptrdiff_t _srcstride,
 787                                      int height, intptr_t mx, intptr_t my, int width)
 788 {
 789     int x, y;
 790     pixel        *src       = (pixel*)_src;
 791     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 792     pixel *dst          = (pixel *)_dst;
 793     ptrdiff_t dststride = _dststride / sizeof(pixel);
 794     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 795     int shift = 14 - BIT_DEPTH;
 796
 797 #if BIT_DEPTH < 14
 798     int offset = 1 << (shift - 1);
 799 #else
 800     int offset = 0;
 801 #endif
 802
 803     for (y = 0; y < height; y++) {
 804         for (x = 0; x < width; x++)
 805             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
 806         src += srcstride;
 807         dst += dststride;
 808     }
 809 }
 810
 811
 812 static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 813                                      int16_t *src2,
 814                                      int height, intptr_t mx, intptr_t my, int width)
 815 {
 816     int x, y;
 817     pixel        *src       = (pixel*)_src;
 818     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 819     pixel *dst          = (pixel *)_dst;
 820     ptrdiff_t dststride = _dststride / sizeof(pixel);
 821
 822     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 823
 824     int shift = 14 + 1 - BIT_DEPTH;
 825 #if BIT_DEPTH < 14
 826     int offset = 1 << (shift - 1);
 827 #else
 828     int offset = 0;
 829 #endif
 830
 831     for (y = 0; y < height; y++) {
 832         for (x = 0; x < width; x++)
 833             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
 834         src  += srcstride;
 835         dst  += dststride;
 836         src2 += MAX_PB_SIZE;
 837     }
 838 }
 839
 840 static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
 841                                        uint8_t *_src, ptrdiff_t _srcstride,
 842                                        int height, intptr_t mx, intptr_t my, int width)
 843 {
 844     int x, y;
 845     const int8_t *filter;
 846     pixel *src = (pixel*)_src;
 847     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 848     pixel *dst          = (pixel *)_dst;
 849     ptrdiff_t dststride = _dststride / sizeof(pixel);
 850     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 851     int16_t *tmp = tmp_array;
 852     int shift =  14 - BIT_DEPTH;
 853
 854 #if BIT_DEPTH < 14
 855     int offset = 1 << (shift - 1);
 856 #else
 857     int offset = 0;
 858 #endif
 859
 860     src   -= QPEL_EXTRA_BEFORE * srcstride;
 861     filter = ff_hevc_qpel_filters[mx - 1];
 862     for (y = 0; y < height + QPEL_EXTRA; y++) {
 863         for (x = 0; x < width; x++)
 864             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 865         src += srcstride;
 866         tmp += MAX_PB_SIZE;
 867     }
 868
 869     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 870     filter = ff_hevc_qpel_filters[my - 1];
 871
 872     for (y = 0; y < height; y++) {
 873         for (x = 0; x < width; x++)
 874             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
 875         tmp += MAX_PB_SIZE;
 876         dst += dststride;
 877     }
 878 }
 879
 880 static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 881                                       int16_t *src2,
 882                                       int height, intptr_t mx, intptr_t my, int width)
 883 {
 884     int x, y;
 885     const int8_t *filter;
 886     pixel *src = (pixel*)_src;
 887     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 888     pixel *dst          = (pixel *)_dst;
 889     ptrdiff_t dststride = _dststride / sizeof(pixel);
 890     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
 891     int16_t *tmp = tmp_array;
 892     int shift = 14 + 1 - BIT_DEPTH;
 893 #if BIT_DEPTH < 14
 894     int offset = 1 << (shift - 1);
 895 #else
 896     int offset = 0;
 897 #endif
 898
 899     src   -= QPEL_EXTRA_BEFORE * srcstride;
 900     filter = ff_hevc_qpel_filters[mx - 1];
 901     for (y = 0; y < height + QPEL_EXTRA; y++) {
 902         for (x = 0; x < width; x++)
 903             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 904         src += srcstride;
 905         tmp += MAX_PB_SIZE;
 906     }
 907
 908     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
 909     filter = ff_hevc_qpel_filters[my - 1];
 910
 911     for (y = 0; y < height; y++) {
 912         for (x = 0; x < width; x++)
 913             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
 914         tmp  += MAX_PB_SIZE;
 915         dst  += dststride;
 916         src2 += MAX_PB_SIZE;
 917     }
 918 }
 919
 920 static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
 921                                         uint8_t *_src, ptrdiff_t _srcstride,
 922                                         int height, int denom, int wx, int ox,
 923                                         intptr_t mx, intptr_t my, int width)
 924 {
 925     int x, y;
 926     pixel        *src       = (pixel*)_src;
 927     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 928     pixel *dst          = (pixel *)_dst;
 929     ptrdiff_t dststride = _dststride / sizeof(pixel);
 930     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 931     int shift = denom + 14 - BIT_DEPTH;
 932 #if BIT_DEPTH < 14
 933     int offset = 1 << (shift - 1);
 934 #else
 935     int offset = 0;
 936 #endif
 937
 938     ox = ox * (1 << (BIT_DEPTH - 8));
 939     for (y = 0; y < height; y++) {
 940         for (x = 0; x < width; x++)
 941             dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 942         src += srcstride;
 943         dst += dststride;
 944     }
 945 }
 946
 947 static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
 948                                        int16_t *src2,
 949                                        int height, int denom, int wx0, int wx1,
 950                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
 951 {
 952     int x, y;
 953     pixel        *src       = (pixel*)_src;
 954     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 955     pixel *dst          = (pixel *)_dst;
 956     ptrdiff_t dststride = _dststride / sizeof(pixel);
 957
 958     const int8_t *filter    = ff_hevc_qpel_filters[mx - 1];
 959
 960     int shift = 14  + 1 - BIT_DEPTH;
 961     int log2Wd = denom + shift - 1;
 962
 963     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
 964     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
 965     for (y = 0; y < height; y++) {
 966         for (x = 0; x < width; x++)
 967             dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
 968                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
 969         src  += srcstride;
 970         dst  += dststride;
 971         src2 += MAX_PB_SIZE;
 972     }
 973 }
 974
 975 static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
 976                                         uint8_t *_src, ptrdiff_t _srcstride,
 977                                         int height, int denom, int wx, int ox,
 978                                         intptr_t mx, intptr_t my, int width)
 979 {
 980     int x, y;
 981     pixel        *src       = (pixel*)_src;
 982     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
 983     pixel *dst          = (pixel *)_dst;
 984     ptrdiff_t dststride = _dststride / sizeof(pixel);
 985     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
 986     int shift = denom + 14 - BIT_DEPTH;
 987 #if BIT_DEPTH < 14
 988     int offset = 1 << (shift - 1);
 989 #else
 990     int offset = 0;
 991 #endif
 992
 993     ox = ox * (1 << (BIT_DEPTH - 8));
 994     for (y = 0; y < height; y++) {
 995         for (x = 0; x < width; x++)
 996             dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
 997         src += srcstride;
 998         dst += dststride;
 999     }
1000 }
1001
1002 static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1003                                        int16_t *src2,
1004                                        int height, int denom, int wx0, int wx1,
1005                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1006 {
1007     int x, y;
1008     pixel        *src       = (pixel*)_src;
1009     ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
1010     pixel *dst          = (pixel *)_dst;
1011     ptrdiff_t dststride = _dststride / sizeof(pixel);
1012
1013     const int8_t *filter    = ff_hevc_qpel_filters[my - 1];
1014
1015     int shift = 14 + 1 - BIT_DEPTH;
1016     int log2Wd = denom + shift - 1;
1017
1018     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1019     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1020     for (y = 0; y < height; y++) {
1021         for (x = 0; x < width; x++)
1022             dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1023                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1024         src  += srcstride;
1025         dst  += dststride;
1026         src2 += MAX_PB_SIZE;
1027     }
1028 }
1029
1030 static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
1031                                          uint8_t *_src, ptrdiff_t _srcstride,
1032                                          int height, int denom, int wx, int ox,
1033                                          intptr_t mx, intptr_t my, int width)
1034 {
1035     int x, y;
1036     const int8_t *filter;
1037     pixel *src = (pixel*)_src;
1038     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1039     pixel *dst          = (pixel *)_dst;
1040     ptrdiff_t dststride = _dststride / sizeof(pixel);
1041     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1042     int16_t *tmp = tmp_array;
1043     int shift = denom + 14 - BIT_DEPTH;
1044 #if BIT_DEPTH < 14
1045     int offset = 1 << (shift - 1);
1046 #else
1047     int offset = 0;
1048 #endif
1049
1050     src   -= QPEL_EXTRA_BEFORE * srcstride;
1051     filter = ff_hevc_qpel_filters[mx - 1];
1052     for (y = 0; y < height + QPEL_EXTRA; y++) {
1053         for (x = 0; x < width; x++)
1054             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1055         src += srcstride;
1056         tmp += MAX_PB_SIZE;
1057     }
1058
1059     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1060     filter = ff_hevc_qpel_filters[my - 1];
1061
1062     ox = ox * (1 << (BIT_DEPTH - 8));
1063     for (y = 0; y < height; y++) {
1064         for (x = 0; x < width; x++)
1065             dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1066         tmp += MAX_PB_SIZE;
1067         dst += dststride;
1068     }
1069 }
1070
1071 static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1072                                         int16_t *src2,
1073                                         int height, int denom, int wx0, int wx1,
1074                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1075 {
1076     int x, y;
1077     const int8_t *filter;
1078     pixel *src = (pixel*)_src;
1079     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1080     pixel *dst          = (pixel *)_dst;
1081     ptrdiff_t dststride = _dststride / sizeof(pixel);
1082     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
1083     int16_t *tmp = tmp_array;
1084     int shift = 14 + 1 - BIT_DEPTH;
1085     int log2Wd = denom + shift - 1;
1086
1087     src   -= QPEL_EXTRA_BEFORE * srcstride;
1088     filter = ff_hevc_qpel_filters[mx - 1];
1089     for (y = 0; y < height + QPEL_EXTRA; y++) {
1090         for (x = 0; x < width; x++)
1091             tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1092         src += srcstride;
1093         tmp += MAX_PB_SIZE;
1094     }
1095
1096     tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1097     filter = ff_hevc_qpel_filters[my - 1];
1098
1099     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1100     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1101     for (y = 0; y < height; y++) {
1102         for (x = 0; x < width; x++)
1103             dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1104                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1105         tmp  += MAX_PB_SIZE;
1106         dst  += dststride;
1107         src2 += MAX_PB_SIZE;
1108     }
1109 }
1110
1111 ////////////////////////////////////////////////////////////////////////////////
1112 //
1113 ////////////////////////////////////////////////////////////////////////////////
1114 #define EPEL_FILTER(src, stride)                                               \
1115     (filter[0] * src[x - stride] +                                             \
1116      filter[1] * src[x]          +                                             \
1117      filter[2] * src[x + stride] +                                             \
1118      filter[3] * src[x + 2 * stride])
1119
1120 static void FUNC(put_hevc_epel_h)(int16_t *dst,
1121                                   uint8_t *_src, ptrdiff_t _srcstride,
1122                                   int height, intptr_t mx, intptr_t my, int width)
1123 {
1124     int x, y;
1125     pixel *src = (pixel *)_src;
1126     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1127     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1128     for (y = 0; y < height; y++) {
1129         for (x = 0; x < width; x++)
1130             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1131         src += srcstride;
1132         dst += MAX_PB_SIZE;
1133     }
1134 }
1135
1136 static void FUNC(put_hevc_epel_v)(int16_t *dst,
1137                                   uint8_t *_src, ptrdiff_t _srcstride,
1138                                   int height, intptr_t mx, intptr_t my, int width)
1139 {
1140     int x, y;
1141     pixel *src = (pixel *)_src;
1142     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1143     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1144
1145     for (y = 0; y < height; y++) {
1146         for (x = 0; x < width; x++)
1147             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1148         src += srcstride;
1149         dst += MAX_PB_SIZE;
1150     }
1151 }
1152
1153 static void FUNC(put_hevc_epel_hv)(int16_t *dst,
1154                                    uint8_t *_src, ptrdiff_t _srcstride,
1155                                    int height, intptr_t mx, intptr_t my, int width)
1156 {
1157     int x, y;
1158     pixel *src = (pixel *)_src;
1159     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1160     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1161     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1162     int16_t *tmp = tmp_array;
1163
1164     src -= EPEL_EXTRA_BEFORE * srcstride;
1165
1166     for (y = 0; y < height + EPEL_EXTRA; y++) {
1167         for (x = 0; x < width; x++)
1168             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1169         src += srcstride;
1170         tmp += MAX_PB_SIZE;
1171     }
1172
1173     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1174     filter = ff_hevc_epel_filters[my - 1];
1175
1176     for (y = 0; y < height; y++) {
1177         for (x = 0; x < width; x++)
1178             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1179         tmp += MAX_PB_SIZE;
1180         dst += MAX_PB_SIZE;
1181     }
1182 }
1183
1184 static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1185                                       int height, intptr_t mx, intptr_t my, int width)
1186 {
1187     int x, y;
1188     pixel *src = (pixel *)_src;
1189     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1190     pixel *dst          = (pixel *)_dst;
1191     ptrdiff_t dststride = _dststride / sizeof(pixel);
1192     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1193     int shift = 14 - BIT_DEPTH;
1194 #if BIT_DEPTH < 14
1195     int offset = 1 << (shift - 1);
1196 #else
1197     int offset = 0;
1198 #endif
1199
1200     for (y = 0; y < height; y++) {
1201         for (x = 0; x < width; x++)
1202             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
1203         src += srcstride;
1204         dst += dststride;
1205     }
1206 }
1207
1208 static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1209                                      int16_t *src2,
1210                                      int height, intptr_t mx, intptr_t my, int width)
1211 {
1212     int x, y;
1213     pixel *src = (pixel *)_src;
1214     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1215     pixel *dst          = (pixel *)_dst;
1216     ptrdiff_t dststride = _dststride / sizeof(pixel);
1217     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1218     int shift = 14 + 1 - BIT_DEPTH;
1219 #if BIT_DEPTH < 14
1220     int offset = 1 << (shift - 1);
1221 #else
1222     int offset = 0;
1223 #endif
1224
1225     for (y = 0; y < height; y++) {
1226         for (x = 0; x < width; x++) {
1227             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1228         }
1229         dst  += dststride;
1230         src  += srcstride;
1231         src2 += MAX_PB_SIZE;
1232     }
1233 }
1234
1235 static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1236                                       int height, intptr_t mx, intptr_t my, int width)
1237 {
1238     int x, y;
1239     pixel *src = (pixel *)_src;
1240     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1241     pixel *dst          = (pixel *)_dst;
1242     ptrdiff_t dststride = _dststride / sizeof(pixel);
1243     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1244     int shift = 14 - BIT_DEPTH;
1245 #if BIT_DEPTH < 14
1246     int offset = 1 << (shift - 1);
1247 #else
1248     int offset = 0;
1249 #endif
1250
1251     for (y = 0; y < height; y++) {
1252         for (x = 0; x < width; x++)
1253             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
1254         src += srcstride;
1255         dst += dststride;
1256     }
1257 }
1258
1259 static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1260                                      int16_t *src2,
1261                                      int height, intptr_t mx, intptr_t my, int width)
1262 {
1263     int x, y;
1264     pixel *src = (pixel *)_src;
1265     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1266     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1267     pixel *dst          = (pixel *)_dst;
1268     ptrdiff_t dststride = _dststride / sizeof(pixel);
1269     int shift = 14 + 1 - BIT_DEPTH;
1270 #if BIT_DEPTH < 14
1271     int offset = 1 << (shift - 1);
1272 #else
1273     int offset = 0;
1274 #endif
1275
1276     for (y = 0; y < height; y++) {
1277         for (x = 0; x < width; x++)
1278             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
1279         dst  += dststride;
1280         src  += srcstride;
1281         src2 += MAX_PB_SIZE;
1282     }
1283 }
1284
1285 static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1286                                        int height, intptr_t mx, intptr_t my, int width)
1287 {
1288     int x, y;
1289     pixel *src = (pixel *)_src;
1290     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1291     pixel *dst          = (pixel *)_dst;
1292     ptrdiff_t dststride = _dststride / sizeof(pixel);
1293     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1294     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1295     int16_t *tmp = tmp_array;
1296     int shift = 14 - BIT_DEPTH;
1297 #if BIT_DEPTH < 14
1298     int offset = 1 << (shift - 1);
1299 #else
1300     int offset = 0;
1301 #endif
1302
1303     src -= EPEL_EXTRA_BEFORE * srcstride;
1304
1305     for (y = 0; y < height + EPEL_EXTRA; y++) {
1306         for (x = 0; x < width; x++)
1307             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1308         src += srcstride;
1309         tmp += MAX_PB_SIZE;
1310     }
1311
1312     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1313     filter = ff_hevc_epel_filters[my - 1];
1314
1315     for (y = 0; y < height; y++) {
1316         for (x = 0; x < width; x++)
1317             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
1318         tmp += MAX_PB_SIZE;
1319         dst += dststride;
1320     }
1321 }
1322
1323 static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1324                                       int16_t *src2,
1325                                       int height, intptr_t mx, intptr_t my, int width)
1326 {
1327     int x, y;
1328     pixel *src = (pixel *)_src;
1329     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1330     pixel *dst          = (pixel *)_dst;
1331     ptrdiff_t dststride = _dststride / sizeof(pixel);
1332     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1333     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1334     int16_t *tmp = tmp_array;
1335     int shift = 14 + 1 - BIT_DEPTH;
1336 #if BIT_DEPTH < 14
1337     int offset = 1 << (shift - 1);
1338 #else
1339     int offset = 0;
1340 #endif
1341
1342     src -= EPEL_EXTRA_BEFORE * srcstride;
1343
1344     for (y = 0; y < height + EPEL_EXTRA; y++) {
1345         for (x = 0; x < width; x++)
1346             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1347         src += srcstride;
1348         tmp += MAX_PB_SIZE;
1349     }
1350
1351     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1352     filter = ff_hevc_epel_filters[my - 1];
1353
1354     for (y = 0; y < height; y++) {
1355         for (x = 0; x < width; x++)
1356             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
1357         tmp  += MAX_PB_SIZE;
1358         dst  += dststride;
1359         src2 += MAX_PB_SIZE;
1360     }
1361 }
1362
1363 static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1364                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1365 {
1366     int x, y;
1367     pixel *src = (pixel *)_src;
1368     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1369     pixel *dst          = (pixel *)_dst;
1370     ptrdiff_t dststride = _dststride / sizeof(pixel);
1371     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1372     int shift = denom + 14 - BIT_DEPTH;
1373 #if BIT_DEPTH < 14
1374     int offset = 1 << (shift - 1);
1375 #else
1376     int offset = 0;
1377 #endif
1378
1379     ox     = ox * (1 << (BIT_DEPTH - 8));
1380     for (y = 0; y < height; y++) {
1381         for (x = 0; x < width; x++) {
1382             dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1383         }
1384         dst += dststride;
1385         src += srcstride;
1386     }
1387 }
1388
1389 static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1390                                        int16_t *src2,
1391                                        int height, int denom, int wx0, int wx1,
1392                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1393 {
1394     int x, y;
1395     pixel *src = (pixel *)_src;
1396     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1397     pixel *dst          = (pixel *)_dst;
1398     ptrdiff_t dststride = _dststride / sizeof(pixel);
1399     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1400     int shift = 14 + 1 - BIT_DEPTH;
1401     int log2Wd = denom + shift - 1;
1402
1403     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1404     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1405     for (y = 0; y < height; y++) {
1406         for (x = 0; x < width; x++)
1407             dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1408                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1409         src  += srcstride;
1410         dst  += dststride;
1411         src2 += MAX_PB_SIZE;
1412     }
1413 }
1414
1415 static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1416                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1417 {
1418     int x, y;
1419     pixel *src = (pixel *)_src;
1420     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1421     pixel *dst          = (pixel *)_dst;
1422     ptrdiff_t dststride = _dststride / sizeof(pixel);
1423     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1424     int shift = denom + 14 - BIT_DEPTH;
1425 #if BIT_DEPTH < 14
1426     int offset = 1 << (shift - 1);
1427 #else
1428     int offset = 0;
1429 #endif
1430
1431     ox     = ox * (1 << (BIT_DEPTH - 8));
1432     for (y = 0; y < height; y++) {
1433         for (x = 0; x < width; x++) {
1434             dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
1435         }
1436         dst += dststride;
1437         src += srcstride;
1438     }
1439 }
1440
1441 static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1442                                        int16_t *src2,
1443                                        int height, int denom, int wx0, int wx1,
1444                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1445 {
1446     int x, y;
1447     pixel *src = (pixel *)_src;
1448     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1449     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1450     pixel *dst          = (pixel *)_dst;
1451     ptrdiff_t dststride = _dststride / sizeof(pixel);
1452     int shift = 14 + 1 - BIT_DEPTH;
1453     int log2Wd = denom + shift - 1;
1454
1455     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1456     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1457     for (y = 0; y < height; y++) {
1458         for (x = 0; x < width; x++)
1459             dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
1460                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1461         src  += srcstride;
1462         dst  += dststride;
1463         src2 += MAX_PB_SIZE;
1464     }
1465 }
1466
1467 static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1468                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
1469 {
1470     int x, y;
1471     pixel *src = (pixel *)_src;
1472     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1473     pixel *dst          = (pixel *)_dst;
1474     ptrdiff_t dststride = _dststride / sizeof(pixel);
1475     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1476     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1477     int16_t *tmp = tmp_array;
1478     int shift = denom + 14 - BIT_DEPTH;
1479 #if BIT_DEPTH < 14
1480     int offset = 1 << (shift - 1);
1481 #else
1482     int offset = 0;
1483 #endif
1484
1485     src -= EPEL_EXTRA_BEFORE * srcstride;
1486
1487     for (y = 0; y < height + EPEL_EXTRA; y++) {
1488         for (x = 0; x < width; x++)
1489             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1490         src += srcstride;
1491         tmp += MAX_PB_SIZE;
1492     }
1493
1494     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1495     filter = ff_hevc_epel_filters[my - 1];
1496
1497     ox     = ox * (1 << (BIT_DEPTH - 8));
1498     for (y = 0; y < height; y++) {
1499         for (x = 0; x < width; x++)
1500             dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
1501         tmp += MAX_PB_SIZE;
1502         dst += dststride;
1503     }
1504 }
1505
1506 static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
1507                                         int16_t *src2,
1508                                         int height, int denom, int wx0, int wx1,
1509                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width)
1510 {
1511     int x, y;
1512     pixel *src = (pixel *)_src;
1513     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1514     pixel *dst          = (pixel *)_dst;
1515     ptrdiff_t dststride = _dststride / sizeof(pixel);
1516     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1517     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
1518     int16_t *tmp = tmp_array;
1519     int shift = 14 + 1 - BIT_DEPTH;
1520     int log2Wd = denom + shift - 1;
1521
1522     src -= EPEL_EXTRA_BEFORE * srcstride;
1523
1524     for (y = 0; y < height + EPEL_EXTRA; y++) {
1525         for (x = 0; x < width; x++)
1526             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1527         src += srcstride;
1528         tmp += MAX_PB_SIZE;
1529     }
1530
1531     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1532     filter = ff_hevc_epel_filters[my - 1];
1533
1534     ox0     = ox0 * (1 << (BIT_DEPTH - 8));
1535     ox1     = ox1 * (1 << (BIT_DEPTH - 8));
1536     for (y = 0; y < height; y++) {
1537         for (x = 0; x < width; x++)
1538             dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
1539                                     ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
1540         tmp  += MAX_PB_SIZE;
1541         dst  += dststride;
1542         src2 += MAX_PB_SIZE;
1543     }
1544 }// line zero
1545 #define P3 pix[-4 * xstride]
1546 #define P2 pix[-3 * xstride]
1547 #define P1 pix[-2 * xstride]
1548 #define P0 pix[-1 * xstride]
1549 #define Q0 pix[0 * xstride]
1550 #define Q1 pix[1 * xstride]
1551 #define Q2 pix[2 * xstride]
1552 #define Q3 pix[3 * xstride]
1553
1554 // line three. used only for deblocking decision
1555 #define TP3 pix[-4 * xstride + 3 * ystride]
1556 #define TP2 pix[-3 * xstride + 3 * ystride]
1557 #define TP1 pix[-2 * xstride + 3 * ystride]
1558 #define TP0 pix[-1 * xstride + 3 * ystride]
1559 #define TQ0 pix[0  * xstride + 3 * ystride]
1560 #define TQ1 pix[1  * xstride + 3 * ystride]
1561 #define TQ2 pix[2  * xstride + 3 * ystride]
1562 #define TQ3 pix[3  * xstride + 3 * ystride]
1563
1564 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1565                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
1566                                         int beta, int *_tc,
1567                                         uint8_t *_no_p, uint8_t *_no_q)
1568 {
1569     int d, j;
1570     pixel *pix        = (pixel *)_pix;
1571     ptrdiff_t xstride = _xstride / sizeof(pixel);
1572     ptrdiff_t ystride = _ystride / sizeof(pixel);
1573
1574     beta <<= BIT_DEPTH - 8;
1575
1576     for (j = 0; j < 2; j++) {
1577         const int dp0  = abs(P2  - 2 * P1  + P0);
1578         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1579         const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1580         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1581         const int d0   = dp0 + dq0;
1582         const int d3   = dp3 + dq3;
1583         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1584         const int no_p = _no_p[j];
1585         const int no_q = _no_q[j];
1586
1587         if (d0 + d3 >= beta) {
1588             pix += 4 * ystride;
1589             continue;
1590         } else {
1591             const int beta_3 = beta >> 3;
1592             const int beta_2 = beta >> 2;
1593             const int tc25   = ((tc * 5 + 1) >> 1);
1594
1595             if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1596                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1597                                       (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1598                 // strong filtering
1599                 const int tc2 = tc << 1;
1600                 for (d = 0; d < 4; d++) {
1601                     const int p3 = P3;
1602                     const int p2 = P2;
1603                     const int p1 = P1;
1604                     const int p0 = P0;
1605                     const int q0 = Q0;
1606                     const int q1 = Q1;
1607                     const int q2 = Q2;
1608                     const int q3 = Q3;
1609                     if (!no_p) {
1610                         P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1611                         P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1612                         P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1613                     }
1614                     if (!no_q) {
1615                         Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1616                         Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1617                         Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1618                     }
1619                     pix += ystride;
1620                 }
1621             } else { // normal filtering
1622                 int nd_p = 1;
1623                 int nd_q = 1;
1624                 const int tc_2 = tc >> 1;
1625                 if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1626                     nd_p = 2;
1627                 if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1628                     nd_q = 2;
1629
1630                 for (d = 0; d < 4; d++) {
1631                     const int p2 = P2;
1632                     const int p1 = P1;
1633                     const int p0 = P0;
1634                     const int q0 = Q0;
1635                     const int q1 = Q1;
1636                     const int q2 = Q2;
1637                     int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1638                     if (abs(delta0) < 10 * tc) {
1639                         delta0 = av_clip(delta0, -tc, tc);
1640                         if (!no_p)
1641                             P0 = av_clip_pixel(p0 + delta0);
1642                         if (!no_q)
1643                             Q0 = av_clip_pixel(q0 - delta0);
1644                         if (!no_p && nd_p > 1) {
1645                             const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1646                             P1 = av_clip_pixel(p1 + deltap1);
1647                         }
1648                         if (!no_q && nd_q > 1) {
1649                             const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1650                             Q1 = av_clip_pixel(q1 + deltaq1);
1651                         }
1652                     }
1653                     pix += ystride;
1654                 }
1655             }
1656         }
1657     }
1658 }
1659
1660 static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1661                                           ptrdiff_t _ystride, int *_tc,
1662                                           uint8_t *_no_p, uint8_t *_no_q)
1663 {
1664     int d, j, no_p, no_q;
1665     pixel *pix        = (pixel *)_pix;
1666     ptrdiff_t xstride = _xstride / sizeof(pixel);
1667     ptrdiff_t ystride = _ystride / sizeof(pixel);
1668
1669     for (j = 0; j < 2; j++) {
1670         const int tc = _tc[j] << (BIT_DEPTH - 8);
1671         if (tc <= 0) {
1672             pix += 4 * ystride;
1673             continue;
1674         }
1675         no_p = _no_p[j];
1676         no_q = _no_q[j];
1677
1678         for (d = 0; d < 4; d++) {
1679             int delta0;
1680             const int p1 = P1;
1681             const int p0 = P0;
1682             const int q0 = Q0;
1683             const int q1 = Q1;
1684             delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1685             if (!no_p)
1686                 P0 = av_clip_pixel(p0 + delta0);
1687             if (!no_q)
1688                 Q0 = av_clip_pixel(q0 - delta0);
1689             pix += ystride;
1690         }
1691     }
1692 }
1693
1694 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1695                                             int32_t *tc, uint8_t *no_p,
1696                                             uint8_t *no_q)
1697 {
1698     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1699 }
1700
1701 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1702                                             int32_t *tc, uint8_t *no_p,
1703                                             uint8_t *no_q)
1704 {
1705     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1706 }
1707
1708 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1709                                           int beta, int32_t *tc, uint8_t *no_p,
1710                                           uint8_t *no_q)
1711 {
1712     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1713                                 beta, tc, no_p, no_q);
1714 }
1715
1716 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1717                                           int beta, int32_t *tc, uint8_t *no_p,
1718                                           uint8_t *no_q)
1719 {
1720     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1721                                 beta, tc, no_p, no_q);
1722 }
1723
1724 #undef P3
1725 #undef P2
1726 #undef P1
1727 #undef P0
1728 #undef Q0
1729 #undef Q1
1730 #undef Q2
1731 #undef Q3
1732
1733 #undef TP3
1734 #undef TP2
1735 #undef TP1
1736 #undef TP0
1737 #undef TQ0
1738 #undef TQ1
1739 #undef TQ2
1740 #undef TQ3