git.sesse.net Git - ffmpeg/blob - libavcodec/hevcdsp_template.c

   1 /*
   2  * HEVC video decoder
   3  *
   4  * Copyright (C) 2012 - 2013 Guillaume Martres
   5  *
   6  * This file is part of Libav.
   7  *
   8  * Libav is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * Libav is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with Libav; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "get_bits.h"
  24 #include "hevc.h"
  25
  26 #include "bit_depth_template.c"
  27
  28 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
  29                           GetBitContext *gb, int pcm_bit_depth)
  30 {
  31     int x, y;
  32     pixel *dst = (pixel *)_dst;
  33
  34     stride /= sizeof(pixel);
  35
  36     for (y = 0; y < size; y++) {
  37         for (x = 0; x < size; x++)
  38             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
  39         dst += stride;
  40     }
  41 }
  42
  43 static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
  44                                                 ptrdiff_t stride, int size)
  45 {
  46     int x, y;
  47     pixel *dst = (pixel *)_dst;
  48
  49     stride /= sizeof(pixel);
  50
  51     for (y = 0; y < size; y++) {
  52         for (x = 0; x < size; x++) {
  53             dst[x] = av_clip_pixel(dst[x] + *res);
  54             res++;
  55         }
  56         dst += stride;
  57     }
  58 }
  59
  60 static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
  61                                   ptrdiff_t stride)
  62 {
  63     FUNC(add_residual)(_dst, res, stride, 4);
  64 }
  65
  66 static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
  67                                   ptrdiff_t stride)
  68 {
  69     FUNC(add_residual)(_dst, res, stride, 8);
  70 }
  71
  72 static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
  73                                     ptrdiff_t stride)
  74 {
  75     FUNC(add_residual)(_dst, res, stride, 16);
  76 }
  77
  78 static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
  79                                     ptrdiff_t stride)
  80 {
  81     FUNC(add_residual)(_dst, res, stride, 32);
  82 }
  83
  84 static void FUNC(dequant)(int16_t *coeffs)
  85 {
  86     int shift  = 13 - BIT_DEPTH;
  87 #if BIT_DEPTH <= 13
  88     int offset = 1 << (shift - 1);
  89 #else
  90     int offset = 0;
  91 #endif
  92     int x, y;
  93
  94     for (y = 0; y < 4 * 4; y += 4) {
  95         for (x = 0; x < 4; x++)
  96             coeffs[y + x] = (coeffs[y + x] + offset) >> shift;
  97     }
  98 }
  99
 100 #define SET(dst, x)   (dst) = (x)
 101 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
 102
 103 #define TR_4x4_LUMA(dst, src, step, assign)                             \
 104     do {                                                                \
 105         int c0 = src[0 * step] + src[2 * step];                         \
 106         int c1 = src[2 * step] + src[3 * step];                         \
 107         int c2 = src[0 * step] - src[3 * step];                         \
 108         int c3 = 74 * src[1 * step];                                    \
 109                                                                         \
 110         assign(dst[2 * step], 74 * (src[0 * step] -                     \
 111                                     src[2 * step] +                     \
 112                                     src[3 * step]));                    \
 113         assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
 114         assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
 115         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
 116     } while (0)
 117
 118 static void FUNC(transform_4x4_luma)(int16_t *coeffs)
 119 {
 120     int i;
 121     int shift    = 7;
 122     int add      = 1 << (shift - 1);
 123     int16_t *src = coeffs;
 124
 125     for (i = 0; i < 4; i++) {
 126         TR_4x4_LUMA(src, src, 4, SCALE);
 127         src++;
 128     }
 129
 130     shift = 20 - BIT_DEPTH;
 131     add   = 1 << (shift - 1);
 132     for (i = 0; i < 4; i++) {
 133         TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
 134         coeffs += 4;
 135     }
 136 }
 137
 138 #undef TR_4x4_LUMA
 139
 140 #define TR_4(dst, src, dstep, sstep, assign)                            \
 141     do {                                                                \
 142         const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
 143                        transform[8 * 2][0] * src[2 * sstep];            \
 144         const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
 145                        transform[8 * 2][1] * src[2 * sstep];            \
 146         const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
 147                        transform[8 * 3][0] * src[3 * sstep];            \
 148         const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
 149                        transform[8 * 3][1] * src[3 * sstep];            \
 150                                                                         \
 151         assign(dst[0 * dstep], e0 + o0);                                \
 152         assign(dst[1 * dstep], e1 + o1);                                \
 153         assign(dst[2 * dstep], e1 - o1);                                \
 154         assign(dst[3 * dstep], e0 - o0);                                \
 155     } while (0)
 156
 157 static void FUNC(idct_4x4)(int16_t *coeffs)
 158 {
 159     int i;
 160     int shift    = 7;
 161     int add      = 1 << (shift - 1);
 162     int16_t *src = coeffs;
 163
 164     for (i = 0; i < 4; i++) {
 165         TR_4(src, src, 4, 4, SCALE);
 166         src++;
 167     }
 168
 169     shift = 20 - BIT_DEPTH;
 170     add   = 1 << (shift - 1);
 171     for (i = 0; i < 4; i++) {
 172         TR_4(coeffs, coeffs, 1, 1, SCALE);
 173         coeffs += 4;
 174     }
 175 }
 176
 177 #define TR_8(dst, src, dstep, sstep, assign)                      \
 178     do {                                                          \
 179         int i, j;                                                 \
 180         int e_8[4];                                               \
 181         int o_8[4] = { 0 };                                       \
 182         for (i = 0; i < 4; i++)                                   \
 183             for (j = 1; j < 8; j += 2)                            \
 184                 o_8[i] += transform[4 * j][i] * src[j * sstep];   \
 185         TR_4(e_8, src, 1, 2 * sstep, SET);                        \
 186                                                                   \
 187         for (i = 0; i < 4; i++) {                                 \
 188             assign(dst[i * dstep], e_8[i] + o_8[i]);              \
 189             assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
 190         }                                                         \
 191     } while (0)
 192
 193 #define TR_16(dst, src, dstep, sstep, assign)                     \
 194     do {                                                          \
 195         int i, j;                                                 \
 196         int e_16[8];                                              \
 197         int o_16[8] = { 0 };                                      \
 198         for (i = 0; i < 8; i++)                                   \
 199             for (j = 1; j < 16; j += 2)                           \
 200                 o_16[i] += transform[2 * j][i] * src[j * sstep];  \
 201         TR_8(e_16, src, 1, 2 * sstep, SET);                       \
 202                                                                   \
 203         for (i = 0; i < 8; i++) {                                 \
 204             assign(dst[i * dstep], e_16[i] + o_16[i]);            \
 205             assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
 206         }                                                         \
 207     } while (0)
 208
 209 #define TR_32(dst, src, dstep, sstep, assign)                     \
 210     do {                                                          \
 211         int i, j;                                                 \
 212         int e_32[16];                                             \
 213         int o_32[16] = { 0 };                                     \
 214         for (i = 0; i < 16; i++)                                  \
 215             for (j = 1; j < 32; j += 2)                           \
 216                 o_32[i] += transform[j][i] * src[j * sstep];      \
 217         TR_16(e_32, src, 1, 2 * sstep, SET);                      \
 218                                                                   \
 219         for (i = 0; i < 16; i++) {                                \
 220             assign(dst[i * dstep], e_32[i] + o_32[i]);            \
 221             assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
 222         }                                                         \
 223     } while (0)
 224
 225
 226
 227 static void FUNC(idct_8x8)(int16_t *coeffs)
 228 {
 229     int i;
 230     int shift    = 7;
 231     int add      = 1 << (shift - 1);
 232     int16_t *src = coeffs;
 233
 234     for (i = 0; i < 8; i++) {
 235         TR_8(src, src, 8, 8, SCALE);
 236         src++;
 237     }
 238
 239     shift = 20 - BIT_DEPTH;
 240     add   = 1 << (shift - 1);
 241     for (i = 0; i < 8; i++) {
 242         TR_8(coeffs, coeffs, 1, 1, SCALE);
 243         coeffs += 8;
 244     }
 245 }
 246
 247 static void FUNC(idct_16x16)(int16_t *coeffs)
 248 {
 249     int i;
 250     int shift    = 7;
 251     int add      = 1 << (shift - 1);
 252     int16_t *src = coeffs;
 253
 254     for (i = 0; i < 16; i++) {
 255         TR_16(src, src, 16, 16, SCALE);
 256         src++;
 257     }
 258
 259     shift = 20 - BIT_DEPTH;
 260     add   = 1 << (shift - 1);
 261     for (i = 0; i < 16; i++) {
 262         TR_16(coeffs, coeffs, 1, 1, SCALE);
 263         coeffs += 16;
 264     }
 265 }
 266
 267 static void FUNC(idct_32x32)(int16_t *coeffs)
 268 {
 269     int i;
 270     int shift    = 7;
 271     int add      = 1 << (shift - 1);
 272     int16_t *src = coeffs;
 273
 274     for (i = 0; i < 32; i++) {
 275         TR_32(src, src, 32, 32, SCALE);
 276         src++;
 277     }
 278     src   = coeffs;
 279     shift = 20 - BIT_DEPTH;
 280     add   = 1 << (shift - 1);
 281     for (i = 0; i < 32; i++) {
 282         TR_32(coeffs, coeffs, 1, 1, SCALE);
 283         coeffs += 32;
 284     }
 285 }
 286
 287 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
 288                                   ptrdiff_t stride, SAOParams *sao,
 289                                   int *borders, int width, int height,
 290                                   int c_idx, int class)
 291 {
 292     pixel *dst = (pixel *)_dst;
 293     pixel *src = (pixel *)_src;
 294     int offset_table[32] = { 0 };
 295     int k, y, x;
 296     int chroma = !!c_idx;
 297     int shift  = BIT_DEPTH - 5;
 298     int *sao_offset_val = sao->offset_val[c_idx];
 299     int sao_left_class  = sao->band_position[c_idx];
 300     int init_y = 0, init_x = 0;
 301
 302     stride /= sizeof(pixel);
 303
 304     switch (class) {
 305     case 0:
 306         if (!borders[2])
 307             width -= (8 >> chroma) + 2;
 308         if (!borders[3])
 309             height -= (4 >> chroma) + 2;
 310         break;
 311     case 1:
 312         init_y = -(4 >> chroma) - 2;
 313         if (!borders[2])
 314             width -= (8 >> chroma) + 2;
 315         height = (4 >> chroma) + 2;
 316         break;
 317     case 2:
 318         init_x = -(8 >> chroma) - 2;
 319         width  =  (8 >> chroma) + 2;
 320         if (!borders[3])
 321             height -= (4 >> chroma) + 2;
 322         break;
 323     case 3:
 324         init_y = -(4 >> chroma) - 2;
 325         init_x = -(8 >> chroma) - 2;
 326         width  =  (8 >> chroma) + 2;
 327         height =  (4 >> chroma) + 2;
 328         break;
 329     }
 330
 331     dst = dst + (init_y * stride + init_x);
 332     src = src + (init_y * stride + init_x);
 333     for (k = 0; k < 4; k++)
 334         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
 335     for (y = 0; y < height; y++) {
 336         for (x = 0; x < width; x++)
 337             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 338         dst += stride;
 339         src += stride;
 340     }
 341 }
 342
 343 static void FUNC(sao_band_filter_0)(uint8_t *dst, uint8_t *src,
 344                                     ptrdiff_t stride, SAOParams *sao,
 345                                     int *borders, int width, int height,
 346                                     int c_idx)
 347 {
 348     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
 349                           width, height, c_idx, 0);
 350 }
 351
 352 static void FUNC(sao_band_filter_1)(uint8_t *dst, uint8_t *src,
 353                                     ptrdiff_t stride, SAOParams *sao,
 354                                     int *borders, int width, int height,
 355                                     int c_idx)
 356 {
 357     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
 358                           width, height, c_idx, 1);
 359 }
 360
 361 static void FUNC(sao_band_filter_2)(uint8_t *dst, uint8_t *src,
 362                                     ptrdiff_t stride, SAOParams *sao,
 363                                     int *borders, int width, int height,
 364                                     int c_idx)
 365 {
 366     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
 367                           width, height, c_idx, 2);
 368 }
 369
 370 static void FUNC(sao_band_filter_3)(uint8_t *_dst, uint8_t *_src,
 371                                     ptrdiff_t stride, SAOParams *sao,
 372                                     int *borders, int width, int height,
 373                                     int c_idx)
 374 {
 375     FUNC(sao_band_filter)(_dst, _src, stride, sao, borders,
 376                           width, height, c_idx, 3);
 377 }
 378
 379 static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
 380                                     ptrdiff_t stride, SAOParams *sao,
 381                                     int *borders, int _width, int _height,
 382                                     int c_idx, uint8_t vert_edge,
 383                                     uint8_t horiz_edge, uint8_t diag_edge)
 384 {
 385     int x, y;
 386     pixel *dst = (pixel *)_dst;
 387     pixel *src = (pixel *)_src;
 388     int chroma = !!c_idx;
 389     int *sao_offset_val = sao->offset_val[c_idx];
 390     int sao_eo_class    = sao->eo_class[c_idx];
 391     int init_x = 0, init_y = 0, width = _width, height = _height;
 392
 393     static const int8_t pos[4][2][2] = {
 394         { { -1,  0 }, {  1, 0 } }, // horizontal
 395         { {  0, -1 }, {  0, 1 } }, // vertical
 396         { { -1, -1 }, {  1, 1 } }, // 45 degree
 397         { {  1, -1 }, { -1, 1 } }, // 135 degree
 398     };
 399     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 400
 401 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 402
 403     stride /= sizeof(pixel);
 404
 405     if (!borders[2])
 406         width -= (8 >> chroma) + 2;
 407     if (!borders[3])
 408         height -= (4 >> chroma) + 2;
 409
 410     dst = dst + (init_y * stride + init_x);
 411     src = src + (init_y * stride + init_x);
 412     init_y = init_x = 0;
 413     if (sao_eo_class != SAO_EO_VERT) {
 414         if (borders[0]) {
 415             int offset_val = sao_offset_val[0];
 416             int y_stride   = 0;
 417             for (y = 0; y < height; y++) {
 418                 dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
 419                 y_stride     += stride;
 420             }
 421             init_x = 1;
 422         }
 423         if (borders[2]) {
 424             int offset_val = sao_offset_val[0];
 425             int x_stride   = width - 1;
 426             for (x = 0; x < height; x++) {
 427                 dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
 428                 x_stride     += stride;
 429             }
 430             width--;
 431         }
 432     }
 433     if (sao_eo_class != SAO_EO_HORIZ) {
 434         if (borders[1]) {
 435             int offset_val = sao_offset_val[0];
 436             for (x = init_x; x < width; x++)
 437                 dst[x] = av_clip_pixel(src[x] + offset_val);
 438             init_y = 1;
 439         }
 440         if (borders[3]) {
 441             int offset_val = sao_offset_val[0];
 442             int y_stride   = stride * (height - 1);
 443             for (x = init_x; x < width; x++)
 444                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
 445             height--;
 446         }
 447     }
 448     {
 449         int y_stride = init_y * stride;
 450         int pos_0_0  = pos[sao_eo_class][0][0];
 451         int pos_0_1  = pos[sao_eo_class][0][1];
 452         int pos_1_0  = pos[sao_eo_class][1][0];
 453         int pos_1_1  = pos[sao_eo_class][1][1];
 454
 455         int y_stride_0_1 = (init_y + pos_0_1) * stride;
 456         int y_stride_1_1 = (init_y + pos_1_1) * stride;
 457         for (y = init_y; y < height; y++) {
 458             for (x = init_x; x < width; x++) {
 459                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
 460                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
 461                 int offset_val    = edge_idx[2 + diff0 + diff1];
 462                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
 463             }
 464             y_stride     += stride;
 465             y_stride_0_1 += stride;
 466             y_stride_1_1 += stride;
 467         }
 468     }
 469
 470     {
 471         // Restore pixels that can't be modified
 472         int save_upper_left = !diag_edge && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
 473         if (vert_edge && sao_eo_class != SAO_EO_VERT)
 474             for (y = init_y+save_upper_left; y< height; y++)
 475                 dst[y*stride] = src[y*stride];
 476         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
 477             for(x = init_x+save_upper_left; x<width; x++)
 478                 dst[x] = src[x];
 479         if(diag_edge && sao_eo_class == SAO_EO_135D)
 480             dst[0] = src[0];
 481     }
 482
 483 #undef CMP
 484 }
 485
 486 static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
 487                                     ptrdiff_t stride, SAOParams *sao,
 488                                     int *borders, int _width, int _height,
 489                                     int c_idx, uint8_t vert_edge,
 490                                     uint8_t horiz_edge, uint8_t diag_edge)
 491 {
 492     int x, y;
 493     pixel *dst = (pixel *)_dst;
 494     pixel *src = (pixel *)_src;
 495     int chroma = !!c_idx;
 496     int *sao_offset_val = sao->offset_val[c_idx];
 497     int sao_eo_class    = sao->eo_class[c_idx];
 498     int init_x = 0, init_y = 0, width = _width, height = _height;
 499
 500     static const int8_t pos[4][2][2] = {
 501         { { -1, 0  }, { 1,  0 } }, // horizontal
 502         { { 0,  -1 }, { 0,  1 } }, // vertical
 503         { { -1, -1 }, { 1,  1 } }, // 45 degree
 504         { { 1,  -1 }, { -1, 1 } }, // 135 degree
 505     };
 506     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 507
 508 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 509
 510     stride /= sizeof(pixel);
 511
 512     init_y = -(4 >> chroma) - 2;
 513     if (!borders[2])
 514         width -= (8 >> chroma) + 2;
 515     height = (4 >> chroma) + 2;
 516
 517     dst = dst + (init_y * stride + init_x);
 518     src = src + (init_y * stride + init_x);
 519     init_y = init_x = 0;
 520     if (sao_eo_class != SAO_EO_VERT) {
 521         if (borders[0]) {
 522             int offset_val = sao_offset_val[0];
 523             int y_stride   = 0;
 524             for (y = 0; y < height; y++) {
 525                 dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
 526                 y_stride     += stride;
 527             }
 528             init_x = 1;
 529         }
 530         if (borders[2]) {
 531             int offset_val = sao_offset_val[0];
 532             int x_stride   = width - 1;
 533             for (x = 0; x < height; x++) {
 534                 dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
 535                 x_stride     += stride;
 536             }
 537             width--;
 538         }
 539     }
 540     {
 541         int y_stride = init_y * stride;
 542         int pos_0_0  = pos[sao_eo_class][0][0];
 543         int pos_0_1  = pos[sao_eo_class][0][1];
 544         int pos_1_0  = pos[sao_eo_class][1][0];
 545         int pos_1_1  = pos[sao_eo_class][1][1];
 546
 547         int y_stride_0_1 = (init_y + pos_0_1) * stride;
 548         int y_stride_1_1 = (init_y + pos_1_1) * stride;
 549         for (y = init_y; y < height; y++) {
 550             for (x = init_x; x < width; x++) {
 551                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
 552                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
 553                 int offset_val    = edge_idx[2 + diff0 + diff1];
 554                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
 555             }
 556             y_stride     += stride;
 557             y_stride_0_1 += stride;
 558             y_stride_1_1 += stride;
 559         }
 560     }
 561
 562     {
 563         // Restore pixels that can't be modified
 564         int save_lower_left = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[0];
 565         if(vert_edge && sao_eo_class != SAO_EO_VERT)
 566             for(y = init_y; y< height-save_lower_left; y++)
 567                 dst[y*stride] = src[y*stride];
 568         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
 569             for(x = init_x+save_lower_left; x<width; x++)
 570                 dst[(height-1)*stride+x] = src[(height-1)*stride+x];
 571         if(diag_edge && sao_eo_class == SAO_EO_45D)
 572             dst[stride*(height-1)] = src[stride*(height-1)];
 573     }
 574
 575 #undef CMP
 576 }
 577
 578 static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
 579                                     ptrdiff_t stride, SAOParams *sao,
 580                                     int *borders, int _width, int _height,
 581                                     int c_idx, uint8_t vert_edge,
 582                                     uint8_t horiz_edge, uint8_t diag_edge)
 583 {
 584     int x, y;
 585     pixel *dst = (pixel *)_dst;
 586     pixel *src = (pixel *)_src;
 587     int chroma = !!c_idx;
 588     int *sao_offset_val = sao->offset_val[c_idx];
 589     int sao_eo_class    = sao->eo_class[c_idx];
 590     int init_x = 0, init_y = 0, width = _width, height = _height;
 591
 592     static const int8_t pos[4][2][2] = {
 593         { { -1,  0 }, {  1, 0 } }, // horizontal
 594         { {  0, -1 }, {  0, 1 } }, // vertical
 595         { { -1, -1 }, {  1, 1 } }, // 45 degree
 596         { {  1, -1 }, { -1, 1 } }, // 135 degree
 597     };
 598     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 599
 600 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 601
 602     stride /= sizeof(pixel);
 603
 604     init_x = -(8 >> chroma) - 2;
 605     width  =  (8 >> chroma) + 2;
 606     if (!borders[3])
 607         height -= (4 >> chroma) + 2;
 608
 609     dst = dst + (init_y * stride + init_x);
 610     src = src + (init_y * stride + init_x);
 611     init_y = init_x = 0;
 612     if (sao_eo_class != SAO_EO_HORIZ) {
 613         if (borders[1]) {
 614             int offset_val = sao_offset_val[0];
 615             for (x = init_x; x < width; x++)
 616                 dst[x] = av_clip_pixel(src[x] + offset_val);
 617             init_y = 1;
 618         }
 619         if (borders[3]) {
 620             int offset_val = sao_offset_val[0];
 621             int y_stride   = stride * (height - 1);
 622             for (x = init_x; x < width; x++)
 623                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
 624             height--;
 625         }
 626     }
 627     {
 628         int y_stride = init_y * stride;
 629         int pos_0_0  = pos[sao_eo_class][0][0];
 630         int pos_0_1  = pos[sao_eo_class][0][1];
 631         int pos_1_0  = pos[sao_eo_class][1][0];
 632         int pos_1_1  = pos[sao_eo_class][1][1];
 633
 634         int y_stride_0_1 = (init_y + pos_0_1) * stride;
 635         int y_stride_1_1 = (init_y + pos_1_1) * stride;
 636         for (y = init_y; y < height; y++) {
 637             for (x = init_x; x < width; x++) {
 638                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
 639                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
 640                 int offset_val    = edge_idx[2 + diff0 + diff1];
 641                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
 642             }
 643             y_stride     += stride;
 644             y_stride_0_1 += stride;
 645             y_stride_1_1 += stride;
 646         }
 647     }
 648
 649     {
 650         // Restore pixels that can't be modified
 651         int save_upper_right = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[1];
 652         if(vert_edge && sao_eo_class != SAO_EO_VERT)
 653             for(y = init_y+save_upper_right; y< height; y++)
 654                 dst[y*stride+width-1] = src[y*stride+width-1];
 655         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
 656             for(x = init_x; x<width-save_upper_right; x++)
 657                 dst[x] = src[x];
 658         if(diag_edge && sao_eo_class == SAO_EO_45D)
 659             dst[width-1] = src[width-1];
 660     }
 661 #undef CMP
 662 }
 663
 664 static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
 665                                     ptrdiff_t stride, SAOParams *sao,
 666                                     int *borders, int _width, int _height,
 667                                     int c_idx, uint8_t vert_edge,
 668                                     uint8_t horiz_edge, uint8_t diag_edge)
 669 {
 670     int x, y;
 671     pixel *dst = (pixel *)_dst;
 672     pixel *src = (pixel *)_src;
 673     int chroma = !!c_idx;
 674     int *sao_offset_val = sao->offset_val[c_idx];
 675     int sao_eo_class    = sao->eo_class[c_idx];
 676     int init_x = 0, init_y = 0, width = _width, height = _height;
 677
 678     static const int8_t pos[4][2][2] = {
 679         { { -1,  0 }, {  1, 0 } }, // horizontal
 680         { {  0, -1 }, {  0, 1 } }, // vertical
 681         { { -1, -1 }, {  1, 1 } }, // 45 degree
 682         { {  1, -1 }, { -1, 1 } }, // 135 degree
 683     };
 684     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 685
 686 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 687
 688     stride /= sizeof(pixel);
 689
 690     init_y = -(4 >> chroma) - 2;
 691     init_x = -(8 >> chroma) - 2;
 692     width  =  (8 >> chroma) + 2;
 693     height =  (4 >> chroma) + 2;
 694
 695
 696     dst    = dst + (init_y * stride + init_x);
 697     src    = src + (init_y * stride + init_x);
 698     init_y = init_x = 0;
 699
 700     {
 701         int y_stride = init_y * stride;
 702         int pos_0_0  = pos[sao_eo_class][0][0];
 703         int pos_0_1  = pos[sao_eo_class][0][1];
 704         int pos_1_0  = pos[sao_eo_class][1][0];
 705         int pos_1_1  = pos[sao_eo_class][1][1];
 706
 707         int y_stride_0_1 = (init_y + pos_0_1) * stride;
 708         int y_stride_1_1 = (init_y + pos_1_1) * stride;
 709
 710         for (y = init_y; y < height; y++) {
 711             for (x = init_x; x < width; x++) {
 712                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
 713                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
 714                 int offset_val    = edge_idx[2 + diff0 + diff1];
 715                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
 716             }
 717             y_stride     += stride;
 718             y_stride_0_1 += stride;
 719             y_stride_1_1 += stride;
 720         }
 721     }
 722
 723     {
 724         // Restore pixels that can't be modified
 725         int save_lower_right = !diag_edge && sao_eo_class == SAO_EO_135D;
 726         if(vert_edge && sao_eo_class != SAO_EO_VERT)
 727             for(y = init_y; y< height-save_lower_right; y++)
 728                 dst[y*stride+width-1] = src[y*stride+width-1];
 729         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
 730             for(x = init_x; x<width-save_lower_right; x++)
 731                 dst[(height-1)*stride+x] = src[(height-1)*stride+x];
 732         if(diag_edge && sao_eo_class == SAO_EO_135D)
 733             dst[stride*(height-1)+width-1] = src[stride*(height-1)+width-1];
 734     }
 735 #undef CMP
 736 }
 737
 738 #undef SET
 739 #undef SCALE
 740 #undef TR_4
 741 #undef TR_8
 742 #undef TR_16
 743 #undef TR_32
 744
 745 static av_always_inline void
 746 FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
 747                            uint8_t *_src, ptrdiff_t _srcstride,
 748                            int width, int height, int mx, int my,
 749                            int16_t* mcbuffer)
 750 {
 751     int x, y;
 752     pixel *src          = (pixel *)_src;
 753     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 754
 755     dststride /= sizeof(*dst);
 756     for (y = 0; y < height; y++) {
 757         for (x = 0; x < width; x++)
 758             dst[x] = src[x] << (14 - BIT_DEPTH);
 759         src += srcstride;
 760         dst += dststride;
 761     }
 762 }
 763
 764 #define QPEL_FILTER_1(src, stride)      \
 765     (1 * -src[x - 3 * stride] +         \
 766      4 *  src[x - 2 * stride] -         \
 767     10 *  src[x -     stride] +         \
 768     58 *  src[x]              +         \
 769     17 *  src[x +     stride] -         \
 770      5 *  src[x + 2 * stride] +         \
 771      1 *  src[x + 3 * stride])
 772
 773 #define QPEL_FILTER_2(src, stride)      \
 774     (1  * -src[x - 3 * stride] +        \
 775      4  *  src[x - 2 * stride] -        \
 776     11  *  src[x -     stride] +        \
 777     40  *  src[x]              +        \
 778     40  *  src[x +     stride] -        \
 779     11  *  src[x + 2 * stride] +        \
 780      4  *  src[x + 3 * stride] -        \
 781      1  *  src[x + 4 * stride])
 782
 783 #define QPEL_FILTER_3(src, stride)      \
 784     (1  * src[x - 2 * stride] -         \
 785      5  * src[x -     stride] +         \
 786     17  * src[x]              +         \
 787     58  * src[x + stride]     -         \
 788     10  * src[x + 2 * stride] +         \
 789      4  * src[x + 3 * stride] -         \
 790      1  * src[x + 4 * stride])
 791
 792
 793 #define PUT_HEVC_QPEL_H(H)                                                     \
 794 static void FUNC(put_hevc_qpel_h ## H)(int16_t *dst,  ptrdiff_t dststride,     \
 795                                        uint8_t *_src, ptrdiff_t _srcstride,    \
 796                                        int width, int height,                  \
 797                                        int16_t* mcbuffer)                      \
 798 {                                                                              \
 799     int x, y;                                                                  \
 800     pixel *src = (pixel*)_src;                                                 \
 801     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
 802                                                                                \
 803     dststride /= sizeof(*dst);                                                 \
 804     for (y = 0; y < height; y++) {                                             \
 805         for (x = 0; x < width; x++)                                            \
 806             dst[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
 807         src += srcstride;                                                      \
 808         dst += dststride;                                                      \
 809     }                                                                          \
 810 }
 811
 812 #define PUT_HEVC_QPEL_V(V)                                                     \
 813 static void FUNC(put_hevc_qpel_v ## V)(int16_t *dst,  ptrdiff_t dststride,     \
 814                                        uint8_t *_src, ptrdiff_t _srcstride,    \
 815                                        int width, int height,                  \
 816                                        int16_t* mcbuffer)                      \
 817 {                                                                              \
 818     int x, y;                                                                  \
 819     pixel *src = (pixel*)_src;                                                 \
 820     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
 821                                                                                \
 822     dststride /= sizeof(*dst);                                                 \
 823     for (y = 0; y < height; y++)  {                                            \
 824         for (x = 0; x < width; x++)                                            \
 825             dst[x] = QPEL_FILTER_ ## V(src, srcstride) >> (BIT_DEPTH - 8);     \
 826         src += srcstride;                                                      \
 827         dst += dststride;                                                      \
 828     }                                                                          \
 829 }
 830
 831 #define PUT_HEVC_QPEL_HV(H, V)                                                 \
 832 static void FUNC(put_hevc_qpel_h ## H ## v ## V)(int16_t *dst,                 \
 833                                                  ptrdiff_t dststride,          \
 834                                                  uint8_t *_src,                \
 835                                                  ptrdiff_t _srcstride,         \
 836                                                  int width, int height,        \
 837                                                  int16_t* mcbuffer)            \
 838 {                                                                              \
 839     int x, y;                                                                  \
 840     pixel *src = (pixel*)_src;                                                 \
 841     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
 842                                                                                \
 843     int16_t tmp_array[(MAX_PB_SIZE + 7) * MAX_PB_SIZE];                        \
 844     int16_t *tmp = tmp_array;                                                  \
 845                                                                                \
 846     dststride /= sizeof(*dst);                                                 \
 847     src -= ff_hevc_qpel_extra_before[V] * srcstride;                           \
 848                                                                                \
 849     for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) {                     \
 850         for (x = 0; x < width; x++)                                            \
 851             tmp[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
 852         src += srcstride;                                                      \
 853         tmp += MAX_PB_SIZE;                                                    \
 854     }                                                                          \
 855                                                                                \
 856     tmp = tmp_array + ff_hevc_qpel_extra_before[V] * MAX_PB_SIZE;              \
 857                                                                                \
 858     for (y = 0; y < height; y++) {                                             \
 859         for (x = 0; x < width; x++)                                            \
 860             dst[x] = QPEL_FILTER_ ## V(tmp, MAX_PB_SIZE) >> 6;                 \
 861         tmp += MAX_PB_SIZE;                                                    \
 862         dst += dststride;                                                      \
 863     }                                                                          \
 864 }
 865
 866 PUT_HEVC_QPEL_H(1)
 867 PUT_HEVC_QPEL_H(2)
 868 PUT_HEVC_QPEL_H(3)
 869 PUT_HEVC_QPEL_V(1)
 870 PUT_HEVC_QPEL_V(2)
 871 PUT_HEVC_QPEL_V(3)
 872 PUT_HEVC_QPEL_HV(1, 1)
 873 PUT_HEVC_QPEL_HV(1, 2)
 874 PUT_HEVC_QPEL_HV(1, 3)
 875 PUT_HEVC_QPEL_HV(2, 1)
 876 PUT_HEVC_QPEL_HV(2, 2)
 877 PUT_HEVC_QPEL_HV(2, 3)
 878 PUT_HEVC_QPEL_HV(3, 1)
 879 PUT_HEVC_QPEL_HV(3, 2)
 880 PUT_HEVC_QPEL_HV(3, 3)
 881
 882 #define QPEL(W)                                                                             \
 883 static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride,             \
 884                                              uint8_t *src, ptrdiff_t srcstride,             \
 885                                              int height, int mx, int my,                    \
 886                                              int16_t *mcbuffer)                             \
 887 {                                                                                           \
 888     FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height,                   \
 889                                mx, my, mcbuffer);                                           \
 890 }                                                                                           \
 891                                                                                             \
 892 static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
 893                                         uint8_t *src, ptrdiff_t srcstride,                  \
 894                                         int height, int mx, int my,                         \
 895                                         int16_t *mcbuffer)                                  \
 896 {                                                                                           \
 897     if (mx == 1)                                                                            \
 898         FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 899     else if (mx == 2)                                                                       \
 900         FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 901     else                                                                                    \
 902         FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 903 }                                                                                           \
 904                                                                                             \
 905 static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
 906                                              uint8_t *src, ptrdiff_t srcstride,             \
 907                                              int height, int mx, int my,                    \
 908                                              int16_t *mcbuffer)                             \
 909 {                                                                                           \
 910     if (my == 1)                                                                            \
 911         FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 912     else if (my == 2)                                                                       \
 913         FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 914     else                                                                                    \
 915         FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 916 }                                                                                           \
 917                                                                                             \
 918 static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,                 \
 919                                              uint8_t *src, ptrdiff_t srcstride,             \
 920                                              int height, int mx, int my,                    \
 921                                              int16_t *mcbuffer)                             \
 922 {                                                                                           \
 923     if (my == 1) {                                                                          \
 924         if (mx == 1)                                                                        \
 925             FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 926         else if (mx == 2)                                                                   \
 927             FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 928         else                                                                                \
 929             FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 930     } else if (my == 2) {                                                                   \
 931         if (mx == 1)                                                                        \
 932             FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 933         else if (mx == 2)                                                                   \
 934             FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 935         else                                                                                \
 936             FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 937     } else {                                                                                \
 938         if (mx == 1)                                                                        \
 939             FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 940         else if (mx == 2)                                                                   \
 941             FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 942         else                                                                                \
 943             FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 944     }                                                                                       \
 945 }
 946
 947 QPEL(64)
 948 QPEL(48)
 949 QPEL(32)
 950 QPEL(24)
 951 QPEL(16)
 952 QPEL(12)
 953 QPEL(8)
 954 QPEL(4)
 955
 956 static inline void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
 957                                               uint8_t *_src, ptrdiff_t _srcstride,
 958                                               int width, int height, int mx, int my,
 959                                               int16_t* mcbuffer)
 960 {
 961     int x, y;
 962     pixel *src          = (pixel *)_src;
 963     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 964
 965     dststride /= sizeof(*dst);
 966     for (y = 0; y < height; y++) {
 967         for (x = 0; x < width; x++)
 968             dst[x] = src[x] << (14 - BIT_DEPTH);
 969         src += srcstride;
 970         dst += dststride;
 971     }
 972 }
 973
 974 #define EPEL_FILTER(src, stride)                \
 975     (filter_0 * src[x - stride] +               \
 976      filter_1 * src[x]          +               \
 977      filter_2 * src[x + stride] +               \
 978      filter_3 * src[x + 2 * stride])
 979
 980 static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
 981                                          uint8_t *_src, ptrdiff_t _srcstride,
 982                                          int width, int height, int mx, int my,
 983                                          int16_t* mcbuffer)
 984 {
 985     int x, y;
 986     pixel *src = (pixel *)_src;
 987     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
 988     const int16_t *filter = ff_hevc_epel_coeffs[mx - 1];
 989     int8_t filter_0 = filter[0];
 990     int8_t filter_1 = filter[1];
 991     int8_t filter_2 = filter[2];
 992     int8_t filter_3 = filter[3];
 993     dststride /= sizeof(*dst);
 994     for (y = 0; y < height; y++) {
 995         for (x = 0; x < width; x++)
 996             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
 997         src += srcstride;
 998         dst += dststride;
 999     }
1000 }
1001
1002 static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
1003                                          uint8_t *_src, ptrdiff_t _srcstride,
1004                                          int width, int height, int mx, int my,
1005                                          int16_t* mcbuffer)
1006 {
1007     int x, y;
1008     pixel *src = (pixel *)_src;
1009     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1010     const int16_t *filter = ff_hevc_epel_coeffs[my - 1];
1011     int8_t filter_0 = filter[0];
1012     int8_t filter_1 = filter[1];
1013     int8_t filter_2 = filter[2];
1014     int8_t filter_3 = filter[3];
1015
1016     dststride /= sizeof(*dst);
1017     for (y = 0; y < height; y++) {
1018         for (x = 0; x < width; x++)
1019             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1020         src += srcstride;
1021         dst += dststride;
1022     }
1023 }
1024
1025 static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
1026                                           uint8_t *_src, ptrdiff_t _srcstride,
1027                                           int width, int height, int mx, int my,
1028                                           int16_t* mcbuffer)
1029 {
1030     int x, y;
1031     pixel *src = (pixel *)_src;
1032     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1033     const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1];
1034     const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1];
1035     int8_t filter_0 = filter_h[0];
1036     int8_t filter_1 = filter_h[1];
1037     int8_t filter_2 = filter_h[2];
1038     int8_t filter_3 = filter_h[3];
1039     int16_t tmp_array[(MAX_PB_SIZE + 3) * MAX_PB_SIZE];
1040     int16_t *tmp = tmp_array;
1041
1042     dststride /= sizeof(*dst);
1043     src -= EPEL_EXTRA_BEFORE * srcstride;
1044
1045     for (y = 0; y < height + EPEL_EXTRA; y++) {
1046         for (x = 0; x < width; x++)
1047             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1048         src += srcstride;
1049         tmp += MAX_PB_SIZE;
1050     }
1051
1052     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1053     filter_0 = filter_v[0];
1054     filter_1 = filter_v[1];
1055     filter_2 = filter_v[2];
1056     filter_3 = filter_v[3];
1057     for (y = 0; y < height; y++) {
1058         for (x = 0; x < width; x++)
1059             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1060         tmp += MAX_PB_SIZE;
1061         dst += dststride;
1062     }
1063 }
1064
1065 #define EPEL(W)                                                                 \
1066 static void FUNC(put_hevc_epel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
1067                                              uint8_t *src, ptrdiff_t srcstride, \
1068                                              int height, int mx, int my,        \
1069                                              int16_t *mcbuffer)                 \
1070 {                                                                               \
1071     FUNC(put_hevc_epel_pixels)(dst, dststride, src, srcstride,                  \
1072                                W, height, mx, my, mcbuffer);                    \
1073 }                                                                               \
1074 static void FUNC(put_hevc_epel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
1075                                         uint8_t *src, ptrdiff_t srcstride,      \
1076                                         int height, int mx, int my,             \
1077                                         int16_t *mcbuffer)                      \
1078 {                                                                               \
1079     FUNC(put_hevc_epel_h)(dst, dststride, src, srcstride,                       \
1080                           W, height, mx, my, mcbuffer);                         \
1081 }                                                                               \
1082 static void FUNC(put_hevc_epel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
1083                                         uint8_t *src, ptrdiff_t srcstride,      \
1084                                         int height, int mx, int my,             \
1085                                         int16_t *mcbuffer)                      \
1086 {                                                                               \
1087     FUNC(put_hevc_epel_v)(dst, dststride, src, srcstride,                       \
1088                           W, height, mx, my, mcbuffer);                         \
1089 }                                                                               \
1090 static void FUNC(put_hevc_epel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,     \
1091                                          uint8_t *src, ptrdiff_t srcstride,     \
1092                                          int height, int mx, int my,            \
1093                                          int16_t *mcbuffer)                     \
1094 {                                                                               \
1095     FUNC(put_hevc_epel_hv)(dst, dststride, src, srcstride,                      \
1096                            W, height, mx, my, mcbuffer);                        \
1097 }
1098
1099 EPEL(32)
1100 EPEL(24)
1101 EPEL(16)
1102 EPEL(12)
1103 EPEL(8)
1104 EPEL(6)
1105 EPEL(4)
1106 EPEL(2)
1107
1108 static av_always_inline void
1109 FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
1110                           int16_t *src, ptrdiff_t srcstride,
1111                           int width, int height)
1112 {
1113     int x, y;
1114     pixel *dst          = (pixel *)_dst;
1115     ptrdiff_t dststride = _dststride / sizeof(pixel);
1116
1117     int shift = 14 - BIT_DEPTH;
1118 #if BIT_DEPTH < 14
1119     int offset = 1 << (shift - 1);
1120 #else
1121     int offset = 0;
1122 #endif
1123     srcstride /= sizeof(*src);
1124     for (y = 0; y < height; y++) {
1125         for (x = 0; x < width; x++)
1126             dst[x] = av_clip_pixel((src[x] + offset) >> shift);
1127         dst += dststride;
1128         src += srcstride;
1129     }
1130 }
1131
1132 static av_always_inline void
1133 FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
1134                               int16_t *src1, int16_t *src2,
1135                               ptrdiff_t srcstride,
1136                               int width, int height)
1137 {
1138     int x, y;
1139     pixel *dst          = (pixel *)_dst;
1140     ptrdiff_t dststride = _dststride / sizeof(pixel);
1141
1142     int shift = 14 + 1 - BIT_DEPTH;
1143 #if BIT_DEPTH < 14
1144     int offset = 1 << (shift - 1);
1145 #else
1146     int offset = 0;
1147 #endif
1148
1149     srcstride /= sizeof(*src1);
1150     for (y = 0; y < height; y++) {
1151         for (x = 0; x < width; x++)
1152             dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
1153         dst  += dststride;
1154         src1 += srcstride;
1155         src2 += srcstride;
1156     }
1157 }
1158
1159 static av_always_inline void
1160 FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
1161                     uint8_t *_dst, ptrdiff_t _dststride,
1162                     int16_t *src, ptrdiff_t srcstride,
1163                     int width, int height)
1164 {
1165     int shift, log2Wd, wx, ox, x, y, offset;
1166     pixel *dst          = (pixel *)_dst;
1167     ptrdiff_t dststride = _dststride / sizeof(pixel);
1168
1169     shift  = 14 - BIT_DEPTH;
1170     log2Wd = denom + shift;
1171     offset = 1 << (log2Wd - 1);
1172     wx     = wlxFlag;
1173     ox     = olxFlag * (1 << (BIT_DEPTH - 8));
1174
1175     srcstride /= sizeof(*src);
1176     for (y = 0; y < height; y++) {
1177         for (x = 0; x < width; x++) {
1178             if (log2Wd >= 1) {
1179                 dst[x] = av_clip_pixel(((src[x] * wx + offset) >> log2Wd) + ox);
1180             } else {
1181                 dst[x] = av_clip_pixel(src[x] * wx + ox);
1182             }
1183         }
1184         dst += dststride;
1185         src += srcstride;
1186     }
1187 }
1188
1189 static av_always_inline void
1190 FUNC(weighted_pred_avg)(uint8_t denom,
1191                         int16_t wl0Flag, int16_t wl1Flag,
1192                         int16_t ol0Flag, int16_t ol1Flag,
1193                         uint8_t *_dst, ptrdiff_t _dststride,
1194                         int16_t *src1, int16_t *src2,
1195                         ptrdiff_t srcstride,
1196                         int width, int height)
1197 {
1198     int shift, log2Wd, w0, w1, o0, o1, x, y;
1199     pixel *dst = (pixel *)_dst;
1200     ptrdiff_t dststride = _dststride / sizeof(pixel);
1201
1202     shift  = 14 - BIT_DEPTH;
1203     log2Wd = denom + shift;
1204     w0     = wl0Flag;
1205     w1     = wl1Flag;
1206     o0     = ol0Flag * (1 << (BIT_DEPTH - 8));
1207     o1     = ol1Flag * (1 << (BIT_DEPTH - 8));
1208
1209     srcstride /= sizeof(*src1);
1210     for (y = 0; y < height; y++) {
1211         for (x = 0; x < width; x++)
1212             dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
1213                                     ((o0 + o1 + 1) << log2Wd)) >> (log2Wd + 1));
1214         dst  += dststride;
1215         src1 += srcstride;
1216         src2 += srcstride;
1217     }
1218 }
1219
1220 #define PUT_PRED(w)                                                                            \
1221 static void FUNC(put_unweighted_pred_ ## w)(uint8_t *dst, ptrdiff_t dststride,                 \
1222                                             int16_t *src, ptrdiff_t srcstride,                 \
1223                                             int height)                                        \
1224 {                                                                                              \
1225     FUNC(put_unweighted_pred)(dst, dststride, src, srcstride, w, height);                      \
1226 }                                                                                              \
1227 static void FUNC(put_unweighted_pred_avg_ ## w)(uint8_t *dst, ptrdiff_t dststride,             \
1228                                                 int16_t *src1, int16_t *src2,                  \
1229                                                 ptrdiff_t srcstride, int height)               \
1230 {                                                                                              \
1231     FUNC(put_unweighted_pred_avg)(dst, dststride, src1, src2, srcstride, w, height);           \
1232 }                                                                                              \
1233 static void FUNC(put_weighted_pred_ ## w)(uint8_t denom, int16_t weight, int16_t offset,       \
1234                                           uint8_t *dst, ptrdiff_t dststride,                   \
1235                                           int16_t *src, ptrdiff_t srcstride, int height)       \
1236 {                                                                                              \
1237     FUNC(weighted_pred)(denom, weight, offset,                                                 \
1238                         dst, dststride, src, srcstride, w, height);                            \
1239 }                                                                                              \
1240 static void FUNC(put_weighted_pred_avg_ ## w)(uint8_t denom, int16_t weight0, int16_t weight1, \
1241                                               int16_t offset0, int16_t offset1,                \
1242                                               uint8_t *dst, ptrdiff_t dststride,               \
1243                                               int16_t *src1, int16_t *src2,                    \
1244                                               ptrdiff_t srcstride, int height)                 \
1245 {                                                                                              \
1246     FUNC(weighted_pred_avg)(denom, weight0, weight1, offset0, offset1,                         \
1247                             dst, dststride, src1, src2, srcstride, w, height);                 \
1248 }
1249
1250 PUT_PRED(64)
1251 PUT_PRED(48)
1252 PUT_PRED(32)
1253 PUT_PRED(24)
1254 PUT_PRED(16)
1255 PUT_PRED(12)
1256 PUT_PRED(8)
1257 PUT_PRED(6)
1258 PUT_PRED(4)
1259 PUT_PRED(2)
1260
1261 // line zero
1262 #define P3 pix[-4 * xstride]
1263 #define P2 pix[-3 * xstride]
1264 #define P1 pix[-2 * xstride]
1265 #define P0 pix[-1 * xstride]
1266 #define Q0 pix[0 * xstride]
1267 #define Q1 pix[1 * xstride]
1268 #define Q2 pix[2 * xstride]
1269 #define Q3 pix[3 * xstride]
1270
1271 // line three. used only for deblocking decision
1272 #define TP3 pix[-4 * xstride + 3 * ystride]
1273 #define TP2 pix[-3 * xstride + 3 * ystride]
1274 #define TP1 pix[-2 * xstride + 3 * ystride]
1275 #define TP0 pix[-1 * xstride + 3 * ystride]
1276 #define TQ0 pix[0  * xstride + 3 * ystride]
1277 #define TQ1 pix[1  * xstride + 3 * ystride]
1278 #define TQ2 pix[2  * xstride + 3 * ystride]
1279 #define TQ3 pix[3  * xstride + 3 * ystride]
1280
1281 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1282                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
1283                                         int beta, int *_tc,
1284                                         uint8_t *_no_p, uint8_t *_no_q)
1285 {
1286     int d, j;
1287     pixel *pix        = (pixel *)_pix;
1288     ptrdiff_t xstride = _xstride / sizeof(pixel);
1289     ptrdiff_t ystride = _ystride / sizeof(pixel);
1290
1291     beta <<= BIT_DEPTH - 8;
1292
1293     for (j = 0; j < 2; j++) {
1294         const int dp0  = abs(P2  - 2 * P1  + P0);
1295         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1296         const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1297         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1298         const int d0   = dp0 + dq0;
1299         const int d3   = dp3 + dq3;
1300         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1301         const int no_p = _no_p[j];
1302         const int no_q = _no_q[j];
1303
1304         if (d0 + d3 >= beta) {
1305             pix += 4 * ystride;
1306             continue;
1307         } else {
1308             const int beta_3 = beta >> 3;
1309             const int beta_2 = beta >> 2;
1310             const int tc25   = ((tc * 5 + 1) >> 1);
1311
1312             if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1313                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1314                                       (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1315                 // strong filtering
1316                 const int tc2 = tc << 1;
1317                 for (d = 0; d < 4; d++) {
1318                     const int p3 = P3;
1319                     const int p2 = P2;
1320                     const int p1 = P1;
1321                     const int p0 = P0;
1322                     const int q0 = Q0;
1323                     const int q1 = Q1;
1324                     const int q2 = Q2;
1325                     const int q3 = Q3;
1326                     if (!no_p) {
1327                         P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1328                         P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1329                         P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1330                     }
1331                     if (!no_q) {
1332                         Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1333                         Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1334                         Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1335                     }
1336                     pix += ystride;
1337                 }
1338             } else { // normal filtering
1339                 int nd_p = 1;
1340                 int nd_q = 1;
1341                 const int tc_2 = tc >> 1;
1342                 if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1343                     nd_p = 2;
1344                 if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1345                     nd_q = 2;
1346
1347                 for (d = 0; d < 4; d++) {
1348                     const int p2 = P2;
1349                     const int p1 = P1;
1350                     const int p0 = P0;
1351                     const int q0 = Q0;
1352                     const int q1 = Q1;
1353                     const int q2 = Q2;
1354                     int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1355                     if (abs(delta0) < 10 * tc) {
1356                         delta0 = av_clip(delta0, -tc, tc);
1357                         if (!no_p)
1358                             P0 = av_clip_pixel(p0 + delta0);
1359                         if (!no_q)
1360                             Q0 = av_clip_pixel(q0 - delta0);
1361                         if (!no_p && nd_p > 1) {
1362                             const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1363                             P1 = av_clip_pixel(p1 + deltap1);
1364                         }
1365                         if (!no_q && nd_q > 1) {
1366                             const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1367                             Q1 = av_clip_pixel(q1 + deltaq1);
1368                         }
1369                     }
1370                     pix += ystride;
1371                 }
1372             }
1373         }
1374     }
1375 }
1376
1377 static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1378                                           ptrdiff_t _ystride, int *_tc,
1379                                           uint8_t *_no_p, uint8_t *_no_q)
1380 {
1381     int d, j, no_p, no_q;
1382     pixel *pix        = (pixel *)_pix;
1383     ptrdiff_t xstride = _xstride / sizeof(pixel);
1384     ptrdiff_t ystride = _ystride / sizeof(pixel);
1385
1386     for (j = 0; j < 2; j++) {
1387         const int tc = _tc[j] << (BIT_DEPTH - 8);
1388         if (tc <= 0) {
1389             pix += 4 * ystride;
1390             continue;
1391         }
1392         no_p = _no_p[j];
1393         no_q = _no_q[j];
1394
1395         for (d = 0; d < 4; d++) {
1396             int delta0;
1397             const int p1 = P1;
1398             const int p0 = P0;
1399             const int q0 = Q0;
1400             const int q1 = Q1;
1401             delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1402             if (!no_p)
1403                 P0 = av_clip_pixel(p0 + delta0);
1404             if (!no_q)
1405                 Q0 = av_clip_pixel(q0 - delta0);
1406             pix += ystride;
1407         }
1408     }
1409 }
1410
1411 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1412                                             int *tc, uint8_t *no_p,
1413                                             uint8_t *no_q)
1414 {
1415     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1416 }
1417
1418 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1419                                             int *tc, uint8_t *no_p,
1420                                             uint8_t *no_q)
1421 {
1422     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1423 }
1424
1425 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1426                                           int beta, int *tc, uint8_t *no_p,
1427                                           uint8_t *no_q)
1428 {
1429     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1430                                 beta, tc, no_p, no_q);
1431 }
1432
1433 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1434                                           int beta, int *tc, uint8_t *no_p,
1435                                           uint8_t *no_q)
1436 {
1437     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1438                                 beta, tc, no_p, no_q);
1439 }
1440
1441 #undef P3
1442 #undef P2
1443 #undef P1
1444 #undef P0
1445 #undef Q0
1446 #undef Q1
1447 #undef Q2
1448 #undef Q3
1449
1450 #undef TP3
1451 #undef TP2
1452 #undef TP1
1453 #undef TP0
1454 #undef TQ0
1455 #undef TQ1
1456 #undef TQ2
1457 #undef TQ3