git.sesse.net Git - ffmpeg/blob - libavcodec/hevcdsp_template.c

   1 /*
   2  * HEVC video decoder
   3  *
   4  * Copyright (C) 2012 - 2013 Guillaume Martres
   5  *
   6  * This file is part of Libav.
   7  *
   8  * Libav is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * Libav is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with Libav; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "get_bits.h"
  24 #include "hevc.h"
  25
  26 #include "bit_depth_template.c"
  27
  28 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
  29                           GetBitContext *gb, int pcm_bit_depth)
  30 {
  31     int x, y;
  32     pixel *dst = (pixel *)_dst;
  33
  34     stride /= sizeof(pixel);
  35
  36     for (y = 0; y < size; y++) {
  37         for (x = 0; x < size; x++)
  38             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
  39         dst += stride;
  40     }
  41 }
  42
  43 static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
  44                                                      ptrdiff_t stride, int size)
  45 {
  46     int x, y;
  47     pixel *dst = (pixel *)_dst;
  48
  49     stride /= sizeof(pixel);
  50
  51     for (y = 0; y < size; y++) {
  52         for (x = 0; x < size; x++) {
  53             dst[x] = av_clip_pixel(dst[x] + *coeffs);
  54             coeffs++;
  55         }
  56         dst += stride;
  57     }
  58 }
  59
  60 static void FUNC(transquant_bypass4x4)(uint8_t *_dst, int16_t *coeffs,
  61                                        ptrdiff_t stride)
  62 {
  63     FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
  64 }
  65
  66 static void FUNC(transquant_bypass8x8)(uint8_t *_dst, int16_t *coeffs,
  67                                        ptrdiff_t stride)
  68 {
  69     FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
  70 }
  71
  72 static void FUNC(transquant_bypass16x16)(uint8_t *_dst, int16_t *coeffs,
  73                                          ptrdiff_t stride)
  74 {
  75     FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
  76 }
  77
  78 static void FUNC(transquant_bypass32x32)(uint8_t *_dst, int16_t *coeffs,
  79                                          ptrdiff_t stride)
  80 {
  81     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
  82 }
  83
  84 static void FUNC(transform_skip)(uint8_t *_dst, int16_t *coeffs,
  85                                  ptrdiff_t stride)
  86 {
  87     pixel *dst = (pixel *)_dst;
  88     int shift  = 13 - BIT_DEPTH;
  89 #if BIT_DEPTH <= 13
  90     int offset = 1 << (shift - 1);
  91 #else
  92     int offset = 0;
  93 #endif
  94     int x, y;
  95
  96     stride /= sizeof(pixel);
  97
  98     for (y = 0; y < 4 * 4; y += 4) {
  99         for (x = 0; x < 4; x++)
 100             dst[x] = av_clip_pixel(dst[x] + ((coeffs[y + x] + offset) >> shift));
 101         dst += stride;
 102     }
 103 }
 104
 105 #define SET(dst, x)   (dst) = (x)
 106 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
 107 #define ADD_AND_SCALE(dst, x)                                           \
 108     (dst) = av_clip_pixel((dst) + av_clip_int16(((x) + add) >> shift))
 109
 110 #define TR_4x4_LUMA(dst, src, step, assign)                             \
 111     do {                                                                \
 112         int c0 = src[0 * step] + src[2 * step];                         \
 113         int c1 = src[2 * step] + src[3 * step];                         \
 114         int c2 = src[0 * step] - src[3 * step];                         \
 115         int c3 = 74 * src[1 * step];                                    \
 116                                                                         \
 117         assign(dst[2 * step], 74 * (src[0 * step] -                     \
 118                                     src[2 * step] +                     \
 119                                     src[3 * step]));                    \
 120         assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
 121         assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
 122         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
 123     } while (0)
 124
 125 static void FUNC(transform_4x4_luma_add)(uint8_t *_dst, int16_t *coeffs,
 126                                          ptrdiff_t stride)
 127 {
 128     int i;
 129     pixel *dst   = (pixel *)_dst;
 130     int shift    = 7;
 131     int add      = 1 << (shift - 1);
 132     int16_t *src = coeffs;
 133
 134     stride /= sizeof(pixel);
 135
 136     for (i = 0; i < 4; i++) {
 137         TR_4x4_LUMA(src, src, 4, SCALE);
 138         src++;
 139     }
 140
 141     shift = 20 - BIT_DEPTH;
 142     add   = 1 << (shift - 1);
 143     for (i = 0; i < 4; i++) {
 144         TR_4x4_LUMA(dst, coeffs, 1, ADD_AND_SCALE);
 145         coeffs += 4;
 146         dst    += stride;
 147     }
 148 }
 149
 150 #undef TR_4x4_LUMA
 151
 152 #define TR_4(dst, src, dstep, sstep, assign)                            \
 153     do {                                                                \
 154         const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
 155                        transform[8 * 2][0] * src[2 * sstep];            \
 156         const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
 157                        transform[8 * 2][1] * src[2 * sstep];            \
 158         const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
 159                        transform[8 * 3][0] * src[3 * sstep];            \
 160         const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
 161                        transform[8 * 3][1] * src[3 * sstep];            \
 162                                                                         \
 163         assign(dst[0 * dstep], e0 + o0);                                \
 164         assign(dst[1 * dstep], e1 + o1);                                \
 165         assign(dst[2 * dstep], e1 - o1);                                \
 166         assign(dst[3 * dstep], e0 - o0);                                \
 167     } while (0)
 168
 169 static void FUNC(transform_4x4_add)(uint8_t *_dst, int16_t *coeffs,
 170                                     ptrdiff_t stride)
 171 {
 172     int i;
 173     pixel *dst   = (pixel *)_dst;
 174     int shift    = 7;
 175     int add      = 1 << (shift - 1);
 176     int16_t *src = coeffs;
 177
 178     stride /= sizeof(pixel);
 179
 180     for (i = 0; i < 4; i++) {
 181         TR_4(src, src, 4, 4, SCALE);
 182         src++;
 183     }
 184
 185     shift = 20 - BIT_DEPTH;
 186     add   = 1 << (shift - 1);
 187     for (i = 0; i < 4; i++) {
 188         TR_4(dst, coeffs, 1, 1, ADD_AND_SCALE);
 189         coeffs += 4;
 190         dst    += stride;
 191     }
 192 }
 193
 194 #define TR_8(dst, src, dstep, sstep, assign)                      \
 195     do {                                                          \
 196         int i, j;                                                 \
 197         int e_8[4];                                               \
 198         int o_8[4] = { 0 };                                       \
 199         for (i = 0; i < 4; i++)                                   \
 200             for (j = 1; j < 8; j += 2)                            \
 201                 o_8[i] += transform[4 * j][i] * src[j * sstep];   \
 202         TR_4(e_8, src, 1, 2 * sstep, SET);                        \
 203                                                                   \
 204         for (i = 0; i < 4; i++) {                                 \
 205             assign(dst[i * dstep], e_8[i] + o_8[i]);              \
 206             assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
 207         }                                                         \
 208     } while (0)
 209
 210 #define TR_16(dst, src, dstep, sstep, assign)                     \
 211     do {                                                          \
 212         int i, j;                                                 \
 213         int e_16[8];                                              \
 214         int o_16[8] = { 0 };                                      \
 215         for (i = 0; i < 8; i++)                                   \
 216             for (j = 1; j < 16; j += 2)                           \
 217                 o_16[i] += transform[2 * j][i] * src[j * sstep];  \
 218         TR_8(e_16, src, 1, 2 * sstep, SET);                       \
 219                                                                   \
 220         for (i = 0; i < 8; i++) {                                 \
 221             assign(dst[i * dstep], e_16[i] + o_16[i]);            \
 222             assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
 223         }                                                         \
 224     } while (0)
 225
 226 #define TR_32(dst, src, dstep, sstep, assign)                     \
 227     do {                                                          \
 228         int i, j;                                                 \
 229         int e_32[16];                                             \
 230         int o_32[16] = { 0 };                                     \
 231         for (i = 0; i < 16; i++)                                  \
 232             for (j = 1; j < 32; j += 2)                           \
 233                 o_32[i] += transform[j][i] * src[j * sstep];      \
 234         TR_16(e_32, src, 1, 2 * sstep, SET);                      \
 235                                                                   \
 236         for (i = 0; i < 16; i++) {                                \
 237             assign(dst[i * dstep], e_32[i] + o_32[i]);            \
 238             assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
 239         }                                                         \
 240     } while (0)
 241
 242
 243
 244 static void FUNC(transform_8x8_add)(uint8_t *_dst, int16_t *coeffs,
 245                                     ptrdiff_t stride)
 246 {
 247     int i;
 248     pixel *dst   = (pixel *)_dst;
 249     int shift    = 7;
 250     int add      = 1 << (shift - 1);
 251     int16_t *src = coeffs;
 252
 253     stride /= sizeof(pixel);
 254
 255     for (i = 0; i < 8; i++) {
 256         TR_8(src, src, 8, 8, SCALE);
 257         src++;
 258     }
 259
 260     shift = 20 - BIT_DEPTH;
 261     add   = 1 << (shift - 1);
 262     for (i = 0; i < 8; i++) {
 263         TR_8(dst, coeffs, 1, 1, ADD_AND_SCALE);
 264         coeffs += 8;
 265         dst    += stride;
 266     }
 267 }
 268
 269 static void FUNC(transform_16x16_add)(uint8_t *_dst, int16_t *coeffs,
 270                                       ptrdiff_t stride)
 271 {
 272     int i;
 273     pixel *dst   = (pixel *)_dst;
 274     int shift    = 7;
 275     int add      = 1 << (shift - 1);
 276     int16_t *src = coeffs;
 277
 278     stride /= sizeof(pixel);
 279
 280     for (i = 0; i < 16; i++) {
 281         TR_16(src, src, 16, 16, SCALE);
 282         src++;
 283     }
 284
 285     shift = 20 - BIT_DEPTH;
 286     add   = 1 << (shift - 1);
 287     for (i = 0; i < 16; i++) {
 288         TR_16(dst, coeffs, 1, 1, ADD_AND_SCALE);
 289         coeffs += 16;
 290         dst    += stride;
 291     }
 292 }
 293
 294 static void FUNC(transform_32x32_add)(uint8_t *_dst, int16_t *coeffs,
 295                                       ptrdiff_t stride)
 296 {
 297     int i;
 298     pixel *dst   = (pixel *)_dst;
 299     int shift    = 7;
 300     int add      = 1 << (shift - 1);
 301     int16_t *src = coeffs;
 302
 303     stride /= sizeof(pixel);
 304
 305     for (i = 0; i < 32; i++) {
 306         TR_32(src, src, 32, 32, SCALE);
 307         src++;
 308     }
 309     src   = coeffs;
 310     shift = 20 - BIT_DEPTH;
 311     add   = 1 << (shift - 1);
 312     for (i = 0; i < 32; i++) {
 313         TR_32(dst, coeffs, 1, 1, ADD_AND_SCALE);
 314         coeffs += 32;
 315         dst    += stride;
 316     }
 317 }
 318
 319 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
 320                                   ptrdiff_t stride, SAOParams *sao,
 321                                   int *borders, int width, int height,
 322                                   int c_idx, int class)
 323 {
 324     pixel *dst = (pixel *)_dst;
 325     pixel *src = (pixel *)_src;
 326     int offset_table[32] = { 0 };
 327     int k, y, x;
 328     int chroma = !!c_idx;
 329     int shift  = BIT_DEPTH - 5;
 330     int *sao_offset_val = sao->offset_val[c_idx];
 331     int sao_left_class  = sao->band_position[c_idx];
 332     int init_y = 0, init_x = 0;
 333
 334     stride /= sizeof(pixel);
 335
 336     switch (class) {
 337     case 0:
 338         if (!borders[2])
 339             width -= (8 >> chroma) + 2;
 340         if (!borders[3])
 341             height -= (4 >> chroma) + 2;
 342         break;
 343     case 1:
 344         init_y = -(4 >> chroma) - 2;
 345         if (!borders[2])
 346             width -= (8 >> chroma) + 2;
 347         height = (4 >> chroma) + 2;
 348         break;
 349     case 2:
 350         init_x = -(8 >> chroma) - 2;
 351         width  =  (8 >> chroma) + 2;
 352         if (!borders[3])
 353             height -= (4 >> chroma) + 2;
 354         break;
 355     case 3:
 356         init_y = -(4 >> chroma) - 2;
 357         init_x = -(8 >> chroma) - 2;
 358         width  =  (8 >> chroma) + 2;
 359         height =  (4 >> chroma) + 2;
 360         break;
 361     }
 362
 363     dst = dst + (init_y * stride + init_x);
 364     src = src + (init_y * stride + init_x);
 365     for (k = 0; k < 4; k++)
 366         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
 367     for (y = 0; y < height; y++) {
 368         for (x = 0; x < width; x++)
 369             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
 370         dst += stride;
 371         src += stride;
 372     }
 373 }
 374
 375 static void FUNC(sao_band_filter_0)(uint8_t *dst, uint8_t *src,
 376                                     ptrdiff_t stride, SAOParams *sao,
 377                                     int *borders, int width, int height,
 378                                     int c_idx)
 379 {
 380     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
 381                           width, height, c_idx, 0);
 382 }
 383
 384 static void FUNC(sao_band_filter_1)(uint8_t *dst, uint8_t *src,
 385                                     ptrdiff_t stride, SAOParams *sao,
 386                                     int *borders, int width, int height,
 387                                     int c_idx)
 388 {
 389     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
 390                           width, height, c_idx, 1);
 391 }
 392
 393 static void FUNC(sao_band_filter_2)(uint8_t *dst, uint8_t *src,
 394                                     ptrdiff_t stride, SAOParams *sao,
 395                                     int *borders, int width, int height,
 396                                     int c_idx)
 397 {
 398     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
 399                           width, height, c_idx, 2);
 400 }
 401
 402 static void FUNC(sao_band_filter_3)(uint8_t *_dst, uint8_t *_src,
 403                                     ptrdiff_t stride, SAOParams *sao,
 404                                     int *borders, int width, int height,
 405                                     int c_idx)
 406 {
 407     FUNC(sao_band_filter)(_dst, _src, stride, sao, borders,
 408                           width, height, c_idx, 3);
 409 }
 410
 411 static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
 412                                     ptrdiff_t stride, SAOParams *sao,
 413                                     int *borders, int _width, int _height,
 414                                     int c_idx, uint8_t vert_edge,
 415                                     uint8_t horiz_edge, uint8_t diag_edge)
 416 {
 417     int x, y;
 418     pixel *dst = (pixel *)_dst;
 419     pixel *src = (pixel *)_src;
 420     int chroma = !!c_idx;
 421     int *sao_offset_val = sao->offset_val[c_idx];
 422     int sao_eo_class    = sao->eo_class[c_idx];
 423     int init_x = 0, init_y = 0, width = _width, height = _height;
 424
 425     static const int8_t pos[4][2][2] = {
 426         { { -1,  0 }, {  1, 0 } }, // horizontal
 427         { {  0, -1 }, {  0, 1 } }, // vertical
 428         { { -1, -1 }, {  1, 1 } }, // 45 degree
 429         { {  1, -1 }, { -1, 1 } }, // 135 degree
 430     };
 431     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 432
 433 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 434
 435     stride /= sizeof(pixel);
 436
 437     if (!borders[2])
 438         width -= (8 >> chroma) + 2;
 439     if (!borders[3])
 440         height -= (4 >> chroma) + 2;
 441
 442     dst = dst + (init_y * stride + init_x);
 443     src = src + (init_y * stride + init_x);
 444     init_y = init_x = 0;
 445     if (sao_eo_class != SAO_EO_VERT) {
 446         if (borders[0]) {
 447             int offset_val = sao_offset_val[0];
 448             int y_stride   = 0;
 449             for (y = 0; y < height; y++) {
 450                 dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
 451                 y_stride     += stride;
 452             }
 453             init_x = 1;
 454         }
 455         if (borders[2]) {
 456             int offset_val = sao_offset_val[0];
 457             int x_stride   = width - 1;
 458             for (x = 0; x < height; x++) {
 459                 dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
 460                 x_stride     += stride;
 461             }
 462             width--;
 463         }
 464     }
 465     if (sao_eo_class != SAO_EO_HORIZ) {
 466         if (borders[1]) {
 467             int offset_val = sao_offset_val[0];
 468             for (x = init_x; x < width; x++)
 469                 dst[x] = av_clip_pixel(src[x] + offset_val);
 470             init_y = 1;
 471         }
 472         if (borders[3]) {
 473             int offset_val = sao_offset_val[0];
 474             int y_stride   = stride * (height - 1);
 475             for (x = init_x; x < width; x++)
 476                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
 477             height--;
 478         }
 479     }
 480     {
 481         int y_stride = init_y * stride;
 482         int pos_0_0  = pos[sao_eo_class][0][0];
 483         int pos_0_1  = pos[sao_eo_class][0][1];
 484         int pos_1_0  = pos[sao_eo_class][1][0];
 485         int pos_1_1  = pos[sao_eo_class][1][1];
 486
 487         int y_stride_0_1 = (init_y + pos_0_1) * stride;
 488         int y_stride_1_1 = (init_y + pos_1_1) * stride;
 489         for (y = init_y; y < height; y++) {
 490             for (x = init_x; x < width; x++) {
 491                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
 492                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
 493                 int offset_val    = edge_idx[2 + diff0 + diff1];
 494                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
 495             }
 496             y_stride     += stride;
 497             y_stride_0_1 += stride;
 498             y_stride_1_1 += stride;
 499         }
 500     }
 501
 502     {
 503         // Restore pixels that can't be modified
 504         int save_upper_left = !diag_edge && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
 505         if (vert_edge && sao_eo_class != SAO_EO_VERT)
 506             for (y = init_y+save_upper_left; y< height; y++)
 507                 dst[y*stride] = src[y*stride];
 508         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
 509             for(x = init_x+save_upper_left; x<width; x++)
 510                 dst[x] = src[x];
 511         if(diag_edge && sao_eo_class == SAO_EO_135D)
 512             dst[0] = src[0];
 513     }
 514
 515 #undef CMP
 516 }
 517
 518 static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
 519                                     ptrdiff_t stride, SAOParams *sao,
 520                                     int *borders, int _width, int _height,
 521                                     int c_idx, uint8_t vert_edge,
 522                                     uint8_t horiz_edge, uint8_t diag_edge)
 523 {
 524     int x, y;
 525     pixel *dst = (pixel *)_dst;
 526     pixel *src = (pixel *)_src;
 527     int chroma = !!c_idx;
 528     int *sao_offset_val = sao->offset_val[c_idx];
 529     int sao_eo_class    = sao->eo_class[c_idx];
 530     int init_x = 0, init_y = 0, width = _width, height = _height;
 531
 532     static const int8_t pos[4][2][2] = {
 533         { { -1, 0  }, { 1,  0 } }, // horizontal
 534         { { 0,  -1 }, { 0,  1 } }, // vertical
 535         { { -1, -1 }, { 1,  1 } }, // 45 degree
 536         { { 1,  -1 }, { -1, 1 } }, // 135 degree
 537     };
 538     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 539
 540 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 541
 542     stride /= sizeof(pixel);
 543
 544     init_y = -(4 >> chroma) - 2;
 545     if (!borders[2])
 546         width -= (8 >> chroma) + 2;
 547     height = (4 >> chroma) + 2;
 548
 549     dst = dst + (init_y * stride + init_x);
 550     src = src + (init_y * stride + init_x);
 551     init_y = init_x = 0;
 552     if (sao_eo_class != SAO_EO_VERT) {
 553         if (borders[0]) {
 554             int offset_val = sao_offset_val[0];
 555             int y_stride   = 0;
 556             for (y = 0; y < height; y++) {
 557                 dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
 558                 y_stride     += stride;
 559             }
 560             init_x = 1;
 561         }
 562         if (borders[2]) {
 563             int offset_val = sao_offset_val[0];
 564             int x_stride   = width - 1;
 565             for (x = 0; x < height; x++) {
 566                 dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
 567                 x_stride     += stride;
 568             }
 569             width--;
 570         }
 571     }
 572     {
 573         int y_stride = init_y * stride;
 574         int pos_0_0  = pos[sao_eo_class][0][0];
 575         int pos_0_1  = pos[sao_eo_class][0][1];
 576         int pos_1_0  = pos[sao_eo_class][1][0];
 577         int pos_1_1  = pos[sao_eo_class][1][1];
 578
 579         int y_stride_0_1 = (init_y + pos_0_1) * stride;
 580         int y_stride_1_1 = (init_y + pos_1_1) * stride;
 581         for (y = init_y; y < height; y++) {
 582             for (x = init_x; x < width; x++) {
 583                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
 584                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
 585                 int offset_val    = edge_idx[2 + diff0 + diff1];
 586                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
 587             }
 588             y_stride     += stride;
 589             y_stride_0_1 += stride;
 590             y_stride_1_1 += stride;
 591         }
 592     }
 593
 594     {
 595         // Restore pixels that can't be modified
 596         int save_lower_left = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[0];
 597         if(vert_edge && sao_eo_class != SAO_EO_VERT)
 598             for(y = init_y; y< height-save_lower_left; y++)
 599                 dst[y*stride] = src[y*stride];
 600         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
 601             for(x = init_x+save_lower_left; x<width; x++)
 602                 dst[(height-1)*stride+x] = src[(height-1)*stride+x];
 603         if(diag_edge && sao_eo_class == SAO_EO_45D)
 604             dst[stride*(height-1)] = src[stride*(height-1)];
 605     }
 606
 607 #undef CMP
 608 }
 609
 610 static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
 611                                     ptrdiff_t stride, SAOParams *sao,
 612                                     int *borders, int _width, int _height,
 613                                     int c_idx, uint8_t vert_edge,
 614                                     uint8_t horiz_edge, uint8_t diag_edge)
 615 {
 616     int x, y;
 617     pixel *dst = (pixel *)_dst;
 618     pixel *src = (pixel *)_src;
 619     int chroma = !!c_idx;
 620     int *sao_offset_val = sao->offset_val[c_idx];
 621     int sao_eo_class    = sao->eo_class[c_idx];
 622     int init_x = 0, init_y = 0, width = _width, height = _height;
 623
 624     static const int8_t pos[4][2][2] = {
 625         { { -1,  0 }, {  1, 0 } }, // horizontal
 626         { {  0, -1 }, {  0, 1 } }, // vertical
 627         { { -1, -1 }, {  1, 1 } }, // 45 degree
 628         { {  1, -1 }, { -1, 1 } }, // 135 degree
 629     };
 630     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 631
 632 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 633
 634     stride /= sizeof(pixel);
 635
 636     init_x = -(8 >> chroma) - 2;
 637     width  =  (8 >> chroma) + 2;
 638     if (!borders[3])
 639         height -= (4 >> chroma) + 2;
 640
 641     dst = dst + (init_y * stride + init_x);
 642     src = src + (init_y * stride + init_x);
 643     init_y = init_x = 0;
 644     if (sao_eo_class != SAO_EO_HORIZ) {
 645         if (borders[1]) {
 646             int offset_val = sao_offset_val[0];
 647             for (x = init_x; x < width; x++)
 648                 dst[x] = av_clip_pixel(src[x] + offset_val);
 649             init_y = 1;
 650         }
 651         if (borders[3]) {
 652             int offset_val = sao_offset_val[0];
 653             int y_stride   = stride * (height - 1);
 654             for (x = init_x; x < width; x++)
 655                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
 656             height--;
 657         }
 658     }
 659     {
 660         int y_stride = init_y * stride;
 661         int pos_0_0  = pos[sao_eo_class][0][0];
 662         int pos_0_1  = pos[sao_eo_class][0][1];
 663         int pos_1_0  = pos[sao_eo_class][1][0];
 664         int pos_1_1  = pos[sao_eo_class][1][1];
 665
 666         int y_stride_0_1 = (init_y + pos_0_1) * stride;
 667         int y_stride_1_1 = (init_y + pos_1_1) * stride;
 668         for (y = init_y; y < height; y++) {
 669             for (x = init_x; x < width; x++) {
 670                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
 671                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
 672                 int offset_val    = edge_idx[2 + diff0 + diff1];
 673                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
 674             }
 675             y_stride     += stride;
 676             y_stride_0_1 += stride;
 677             y_stride_1_1 += stride;
 678         }
 679     }
 680
 681     {
 682         // Restore pixels that can't be modified
 683         int save_upper_right = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[1];
 684         if(vert_edge && sao_eo_class != SAO_EO_VERT)
 685             for(y = init_y+save_upper_right; y< height; y++)
 686                 dst[y*stride+width-1] = src[y*stride+width-1];
 687         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
 688             for(x = init_x; x<width-save_upper_right; x++)
 689                 dst[x] = src[x];
 690         if(diag_edge && sao_eo_class == SAO_EO_45D)
 691             dst[width-1] = src[width-1];
 692     }
 693 #undef CMP
 694 }
 695
 696 static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
 697                                     ptrdiff_t stride, SAOParams *sao,
 698                                     int *borders, int _width, int _height,
 699                                     int c_idx, uint8_t vert_edge,
 700                                     uint8_t horiz_edge, uint8_t diag_edge)
 701 {
 702     int x, y;
 703     pixel *dst = (pixel *)_dst;
 704     pixel *src = (pixel *)_src;
 705     int chroma = !!c_idx;
 706     int *sao_offset_val = sao->offset_val[c_idx];
 707     int sao_eo_class    = sao->eo_class[c_idx];
 708     int init_x = 0, init_y = 0, width = _width, height = _height;
 709
 710     static const int8_t pos[4][2][2] = {
 711         { { -1,  0 }, {  1, 0 } }, // horizontal
 712         { {  0, -1 }, {  0, 1 } }, // vertical
 713         { { -1, -1 }, {  1, 1 } }, // 45 degree
 714         { {  1, -1 }, { -1, 1 } }, // 135 degree
 715     };
 716     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
 717
 718 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 719
 720     stride /= sizeof(pixel);
 721
 722     init_y = -(4 >> chroma) - 2;
 723     init_x = -(8 >> chroma) - 2;
 724     width  =  (8 >> chroma) + 2;
 725     height =  (4 >> chroma) + 2;
 726
 727
 728     dst    = dst + (init_y * stride + init_x);
 729     src    = src + (init_y * stride + init_x);
 730     init_y = init_x = 0;
 731
 732     {
 733         int y_stride = init_y * stride;
 734         int pos_0_0  = pos[sao_eo_class][0][0];
 735         int pos_0_1  = pos[sao_eo_class][0][1];
 736         int pos_1_0  = pos[sao_eo_class][1][0];
 737         int pos_1_1  = pos[sao_eo_class][1][1];
 738
 739         int y_stride_0_1 = (init_y + pos_0_1) * stride;
 740         int y_stride_1_1 = (init_y + pos_1_1) * stride;
 741
 742         for (y = init_y; y < height; y++) {
 743             for (x = init_x; x < width; x++) {
 744                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
 745                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
 746                 int offset_val    = edge_idx[2 + diff0 + diff1];
 747                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
 748             }
 749             y_stride     += stride;
 750             y_stride_0_1 += stride;
 751             y_stride_1_1 += stride;
 752         }
 753     }
 754
 755     {
 756         // Restore pixels that can't be modified
 757         int save_lower_right = !diag_edge && sao_eo_class == SAO_EO_135D;
 758         if(vert_edge && sao_eo_class != SAO_EO_VERT)
 759             for(y = init_y; y< height-save_lower_right; y++)
 760                 dst[y*stride+width-1] = src[y*stride+width-1];
 761         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
 762             for(x = init_x; x<width-save_lower_right; x++)
 763                 dst[(height-1)*stride+x] = src[(height-1)*stride+x];
 764         if(diag_edge && sao_eo_class == SAO_EO_135D)
 765             dst[stride*(height-1)+width-1] = src[stride*(height-1)+width-1];
 766     }
 767 #undef CMP
 768 }
 769
 770 #undef SET
 771 #undef SCALE
 772 #undef ADD_AND_SCALE
 773 #undef TR_4
 774 #undef TR_8
 775 #undef TR_16
 776 #undef TR_32
 777
 778 static av_always_inline void
 779 FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
 780                            uint8_t *_src, ptrdiff_t _srcstride,
 781                            int width, int height, int mx, int my,
 782                            int16_t* mcbuffer)
 783 {
 784     int x, y;
 785     pixel *src          = (pixel *)_src;
 786     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 787
 788     dststride /= sizeof(*dst);
 789     for (y = 0; y < height; y++) {
 790         for (x = 0; x < width; x++)
 791             dst[x] = src[x] << (14 - BIT_DEPTH);
 792         src += srcstride;
 793         dst += dststride;
 794     }
 795 }
 796
 797 #define QPEL_FILTER_1(src, stride)      \
 798     (1 * -src[x - 3 * stride] +         \
 799      4 *  src[x - 2 * stride] -         \
 800     10 *  src[x -     stride] +         \
 801     58 *  src[x]              +         \
 802     17 *  src[x +     stride] -         \
 803      5 *  src[x + 2 * stride] +         \
 804      1 *  src[x + 3 * stride])
 805
 806 #define QPEL_FILTER_2(src, stride)      \
 807     (1  * -src[x - 3 * stride] +        \
 808      4  *  src[x - 2 * stride] -        \
 809     11  *  src[x -     stride] +        \
 810     40  *  src[x]              +        \
 811     40  *  src[x +     stride] -        \
 812     11  *  src[x + 2 * stride] +        \
 813      4  *  src[x + 3 * stride] -        \
 814      1  *  src[x + 4 * stride])
 815
 816 #define QPEL_FILTER_3(src, stride)      \
 817     (1  * src[x - 2 * stride] -         \
 818      5  * src[x -     stride] +         \
 819     17  * src[x]              +         \
 820     58  * src[x + stride]     -         \
 821     10  * src[x + 2 * stride] +         \
 822      4  * src[x + 3 * stride] -         \
 823      1  * src[x + 4 * stride])
 824
 825
 826 #define PUT_HEVC_QPEL_H(H)                                                     \
 827 static void FUNC(put_hevc_qpel_h ## H)(int16_t *dst,  ptrdiff_t dststride,     \
 828                                        uint8_t *_src, ptrdiff_t _srcstride,    \
 829                                        int width, int height,                  \
 830                                        int16_t* mcbuffer)                      \
 831 {                                                                              \
 832     int x, y;                                                                  \
 833     pixel *src = (pixel*)_src;                                                 \
 834     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
 835                                                                                \
 836     dststride /= sizeof(*dst);                                                 \
 837     for (y = 0; y < height; y++) {                                             \
 838         for (x = 0; x < width; x++)                                            \
 839             dst[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
 840         src += srcstride;                                                      \
 841         dst += dststride;                                                      \
 842     }                                                                          \
 843 }
 844
 845 #define PUT_HEVC_QPEL_V(V)                                                     \
 846 static void FUNC(put_hevc_qpel_v ## V)(int16_t *dst,  ptrdiff_t dststride,     \
 847                                        uint8_t *_src, ptrdiff_t _srcstride,    \
 848                                        int width, int height,                  \
 849                                        int16_t* mcbuffer)                      \
 850 {                                                                              \
 851     int x, y;                                                                  \
 852     pixel *src = (pixel*)_src;                                                 \
 853     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
 854                                                                                \
 855     dststride /= sizeof(*dst);                                                 \
 856     for (y = 0; y < height; y++)  {                                            \
 857         for (x = 0; x < width; x++)                                            \
 858             dst[x] = QPEL_FILTER_ ## V(src, srcstride) >> (BIT_DEPTH - 8);     \
 859         src += srcstride;                                                      \
 860         dst += dststride;                                                      \
 861     }                                                                          \
 862 }
 863
 864 #define PUT_HEVC_QPEL_HV(H, V)                                                 \
 865 static void FUNC(put_hevc_qpel_h ## H ## v ## V)(int16_t *dst,                 \
 866                                                  ptrdiff_t dststride,          \
 867                                                  uint8_t *_src,                \
 868                                                  ptrdiff_t _srcstride,         \
 869                                                  int width, int height,        \
 870                                                  int16_t* mcbuffer)            \
 871 {                                                                              \
 872     int x, y;                                                                  \
 873     pixel *src = (pixel*)_src;                                                 \
 874     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
 875                                                                                \
 876     int16_t tmp_array[(MAX_PB_SIZE + 7) * MAX_PB_SIZE];                        \
 877     int16_t *tmp = tmp_array;                                                  \
 878                                                                                \
 879     dststride /= sizeof(*dst);                                                 \
 880     src -= ff_hevc_qpel_extra_before[V] * srcstride;                           \
 881                                                                                \
 882     for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) {                     \
 883         for (x = 0; x < width; x++)                                            \
 884             tmp[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
 885         src += srcstride;                                                      \
 886         tmp += MAX_PB_SIZE;                                                    \
 887     }                                                                          \
 888                                                                                \
 889     tmp = tmp_array + ff_hevc_qpel_extra_before[V] * MAX_PB_SIZE;              \
 890                                                                                \
 891     for (y = 0; y < height; y++) {                                             \
 892         for (x = 0; x < width; x++)                                            \
 893             dst[x] = QPEL_FILTER_ ## V(tmp, MAX_PB_SIZE) >> 6;                 \
 894         tmp += MAX_PB_SIZE;                                                    \
 895         dst += dststride;                                                      \
 896     }                                                                          \
 897 }
 898
 899 PUT_HEVC_QPEL_H(1)
 900 PUT_HEVC_QPEL_H(2)
 901 PUT_HEVC_QPEL_H(3)
 902 PUT_HEVC_QPEL_V(1)
 903 PUT_HEVC_QPEL_V(2)
 904 PUT_HEVC_QPEL_V(3)
 905 PUT_HEVC_QPEL_HV(1, 1)
 906 PUT_HEVC_QPEL_HV(1, 2)
 907 PUT_HEVC_QPEL_HV(1, 3)
 908 PUT_HEVC_QPEL_HV(2, 1)
 909 PUT_HEVC_QPEL_HV(2, 2)
 910 PUT_HEVC_QPEL_HV(2, 3)
 911 PUT_HEVC_QPEL_HV(3, 1)
 912 PUT_HEVC_QPEL_HV(3, 2)
 913 PUT_HEVC_QPEL_HV(3, 3)
 914
 915 #define QPEL(W)                                                                             \
 916 static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride,             \
 917                                              uint8_t *src, ptrdiff_t srcstride,             \
 918                                              int height, int mx, int my,                    \
 919                                              int16_t *mcbuffer)                             \
 920 {                                                                                           \
 921     FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height,                   \
 922                                mx, my, mcbuffer);                                           \
 923 }                                                                                           \
 924                                                                                             \
 925 static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
 926                                         uint8_t *src, ptrdiff_t srcstride,                  \
 927                                         int height, int mx, int my,                         \
 928                                         int16_t *mcbuffer)                                  \
 929 {                                                                                           \
 930     if (mx == 1)                                                                            \
 931         FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 932     else if (mx == 2)                                                                       \
 933         FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 934     else                                                                                    \
 935         FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 936 }                                                                                           \
 937                                                                                             \
 938 static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
 939                                              uint8_t *src, ptrdiff_t srcstride,             \
 940                                              int height, int mx, int my,                    \
 941                                              int16_t *mcbuffer)                             \
 942 {                                                                                           \
 943     if (my == 1)                                                                            \
 944         FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 945     else if (my == 2)                                                                       \
 946         FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 947     else                                                                                    \
 948         FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
 949 }                                                                                           \
 950                                                                                             \
 951 static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,                 \
 952                                              uint8_t *src, ptrdiff_t srcstride,             \
 953                                              int height, int mx, int my,                    \
 954                                              int16_t *mcbuffer)                             \
 955 {                                                                                           \
 956     if (my == 1) {                                                                          \
 957         if (mx == 1)                                                                        \
 958             FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 959         else if (mx == 2)                                                                   \
 960             FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 961         else                                                                                \
 962             FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 963     } else if (my == 2) {                                                                   \
 964         if (mx == 1)                                                                        \
 965             FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 966         else if (mx == 2)                                                                   \
 967             FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 968         else                                                                                \
 969             FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 970     } else {                                                                                \
 971         if (mx == 1)                                                                        \
 972             FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 973         else if (mx == 2)                                                                   \
 974             FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 975         else                                                                                \
 976             FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
 977     }                                                                                       \
 978 }
 979
 980 QPEL(64)
 981 QPEL(48)
 982 QPEL(32)
 983 QPEL(24)
 984 QPEL(16)
 985 QPEL(12)
 986 QPEL(8)
 987 QPEL(4)
 988
 989 static inline void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
 990                                               uint8_t *_src, ptrdiff_t _srcstride,
 991                                               int width, int height, int mx, int my,
 992                                               int16_t* mcbuffer)
 993 {
 994     int x, y;
 995     pixel *src          = (pixel *)_src;
 996     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 997
 998     dststride /= sizeof(*dst);
 999     for (y = 0; y < height; y++) {
1000         for (x = 0; x < width; x++)
1001             dst[x] = src[x] << (14 - BIT_DEPTH);
1002         src += srcstride;
1003         dst += dststride;
1004     }
1005 }
1006
1007 #define EPEL_FILTER(src, stride)                \
1008     (filter_0 * src[x - stride] +               \
1009      filter_1 * src[x]          +               \
1010      filter_2 * src[x + stride] +               \
1011      filter_3 * src[x + 2 * stride])
1012
1013 static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
1014                                          uint8_t *_src, ptrdiff_t _srcstride,
1015                                          int width, int height, int mx, int my,
1016                                          int16_t* mcbuffer)
1017 {
1018     int x, y;
1019     pixel *src = (pixel *)_src;
1020     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
1021     const int8_t *filter = ff_hevc_epel_filters[mx - 1];
1022     int8_t filter_0 = filter[0];
1023     int8_t filter_1 = filter[1];
1024     int8_t filter_2 = filter[2];
1025     int8_t filter_3 = filter[3];
1026     dststride /= sizeof(*dst);
1027     for (y = 0; y < height; y++) {
1028         for (x = 0; x < width; x++)
1029             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1030         src += srcstride;
1031         dst += dststride;
1032     }
1033 }
1034
1035 static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
1036                                          uint8_t *_src, ptrdiff_t _srcstride,
1037                                          int width, int height, int mx, int my,
1038                                          int16_t* mcbuffer)
1039 {
1040     int x, y;
1041     pixel *src = (pixel *)_src;
1042     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1043     const int8_t *filter = ff_hevc_epel_filters[my - 1];
1044     int8_t filter_0 = filter[0];
1045     int8_t filter_1 = filter[1];
1046     int8_t filter_2 = filter[2];
1047     int8_t filter_3 = filter[3];
1048
1049     dststride /= sizeof(*dst);
1050     for (y = 0; y < height; y++) {
1051         for (x = 0; x < width; x++)
1052             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1053         src += srcstride;
1054         dst += dststride;
1055     }
1056 }
1057
1058 static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
1059                                           uint8_t *_src, ptrdiff_t _srcstride,
1060                                           int width, int height, int mx, int my,
1061                                           int16_t* mcbuffer)
1062 {
1063     int x, y;
1064     pixel *src = (pixel *)_src;
1065     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1066     const int8_t *filter_h = ff_hevc_epel_filters[mx - 1];
1067     const int8_t *filter_v = ff_hevc_epel_filters[my - 1];
1068     int8_t filter_0 = filter_h[0];
1069     int8_t filter_1 = filter_h[1];
1070     int8_t filter_2 = filter_h[2];
1071     int8_t filter_3 = filter_h[3];
1072     int16_t tmp_array[(MAX_PB_SIZE + 3) * MAX_PB_SIZE];
1073     int16_t *tmp = tmp_array;
1074
1075     dststride /= sizeof(*dst);
1076     src -= EPEL_EXTRA_BEFORE * srcstride;
1077
1078     for (y = 0; y < height + EPEL_EXTRA; y++) {
1079         for (x = 0; x < width; x++)
1080             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1081         src += srcstride;
1082         tmp += MAX_PB_SIZE;
1083     }
1084
1085     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1086     filter_0 = filter_v[0];
1087     filter_1 = filter_v[1];
1088     filter_2 = filter_v[2];
1089     filter_3 = filter_v[3];
1090     for (y = 0; y < height; y++) {
1091         for (x = 0; x < width; x++)
1092             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1093         tmp += MAX_PB_SIZE;
1094         dst += dststride;
1095     }
1096 }
1097
1098 #define EPEL(W)                                                                 \
1099 static void FUNC(put_hevc_epel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
1100                                              uint8_t *src, ptrdiff_t srcstride, \
1101                                              int height, int mx, int my,        \
1102                                              int16_t *mcbuffer)                 \
1103 {                                                                               \
1104     FUNC(put_hevc_epel_pixels)(dst, dststride, src, srcstride,                  \
1105                                W, height, mx, my, mcbuffer);                    \
1106 }                                                                               \
1107 static void FUNC(put_hevc_epel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
1108                                         uint8_t *src, ptrdiff_t srcstride,      \
1109                                         int height, int mx, int my,             \
1110                                         int16_t *mcbuffer)                      \
1111 {                                                                               \
1112     FUNC(put_hevc_epel_h)(dst, dststride, src, srcstride,                       \
1113                           W, height, mx, my, mcbuffer);                         \
1114 }                                                                               \
1115 static void FUNC(put_hevc_epel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
1116                                         uint8_t *src, ptrdiff_t srcstride,      \
1117                                         int height, int mx, int my,             \
1118                                         int16_t *mcbuffer)                      \
1119 {                                                                               \
1120     FUNC(put_hevc_epel_v)(dst, dststride, src, srcstride,                       \
1121                           W, height, mx, my, mcbuffer);                         \
1122 }                                                                               \
1123 static void FUNC(put_hevc_epel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,     \
1124                                          uint8_t *src, ptrdiff_t srcstride,     \
1125                                          int height, int mx, int my,            \
1126                                          int16_t *mcbuffer)                     \
1127 {                                                                               \
1128     FUNC(put_hevc_epel_hv)(dst, dststride, src, srcstride,                      \
1129                            W, height, mx, my, mcbuffer);                        \
1130 }
1131
1132 EPEL(32)
1133 EPEL(24)
1134 EPEL(16)
1135 EPEL(12)
1136 EPEL(8)
1137 EPEL(6)
1138 EPEL(4)
1139 EPEL(2)
1140
1141 static av_always_inline void
1142 FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
1143                           int16_t *src, ptrdiff_t srcstride,
1144                           int width, int height)
1145 {
1146     int x, y;
1147     pixel *dst          = (pixel *)_dst;
1148     ptrdiff_t dststride = _dststride / sizeof(pixel);
1149
1150     int shift = 14 - BIT_DEPTH;
1151 #if BIT_DEPTH < 14
1152     int offset = 1 << (shift - 1);
1153 #else
1154     int offset = 0;
1155 #endif
1156     srcstride /= sizeof(*src);
1157     for (y = 0; y < height; y++) {
1158         for (x = 0; x < width; x++)
1159             dst[x] = av_clip_pixel((src[x] + offset) >> shift);
1160         dst += dststride;
1161         src += srcstride;
1162     }
1163 }
1164
1165 static av_always_inline void
1166 FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
1167                               int16_t *src1, int16_t *src2,
1168                               ptrdiff_t srcstride,
1169                               int width, int height)
1170 {
1171     int x, y;
1172     pixel *dst          = (pixel *)_dst;
1173     ptrdiff_t dststride = _dststride / sizeof(pixel);
1174
1175     int shift = 14 + 1 - BIT_DEPTH;
1176 #if BIT_DEPTH < 14
1177     int offset = 1 << (shift - 1);
1178 #else
1179     int offset = 0;
1180 #endif
1181
1182     srcstride /= sizeof(*src1);
1183     for (y = 0; y < height; y++) {
1184         for (x = 0; x < width; x++)
1185             dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
1186         dst  += dststride;
1187         src1 += srcstride;
1188         src2 += srcstride;
1189     }
1190 }
1191
1192 static av_always_inline void
1193 FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
1194                     uint8_t *_dst, ptrdiff_t _dststride,
1195                     int16_t *src, ptrdiff_t srcstride,
1196                     int width, int height)
1197 {
1198     int shift, log2Wd, wx, ox, x, y, offset;
1199     pixel *dst          = (pixel *)_dst;
1200     ptrdiff_t dststride = _dststride / sizeof(pixel);
1201
1202     shift  = 14 - BIT_DEPTH;
1203     log2Wd = denom + shift;
1204     offset = 1 << (log2Wd - 1);
1205     wx     = wlxFlag;
1206     ox     = olxFlag * (1 << (BIT_DEPTH - 8));
1207
1208     srcstride /= sizeof(*src);
1209     for (y = 0; y < height; y++) {
1210         for (x = 0; x < width; x++) {
1211             if (log2Wd >= 1) {
1212                 dst[x] = av_clip_pixel(((src[x] * wx + offset) >> log2Wd) + ox);
1213             } else {
1214                 dst[x] = av_clip_pixel(src[x] * wx + ox);
1215             }
1216         }
1217         dst += dststride;
1218         src += srcstride;
1219     }
1220 }
1221
1222 static av_always_inline void
1223 FUNC(weighted_pred_avg)(uint8_t denom,
1224                         int16_t wl0Flag, int16_t wl1Flag,
1225                         int16_t ol0Flag, int16_t ol1Flag,
1226                         uint8_t *_dst, ptrdiff_t _dststride,
1227                         int16_t *src1, int16_t *src2,
1228                         ptrdiff_t srcstride,
1229                         int width, int height)
1230 {
1231     int shift, log2Wd, w0, w1, o0, o1, x, y;
1232     pixel *dst = (pixel *)_dst;
1233     ptrdiff_t dststride = _dststride / sizeof(pixel);
1234
1235     shift  = 14 - BIT_DEPTH;
1236     log2Wd = denom + shift;
1237     w0     = wl0Flag;
1238     w1     = wl1Flag;
1239     o0     = ol0Flag * (1 << (BIT_DEPTH - 8));
1240     o1     = ol1Flag * (1 << (BIT_DEPTH - 8));
1241
1242     srcstride /= sizeof(*src1);
1243     for (y = 0; y < height; y++) {
1244         for (x = 0; x < width; x++)
1245             dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
1246                                     ((o0 + o1 + 1) << log2Wd)) >> (log2Wd + 1));
1247         dst  += dststride;
1248         src1 += srcstride;
1249         src2 += srcstride;
1250     }
1251 }
1252
1253 #define PUT_PRED(w)                                                                            \
1254 static void FUNC(put_unweighted_pred_ ## w)(uint8_t *dst, ptrdiff_t dststride,                 \
1255                                             int16_t *src, ptrdiff_t srcstride,                 \
1256                                             int height)                                        \
1257 {                                                                                              \
1258     FUNC(put_unweighted_pred)(dst, dststride, src, srcstride, w, height);                      \
1259 }                                                                                              \
1260 static void FUNC(put_unweighted_pred_avg_ ## w)(uint8_t *dst, ptrdiff_t dststride,             \
1261                                                 int16_t *src1, int16_t *src2,                  \
1262                                                 ptrdiff_t srcstride, int height)               \
1263 {                                                                                              \
1264     FUNC(put_unweighted_pred_avg)(dst, dststride, src1, src2, srcstride, w, height);           \
1265 }                                                                                              \
1266 static void FUNC(put_weighted_pred_ ## w)(uint8_t denom, int16_t weight, int16_t offset,       \
1267                                           uint8_t *dst, ptrdiff_t dststride,                   \
1268                                           int16_t *src, ptrdiff_t srcstride, int height)       \
1269 {                                                                                              \
1270     FUNC(weighted_pred)(denom, weight, offset,                                                 \
1271                         dst, dststride, src, srcstride, w, height);                            \
1272 }                                                                                              \
1273 static void FUNC(put_weighted_pred_avg_ ## w)(uint8_t denom, int16_t weight0, int16_t weight1, \
1274                                               int16_t offset0, int16_t offset1,                \
1275                                               uint8_t *dst, ptrdiff_t dststride,               \
1276                                               int16_t *src1, int16_t *src2,                    \
1277                                               ptrdiff_t srcstride, int height)                 \
1278 {                                                                                              \
1279     FUNC(weighted_pred_avg)(denom, weight0, weight1, offset0, offset1,                         \
1280                             dst, dststride, src1, src2, srcstride, w, height);                 \
1281 }
1282
1283 PUT_PRED(64)
1284 PUT_PRED(48)
1285 PUT_PRED(32)
1286 PUT_PRED(24)
1287 PUT_PRED(16)
1288 PUT_PRED(12)
1289 PUT_PRED(8)
1290 PUT_PRED(6)
1291 PUT_PRED(4)
1292 PUT_PRED(2)
1293
1294 // line zero
1295 #define P3 pix[-4 * xstride]
1296 #define P2 pix[-3 * xstride]
1297 #define P1 pix[-2 * xstride]
1298 #define P0 pix[-1 * xstride]
1299 #define Q0 pix[0 * xstride]
1300 #define Q1 pix[1 * xstride]
1301 #define Q2 pix[2 * xstride]
1302 #define Q3 pix[3 * xstride]
1303
1304 // line three. used only for deblocking decision
1305 #define TP3 pix[-4 * xstride + 3 * ystride]
1306 #define TP2 pix[-3 * xstride + 3 * ystride]
1307 #define TP1 pix[-2 * xstride + 3 * ystride]
1308 #define TP0 pix[-1 * xstride + 3 * ystride]
1309 #define TQ0 pix[0  * xstride + 3 * ystride]
1310 #define TQ1 pix[1  * xstride + 3 * ystride]
1311 #define TQ2 pix[2  * xstride + 3 * ystride]
1312 #define TQ3 pix[3  * xstride + 3 * ystride]
1313
1314 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1315                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
1316                                         int beta, int *_tc,
1317                                         uint8_t *_no_p, uint8_t *_no_q)
1318 {
1319     int d, j;
1320     pixel *pix        = (pixel *)_pix;
1321     ptrdiff_t xstride = _xstride / sizeof(pixel);
1322     ptrdiff_t ystride = _ystride / sizeof(pixel);
1323
1324     beta <<= BIT_DEPTH - 8;
1325
1326     for (j = 0; j < 2; j++) {
1327         const int dp0  = abs(P2  - 2 * P1  + P0);
1328         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1329         const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1330         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1331         const int d0   = dp0 + dq0;
1332         const int d3   = dp3 + dq3;
1333         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1334         const int no_p = _no_p[j];
1335         const int no_q = _no_q[j];
1336
1337         if (d0 + d3 >= beta) {
1338             pix += 4 * ystride;
1339             continue;
1340         } else {
1341             const int beta_3 = beta >> 3;
1342             const int beta_2 = beta >> 2;
1343             const int tc25   = ((tc * 5 + 1) >> 1);
1344
1345             if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1346                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1347                                       (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1348                 // strong filtering
1349                 const int tc2 = tc << 1;
1350                 for (d = 0; d < 4; d++) {
1351                     const int p3 = P3;
1352                     const int p2 = P2;
1353                     const int p1 = P1;
1354                     const int p0 = P0;
1355                     const int q0 = Q0;
1356                     const int q1 = Q1;
1357                     const int q2 = Q2;
1358                     const int q3 = Q3;
1359                     if (!no_p) {
1360                         P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1361                         P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1362                         P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1363                     }
1364                     if (!no_q) {
1365                         Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1366                         Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1367                         Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1368                     }
1369                     pix += ystride;
1370                 }
1371             } else { // normal filtering
1372                 int nd_p = 1;
1373                 int nd_q = 1;
1374                 const int tc_2 = tc >> 1;
1375                 if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1376                     nd_p = 2;
1377                 if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1378                     nd_q = 2;
1379
1380                 for (d = 0; d < 4; d++) {
1381                     const int p2 = P2;
1382                     const int p1 = P1;
1383                     const int p0 = P0;
1384                     const int q0 = Q0;
1385                     const int q1 = Q1;
1386                     const int q2 = Q2;
1387                     int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1388                     if (abs(delta0) < 10 * tc) {
1389                         delta0 = av_clip(delta0, -tc, tc);
1390                         if (!no_p)
1391                             P0 = av_clip_pixel(p0 + delta0);
1392                         if (!no_q)
1393                             Q0 = av_clip_pixel(q0 - delta0);
1394                         if (!no_p && nd_p > 1) {
1395                             const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1396                             P1 = av_clip_pixel(p1 + deltap1);
1397                         }
1398                         if (!no_q && nd_q > 1) {
1399                             const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1400                             Q1 = av_clip_pixel(q1 + deltaq1);
1401                         }
1402                     }
1403                     pix += ystride;
1404                 }
1405             }
1406         }
1407     }
1408 }
1409
1410 static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1411                                           ptrdiff_t _ystride, int *_tc,
1412                                           uint8_t *_no_p, uint8_t *_no_q)
1413 {
1414     int d, j, no_p, no_q;
1415     pixel *pix        = (pixel *)_pix;
1416     ptrdiff_t xstride = _xstride / sizeof(pixel);
1417     ptrdiff_t ystride = _ystride / sizeof(pixel);
1418
1419     for (j = 0; j < 2; j++) {
1420         const int tc = _tc[j] << (BIT_DEPTH - 8);
1421         if (tc <= 0) {
1422             pix += 4 * ystride;
1423             continue;
1424         }
1425         no_p = _no_p[j];
1426         no_q = _no_q[j];
1427
1428         for (d = 0; d < 4; d++) {
1429             int delta0;
1430             const int p1 = P1;
1431             const int p0 = P0;
1432             const int q0 = Q0;
1433             const int q1 = Q1;
1434             delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1435             if (!no_p)
1436                 P0 = av_clip_pixel(p0 + delta0);
1437             if (!no_q)
1438                 Q0 = av_clip_pixel(q0 - delta0);
1439             pix += ystride;
1440         }
1441     }
1442 }
1443
1444 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1445                                             int *tc, uint8_t *no_p,
1446                                             uint8_t *no_q)
1447 {
1448     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1449 }
1450
1451 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1452                                             int *tc, uint8_t *no_p,
1453                                             uint8_t *no_q)
1454 {
1455     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1456 }
1457
1458 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1459                                           int beta, int *tc, uint8_t *no_p,
1460                                           uint8_t *no_q)
1461 {
1462     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1463                                 beta, tc, no_p, no_q);
1464 }
1465
1466 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1467                                           int beta, int *tc, uint8_t *no_p,
1468                                           uint8_t *no_q)
1469 {
1470     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1471                                 beta, tc, no_p, no_q);
1472 }
1473
1474 #undef P3
1475 #undef P2
1476 #undef P1
1477 #undef P0
1478 #undef Q0
1479 #undef Q1
1480 #undef Q2
1481 #undef Q3
1482
1483 #undef TP3
1484 #undef TP2
1485 #undef TP1
1486 #undef TP0
1487 #undef TQ0
1488 #undef TQ1
1489 #undef TQ2
1490 #undef TQ3