]> git.sesse.net Git - ffmpeg/blob - libavcodec/hevcdsp_template.c
hevc: Separate adding residual to prediction from IDCT
[ffmpeg] / libavcodec / hevcdsp_template.c
1 /*
2  * HEVC video decoder
3  *
4  * Copyright (C) 2012 - 2013 Guillaume Martres
5  *
6  * This file is part of Libav.
7  *
8  * Libav is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * Libav is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with Libav; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "get_bits.h"
24 #include "hevc.h"
25
26 #include "bit_depth_template.c"
27
28 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int size,
29                           GetBitContext *gb, int pcm_bit_depth)
30 {
31     int x, y;
32     pixel *dst = (pixel *)_dst;
33
34     stride /= sizeof(pixel);
35
36     for (y = 0; y < size; y++) {
37         for (x = 0; x < size; x++)
38             dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
39         dst += stride;
40     }
41 }
42
43 static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
44                                                 ptrdiff_t stride, int size)
45 {
46     int x, y;
47     pixel *dst = (pixel *)_dst;
48
49     stride /= sizeof(pixel);
50
51     for (y = 0; y < size; y++) {
52         for (x = 0; x < size; x++) {
53             dst[x] = av_clip_pixel(dst[x] + *res);
54             res++;
55         }
56         dst += stride;
57     }
58 }
59
60 static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
61                                   ptrdiff_t stride)
62 {
63     FUNC(add_residual)(_dst, res, stride, 4);
64 }
65
66 static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
67                                   ptrdiff_t stride)
68 {
69     FUNC(add_residual)(_dst, res, stride, 8);
70 }
71
72 static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
73                                     ptrdiff_t stride)
74 {
75     FUNC(add_residual)(_dst, res, stride, 16);
76 }
77
78 static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
79                                     ptrdiff_t stride)
80 {
81     FUNC(add_residual)(_dst, res, stride, 32);
82 }
83
84 static void FUNC(dequant)(int16_t *coeffs)
85 {
86     int shift  = 13 - BIT_DEPTH;
87 #if BIT_DEPTH <= 13
88     int offset = 1 << (shift - 1);
89 #else
90     int offset = 0;
91 #endif
92     int x, y;
93
94     for (y = 0; y < 4 * 4; y += 4) {
95         for (x = 0; x < 4; x++)
96             coeffs[y + x] = (coeffs[y + x] + offset) >> shift;
97     }
98 }
99
100 #define SET(dst, x)   (dst) = (x)
101 #define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
102
103 #define TR_4x4_LUMA(dst, src, step, assign)                             \
104     do {                                                                \
105         int c0 = src[0 * step] + src[2 * step];                         \
106         int c1 = src[2 * step] + src[3 * step];                         \
107         int c2 = src[0 * step] - src[3 * step];                         \
108         int c3 = 74 * src[1 * step];                                    \
109                                                                         \
110         assign(dst[2 * step], 74 * (src[0 * step] -                     \
111                                     src[2 * step] +                     \
112                                     src[3 * step]));                    \
113         assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
114         assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
115         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
116     } while (0)
117
118 static void FUNC(transform_4x4_luma)(int16_t *coeffs)
119 {
120     int i;
121     int shift    = 7;
122     int add      = 1 << (shift - 1);
123     int16_t *src = coeffs;
124
125     for (i = 0; i < 4; i++) {
126         TR_4x4_LUMA(src, src, 4, SCALE);
127         src++;
128     }
129
130     shift = 20 - BIT_DEPTH;
131     add   = 1 << (shift - 1);
132     for (i = 0; i < 4; i++) {
133         TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
134         coeffs += 4;
135     }
136 }
137
138 #undef TR_4x4_LUMA
139
140 #define TR_4(dst, src, dstep, sstep, assign)                            \
141     do {                                                                \
142         const int e0 = transform[8 * 0][0] * src[0 * sstep] +           \
143                        transform[8 * 2][0] * src[2 * sstep];            \
144         const int e1 = transform[8 * 0][1] * src[0 * sstep] +           \
145                        transform[8 * 2][1] * src[2 * sstep];            \
146         const int o0 = transform[8 * 1][0] * src[1 * sstep] +           \
147                        transform[8 * 3][0] * src[3 * sstep];            \
148         const int o1 = transform[8 * 1][1] * src[1 * sstep] +           \
149                        transform[8 * 3][1] * src[3 * sstep];            \
150                                                                         \
151         assign(dst[0 * dstep], e0 + o0);                                \
152         assign(dst[1 * dstep], e1 + o1);                                \
153         assign(dst[2 * dstep], e1 - o1);                                \
154         assign(dst[3 * dstep], e0 - o0);                                \
155     } while (0)
156
157 static void FUNC(idct_4x4)(int16_t *coeffs)
158 {
159     int i;
160     int shift    = 7;
161     int add      = 1 << (shift - 1);
162     int16_t *src = coeffs;
163
164     for (i = 0; i < 4; i++) {
165         TR_4(src, src, 4, 4, SCALE);
166         src++;
167     }
168
169     shift = 20 - BIT_DEPTH;
170     add   = 1 << (shift - 1);
171     for (i = 0; i < 4; i++) {
172         TR_4(coeffs, coeffs, 1, 1, SCALE);
173         coeffs += 4;
174     }
175 }
176
177 #define TR_8(dst, src, dstep, sstep, assign)                      \
178     do {                                                          \
179         int i, j;                                                 \
180         int e_8[4];                                               \
181         int o_8[4] = { 0 };                                       \
182         for (i = 0; i < 4; i++)                                   \
183             for (j = 1; j < 8; j += 2)                            \
184                 o_8[i] += transform[4 * j][i] * src[j * sstep];   \
185         TR_4(e_8, src, 1, 2 * sstep, SET);                        \
186                                                                   \
187         for (i = 0; i < 4; i++) {                                 \
188             assign(dst[i * dstep], e_8[i] + o_8[i]);              \
189             assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
190         }                                                         \
191     } while (0)
192
193 #define TR_16(dst, src, dstep, sstep, assign)                     \
194     do {                                                          \
195         int i, j;                                                 \
196         int e_16[8];                                              \
197         int o_16[8] = { 0 };                                      \
198         for (i = 0; i < 8; i++)                                   \
199             for (j = 1; j < 16; j += 2)                           \
200                 o_16[i] += transform[2 * j][i] * src[j * sstep];  \
201         TR_8(e_16, src, 1, 2 * sstep, SET);                       \
202                                                                   \
203         for (i = 0; i < 8; i++) {                                 \
204             assign(dst[i * dstep], e_16[i] + o_16[i]);            \
205             assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
206         }                                                         \
207     } while (0)
208
209 #define TR_32(dst, src, dstep, sstep, assign)                     \
210     do {                                                          \
211         int i, j;                                                 \
212         int e_32[16];                                             \
213         int o_32[16] = { 0 };                                     \
214         for (i = 0; i < 16; i++)                                  \
215             for (j = 1; j < 32; j += 2)                           \
216                 o_32[i] += transform[j][i] * src[j * sstep];      \
217         TR_16(e_32, src, 1, 2 * sstep, SET);                      \
218                                                                   \
219         for (i = 0; i < 16; i++) {                                \
220             assign(dst[i * dstep], e_32[i] + o_32[i]);            \
221             assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
222         }                                                         \
223     } while (0)
224
225
226
227 static void FUNC(idct_8x8)(int16_t *coeffs)
228 {
229     int i;
230     int shift    = 7;
231     int add      = 1 << (shift - 1);
232     int16_t *src = coeffs;
233
234     for (i = 0; i < 8; i++) {
235         TR_8(src, src, 8, 8, SCALE);
236         src++;
237     }
238
239     shift = 20 - BIT_DEPTH;
240     add   = 1 << (shift - 1);
241     for (i = 0; i < 8; i++) {
242         TR_8(coeffs, coeffs, 1, 1, SCALE);
243         coeffs += 8;
244     }
245 }
246
247 static void FUNC(idct_16x16)(int16_t *coeffs)
248 {
249     int i;
250     int shift    = 7;
251     int add      = 1 << (shift - 1);
252     int16_t *src = coeffs;
253
254     for (i = 0; i < 16; i++) {
255         TR_16(src, src, 16, 16, SCALE);
256         src++;
257     }
258
259     shift = 20 - BIT_DEPTH;
260     add   = 1 << (shift - 1);
261     for (i = 0; i < 16; i++) {
262         TR_16(coeffs, coeffs, 1, 1, SCALE);
263         coeffs += 16;
264     }
265 }
266
267 static void FUNC(idct_32x32)(int16_t *coeffs)
268 {
269     int i;
270     int shift    = 7;
271     int add      = 1 << (shift - 1);
272     int16_t *src = coeffs;
273
274     for (i = 0; i < 32; i++) {
275         TR_32(src, src, 32, 32, SCALE);
276         src++;
277     }
278     src   = coeffs;
279     shift = 20 - BIT_DEPTH;
280     add   = 1 << (shift - 1);
281     for (i = 0; i < 32; i++) {
282         TR_32(coeffs, coeffs, 1, 1, SCALE);
283         coeffs += 32;
284     }
285 }
286
287 static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
288                                   ptrdiff_t stride, SAOParams *sao,
289                                   int *borders, int width, int height,
290                                   int c_idx, int class)
291 {
292     pixel *dst = (pixel *)_dst;
293     pixel *src = (pixel *)_src;
294     int offset_table[32] = { 0 };
295     int k, y, x;
296     int chroma = !!c_idx;
297     int shift  = BIT_DEPTH - 5;
298     int *sao_offset_val = sao->offset_val[c_idx];
299     int sao_left_class  = sao->band_position[c_idx];
300     int init_y = 0, init_x = 0;
301
302     stride /= sizeof(pixel);
303
304     switch (class) {
305     case 0:
306         if (!borders[2])
307             width -= (8 >> chroma) + 2;
308         if (!borders[3])
309             height -= (4 >> chroma) + 2;
310         break;
311     case 1:
312         init_y = -(4 >> chroma) - 2;
313         if (!borders[2])
314             width -= (8 >> chroma) + 2;
315         height = (4 >> chroma) + 2;
316         break;
317     case 2:
318         init_x = -(8 >> chroma) - 2;
319         width  =  (8 >> chroma) + 2;
320         if (!borders[3])
321             height -= (4 >> chroma) + 2;
322         break;
323     case 3:
324         init_y = -(4 >> chroma) - 2;
325         init_x = -(8 >> chroma) - 2;
326         width  =  (8 >> chroma) + 2;
327         height =  (4 >> chroma) + 2;
328         break;
329     }
330
331     dst = dst + (init_y * stride + init_x);
332     src = src + (init_y * stride + init_x);
333     for (k = 0; k < 4; k++)
334         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
335     for (y = 0; y < height; y++) {
336         for (x = 0; x < width; x++)
337             dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
338         dst += stride;
339         src += stride;
340     }
341 }
342
343 static void FUNC(sao_band_filter_0)(uint8_t *dst, uint8_t *src,
344                                     ptrdiff_t stride, SAOParams *sao,
345                                     int *borders, int width, int height,
346                                     int c_idx)
347 {
348     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
349                           width, height, c_idx, 0);
350 }
351
352 static void FUNC(sao_band_filter_1)(uint8_t *dst, uint8_t *src,
353                                     ptrdiff_t stride, SAOParams *sao,
354                                     int *borders, int width, int height,
355                                     int c_idx)
356 {
357     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
358                           width, height, c_idx, 1);
359 }
360
361 static void FUNC(sao_band_filter_2)(uint8_t *dst, uint8_t *src,
362                                     ptrdiff_t stride, SAOParams *sao,
363                                     int *borders, int width, int height,
364                                     int c_idx)
365 {
366     FUNC(sao_band_filter)(dst, src, stride, sao, borders,
367                           width, height, c_idx, 2);
368 }
369
370 static void FUNC(sao_band_filter_3)(uint8_t *_dst, uint8_t *_src,
371                                     ptrdiff_t stride, SAOParams *sao,
372                                     int *borders, int width, int height,
373                                     int c_idx)
374 {
375     FUNC(sao_band_filter)(_dst, _src, stride, sao, borders,
376                           width, height, c_idx, 3);
377 }
378
379 static void FUNC(sao_edge_filter_0)(uint8_t *_dst, uint8_t *_src,
380                                     ptrdiff_t stride, SAOParams *sao,
381                                     int *borders, int _width, int _height,
382                                     int c_idx, uint8_t vert_edge,
383                                     uint8_t horiz_edge, uint8_t diag_edge)
384 {
385     int x, y;
386     pixel *dst = (pixel *)_dst;
387     pixel *src = (pixel *)_src;
388     int chroma = !!c_idx;
389     int *sao_offset_val = sao->offset_val[c_idx];
390     int sao_eo_class    = sao->eo_class[c_idx];
391     int init_x = 0, init_y = 0, width = _width, height = _height;
392
393     static const int8_t pos[4][2][2] = {
394         { { -1,  0 }, {  1, 0 } }, // horizontal
395         { {  0, -1 }, {  0, 1 } }, // vertical
396         { { -1, -1 }, {  1, 1 } }, // 45 degree
397         { {  1, -1 }, { -1, 1 } }, // 135 degree
398     };
399     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
400
401 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
402
403     stride /= sizeof(pixel);
404
405     if (!borders[2])
406         width -= (8 >> chroma) + 2;
407     if (!borders[3])
408         height -= (4 >> chroma) + 2;
409
410     dst = dst + (init_y * stride + init_x);
411     src = src + (init_y * stride + init_x);
412     init_y = init_x = 0;
413     if (sao_eo_class != SAO_EO_VERT) {
414         if (borders[0]) {
415             int offset_val = sao_offset_val[0];
416             int y_stride   = 0;
417             for (y = 0; y < height; y++) {
418                 dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
419                 y_stride     += stride;
420             }
421             init_x = 1;
422         }
423         if (borders[2]) {
424             int offset_val = sao_offset_val[0];
425             int x_stride   = width - 1;
426             for (x = 0; x < height; x++) {
427                 dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
428                 x_stride     += stride;
429             }
430             width--;
431         }
432     }
433     if (sao_eo_class != SAO_EO_HORIZ) {
434         if (borders[1]) {
435             int offset_val = sao_offset_val[0];
436             for (x = init_x; x < width; x++)
437                 dst[x] = av_clip_pixel(src[x] + offset_val);
438             init_y = 1;
439         }
440         if (borders[3]) {
441             int offset_val = sao_offset_val[0];
442             int y_stride   = stride * (height - 1);
443             for (x = init_x; x < width; x++)
444                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
445             height--;
446         }
447     }
448     {
449         int y_stride = init_y * stride;
450         int pos_0_0  = pos[sao_eo_class][0][0];
451         int pos_0_1  = pos[sao_eo_class][0][1];
452         int pos_1_0  = pos[sao_eo_class][1][0];
453         int pos_1_1  = pos[sao_eo_class][1][1];
454
455         int y_stride_0_1 = (init_y + pos_0_1) * stride;
456         int y_stride_1_1 = (init_y + pos_1_1) * stride;
457         for (y = init_y; y < height; y++) {
458             for (x = init_x; x < width; x++) {
459                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
460                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
461                 int offset_val    = edge_idx[2 + diff0 + diff1];
462                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
463             }
464             y_stride     += stride;
465             y_stride_0_1 += stride;
466             y_stride_1_1 += stride;
467         }
468     }
469
470     {
471         // Restore pixels that can't be modified
472         int save_upper_left = !diag_edge && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
473         if (vert_edge && sao_eo_class != SAO_EO_VERT)
474             for (y = init_y+save_upper_left; y< height; y++)
475                 dst[y*stride] = src[y*stride];
476         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
477             for(x = init_x+save_upper_left; x<width; x++)
478                 dst[x] = src[x];
479         if(diag_edge && sao_eo_class == SAO_EO_135D)
480             dst[0] = src[0];
481     }
482
483 #undef CMP
484 }
485
486 static void FUNC(sao_edge_filter_1)(uint8_t *_dst, uint8_t *_src,
487                                     ptrdiff_t stride, SAOParams *sao,
488                                     int *borders, int _width, int _height,
489                                     int c_idx, uint8_t vert_edge,
490                                     uint8_t horiz_edge, uint8_t diag_edge)
491 {
492     int x, y;
493     pixel *dst = (pixel *)_dst;
494     pixel *src = (pixel *)_src;
495     int chroma = !!c_idx;
496     int *sao_offset_val = sao->offset_val[c_idx];
497     int sao_eo_class    = sao->eo_class[c_idx];
498     int init_x = 0, init_y = 0, width = _width, height = _height;
499
500     static const int8_t pos[4][2][2] = {
501         { { -1, 0  }, { 1,  0 } }, // horizontal
502         { { 0,  -1 }, { 0,  1 } }, // vertical
503         { { -1, -1 }, { 1,  1 } }, // 45 degree
504         { { 1,  -1 }, { -1, 1 } }, // 135 degree
505     };
506     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
507
508 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
509
510     stride /= sizeof(pixel);
511
512     init_y = -(4 >> chroma) - 2;
513     if (!borders[2])
514         width -= (8 >> chroma) + 2;
515     height = (4 >> chroma) + 2;
516
517     dst = dst + (init_y * stride + init_x);
518     src = src + (init_y * stride + init_x);
519     init_y = init_x = 0;
520     if (sao_eo_class != SAO_EO_VERT) {
521         if (borders[0]) {
522             int offset_val = sao_offset_val[0];
523             int y_stride   = 0;
524             for (y = 0; y < height; y++) {
525                 dst[y_stride] = av_clip_pixel(src[y_stride] + offset_val);
526                 y_stride     += stride;
527             }
528             init_x = 1;
529         }
530         if (borders[2]) {
531             int offset_val = sao_offset_val[0];
532             int x_stride   = width - 1;
533             for (x = 0; x < height; x++) {
534                 dst[x_stride] = av_clip_pixel(src[x_stride] + offset_val);
535                 x_stride     += stride;
536             }
537             width--;
538         }
539     }
540     {
541         int y_stride = init_y * stride;
542         int pos_0_0  = pos[sao_eo_class][0][0];
543         int pos_0_1  = pos[sao_eo_class][0][1];
544         int pos_1_0  = pos[sao_eo_class][1][0];
545         int pos_1_1  = pos[sao_eo_class][1][1];
546
547         int y_stride_0_1 = (init_y + pos_0_1) * stride;
548         int y_stride_1_1 = (init_y + pos_1_1) * stride;
549         for (y = init_y; y < height; y++) {
550             for (x = init_x; x < width; x++) {
551                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
552                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
553                 int offset_val    = edge_idx[2 + diff0 + diff1];
554                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
555             }
556             y_stride     += stride;
557             y_stride_0_1 += stride;
558             y_stride_1_1 += stride;
559         }
560     }
561
562     {
563         // Restore pixels that can't be modified
564         int save_lower_left = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[0];
565         if(vert_edge && sao_eo_class != SAO_EO_VERT)
566             for(y = init_y; y< height-save_lower_left; y++)
567                 dst[y*stride] = src[y*stride];
568         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
569             for(x = init_x+save_lower_left; x<width; x++)
570                 dst[(height-1)*stride+x] = src[(height-1)*stride+x];
571         if(diag_edge && sao_eo_class == SAO_EO_45D)
572             dst[stride*(height-1)] = src[stride*(height-1)];
573     }
574
575 #undef CMP
576 }
577
578 static void FUNC(sao_edge_filter_2)(uint8_t *_dst, uint8_t *_src,
579                                     ptrdiff_t stride, SAOParams *sao,
580                                     int *borders, int _width, int _height,
581                                     int c_idx, uint8_t vert_edge,
582                                     uint8_t horiz_edge, uint8_t diag_edge)
583 {
584     int x, y;
585     pixel *dst = (pixel *)_dst;
586     pixel *src = (pixel *)_src;
587     int chroma = !!c_idx;
588     int *sao_offset_val = sao->offset_val[c_idx];
589     int sao_eo_class    = sao->eo_class[c_idx];
590     int init_x = 0, init_y = 0, width = _width, height = _height;
591
592     static const int8_t pos[4][2][2] = {
593         { { -1,  0 }, {  1, 0 } }, // horizontal
594         { {  0, -1 }, {  0, 1 } }, // vertical
595         { { -1, -1 }, {  1, 1 } }, // 45 degree
596         { {  1, -1 }, { -1, 1 } }, // 135 degree
597     };
598     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
599
600 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
601
602     stride /= sizeof(pixel);
603
604     init_x = -(8 >> chroma) - 2;
605     width  =  (8 >> chroma) + 2;
606     if (!borders[3])
607         height -= (4 >> chroma) + 2;
608
609     dst = dst + (init_y * stride + init_x);
610     src = src + (init_y * stride + init_x);
611     init_y = init_x = 0;
612     if (sao_eo_class != SAO_EO_HORIZ) {
613         if (borders[1]) {
614             int offset_val = sao_offset_val[0];
615             for (x = init_x; x < width; x++)
616                 dst[x] = av_clip_pixel(src[x] + offset_val);
617             init_y = 1;
618         }
619         if (borders[3]) {
620             int offset_val = sao_offset_val[0];
621             int y_stride   = stride * (height - 1);
622             for (x = init_x; x < width; x++)
623                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + offset_val);
624             height--;
625         }
626     }
627     {
628         int y_stride = init_y * stride;
629         int pos_0_0  = pos[sao_eo_class][0][0];
630         int pos_0_1  = pos[sao_eo_class][0][1];
631         int pos_1_0  = pos[sao_eo_class][1][0];
632         int pos_1_1  = pos[sao_eo_class][1][1];
633
634         int y_stride_0_1 = (init_y + pos_0_1) * stride;
635         int y_stride_1_1 = (init_y + pos_1_1) * stride;
636         for (y = init_y; y < height; y++) {
637             for (x = init_x; x < width; x++) {
638                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
639                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
640                 int offset_val    = edge_idx[2 + diff0 + diff1];
641                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
642             }
643             y_stride     += stride;
644             y_stride_0_1 += stride;
645             y_stride_1_1 += stride;
646         }
647     }
648
649     {
650         // Restore pixels that can't be modified
651         int save_upper_right = !diag_edge && sao_eo_class == SAO_EO_45D && !borders[1];
652         if(vert_edge && sao_eo_class != SAO_EO_VERT)
653             for(y = init_y+save_upper_right; y< height; y++)
654                 dst[y*stride+width-1] = src[y*stride+width-1];
655         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
656             for(x = init_x; x<width-save_upper_right; x++)
657                 dst[x] = src[x];
658         if(diag_edge && sao_eo_class == SAO_EO_45D)
659             dst[width-1] = src[width-1];
660     }
661 #undef CMP
662 }
663
664 static void FUNC(sao_edge_filter_3)(uint8_t *_dst, uint8_t *_src,
665                                     ptrdiff_t stride, SAOParams *sao,
666                                     int *borders, int _width, int _height,
667                                     int c_idx, uint8_t vert_edge,
668                                     uint8_t horiz_edge, uint8_t diag_edge)
669 {
670     int x, y;
671     pixel *dst = (pixel *)_dst;
672     pixel *src = (pixel *)_src;
673     int chroma = !!c_idx;
674     int *sao_offset_val = sao->offset_val[c_idx];
675     int sao_eo_class    = sao->eo_class[c_idx];
676     int init_x = 0, init_y = 0, width = _width, height = _height;
677
678     static const int8_t pos[4][2][2] = {
679         { { -1,  0 }, {  1, 0 } }, // horizontal
680         { {  0, -1 }, {  0, 1 } }, // vertical
681         { { -1, -1 }, {  1, 1 } }, // 45 degree
682         { {  1, -1 }, { -1, 1 } }, // 135 degree
683     };
684     static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
685
686 #define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
687
688     stride /= sizeof(pixel);
689
690     init_y = -(4 >> chroma) - 2;
691     init_x = -(8 >> chroma) - 2;
692     width  =  (8 >> chroma) + 2;
693     height =  (4 >> chroma) + 2;
694
695
696     dst    = dst + (init_y * stride + init_x);
697     src    = src + (init_y * stride + init_x);
698     init_y = init_x = 0;
699
700     {
701         int y_stride = init_y * stride;
702         int pos_0_0  = pos[sao_eo_class][0][0];
703         int pos_0_1  = pos[sao_eo_class][0][1];
704         int pos_1_0  = pos[sao_eo_class][1][0];
705         int pos_1_1  = pos[sao_eo_class][1][1];
706
707         int y_stride_0_1 = (init_y + pos_0_1) * stride;
708         int y_stride_1_1 = (init_y + pos_1_1) * stride;
709
710         for (y = init_y; y < height; y++) {
711             for (x = init_x; x < width; x++) {
712                 int diff0         = CMP(src[x + y_stride], src[x + pos_0_0 + y_stride_0_1]);
713                 int diff1         = CMP(src[x + y_stride], src[x + pos_1_0 + y_stride_1_1]);
714                 int offset_val    = edge_idx[2 + diff0 + diff1];
715                 dst[x + y_stride] = av_clip_pixel(src[x + y_stride] + sao_offset_val[offset_val]);
716             }
717             y_stride     += stride;
718             y_stride_0_1 += stride;
719             y_stride_1_1 += stride;
720         }
721     }
722
723     {
724         // Restore pixels that can't be modified
725         int save_lower_right = !diag_edge && sao_eo_class == SAO_EO_135D;
726         if(vert_edge && sao_eo_class != SAO_EO_VERT)
727             for(y = init_y; y< height-save_lower_right; y++)
728                 dst[y*stride+width-1] = src[y*stride+width-1];
729         if(horiz_edge && sao_eo_class != SAO_EO_HORIZ)
730             for(x = init_x; x<width-save_lower_right; x++)
731                 dst[(height-1)*stride+x] = src[(height-1)*stride+x];
732         if(diag_edge && sao_eo_class == SAO_EO_135D)
733             dst[stride*(height-1)+width-1] = src[stride*(height-1)+width-1];
734     }
735 #undef CMP
736 }
737
738 #undef SET
739 #undef SCALE
740 #undef TR_4
741 #undef TR_8
742 #undef TR_16
743 #undef TR_32
744
745 static av_always_inline void
746 FUNC(put_hevc_qpel_pixels)(int16_t *dst, ptrdiff_t dststride,
747                            uint8_t *_src, ptrdiff_t _srcstride,
748                            int width, int height, int mx, int my,
749                            int16_t* mcbuffer)
750 {
751     int x, y;
752     pixel *src          = (pixel *)_src;
753     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
754
755     dststride /= sizeof(*dst);
756     for (y = 0; y < height; y++) {
757         for (x = 0; x < width; x++)
758             dst[x] = src[x] << (14 - BIT_DEPTH);
759         src += srcstride;
760         dst += dststride;
761     }
762 }
763
764 #define QPEL_FILTER_1(src, stride)      \
765     (1 * -src[x - 3 * stride] +         \
766      4 *  src[x - 2 * stride] -         \
767     10 *  src[x -     stride] +         \
768     58 *  src[x]              +         \
769     17 *  src[x +     stride] -         \
770      5 *  src[x + 2 * stride] +         \
771      1 *  src[x + 3 * stride])
772
773 #define QPEL_FILTER_2(src, stride)      \
774     (1  * -src[x - 3 * stride] +        \
775      4  *  src[x - 2 * stride] -        \
776     11  *  src[x -     stride] +        \
777     40  *  src[x]              +        \
778     40  *  src[x +     stride] -        \
779     11  *  src[x + 2 * stride] +        \
780      4  *  src[x + 3 * stride] -        \
781      1  *  src[x + 4 * stride])
782
783 #define QPEL_FILTER_3(src, stride)      \
784     (1  * src[x - 2 * stride] -         \
785      5  * src[x -     stride] +         \
786     17  * src[x]              +         \
787     58  * src[x + stride]     -         \
788     10  * src[x + 2 * stride] +         \
789      4  * src[x + 3 * stride] -         \
790      1  * src[x + 4 * stride])
791
792
793 #define PUT_HEVC_QPEL_H(H)                                                     \
794 static void FUNC(put_hevc_qpel_h ## H)(int16_t *dst,  ptrdiff_t dststride,     \
795                                        uint8_t *_src, ptrdiff_t _srcstride,    \
796                                        int width, int height,                  \
797                                        int16_t* mcbuffer)                      \
798 {                                                                              \
799     int x, y;                                                                  \
800     pixel *src = (pixel*)_src;                                                 \
801     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
802                                                                                \
803     dststride /= sizeof(*dst);                                                 \
804     for (y = 0; y < height; y++) {                                             \
805         for (x = 0; x < width; x++)                                            \
806             dst[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
807         src += srcstride;                                                      \
808         dst += dststride;                                                      \
809     }                                                                          \
810 }
811
812 #define PUT_HEVC_QPEL_V(V)                                                     \
813 static void FUNC(put_hevc_qpel_v ## V)(int16_t *dst,  ptrdiff_t dststride,     \
814                                        uint8_t *_src, ptrdiff_t _srcstride,    \
815                                        int width, int height,                  \
816                                        int16_t* mcbuffer)                      \
817 {                                                                              \
818     int x, y;                                                                  \
819     pixel *src = (pixel*)_src;                                                 \
820     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
821                                                                                \
822     dststride /= sizeof(*dst);                                                 \
823     for (y = 0; y < height; y++)  {                                            \
824         for (x = 0; x < width; x++)                                            \
825             dst[x] = QPEL_FILTER_ ## V(src, srcstride) >> (BIT_DEPTH - 8);     \
826         src += srcstride;                                                      \
827         dst += dststride;                                                      \
828     }                                                                          \
829 }
830
831 #define PUT_HEVC_QPEL_HV(H, V)                                                 \
832 static void FUNC(put_hevc_qpel_h ## H ## v ## V)(int16_t *dst,                 \
833                                                  ptrdiff_t dststride,          \
834                                                  uint8_t *_src,                \
835                                                  ptrdiff_t _srcstride,         \
836                                                  int width, int height,        \
837                                                  int16_t* mcbuffer)            \
838 {                                                                              \
839     int x, y;                                                                  \
840     pixel *src = (pixel*)_src;                                                 \
841     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                          \
842                                                                                \
843     int16_t tmp_array[(MAX_PB_SIZE + 7) * MAX_PB_SIZE];                        \
844     int16_t *tmp = tmp_array;                                                  \
845                                                                                \
846     dststride /= sizeof(*dst);                                                 \
847     src -= ff_hevc_qpel_extra_before[V] * srcstride;                           \
848                                                                                \
849     for (y = 0; y < height + ff_hevc_qpel_extra[V]; y++) {                     \
850         for (x = 0; x < width; x++)                                            \
851             tmp[x] = QPEL_FILTER_ ## H(src, 1) >> (BIT_DEPTH - 8);             \
852         src += srcstride;                                                      \
853         tmp += MAX_PB_SIZE;                                                    \
854     }                                                                          \
855                                                                                \
856     tmp = tmp_array + ff_hevc_qpel_extra_before[V] * MAX_PB_SIZE;              \
857                                                                                \
858     for (y = 0; y < height; y++) {                                             \
859         for (x = 0; x < width; x++)                                            \
860             dst[x] = QPEL_FILTER_ ## V(tmp, MAX_PB_SIZE) >> 6;                 \
861         tmp += MAX_PB_SIZE;                                                    \
862         dst += dststride;                                                      \
863     }                                                                          \
864 }
865
866 PUT_HEVC_QPEL_H(1)
867 PUT_HEVC_QPEL_H(2)
868 PUT_HEVC_QPEL_H(3)
869 PUT_HEVC_QPEL_V(1)
870 PUT_HEVC_QPEL_V(2)
871 PUT_HEVC_QPEL_V(3)
872 PUT_HEVC_QPEL_HV(1, 1)
873 PUT_HEVC_QPEL_HV(1, 2)
874 PUT_HEVC_QPEL_HV(1, 3)
875 PUT_HEVC_QPEL_HV(2, 1)
876 PUT_HEVC_QPEL_HV(2, 2)
877 PUT_HEVC_QPEL_HV(2, 3)
878 PUT_HEVC_QPEL_HV(3, 1)
879 PUT_HEVC_QPEL_HV(3, 2)
880 PUT_HEVC_QPEL_HV(3, 3)
881
882 #define QPEL(W)                                                                             \
883 static void FUNC(put_hevc_qpel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride,             \
884                                              uint8_t *src, ptrdiff_t srcstride,             \
885                                              int height, int mx, int my,                    \
886                                              int16_t *mcbuffer)                             \
887 {                                                                                           \
888     FUNC(put_hevc_qpel_pixels)(dst, dststride, src, srcstride, W, height,                   \
889                                mx, my, mcbuffer);                                           \
890 }                                                                                           \
891                                                                                             \
892 static void FUNC(put_hevc_qpel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
893                                         uint8_t *src, ptrdiff_t srcstride,                  \
894                                         int height, int mx, int my,                         \
895                                         int16_t *mcbuffer)                                  \
896 {                                                                                           \
897     if (mx == 1)                                                                            \
898         FUNC(put_hevc_qpel_h1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
899     else if (mx == 2)                                                                       \
900         FUNC(put_hevc_qpel_h2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
901     else                                                                                    \
902         FUNC(put_hevc_qpel_h3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
903 }                                                                                           \
904                                                                                             \
905 static void FUNC(put_hevc_qpel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,                  \
906                                              uint8_t *src, ptrdiff_t srcstride,             \
907                                              int height, int mx, int my,                    \
908                                              int16_t *mcbuffer)                             \
909 {                                                                                           \
910     if (my == 1)                                                                            \
911         FUNC(put_hevc_qpel_v1)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
912     else if (my == 2)                                                                       \
913         FUNC(put_hevc_qpel_v2)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
914     else                                                                                    \
915         FUNC(put_hevc_qpel_v3)(dst, dststride, src, srcstride, W, height, mcbuffer);        \
916 }                                                                                           \
917                                                                                             \
918 static void FUNC(put_hevc_qpel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,                 \
919                                              uint8_t *src, ptrdiff_t srcstride,             \
920                                              int height, int mx, int my,                    \
921                                              int16_t *mcbuffer)                             \
922 {                                                                                           \
923     if (my == 1) {                                                                          \
924         if (mx == 1)                                                                        \
925             FUNC(put_hevc_qpel_h1v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
926         else if (mx == 2)                                                                   \
927             FUNC(put_hevc_qpel_h2v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
928         else                                                                                \
929             FUNC(put_hevc_qpel_h3v1)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
930     } else if (my == 2) {                                                                   \
931         if (mx == 1)                                                                        \
932             FUNC(put_hevc_qpel_h1v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
933         else if (mx == 2)                                                                   \
934             FUNC(put_hevc_qpel_h2v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
935         else                                                                                \
936             FUNC(put_hevc_qpel_h3v2)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
937     } else {                                                                                \
938         if (mx == 1)                                                                        \
939             FUNC(put_hevc_qpel_h1v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
940         else if (mx == 2)                                                                   \
941             FUNC(put_hevc_qpel_h2v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
942         else                                                                                \
943             FUNC(put_hevc_qpel_h3v3)(dst, dststride, src, srcstride, W, height, mcbuffer);  \
944     }                                                                                       \
945 }
946
947 QPEL(64)
948 QPEL(48)
949 QPEL(32)
950 QPEL(24)
951 QPEL(16)
952 QPEL(12)
953 QPEL(8)
954 QPEL(4)
955
956 static inline void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride,
957                                               uint8_t *_src, ptrdiff_t _srcstride,
958                                               int width, int height, int mx, int my,
959                                               int16_t* mcbuffer)
960 {
961     int x, y;
962     pixel *src          = (pixel *)_src;
963     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
964
965     dststride /= sizeof(*dst);
966     for (y = 0; y < height; y++) {
967         for (x = 0; x < width; x++)
968             dst[x] = src[x] << (14 - BIT_DEPTH);
969         src += srcstride;
970         dst += dststride;
971     }
972 }
973
974 #define EPEL_FILTER(src, stride)                \
975     (filter_0 * src[x - stride] +               \
976      filter_1 * src[x]          +               \
977      filter_2 * src[x + stride] +               \
978      filter_3 * src[x + 2 * stride])
979
980 static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
981                                          uint8_t *_src, ptrdiff_t _srcstride,
982                                          int width, int height, int mx, int my,
983                                          int16_t* mcbuffer)
984 {
985     int x, y;
986     pixel *src = (pixel *)_src;
987     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
988     const int16_t *filter = ff_hevc_epel_coeffs[mx - 1];
989     int8_t filter_0 = filter[0];
990     int8_t filter_1 = filter[1];
991     int8_t filter_2 = filter[2];
992     int8_t filter_3 = filter[3];
993     dststride /= sizeof(*dst);
994     for (y = 0; y < height; y++) {
995         for (x = 0; x < width; x++)
996             dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
997         src += srcstride;
998         dst += dststride;
999     }
1000 }
1001
1002 static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
1003                                          uint8_t *_src, ptrdiff_t _srcstride,
1004                                          int width, int height, int mx, int my,
1005                                          int16_t* mcbuffer)
1006 {
1007     int x, y;
1008     pixel *src = (pixel *)_src;
1009     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1010     const int16_t *filter = ff_hevc_epel_coeffs[my - 1];
1011     int8_t filter_0 = filter[0];
1012     int8_t filter_1 = filter[1];
1013     int8_t filter_2 = filter[2];
1014     int8_t filter_3 = filter[3];
1015
1016     dststride /= sizeof(*dst);
1017     for (y = 0; y < height; y++) {
1018         for (x = 0; x < width; x++)
1019             dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
1020         src += srcstride;
1021         dst += dststride;
1022     }
1023 }
1024
1025 static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
1026                                           uint8_t *_src, ptrdiff_t _srcstride,
1027                                           int width, int height, int mx, int my,
1028                                           int16_t* mcbuffer)
1029 {
1030     int x, y;
1031     pixel *src = (pixel *)_src;
1032     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
1033     const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1];
1034     const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1];
1035     int8_t filter_0 = filter_h[0];
1036     int8_t filter_1 = filter_h[1];
1037     int8_t filter_2 = filter_h[2];
1038     int8_t filter_3 = filter_h[3];
1039     int16_t tmp_array[(MAX_PB_SIZE + 3) * MAX_PB_SIZE];
1040     int16_t *tmp = tmp_array;
1041
1042     dststride /= sizeof(*dst);
1043     src -= EPEL_EXTRA_BEFORE * srcstride;
1044
1045     for (y = 0; y < height + EPEL_EXTRA; y++) {
1046         for (x = 0; x < width; x++)
1047             tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
1048         src += srcstride;
1049         tmp += MAX_PB_SIZE;
1050     }
1051
1052     tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
1053     filter_0 = filter_v[0];
1054     filter_1 = filter_v[1];
1055     filter_2 = filter_v[2];
1056     filter_3 = filter_v[3];
1057     for (y = 0; y < height; y++) {
1058         for (x = 0; x < width; x++)
1059             dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
1060         tmp += MAX_PB_SIZE;
1061         dst += dststride;
1062     }
1063 }
1064
1065 #define EPEL(W)                                                                 \
1066 static void FUNC(put_hevc_epel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \
1067                                              uint8_t *src, ptrdiff_t srcstride, \
1068                                              int height, int mx, int my,        \
1069                                              int16_t *mcbuffer)                 \
1070 {                                                                               \
1071     FUNC(put_hevc_epel_pixels)(dst, dststride, src, srcstride,                  \
1072                                W, height, mx, my, mcbuffer);                    \
1073 }                                                                               \
1074 static void FUNC(put_hevc_epel_h_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
1075                                         uint8_t *src, ptrdiff_t srcstride,      \
1076                                         int height, int mx, int my,             \
1077                                         int16_t *mcbuffer)                      \
1078 {                                                                               \
1079     FUNC(put_hevc_epel_h)(dst, dststride, src, srcstride,                       \
1080                           W, height, mx, my, mcbuffer);                         \
1081 }                                                                               \
1082 static void FUNC(put_hevc_epel_v_ ## W)(int16_t *dst, ptrdiff_t dststride,      \
1083                                         uint8_t *src, ptrdiff_t srcstride,      \
1084                                         int height, int mx, int my,             \
1085                                         int16_t *mcbuffer)                      \
1086 {                                                                               \
1087     FUNC(put_hevc_epel_v)(dst, dststride, src, srcstride,                       \
1088                           W, height, mx, my, mcbuffer);                         \
1089 }                                                                               \
1090 static void FUNC(put_hevc_epel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride,     \
1091                                          uint8_t *src, ptrdiff_t srcstride,     \
1092                                          int height, int mx, int my,            \
1093                                          int16_t *mcbuffer)                     \
1094 {                                                                               \
1095     FUNC(put_hevc_epel_hv)(dst, dststride, src, srcstride,                      \
1096                            W, height, mx, my, mcbuffer);                        \
1097 }
1098
1099 EPEL(32)
1100 EPEL(24)
1101 EPEL(16)
1102 EPEL(12)
1103 EPEL(8)
1104 EPEL(6)
1105 EPEL(4)
1106 EPEL(2)
1107
1108 static av_always_inline void
1109 FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride,
1110                           int16_t *src, ptrdiff_t srcstride,
1111                           int width, int height)
1112 {
1113     int x, y;
1114     pixel *dst          = (pixel *)_dst;
1115     ptrdiff_t dststride = _dststride / sizeof(pixel);
1116
1117     int shift = 14 - BIT_DEPTH;
1118 #if BIT_DEPTH < 14
1119     int offset = 1 << (shift - 1);
1120 #else
1121     int offset = 0;
1122 #endif
1123     srcstride /= sizeof(*src);
1124     for (y = 0; y < height; y++) {
1125         for (x = 0; x < width; x++)
1126             dst[x] = av_clip_pixel((src[x] + offset) >> shift);
1127         dst += dststride;
1128         src += srcstride;
1129     }
1130 }
1131
1132 static av_always_inline void
1133 FUNC(put_unweighted_pred_avg)(uint8_t *_dst, ptrdiff_t _dststride,
1134                               int16_t *src1, int16_t *src2,
1135                               ptrdiff_t srcstride,
1136                               int width, int height)
1137 {
1138     int x, y;
1139     pixel *dst          = (pixel *)_dst;
1140     ptrdiff_t dststride = _dststride / sizeof(pixel);
1141
1142     int shift = 14 + 1 - BIT_DEPTH;
1143 #if BIT_DEPTH < 14
1144     int offset = 1 << (shift - 1);
1145 #else
1146     int offset = 0;
1147 #endif
1148
1149     srcstride /= sizeof(*src1);
1150     for (y = 0; y < height; y++) {
1151         for (x = 0; x < width; x++)
1152             dst[x] = av_clip_pixel((src1[x] + src2[x] + offset) >> shift);
1153         dst  += dststride;
1154         src1 += srcstride;
1155         src2 += srcstride;
1156     }
1157 }
1158
1159 static av_always_inline void
1160 FUNC(weighted_pred)(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
1161                     uint8_t *_dst, ptrdiff_t _dststride,
1162                     int16_t *src, ptrdiff_t srcstride,
1163                     int width, int height)
1164 {
1165     int shift, log2Wd, wx, ox, x, y, offset;
1166     pixel *dst          = (pixel *)_dst;
1167     ptrdiff_t dststride = _dststride / sizeof(pixel);
1168
1169     shift  = 14 - BIT_DEPTH;
1170     log2Wd = denom + shift;
1171     offset = 1 << (log2Wd - 1);
1172     wx     = wlxFlag;
1173     ox     = olxFlag * (1 << (BIT_DEPTH - 8));
1174
1175     srcstride /= sizeof(*src);
1176     for (y = 0; y < height; y++) {
1177         for (x = 0; x < width; x++) {
1178             if (log2Wd >= 1) {
1179                 dst[x] = av_clip_pixel(((src[x] * wx + offset) >> log2Wd) + ox);
1180             } else {
1181                 dst[x] = av_clip_pixel(src[x] * wx + ox);
1182             }
1183         }
1184         dst += dststride;
1185         src += srcstride;
1186     }
1187 }
1188
1189 static av_always_inline void
1190 FUNC(weighted_pred_avg)(uint8_t denom,
1191                         int16_t wl0Flag, int16_t wl1Flag,
1192                         int16_t ol0Flag, int16_t ol1Flag,
1193                         uint8_t *_dst, ptrdiff_t _dststride,
1194                         int16_t *src1, int16_t *src2,
1195                         ptrdiff_t srcstride,
1196                         int width, int height)
1197 {
1198     int shift, log2Wd, w0, w1, o0, o1, x, y;
1199     pixel *dst = (pixel *)_dst;
1200     ptrdiff_t dststride = _dststride / sizeof(pixel);
1201
1202     shift  = 14 - BIT_DEPTH;
1203     log2Wd = denom + shift;
1204     w0     = wl0Flag;
1205     w1     = wl1Flag;
1206     o0     = ol0Flag * (1 << (BIT_DEPTH - 8));
1207     o1     = ol1Flag * (1 << (BIT_DEPTH - 8));
1208
1209     srcstride /= sizeof(*src1);
1210     for (y = 0; y < height; y++) {
1211         for (x = 0; x < width; x++)
1212             dst[x] = av_clip_pixel((src1[x] * w0 + src2[x] * w1 +
1213                                     ((o0 + o1 + 1) << log2Wd)) >> (log2Wd + 1));
1214         dst  += dststride;
1215         src1 += srcstride;
1216         src2 += srcstride;
1217     }
1218 }
1219
1220 #define PUT_PRED(w)                                                                            \
1221 static void FUNC(put_unweighted_pred_ ## w)(uint8_t *dst, ptrdiff_t dststride,                 \
1222                                             int16_t *src, ptrdiff_t srcstride,                 \
1223                                             int height)                                        \
1224 {                                                                                              \
1225     FUNC(put_unweighted_pred)(dst, dststride, src, srcstride, w, height);                      \
1226 }                                                                                              \
1227 static void FUNC(put_unweighted_pred_avg_ ## w)(uint8_t *dst, ptrdiff_t dststride,             \
1228                                                 int16_t *src1, int16_t *src2,                  \
1229                                                 ptrdiff_t srcstride, int height)               \
1230 {                                                                                              \
1231     FUNC(put_unweighted_pred_avg)(dst, dststride, src1, src2, srcstride, w, height);           \
1232 }                                                                                              \
1233 static void FUNC(put_weighted_pred_ ## w)(uint8_t denom, int16_t weight, int16_t offset,       \
1234                                           uint8_t *dst, ptrdiff_t dststride,                   \
1235                                           int16_t *src, ptrdiff_t srcstride, int height)       \
1236 {                                                                                              \
1237     FUNC(weighted_pred)(denom, weight, offset,                                                 \
1238                         dst, dststride, src, srcstride, w, height);                            \
1239 }                                                                                              \
1240 static void FUNC(put_weighted_pred_avg_ ## w)(uint8_t denom, int16_t weight0, int16_t weight1, \
1241                                               int16_t offset0, int16_t offset1,                \
1242                                               uint8_t *dst, ptrdiff_t dststride,               \
1243                                               int16_t *src1, int16_t *src2,                    \
1244                                               ptrdiff_t srcstride, int height)                 \
1245 {                                                                                              \
1246     FUNC(weighted_pred_avg)(denom, weight0, weight1, offset0, offset1,                         \
1247                             dst, dststride, src1, src2, srcstride, w, height);                 \
1248 }
1249
1250 PUT_PRED(64)
1251 PUT_PRED(48)
1252 PUT_PRED(32)
1253 PUT_PRED(24)
1254 PUT_PRED(16)
1255 PUT_PRED(12)
1256 PUT_PRED(8)
1257 PUT_PRED(6)
1258 PUT_PRED(4)
1259 PUT_PRED(2)
1260
1261 // line zero
1262 #define P3 pix[-4 * xstride]
1263 #define P2 pix[-3 * xstride]
1264 #define P1 pix[-2 * xstride]
1265 #define P0 pix[-1 * xstride]
1266 #define Q0 pix[0 * xstride]
1267 #define Q1 pix[1 * xstride]
1268 #define Q2 pix[2 * xstride]
1269 #define Q3 pix[3 * xstride]
1270
1271 // line three. used only for deblocking decision
1272 #define TP3 pix[-4 * xstride + 3 * ystride]
1273 #define TP2 pix[-3 * xstride + 3 * ystride]
1274 #define TP1 pix[-2 * xstride + 3 * ystride]
1275 #define TP0 pix[-1 * xstride + 3 * ystride]
1276 #define TQ0 pix[0  * xstride + 3 * ystride]
1277 #define TQ1 pix[1  * xstride + 3 * ystride]
1278 #define TQ2 pix[2  * xstride + 3 * ystride]
1279 #define TQ3 pix[3  * xstride + 3 * ystride]
1280
1281 static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
1282                                         ptrdiff_t _xstride, ptrdiff_t _ystride,
1283                                         int beta, int *_tc,
1284                                         uint8_t *_no_p, uint8_t *_no_q)
1285 {
1286     int d, j;
1287     pixel *pix        = (pixel *)_pix;
1288     ptrdiff_t xstride = _xstride / sizeof(pixel);
1289     ptrdiff_t ystride = _ystride / sizeof(pixel);
1290
1291     beta <<= BIT_DEPTH - 8;
1292
1293     for (j = 0; j < 2; j++) {
1294         const int dp0  = abs(P2  - 2 * P1  + P0);
1295         const int dq0  = abs(Q2  - 2 * Q1  + Q0);
1296         const int dp3  = abs(TP2 - 2 * TP1 + TP0);
1297         const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
1298         const int d0   = dp0 + dq0;
1299         const int d3   = dp3 + dq3;
1300         const int tc   = _tc[j]   << (BIT_DEPTH - 8);
1301         const int no_p = _no_p[j];
1302         const int no_q = _no_q[j];
1303
1304         if (d0 + d3 >= beta) {
1305             pix += 4 * ystride;
1306             continue;
1307         } else {
1308             const int beta_3 = beta >> 3;
1309             const int beta_2 = beta >> 2;
1310             const int tc25   = ((tc * 5 + 1) >> 1);
1311
1312             if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
1313                 abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
1314                                       (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
1315                 // strong filtering
1316                 const int tc2 = tc << 1;
1317                 for (d = 0; d < 4; d++) {
1318                     const int p3 = P3;
1319                     const int p2 = P2;
1320                     const int p1 = P1;
1321                     const int p0 = P0;
1322                     const int q0 = Q0;
1323                     const int q1 = Q1;
1324                     const int q2 = Q2;
1325                     const int q3 = Q3;
1326                     if (!no_p) {
1327                         P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
1328                         P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
1329                         P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
1330                     }
1331                     if (!no_q) {
1332                         Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
1333                         Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
1334                         Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
1335                     }
1336                     pix += ystride;
1337                 }
1338             } else { // normal filtering
1339                 int nd_p = 1;
1340                 int nd_q = 1;
1341                 const int tc_2 = tc >> 1;
1342                 if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
1343                     nd_p = 2;
1344                 if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
1345                     nd_q = 2;
1346
1347                 for (d = 0; d < 4; d++) {
1348                     const int p2 = P2;
1349                     const int p1 = P1;
1350                     const int p0 = P0;
1351                     const int q0 = Q0;
1352                     const int q1 = Q1;
1353                     const int q2 = Q2;
1354                     int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
1355                     if (abs(delta0) < 10 * tc) {
1356                         delta0 = av_clip(delta0, -tc, tc);
1357                         if (!no_p)
1358                             P0 = av_clip_pixel(p0 + delta0);
1359                         if (!no_q)
1360                             Q0 = av_clip_pixel(q0 - delta0);
1361                         if (!no_p && nd_p > 1) {
1362                             const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
1363                             P1 = av_clip_pixel(p1 + deltap1);
1364                         }
1365                         if (!no_q && nd_q > 1) {
1366                             const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
1367                             Q1 = av_clip_pixel(q1 + deltaq1);
1368                         }
1369                     }
1370                     pix += ystride;
1371                 }
1372             }
1373         }
1374     }
1375 }
1376
1377 static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
1378                                           ptrdiff_t _ystride, int *_tc,
1379                                           uint8_t *_no_p, uint8_t *_no_q)
1380 {
1381     int d, j, no_p, no_q;
1382     pixel *pix        = (pixel *)_pix;
1383     ptrdiff_t xstride = _xstride / sizeof(pixel);
1384     ptrdiff_t ystride = _ystride / sizeof(pixel);
1385
1386     for (j = 0; j < 2; j++) {
1387         const int tc = _tc[j] << (BIT_DEPTH - 8);
1388         if (tc <= 0) {
1389             pix += 4 * ystride;
1390             continue;
1391         }
1392         no_p = _no_p[j];
1393         no_q = _no_q[j];
1394
1395         for (d = 0; d < 4; d++) {
1396             int delta0;
1397             const int p1 = P1;
1398             const int p0 = P0;
1399             const int q0 = Q0;
1400             const int q1 = Q1;
1401             delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
1402             if (!no_p)
1403                 P0 = av_clip_pixel(p0 + delta0);
1404             if (!no_q)
1405                 Q0 = av_clip_pixel(q0 - delta0);
1406             pix += ystride;
1407         }
1408     }
1409 }
1410
1411 static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1412                                             int *tc, uint8_t *no_p,
1413                                             uint8_t *no_q)
1414 {
1415     FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
1416 }
1417
1418 static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
1419                                             int *tc, uint8_t *no_p,
1420                                             uint8_t *no_q)
1421 {
1422     FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
1423 }
1424
1425 static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1426                                           int beta, int *tc, uint8_t *no_p,
1427                                           uint8_t *no_q)
1428 {
1429     FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
1430                                 beta, tc, no_p, no_q);
1431 }
1432
1433 static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
1434                                           int beta, int *tc, uint8_t *no_p,
1435                                           uint8_t *no_q)
1436 {
1437     FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
1438                                 beta, tc, no_p, no_q);
1439 }
1440
1441 #undef P3
1442 #undef P2
1443 #undef P1
1444 #undef P0
1445 #undef Q0
1446 #undef Q1
1447 #undef Q2
1448 #undef Q3
1449
1450 #undef TP3
1451 #undef TP2
1452 #undef TP1
1453 #undef TP0
1454 #undef TQ0
1455 #undef TQ1
1456 #undef TQ2
1457 #undef TQ3