2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * H.264 / AVC / MPEG4 part10 prediction functions.
25 * @author Michael Niedermayer <michaelni@gmx.at>
29 #include "high_bit_depth.h"
31 static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, int _stride){
32 pixel *src = (pixel*)_src;
33 int stride = _stride/sizeof(pixel);
34 const pixel4 a= ((pixel4*)(src-stride))[0];
35 ((pixel4*)(src+0*stride))[0]= a;
36 ((pixel4*)(src+1*stride))[0]= a;
37 ((pixel4*)(src+2*stride))[0]= a;
38 ((pixel4*)(src+3*stride))[0]= a;
41 static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, int _stride){
42 pixel *src = (pixel*)_src;
43 int stride = _stride/sizeof(pixel);
44 ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
45 ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
46 ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
47 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
50 static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
51 pixel *src = (pixel*)_src;
52 int stride = _stride/sizeof(pixel);
53 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
54 + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
56 ((pixel4*)(src+0*stride))[0]=
57 ((pixel4*)(src+1*stride))[0]=
58 ((pixel4*)(src+2*stride))[0]=
59 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
62 static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
63 pixel *src = (pixel*)_src;
64 int stride = _stride/sizeof(pixel);
65 const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
67 ((pixel4*)(src+0*stride))[0]=
68 ((pixel4*)(src+1*stride))[0]=
69 ((pixel4*)(src+2*stride))[0]=
70 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
73 static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
74 pixel *src = (pixel*)_src;
75 int stride = _stride/sizeof(pixel);
76 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
78 ((pixel4*)(src+0*stride))[0]=
79 ((pixel4*)(src+1*stride))[0]=
80 ((pixel4*)(src+2*stride))[0]=
81 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
84 static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
85 pixel *src = (pixel*)_src;
86 int stride = _stride/sizeof(pixel);
87 ((pixel4*)(src+0*stride))[0]=
88 ((pixel4*)(src+1*stride))[0]=
89 ((pixel4*)(src+2*stride))[0]=
90 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
93 static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
94 pixel *src = (pixel*)_src;
95 int stride = _stride/sizeof(pixel);
96 ((pixel4*)(src+0*stride))[0]=
97 ((pixel4*)(src+1*stride))[0]=
98 ((pixel4*)(src+2*stride))[0]=
99 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
102 static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright, int _stride){
103 pixel *src = (pixel*)_src;
104 int stride = _stride/sizeof(pixel);
105 ((pixel4*)(src+0*stride))[0]=
106 ((pixel4*)(src+1*stride))[0]=
107 ((pixel4*)(src+2*stride))[0]=
108 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
112 #define LOAD_TOP_RIGHT_EDGE\
113 const int av_unused t4= topright[0];\
114 const int av_unused t5= topright[1];\
115 const int av_unused t6= topright[2];\
116 const int av_unused t7= topright[3];\
118 #define LOAD_DOWN_LEFT_EDGE\
119 const int av_unused l4= src[-1+4*stride];\
120 const int av_unused l5= src[-1+5*stride];\
121 const int av_unused l6= src[-1+6*stride];\
122 const int av_unused l7= src[-1+7*stride];\
124 #define LOAD_LEFT_EDGE\
125 const int av_unused l0= src[-1+0*stride];\
126 const int av_unused l1= src[-1+1*stride];\
127 const int av_unused l2= src[-1+2*stride];\
128 const int av_unused l3= src[-1+3*stride];\
130 #define LOAD_TOP_EDGE\
131 const int av_unused t0= src[ 0-1*stride];\
132 const int av_unused t1= src[ 1-1*stride];\
133 const int av_unused t2= src[ 2-1*stride];\
134 const int av_unused t3= src[ 3-1*stride];\
136 static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, int _stride){
137 pixel *src = (pixel*)_src;
138 int stride = _stride/sizeof(pixel);
139 const int lt= src[-1-1*stride];
143 src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
145 src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
148 src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
152 src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
155 src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
157 src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
158 src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
161 static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
162 pixel *src = (pixel*)_src;
163 const pixel *topright = (const pixel*)_topright;
164 int stride = _stride/sizeof(pixel);
169 src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
171 src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
174 src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
178 src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
181 src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
183 src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
184 src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
187 static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, const uint8_t *topright, int _stride){
188 pixel *src = (pixel*)_src;
189 int stride = _stride/sizeof(pixel);
190 const int lt= src[-1-1*stride];
195 src[1+2*stride]=(lt + t0 + 1)>>1;
197 src[2+2*stride]=(t0 + t1 + 1)>>1;
199 src[3+2*stride]=(t1 + t2 + 1)>>1;
200 src[3+0*stride]=(t2 + t3 + 1)>>1;
202 src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
204 src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
206 src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
207 src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
208 src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
209 src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
212 static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, const uint8_t *_topright, int _stride){
213 pixel *src = (pixel*)_src;
214 const pixel *topright = (const pixel*)_topright;
215 int stride = _stride/sizeof(pixel);
219 src[0+0*stride]=(t0 + t1 + 1)>>1;
221 src[0+2*stride]=(t1 + t2 + 1)>>1;
223 src[1+2*stride]=(t2 + t3 + 1)>>1;
225 src[2+2*stride]=(t3 + t4+ 1)>>1;
226 src[3+2*stride]=(t4 + t5+ 1)>>1;
227 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
229 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
231 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
233 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
234 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
237 static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, int _stride){
238 pixel *src = (pixel*)_src;
239 int stride = _stride/sizeof(pixel);
242 src[0+0*stride]=(l0 + l1 + 1)>>1;
243 src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
245 src[0+1*stride]=(l1 + l2 + 1)>>1;
247 src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
249 src[0+2*stride]=(l2 + l3 + 1)>>1;
251 src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
260 static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, const uint8_t *topright, int _stride){
261 pixel *src = (pixel*)_src;
262 int stride = _stride/sizeof(pixel);
263 const int lt= src[-1-1*stride];
268 src[2+1*stride]=(lt + l0 + 1)>>1;
270 src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
271 src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
272 src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
274 src[2+2*stride]=(l0 + l1 + 1)>>1;
276 src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
278 src[2+3*stride]=(l1 + l2+ 1)>>1;
280 src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
281 src[0+3*stride]=(l2 + l3 + 1)>>1;
282 src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
285 static void FUNCC(pred16x16_vertical)(uint8_t *_src, int _stride){
287 pixel *src = (pixel*)_src;
288 int stride = _stride/sizeof(pixel);
289 const pixel4 a = ((pixel4*)(src-stride))[0];
290 const pixel4 b = ((pixel4*)(src-stride))[1];
291 const pixel4 c = ((pixel4*)(src-stride))[2];
292 const pixel4 d = ((pixel4*)(src-stride))[3];
295 ((pixel4*)(src+i*stride))[0] = a;
296 ((pixel4*)(src+i*stride))[1] = b;
297 ((pixel4*)(src+i*stride))[2] = c;
298 ((pixel4*)(src+i*stride))[3] = d;
302 static void FUNCC(pred16x16_horizontal)(uint8_t *_src, int stride){
304 pixel *src = (pixel*)_src;
305 stride /= sizeof(pixel);
308 ((pixel4*)(src+i*stride))[0] =
309 ((pixel4*)(src+i*stride))[1] =
310 ((pixel4*)(src+i*stride))[2] =
311 ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
315 #define PREDICT_16x16_DC(v)\
316 for(i=0; i<16; i++){\
324 static void FUNCC(pred16x16_dc)(uint8_t *_src, int stride){
326 pixel *src = (pixel*)_src;
328 stride /= sizeof(pixel);
331 dc+= src[-1+i*stride];
338 dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
339 PREDICT_16x16_DC(dcsplat);
342 static void FUNCC(pred16x16_left_dc)(uint8_t *_src, int stride){
344 pixel *src = (pixel*)_src;
346 stride /= sizeof(pixel);
349 dc+= src[-1+i*stride];
352 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
353 PREDICT_16x16_DC(dcsplat);
356 static void FUNCC(pred16x16_top_dc)(uint8_t *_src, int stride){
358 pixel *src = (pixel*)_src;
360 stride /= sizeof(pixel);
366 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
367 PREDICT_16x16_DC(dcsplat);
370 #define PRED16x16_X(n, v) \
371 static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, int stride){\
373 pixel *src = (pixel*)_src;\
374 stride /= sizeof(pixel);\
375 PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
378 PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
379 PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
380 PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
382 static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, int _stride, const int svq3, const int rv40){
386 pixel *src = (pixel*)_src;
387 int stride = _stride/sizeof(pixel);
388 const pixel * const src0 = src +7-stride;
389 const pixel * src1 = src +8*stride-1;
390 const pixel * src2 = src1-2*stride; // == src+6*stride-1;
391 int H = src0[1] - src0[-1];
392 int V = src1[0] - src2[ 0];
393 for(k=2; k<=8; ++k) {
394 src1 += stride; src2 -= stride;
395 H += k*(src0[k] - src0[-k]);
396 V += k*(src1[0] - src2[ 0]);
399 H = ( 5*(H/4) ) / 16;
400 V = ( 5*(V/4) ) / 16;
402 /* required for 100% accuracy */
405 H = ( H + (H>>2) ) >> 4;
406 V = ( V + (V>>2) ) >> 4;
412 a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
413 for(j=16; j>0; --j) {
416 for(i=-16; i<0; i+=4) {
417 src[16+i] = CLIP((b ) >> 5);
418 src[17+i] = CLIP((b+ H) >> 5);
419 src[18+i] = CLIP((b+2*H) >> 5);
420 src[19+i] = CLIP((b+3*H) >> 5);
427 static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
428 FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
431 static void FUNCC(pred8x8_vertical)(uint8_t *_src, int _stride){
433 pixel *src = (pixel*)_src;
434 int stride = _stride/sizeof(pixel);
435 const pixel4 a= ((pixel4*)(src-stride))[0];
436 const pixel4 b= ((pixel4*)(src-stride))[1];
439 ((pixel4*)(src+i*stride))[0]= a;
440 ((pixel4*)(src+i*stride))[1]= b;
444 static void FUNCC(pred8x8_horizontal)(uint8_t *_src, int stride){
446 pixel *src = (pixel*)_src;
447 stride /= sizeof(pixel);
450 ((pixel4*)(src+i*stride))[0]=
451 ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
455 #define PRED8x8_X(n, v)\
456 static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, int stride){\
458 pixel *src = (pixel*)_src;\
459 stride /= sizeof(pixel);\
461 ((pixel4*)(src+i*stride))[0]=\
462 ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
466 PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
467 PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
468 PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
470 static void FUNCC(pred8x8_left_dc)(uint8_t *_src, int stride){
473 pixel4 dc0splat, dc2splat;
474 pixel *src = (pixel*)_src;
475 stride /= sizeof(pixel);
479 dc0+= src[-1+i*stride];
480 dc2+= src[-1+(i+4)*stride];
482 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
483 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
486 ((pixel4*)(src+i*stride))[0]=
487 ((pixel4*)(src+i*stride))[1]= dc0splat;
490 ((pixel4*)(src+i*stride))[0]=
491 ((pixel4*)(src+i*stride))[1]= dc2splat;
495 static void FUNCC(pred8x8_top_dc)(uint8_t *_src, int stride){
498 pixel4 dc0splat, dc1splat;
499 pixel *src = (pixel*)_src;
500 stride /= sizeof(pixel);
505 dc1+= src[4+i-stride];
507 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
508 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
511 ((pixel4*)(src+i*stride))[0]= dc0splat;
512 ((pixel4*)(src+i*stride))[1]= dc1splat;
515 ((pixel4*)(src+i*stride))[0]= dc0splat;
516 ((pixel4*)(src+i*stride))[1]= dc1splat;
520 static void FUNCC(pred8x8_dc)(uint8_t *_src, int stride){
523 pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
524 pixel *src = (pixel*)_src;
525 stride /= sizeof(pixel);
529 dc0+= src[-1+i*stride] + src[i-stride];
530 dc1+= src[4+i-stride];
531 dc2+= src[-1+(i+4)*stride];
533 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
534 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
535 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
536 dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
539 ((pixel4*)(src+i*stride))[0]= dc0splat;
540 ((pixel4*)(src+i*stride))[1]= dc1splat;
543 ((pixel4*)(src+i*stride))[0]= dc2splat;
544 ((pixel4*)(src+i*stride))[1]= dc3splat;
548 //the following 4 function should not be optimized!
549 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
550 FUNCC(pred8x8_top_dc)(src, stride);
551 FUNCC(pred4x4_dc)(src, NULL, stride);
554 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
555 FUNCC(pred8x8_dc)(src, stride);
556 FUNCC(pred4x4_top_dc)(src, NULL, stride);
559 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
560 FUNCC(pred8x8_left_dc)(src, stride);
561 FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
562 FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
565 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
566 FUNCC(pred8x8_left_dc)(src, stride);
567 FUNCC(pred4x4_128_dc)(src , NULL, stride);
568 FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
571 static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
575 pixel *src = (pixel*)_src;
576 int stride = _stride/sizeof(pixel);
577 const pixel * const src0 = src +3-stride;
578 const pixel * src1 = src +4*stride-1;
579 const pixel * src2 = src1-2*stride; // == src+2*stride-1;
580 int H = src0[1] - src0[-1];
581 int V = src1[0] - src2[ 0];
582 for(k=2; k<=4; ++k) {
583 src1 += stride; src2 -= stride;
584 H += k*(src0[k] - src0[-k]);
585 V += k*(src1[0] - src2[ 0]);
587 H = ( 17*H+16 ) >> 5;
588 V = ( 17*V+16 ) >> 5;
590 a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
594 src[0] = CLIP((b ) >> 5);
595 src[1] = CLIP((b+ H) >> 5);
596 src[2] = CLIP((b+2*H) >> 5);
597 src[3] = CLIP((b+3*H) >> 5);
598 src[4] = CLIP((b+4*H) >> 5);
599 src[5] = CLIP((b+5*H) >> 5);
600 src[6] = CLIP((b+6*H) >> 5);
601 src[7] = CLIP((b+7*H) >> 5);
606 #define SRC(x,y) src[(x)+(y)*stride]
608 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
609 #define PREDICT_8x8_LOAD_LEFT \
610 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
611 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
612 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
613 const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
616 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
617 #define PREDICT_8x8_LOAD_TOP \
618 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
619 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
620 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
621 const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
622 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
625 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
626 #define PREDICT_8x8_LOAD_TOPRIGHT \
627 int t8, t9, t10, t11, t12, t13, t14, t15; \
629 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
630 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
631 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
633 #define PREDICT_8x8_LOAD_TOPLEFT \
634 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
636 #define PREDICT_8x8_DC(v) \
638 for( y = 0; y < 8; y++ ) { \
639 ((pixel4*)src)[0] = \
640 ((pixel4*)src)[1] = v; \
644 static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
646 pixel *src = (pixel*)_src;
647 int stride = _stride/sizeof(pixel);
649 PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
651 static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
653 pixel *src = (pixel*)_src;
654 int stride = _stride/sizeof(pixel);
656 PREDICT_8x8_LOAD_LEFT;
657 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
660 static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
662 pixel *src = (pixel*)_src;
663 int stride = _stride/sizeof(pixel);
665 PREDICT_8x8_LOAD_TOP;
666 const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
669 static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
671 pixel *src = (pixel*)_src;
672 int stride = _stride/sizeof(pixel);
674 PREDICT_8x8_LOAD_LEFT;
675 PREDICT_8x8_LOAD_TOP;
676 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
677 +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
680 static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
682 pixel *src = (pixel*)_src;
683 int stride = _stride/sizeof(pixel);
685 PREDICT_8x8_LOAD_LEFT;
686 #define ROW(y) ((pixel4*)(src+y*stride))[0] =\
687 ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
688 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
691 static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
694 pixel *src = (pixel*)_src;
695 int stride = _stride/sizeof(pixel);
697 PREDICT_8x8_LOAD_TOP;
706 for( y = 1; y < 8; y++ ) {
707 ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
708 ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
711 static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
713 pixel *src = (pixel*)_src;
714 int stride = _stride/sizeof(pixel);
715 PREDICT_8x8_LOAD_TOP;
716 PREDICT_8x8_LOAD_TOPRIGHT;
717 SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
718 SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
719 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
720 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
721 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
722 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
723 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
724 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
725 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
726 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
727 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
728 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
729 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
730 SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
731 SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
733 static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
735 pixel *src = (pixel*)_src;
736 int stride = _stride/sizeof(pixel);
737 PREDICT_8x8_LOAD_TOP;
738 PREDICT_8x8_LOAD_LEFT;
739 PREDICT_8x8_LOAD_TOPLEFT;
740 SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
741 SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
742 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
743 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
744 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
745 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
746 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
747 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
748 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
749 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
750 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
751 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
752 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
753 SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
754 SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
756 static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
758 pixel *src = (pixel*)_src;
759 int stride = _stride/sizeof(pixel);
760 PREDICT_8x8_LOAD_TOP;
761 PREDICT_8x8_LOAD_LEFT;
762 PREDICT_8x8_LOAD_TOPLEFT;
763 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
764 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
765 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
766 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
767 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
768 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
769 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
770 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
771 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
772 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
773 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
774 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
775 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
776 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
777 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
778 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
779 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
780 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
781 SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
782 SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
783 SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
784 SRC(7,0)= (t6 + t7 + 1) >> 1;
786 static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
788 pixel *src = (pixel*)_src;
789 int stride = _stride/sizeof(pixel);
790 PREDICT_8x8_LOAD_TOP;
791 PREDICT_8x8_LOAD_LEFT;
792 PREDICT_8x8_LOAD_TOPLEFT;
793 SRC(0,7)= (l6 + l7 + 1) >> 1;
794 SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
795 SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
796 SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
797 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
798 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
799 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
800 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
801 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
802 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
803 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
804 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
805 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
806 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
807 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
808 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
809 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
810 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
811 SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
812 SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
813 SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
814 SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
816 static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
818 pixel *src = (pixel*)_src;
819 int stride = _stride/sizeof(pixel);
820 PREDICT_8x8_LOAD_TOP;
821 PREDICT_8x8_LOAD_TOPRIGHT;
822 SRC(0,0)= (t0 + t1 + 1) >> 1;
823 SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
824 SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
825 SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
826 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
827 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
828 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
829 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
830 SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
831 SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
832 SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
833 SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
834 SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
835 SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
836 SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
837 SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
838 SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
839 SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
840 SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
841 SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
842 SRC(7,6)= (t10 + t11 + 1) >> 1;
843 SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
845 static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, int has_topright, int _stride)
847 pixel *src = (pixel*)_src;
848 int stride = _stride/sizeof(pixel);
849 PREDICT_8x8_LOAD_LEFT;
850 SRC(0,0)= (l0 + l1 + 1) >> 1;
851 SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
852 SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
853 SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
854 SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
855 SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
856 SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
857 SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
858 SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
859 SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
860 SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
861 SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
862 SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
863 SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
864 SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
865 SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
866 SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
867 SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
869 #undef PREDICT_8x8_LOAD_LEFT
870 #undef PREDICT_8x8_LOAD_TOP
871 #undef PREDICT_8x8_LOAD_TOPLEFT
872 #undef PREDICT_8x8_LOAD_TOPRIGHT
873 #undef PREDICT_8x8_DC
879 static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
881 pixel *pix = (pixel*)_pix;
882 const dctcoef *block = (const dctcoef*)_block;
883 stride /= sizeof(pixel);
887 pix[1*stride]= v += block[0];
888 pix[2*stride]= v += block[4];
889 pix[3*stride]= v += block[8];
890 pix[4*stride]= v + block[12];
896 static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
898 pixel *pix = (pixel*)_pix;
899 const dctcoef *block = (const dctcoef*)_block;
900 stride /= sizeof(pixel);
903 pix[0]= v += block[0];
904 pix[1]= v += block[1];
905 pix[2]= v += block[2];
906 pix[3]= v + block[3];
912 static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
914 pixel *pix = (pixel*)_pix;
915 const dctcoef *block = (const dctcoef*)_block;
916 stride /= sizeof(pixel);
920 pix[1*stride]= v += block[0];
921 pix[2*stride]= v += block[8];
922 pix[3*stride]= v += block[16];
923 pix[4*stride]= v += block[24];
924 pix[5*stride]= v += block[32];
925 pix[6*stride]= v += block[40];
926 pix[7*stride]= v += block[48];
927 pix[8*stride]= v + block[56];
933 static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const DCTELEM *_block, int stride){
935 pixel *pix = (pixel*)_pix;
936 const dctcoef *block = (const dctcoef*)_block;
937 stride /= sizeof(pixel);
940 pix[0]= v += block[0];
941 pix[1]= v += block[1];
942 pix[2]= v += block[2];
943 pix[3]= v += block[3];
944 pix[4]= v += block[4];
945 pix[5]= v += block[5];
946 pix[6]= v += block[6];
947 pix[7]= v + block[7];
953 static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
956 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
959 static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
962 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
965 static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
968 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
971 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
974 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);