2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * H.264 / AVC / MPEG4 part10 prediction functions.
25 * @author Michael Niedermayer <michaelni@gmx.at>
29 #include "h264_high_depth.h"
31 static void FUNCC(pred4x4_vertical)(uint8_t *p_src, const uint8_t *topright, int p_stride){
32 pixel *src = (pixel*)p_src;
33 int stride = p_stride>>(sizeof(pixel)-1);
34 const pixel4 a= ((pixel4*)(src-stride))[0];
35 ((pixel4*)(src+0*stride))[0]= a;
36 ((pixel4*)(src+1*stride))[0]= a;
37 ((pixel4*)(src+2*stride))[0]= a;
38 ((pixel4*)(src+3*stride))[0]= a;
41 static void FUNCC(pred4x4_horizontal)(uint8_t *p_src, const uint8_t *topright, int p_stride){
42 pixel *src = (pixel*)p_src;
43 int stride = p_stride>>(sizeof(pixel)-1);
44 ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
45 ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
46 ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
47 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
50 static void FUNCC(pred4x4_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
51 pixel *src = (pixel*)p_src;
52 int stride = p_stride>>(sizeof(pixel)-1);
53 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
54 + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
56 ((pixel4*)(src+0*stride))[0]=
57 ((pixel4*)(src+1*stride))[0]=
58 ((pixel4*)(src+2*stride))[0]=
59 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
62 static void FUNCC(pred4x4_left_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
63 pixel *src = (pixel*)p_src;
64 int stride = p_stride>>(sizeof(pixel)-1);
65 const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
67 ((pixel4*)(src+0*stride))[0]=
68 ((pixel4*)(src+1*stride))[0]=
69 ((pixel4*)(src+2*stride))[0]=
70 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
73 static void FUNCC(pred4x4_top_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
74 pixel *src = (pixel*)p_src;
75 int stride = p_stride>>(sizeof(pixel)-1);
76 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
78 ((pixel4*)(src+0*stride))[0]=
79 ((pixel4*)(src+1*stride))[0]=
80 ((pixel4*)(src+2*stride))[0]=
81 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
84 static void FUNCC(pred4x4_128_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
85 pixel *src = (pixel*)p_src;
86 int stride = p_stride>>(sizeof(pixel)-1);
87 ((pixel4*)(src+0*stride))[0]=
88 ((pixel4*)(src+1*stride))[0]=
89 ((pixel4*)(src+2*stride))[0]=
90 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
93 static void FUNCC(pred4x4_127_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
94 pixel *src = (pixel*)p_src;
95 int stride = p_stride>>(sizeof(pixel)-1);
96 ((pixel4*)(src+0*stride))[0]=
97 ((pixel4*)(src+1*stride))[0]=
98 ((pixel4*)(src+2*stride))[0]=
99 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
102 static void FUNCC(pred4x4_129_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
103 pixel *src = (pixel*)p_src;
104 int stride = p_stride>>(sizeof(pixel)-1);
105 ((pixel4*)(src+0*stride))[0]=
106 ((pixel4*)(src+1*stride))[0]=
107 ((pixel4*)(src+2*stride))[0]=
108 ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
112 #define LOAD_TOP_RIGHT_EDGE\
113 const int av_unused t4= topright[0];\
114 const int av_unused t5= topright[1];\
115 const int av_unused t6= topright[2];\
116 const int av_unused t7= topright[3];\
118 #define LOAD_DOWN_LEFT_EDGE\
119 const int av_unused l4= src[-1+4*stride];\
120 const int av_unused l5= src[-1+5*stride];\
121 const int av_unused l6= src[-1+6*stride];\
122 const int av_unused l7= src[-1+7*stride];\
124 #define LOAD_LEFT_EDGE\
125 const int av_unused l0= src[-1+0*stride];\
126 const int av_unused l1= src[-1+1*stride];\
127 const int av_unused l2= src[-1+2*stride];\
128 const int av_unused l3= src[-1+3*stride];\
130 #define LOAD_TOP_EDGE\
131 const int av_unused t0= src[ 0-1*stride];\
132 const int av_unused t1= src[ 1-1*stride];\
133 const int av_unused t2= src[ 2-1*stride];\
134 const int av_unused t3= src[ 3-1*stride];\
136 static void FUNCC(pred4x4_vertical_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
137 pixel *src = (pixel*)p_src;
138 const pixel *topright = (const pixel*)p_topright;
139 int stride = p_stride>>(sizeof(pixel)-1);
140 const int lt= src[-1-1*stride];
143 pixel4 v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2,
144 (t0 + 2*t1 + t2 + 2) >> 2,
145 (t1 + 2*t2 + t3 + 2) >> 2,
146 (t2 + 2*t3 + t4 + 2) >> 2);
148 AV_WN4PA(src+0*stride, v);
149 AV_WN4PA(src+1*stride, v);
150 AV_WN4PA(src+2*stride, v);
151 AV_WN4PA(src+3*stride, v);
154 static void FUNCC(pred4x4_horizontal_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
155 pixel *src = (pixel*)p_src;
156 int stride = p_stride>>(sizeof(pixel)-1);
157 const int lt= src[-1-1*stride];
160 AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4((lt + 2*l0 + l1 + 2) >> 2));
161 AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4((l0 + 2*l1 + l2 + 2) >> 2));
162 AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4((l1 + 2*l2 + l3 + 2) >> 2));
163 AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4((l2 + 2*l3 + l3 + 2) >> 2));
166 static void FUNCC(pred4x4_down_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
167 pixel *src = (pixel*)p_src;
168 int stride = p_stride>>(sizeof(pixel)-1);
169 const int lt= src[-1-1*stride];
173 src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
175 src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
178 src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
182 src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
185 src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
187 src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
188 src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
191 static void FUNCC(pred4x4_down_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
192 pixel *src = (pixel*)p_src;
193 const pixel *topright = (const pixel*)p_topright;
194 int stride = p_stride>>(sizeof(pixel)-1);
199 src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
201 src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
204 src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
208 src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
211 src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
213 src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
214 src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
217 static void FUNCC(pred4x4_down_left_svq3)(uint8_t *p_src, const uint8_t *topright, int p_stride){
218 pixel *src = (pixel*)p_src;
219 int stride = p_stride>>(sizeof(pixel)-1);
222 const av_unused int unu0= t0;
223 const av_unused int unu1= l0;
225 src[0+0*stride]=(l1 + t1)>>1;
227 src[0+1*stride]=(l2 + t2)>>1;
240 src[3+3*stride]=(l3 + t3)>>1;
243 static void FUNCC(pred4x4_down_left_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
244 pixel *src = (pixel*)p_src;
245 const pixel *topright = (const pixel*)p_topright;
246 int stride = p_stride>>(sizeof(pixel)-1);
252 src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
254 src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
257 src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + l4 + 2*l3 + 2)>>3;
261 src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3 + l5 + 2*l4 + 2)>>3;
264 src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l4 + l6 + 2*l5 + 2)>>3;
266 src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l5 + l7 + 2*l6 + 2)>>3;
267 src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
270 static void FUNCC(pred4x4_down_left_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
271 pixel *src = (pixel*)p_src;
272 const pixel *topright = (const pixel*)p_topright;
273 int stride = p_stride>>(sizeof(pixel)-1);
278 src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
280 src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
283 src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + 3*l3 + 2)>>3;
287 src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3*4 + 2)>>3;
290 src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l3*4 + 2)>>3;
292 src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l3*4 + 2)>>3;
293 src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2;
296 static void FUNCC(pred4x4_vertical_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
297 pixel *src = (pixel*)p_src;
298 int stride = p_stride>>(sizeof(pixel)-1);
299 const int lt= src[-1-1*stride];
304 src[1+2*stride]=(lt + t0 + 1)>>1;
306 src[2+2*stride]=(t0 + t1 + 1)>>1;
308 src[3+2*stride]=(t1 + t2 + 1)>>1;
309 src[3+0*stride]=(t2 + t3 + 1)>>1;
311 src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
313 src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
315 src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
316 src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
317 src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
318 src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
321 static void FUNCC(pred4x4_vertical_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
322 pixel *src = (pixel*)p_src;
323 const pixel *topright = (const pixel*)p_topright;
324 int stride = p_stride>>(sizeof(pixel)-1);
328 src[0+0*stride]=(t0 + t1 + 1)>>1;
330 src[0+2*stride]=(t1 + t2 + 1)>>1;
332 src[1+2*stride]=(t2 + t3 + 1)>>1;
334 src[2+2*stride]=(t3 + t4+ 1)>>1;
335 src[3+2*stride]=(t4 + t5+ 1)>>1;
336 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
338 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
340 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
342 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
343 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
346 static void FUNCC(pred4x4_vertical_left_rv40_internal)(uint8_t *p_src, const uint8_t *p_topright, int p_stride,
347 const int l0, const int l1, const int l2, const int l3, const int l4){
348 pixel *src = (pixel*)p_src;
349 const pixel *topright = (const pixel*)p_topright;
350 int stride = p_stride>>(sizeof(pixel)-1);
354 src[0+0*stride]=(2*t0 + 2*t1 + l1 + 2*l2 + l3 + 4)>>3;
356 src[0+2*stride]=(t1 + t2 + 1)>>1;
358 src[1+2*stride]=(t2 + t3 + 1)>>1;
360 src[2+2*stride]=(t3 + t4+ 1)>>1;
361 src[3+2*stride]=(t4 + t5+ 1)>>1;
362 src[0+1*stride]=(t0 + 2*t1 + t2 + l2 + 2*l3 + l4 + 4)>>3;
364 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
366 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
368 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
369 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
372 static void FUNCC(pred4x4_vertical_left_rv40)(uint8_t *p_src, const uint8_t *topright, int p_stride){
373 pixel *src = (pixel*)p_src;
374 int stride = p_stride>>(sizeof(pixel)-1);
378 FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l4);
381 static void FUNCC(pred4x4_vertical_left_rv40_nodown)(uint8_t *p_src, const uint8_t *topright, int p_stride){
382 pixel *src = (pixel*)p_src;
383 int stride = p_stride>>(sizeof(pixel)-1);
386 FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l3);
389 static void FUNCC(pred4x4_vertical_left_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
390 pixel *src = (pixel*)p_src;
391 const pixel *topright = (const pixel*)p_topright;
392 int stride = p_stride>>(sizeof(pixel)-1);
396 src[0+0*stride]=(t0 + t1 + 1)>>1;
398 src[0+2*stride]=(t1 + t2 + 1)>>1;
400 src[1+2*stride]=(t2 + t3 + 1)>>1;
402 src[2+2*stride]=(t3 + t4 + 1)>>1;
403 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
405 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
407 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
409 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
410 src[3+2*stride]=(t4 + 2*t5 + t6 + 2)>>2;
411 src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2;
414 static void FUNCC(pred4x4_horizontal_up)(uint8_t *p_src, const uint8_t *topright, int p_stride){
415 pixel *src = (pixel*)p_src;
416 int stride = p_stride>>(sizeof(pixel)-1);
419 src[0+0*stride]=(l0 + l1 + 1)>>1;
420 src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
422 src[0+1*stride]=(l1 + l2 + 1)>>1;
424 src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
426 src[0+2*stride]=(l2 + l3 + 1)>>1;
428 src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
437 static void FUNCC(pred4x4_horizontal_up_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
438 pixel *src = (pixel*)p_src;
439 const pixel *topright = (const pixel*)p_topright;
440 int stride = p_stride>>(sizeof(pixel)-1);
446 src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
447 src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
449 src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
451 src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
453 src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
455 src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
457 src[1+3*stride]=(l3 + 2*l4 + l5 + 2)>>2;
459 src[2+2*stride]=(t6 + t7 + l3 + l4 + 2)>>2;
460 src[2+3*stride]=(l4 + l5 + 1)>>1;
461 src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2;
464 static void FUNCC(pred4x4_horizontal_up_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
465 pixel *src = (pixel*)p_src;
466 const pixel *topright = (const pixel*)p_topright;
467 int stride = p_stride>>(sizeof(pixel)-1);
472 src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
473 src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
475 src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
477 src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
479 src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
481 src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
485 src[2+2*stride]=(t6 + t7 + 2*l3 + 2)>>2;
490 static void FUNCC(pred4x4_horizontal_down)(uint8_t *p_src, const uint8_t *topright, int p_stride){
491 pixel *src = (pixel*)p_src;
492 int stride = p_stride>>(sizeof(pixel)-1);
493 const int lt= src[-1-1*stride];
498 src[2+1*stride]=(lt + l0 + 1)>>1;
500 src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
501 src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
502 src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
504 src[2+2*stride]=(l0 + l1 + 1)>>1;
506 src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
508 src[2+3*stride]=(l1 + l2+ 1)>>1;
510 src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
511 src[0+3*stride]=(l2 + l3 + 1)>>1;
512 src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
515 static void FUNCC(pred4x4_tm_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
516 pixel *src = (pixel*)p_src;
517 int stride = p_stride>>(sizeof(pixel)-1);
518 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
519 pixel *top = src-stride;
522 for (y = 0; y < 4; y++) {
523 uint8_t *cm_in = cm + src[-1];
524 src[0] = cm_in[top[0]];
525 src[1] = cm_in[top[1]];
526 src[2] = cm_in[top[2]];
527 src[3] = cm_in[top[3]];
532 static void FUNCC(pred16x16_vertical)(uint8_t *p_src, int p_stride){
534 pixel *src = (pixel*)p_src;
535 int stride = p_stride>>(sizeof(pixel)-1);
536 const pixel4 a = ((pixel4*)(src-stride))[0];
537 const pixel4 b = ((pixel4*)(src-stride))[1];
538 const pixel4 c = ((pixel4*)(src-stride))[2];
539 const pixel4 d = ((pixel4*)(src-stride))[3];
542 ((pixel4*)(src+i*stride))[0] = a;
543 ((pixel4*)(src+i*stride))[1] = b;
544 ((pixel4*)(src+i*stride))[2] = c;
545 ((pixel4*)(src+i*stride))[3] = d;
549 static void FUNCC(pred16x16_horizontal)(uint8_t *p_src, int stride){
551 pixel *src = (pixel*)p_src;
552 stride >>= sizeof(pixel)-1;
555 ((pixel4*)(src+i*stride))[0] =
556 ((pixel4*)(src+i*stride))[1] =
557 ((pixel4*)(src+i*stride))[2] =
558 ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
562 #define PREDICT_16x16_DC(v)\
563 for(i=0; i<16; i++){\
571 static void FUNCC(pred16x16_dc)(uint8_t *p_src, int stride){
573 pixel *src = (pixel*)p_src;
575 stride >>= sizeof(pixel)-1;
578 dc+= src[-1+i*stride];
585 dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
586 PREDICT_16x16_DC(dcsplat);
589 static void FUNCC(pred16x16_left_dc)(uint8_t *p_src, int stride){
591 pixel *src = (pixel*)p_src;
593 stride >>= sizeof(pixel)-1;
596 dc+= src[-1+i*stride];
599 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
600 PREDICT_16x16_DC(dcsplat);
603 static void FUNCC(pred16x16_top_dc)(uint8_t *p_src, int stride){
605 pixel *src = (pixel*)p_src;
607 stride >>= sizeof(pixel)-1;
613 dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
614 PREDICT_16x16_DC(dcsplat);
617 #define PRED16x16_X(n, v) \
618 static void FUNCC(pred16x16_##n##_dc)(uint8_t *p_src, int stride){\
620 pixel *src = (pixel*)p_src;\
621 stride >>= sizeof(pixel)-1;\
622 PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
625 PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
626 PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
627 PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
629 static inline void FUNCC(pred16x16_plane_compat)(uint8_t *p_src, int p_stride, const int svq3, const int rv40){
633 pixel *src = (pixel*)p_src;
634 int stride = p_stride>>(sizeof(pixel)-1);
635 const pixel * const src0 = src +7-stride;
636 const pixel * src1 = src +8*stride-1;
637 const pixel * src2 = src1-2*stride; // == src+6*stride-1;
638 int H = src0[1] - src0[-1];
639 int V = src1[0] - src2[ 0];
640 for(k=2; k<=8; ++k) {
641 src1 += stride; src2 -= stride;
642 H += k*(src0[k] - src0[-k]);
643 V += k*(src1[0] - src2[ 0]);
646 H = ( 5*(H/4) ) / 16;
647 V = ( 5*(V/4) ) / 16;
649 /* required for 100% accuracy */
652 H = ( H + (H>>2) ) >> 4;
653 V = ( V + (V>>2) ) >> 4;
659 a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
660 for(j=16; j>0; --j) {
663 for(i=-16; i<0; i+=4) {
664 src[16+i] = CLIP((b ) >> 5);
665 src[17+i] = CLIP((b+ H) >> 5);
666 src[18+i] = CLIP((b+2*H) >> 5);
667 src[19+i] = CLIP((b+3*H) >> 5);
674 static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
675 FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
678 static void FUNCC(pred16x16_plane_svq3)(uint8_t *src, int stride){
679 FUNCC(pred16x16_plane_compat)(src, stride, 1, 0);
682 static void FUNCC(pred16x16_plane_rv40)(uint8_t *src, int stride){
683 FUNCC(pred16x16_plane_compat)(src, stride, 0, 1);
686 static void FUNCC(pred16x16_tm_vp8)(uint8_t *src, int stride){
687 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
688 uint8_t *top = src-stride;
691 for (y = 0; y < 16; y++) {
692 uint8_t *cm_in = cm + src[-1];
693 src[0] = cm_in[top[0]];
694 src[1] = cm_in[top[1]];
695 src[2] = cm_in[top[2]];
696 src[3] = cm_in[top[3]];
697 src[4] = cm_in[top[4]];
698 src[5] = cm_in[top[5]];
699 src[6] = cm_in[top[6]];
700 src[7] = cm_in[top[7]];
701 src[8] = cm_in[top[8]];
702 src[9] = cm_in[top[9]];
703 src[10] = cm_in[top[10]];
704 src[11] = cm_in[top[11]];
705 src[12] = cm_in[top[12]];
706 src[13] = cm_in[top[13]];
707 src[14] = cm_in[top[14]];
708 src[15] = cm_in[top[15]];
713 static void FUNCC(pred8x8_vertical)(uint8_t *p_src, int p_stride){
715 pixel *src = (pixel*)p_src;
716 int stride = p_stride>>(sizeof(pixel)-1);
717 const pixel4 a= ((pixel4*)(src-stride))[0];
718 const pixel4 b= ((pixel4*)(src-stride))[1];
721 ((pixel4*)(src+i*stride))[0]= a;
722 ((pixel4*)(src+i*stride))[1]= b;
726 static void FUNCC(pred8x8_horizontal)(uint8_t *p_src, int stride){
728 pixel *src = (pixel*)p_src;
729 stride >>= sizeof(pixel)-1;
732 ((pixel4*)(src+i*stride))[0]=
733 ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
737 #define PRED8x8_X(n, v)\
738 static void FUNCC(pred8x8_##n##_dc)(uint8_t *p_src, int stride){\
740 pixel *src = (pixel*)p_src;\
741 stride >>= sizeof(pixel)-1;\
743 ((pixel4*)(src+i*stride))[0]=\
744 ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
748 PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
749 PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
750 PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
752 static void FUNCC(pred8x8_left_dc)(uint8_t *p_src, int stride){
755 pixel4 dc0splat, dc2splat;
756 pixel *src = (pixel*)p_src;
757 stride >>= sizeof(pixel)-1;
761 dc0+= src[-1+i*stride];
762 dc2+= src[-1+(i+4)*stride];
764 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
765 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
768 ((pixel4*)(src+i*stride))[0]=
769 ((pixel4*)(src+i*stride))[1]= dc0splat;
772 ((pixel4*)(src+i*stride))[0]=
773 ((pixel4*)(src+i*stride))[1]= dc2splat;
777 static void FUNCC(pred8x8_left_dc_rv40)(uint8_t *p_src, int stride){
781 pixel *src = (pixel*)p_src;
782 stride >>= sizeof(pixel)-1;
786 dc0+= src[-1+i*stride];
787 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
790 ((pixel4*)(src+i*stride))[0]=
791 ((pixel4*)(src+i*stride))[1]= dc0splat;
795 static void FUNCC(pred8x8_top_dc)(uint8_t *p_src, int stride){
798 pixel4 dc0splat, dc1splat;
799 pixel *src = (pixel*)p_src;
800 stride >>= sizeof(pixel)-1;
805 dc1+= src[4+i-stride];
807 dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
808 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
811 ((pixel4*)(src+i*stride))[0]= dc0splat;
812 ((pixel4*)(src+i*stride))[1]= dc1splat;
815 ((pixel4*)(src+i*stride))[0]= dc0splat;
816 ((pixel4*)(src+i*stride))[1]= dc1splat;
820 static void FUNCC(pred8x8_top_dc_rv40)(uint8_t *p_src, int stride){
824 pixel *src = (pixel*)p_src;
825 stride >>= sizeof(pixel)-1;
830 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
833 ((pixel4*)(src+i*stride))[0]=
834 ((pixel4*)(src+i*stride))[1]= dc0splat;
839 static void FUNCC(pred8x8_dc)(uint8_t *p_src, int stride){
842 pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
843 pixel *src = (pixel*)p_src;
844 stride >>= sizeof(pixel)-1;
848 dc0+= src[-1+i*stride] + src[i-stride];
849 dc1+= src[4+i-stride];
850 dc2+= src[-1+(i+4)*stride];
852 dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
853 dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
854 dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
855 dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
858 ((pixel4*)(src+i*stride))[0]= dc0splat;
859 ((pixel4*)(src+i*stride))[1]= dc1splat;
862 ((pixel4*)(src+i*stride))[0]= dc2splat;
863 ((pixel4*)(src+i*stride))[1]= dc3splat;
867 //the following 4 function should not be optimized!
868 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
869 FUNCC(pred8x8_top_dc)(src, stride);
870 FUNCC(pred4x4_dc)(src, NULL, stride);
873 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
874 FUNCC(pred8x8_dc)(src, stride);
875 FUNCC(pred4x4_top_dc)(src, NULL, stride);
878 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
879 FUNCC(pred8x8_left_dc)(src, stride);
880 FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride);
881 FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
884 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
885 FUNCC(pred8x8_left_dc)(src, stride);
886 FUNCC(pred4x4_128_dc)(src , NULL, stride);
887 FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
890 static void FUNCC(pred8x8_dc_rv40)(uint8_t *p_src, int stride){
894 pixel *src = (pixel*)p_src;
895 stride >>= sizeof(pixel)-1;
898 dc0+= src[-1+i*stride] + src[i-stride];
899 dc0+= src[4+i-stride];
900 dc0+= src[-1+(i+4)*stride];
902 dc0splat = PIXEL_SPLAT_X4((dc0 + 8)>>4);
905 ((pixel4*)(src+i*stride))[0]= dc0splat;
906 ((pixel4*)(src+i*stride))[1]= dc0splat;
909 ((pixel4*)(src+i*stride))[0]= dc0splat;
910 ((pixel4*)(src+i*stride))[1]= dc0splat;
914 static void FUNCC(pred8x8_plane)(uint8_t *p_src, int p_stride){
918 pixel *src = (pixel*)p_src;
919 int stride = p_stride>>(sizeof(pixel)-1);
920 const pixel * const src0 = src +3-stride;
921 const pixel * src1 = src +4*stride-1;
922 const pixel * src2 = src1-2*stride; // == src+2*stride-1;
923 int H = src0[1] - src0[-1];
924 int V = src1[0] - src2[ 0];
925 for(k=2; k<=4; ++k) {
926 src1 += stride; src2 -= stride;
927 H += k*(src0[k] - src0[-k]);
928 V += k*(src1[0] - src2[ 0]);
930 H = ( 17*H+16 ) >> 5;
931 V = ( 17*V+16 ) >> 5;
933 a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
937 src[0] = CLIP((b ) >> 5);
938 src[1] = CLIP((b+ H) >> 5);
939 src[2] = CLIP((b+2*H) >> 5);
940 src[3] = CLIP((b+3*H) >> 5);
941 src[4] = CLIP((b+4*H) >> 5);
942 src[5] = CLIP((b+5*H) >> 5);
943 src[6] = CLIP((b+6*H) >> 5);
944 src[7] = CLIP((b+7*H) >> 5);
949 static void FUNCC(pred8x8_tm_vp8)(uint8_t *p_src, int p_stride){
950 pixel *src = (pixel*)p_src;
951 int stride = p_stride>>(sizeof(pixel)-1);
952 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
953 pixel *top = src-stride;
956 for (y = 0; y < 8; y++) {
957 uint8_t *cm_in = cm + src[-1];
958 src[0] = cm_in[top[0]];
959 src[1] = cm_in[top[1]];
960 src[2] = cm_in[top[2]];
961 src[3] = cm_in[top[3]];
962 src[4] = cm_in[top[4]];
963 src[5] = cm_in[top[5]];
964 src[6] = cm_in[top[6]];
965 src[7] = cm_in[top[7]];
970 #define SRC(x,y) src[(x)+(y)*stride]
972 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
973 #define PREDICT_8x8_LOAD_LEFT \
974 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
975 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
976 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
977 const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
980 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
981 #define PREDICT_8x8_LOAD_TOP \
982 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
983 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
984 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
985 const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
986 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
989 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
990 #define PREDICT_8x8_LOAD_TOPRIGHT \
991 int t8, t9, t10, t11, t12, t13, t14, t15; \
993 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
994 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
995 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
997 #define PREDICT_8x8_LOAD_TOPLEFT \
998 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
1000 #define PREDICT_8x8_DC(v) \
1002 for( y = 0; y < 8; y++ ) { \
1003 ((pixel4*)src)[0] = \
1004 ((pixel4*)src)[1] = v; \
1008 static void FUNCC(pred8x8l_128_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1010 pixel *src = (pixel*)p_src;
1011 int stride = p_stride>>(sizeof(pixel)-1);
1013 PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
1015 static void FUNCC(pred8x8l_left_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1017 pixel *src = (pixel*)p_src;
1018 int stride = p_stride>>(sizeof(pixel)-1);
1020 PREDICT_8x8_LOAD_LEFT;
1021 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
1024 static void FUNCC(pred8x8l_top_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1026 pixel *src = (pixel*)p_src;
1027 int stride = p_stride>>(sizeof(pixel)-1);
1029 PREDICT_8x8_LOAD_TOP;
1030 const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
1033 static void FUNCC(pred8x8l_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1035 pixel *src = (pixel*)p_src;
1036 int stride = p_stride>>(sizeof(pixel)-1);
1038 PREDICT_8x8_LOAD_LEFT;
1039 PREDICT_8x8_LOAD_TOP;
1040 const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
1041 +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
1044 static void FUNCC(pred8x8l_horizontal)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1046 pixel *src = (pixel*)p_src;
1047 int stride = p_stride>>(sizeof(pixel)-1);
1049 PREDICT_8x8_LOAD_LEFT;
1050 #define ROW(y) ((pixel4*)(src+y*stride))[0] =\
1051 ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
1052 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
1055 static void FUNCC(pred8x8l_vertical)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1058 pixel *src = (pixel*)p_src;
1059 int stride = p_stride>>(sizeof(pixel)-1);
1061 PREDICT_8x8_LOAD_TOP;
1070 for( y = 1; y < 8; y++ ) {
1071 ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
1072 ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
1075 static void FUNCC(pred8x8l_down_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1077 pixel *src = (pixel*)p_src;
1078 int stride = p_stride>>(sizeof(pixel)-1);
1079 PREDICT_8x8_LOAD_TOP;
1080 PREDICT_8x8_LOAD_TOPRIGHT;
1081 SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
1082 SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
1083 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
1084 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
1085 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
1086 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1087 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
1088 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
1089 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
1090 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
1091 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
1092 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
1093 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
1094 SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
1095 SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
1097 static void FUNCC(pred8x8l_down_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1099 pixel *src = (pixel*)p_src;
1100 int stride = p_stride>>(sizeof(pixel)-1);
1101 PREDICT_8x8_LOAD_TOP;
1102 PREDICT_8x8_LOAD_LEFT;
1103 PREDICT_8x8_LOAD_TOPLEFT;
1104 SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
1105 SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1106 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
1107 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1108 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
1109 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1110 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
1111 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
1112 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
1113 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
1114 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
1115 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
1116 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
1117 SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1118 SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1120 static void FUNCC(pred8x8l_vertical_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1122 pixel *src = (pixel*)p_src;
1123 int stride = p_stride>>(sizeof(pixel)-1);
1124 PREDICT_8x8_LOAD_TOP;
1125 PREDICT_8x8_LOAD_LEFT;
1126 PREDICT_8x8_LOAD_TOPLEFT;
1127 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
1128 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1129 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
1130 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1131 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
1132 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1133 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
1134 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
1135 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
1136 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
1137 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1138 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1139 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1140 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1141 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1142 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1143 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1144 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1145 SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1146 SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1147 SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1148 SRC(7,0)= (t6 + t7 + 1) >> 1;
1150 static void FUNCC(pred8x8l_horizontal_down)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1152 pixel *src = (pixel*)p_src;
1153 int stride = p_stride>>(sizeof(pixel)-1);
1154 PREDICT_8x8_LOAD_TOP;
1155 PREDICT_8x8_LOAD_LEFT;
1156 PREDICT_8x8_LOAD_TOPLEFT;
1157 SRC(0,7)= (l6 + l7 + 1) >> 1;
1158 SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1159 SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1160 SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1161 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1162 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1163 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1164 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1165 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1166 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1167 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1168 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1169 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1170 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1171 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1172 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1173 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1174 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1175 SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1176 SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1177 SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1178 SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1180 static void FUNCC(pred8x8l_vertical_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1182 pixel *src = (pixel*)p_src;
1183 int stride = p_stride>>(sizeof(pixel)-1);
1184 PREDICT_8x8_LOAD_TOP;
1185 PREDICT_8x8_LOAD_TOPRIGHT;
1186 SRC(0,0)= (t0 + t1 + 1) >> 1;
1187 SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1188 SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1189 SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1190 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1191 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1192 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1193 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1194 SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1195 SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1196 SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1197 SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1198 SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1199 SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1200 SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1201 SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1202 SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1203 SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1204 SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1205 SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1206 SRC(7,6)= (t10 + t11 + 1) >> 1;
1207 SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1209 static void FUNCC(pred8x8l_horizontal_up)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1211 pixel *src = (pixel*)p_src;
1212 int stride = p_stride>>(sizeof(pixel)-1);
1213 PREDICT_8x8_LOAD_LEFT;
1214 SRC(0,0)= (l0 + l1 + 1) >> 1;
1215 SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1216 SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1217 SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1218 SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1219 SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1220 SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1221 SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1222 SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1223 SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1224 SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1225 SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1226 SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1227 SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1228 SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1229 SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1230 SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1231 SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1233 #undef PREDICT_8x8_LOAD_LEFT
1234 #undef PREDICT_8x8_LOAD_TOP
1235 #undef PREDICT_8x8_LOAD_TOPLEFT
1236 #undef PREDICT_8x8_LOAD_TOPRIGHT
1237 #undef PREDICT_8x8_DC
1243 static void FUNCC(pred4x4_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1245 pixel *pix = (pixel*)p_pix;
1246 const dctcoef *block = (const dctcoef*)p_block;
1247 stride >>= sizeof(pixel)-1;
1251 pix[1*stride]= v += block[0];
1252 pix[2*stride]= v += block[4];
1253 pix[3*stride]= v += block[8];
1254 pix[4*stride]= v + block[12];
1260 static void FUNCC(pred4x4_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1262 pixel *pix = (pixel*)p_pix;
1263 const dctcoef *block = (const dctcoef*)p_block;
1264 stride >>= sizeof(pixel)-1;
1267 pix[0]= v += block[0];
1268 pix[1]= v += block[1];
1269 pix[2]= v += block[2];
1270 pix[3]= v + block[3];
1276 static void FUNCC(pred8x8l_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1278 pixel *pix = (pixel*)p_pix;
1279 const dctcoef *block = (const dctcoef*)p_block;
1280 stride >>= sizeof(pixel)-1;
1284 pix[1*stride]= v += block[0];
1285 pix[2*stride]= v += block[8];
1286 pix[3*stride]= v += block[16];
1287 pix[4*stride]= v += block[24];
1288 pix[5*stride]= v += block[32];
1289 pix[6*stride]= v += block[40];
1290 pix[7*stride]= v += block[48];
1291 pix[8*stride]= v + block[56];
1297 static void FUNCC(pred8x8l_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1299 pixel *pix = (pixel*)p_pix;
1300 const dctcoef *block = (const dctcoef*)p_block;
1301 stride >>= sizeof(pixel)-1;
1304 pix[0]= v += block[0];
1305 pix[1]= v += block[1];
1306 pix[2]= v += block[2];
1307 pix[3]= v += block[3];
1308 pix[4]= v += block[4];
1309 pix[5]= v += block[5];
1310 pix[6]= v += block[6];
1311 pix[7]= v + block[7];
1317 static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1320 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1323 static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1326 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1329 static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1332 FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1335 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1338 FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);