git.sesse.net Git - ffmpeg/blob - libavcodec/h264pred_internal.h

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * H.264 / AVC / MPEG4 part10 prediction functions.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "mathops.h"
  29 #include "h264_high_depth.h"
  30
  31 static void FUNCC(pred4x4_vertical)(uint8_t *p_src, const uint8_t *topright, int p_stride){
  32     pixel *src = (pixel*)p_src;
  33     int stride = p_stride>>(sizeof(pixel)-1);
  34     const pixel4 a= ((pixel4*)(src-stride))[0];
  35     ((pixel4*)(src+0*stride))[0]= a;
  36     ((pixel4*)(src+1*stride))[0]= a;
  37     ((pixel4*)(src+2*stride))[0]= a;
  38     ((pixel4*)(src+3*stride))[0]= a;
  39 }
  40
  41 static void FUNCC(pred4x4_horizontal)(uint8_t *p_src, const uint8_t *topright, int p_stride){
  42     pixel *src = (pixel*)p_src;
  43     int stride = p_stride>>(sizeof(pixel)-1);
  44     ((pixel4*)(src+0*stride))[0]= PIXEL_SPLAT_X4(src[-1+0*stride]);
  45     ((pixel4*)(src+1*stride))[0]= PIXEL_SPLAT_X4(src[-1+1*stride]);
  46     ((pixel4*)(src+2*stride))[0]= PIXEL_SPLAT_X4(src[-1+2*stride]);
  47     ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(src[-1+3*stride]);
  48 }
  49
  50 static void FUNCC(pred4x4_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
  51     pixel *src = (pixel*)p_src;
  52     int stride = p_stride>>(sizeof(pixel)-1);
  53     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
  54                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
  55
  56     ((pixel4*)(src+0*stride))[0]=
  57     ((pixel4*)(src+1*stride))[0]=
  58     ((pixel4*)(src+2*stride))[0]=
  59     ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
  60 }
  61
  62 static void FUNCC(pred4x4_left_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
  63     pixel *src = (pixel*)p_src;
  64     int stride = p_stride>>(sizeof(pixel)-1);
  65     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
  66
  67     ((pixel4*)(src+0*stride))[0]=
  68     ((pixel4*)(src+1*stride))[0]=
  69     ((pixel4*)(src+2*stride))[0]=
  70     ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
  71 }
  72
  73 static void FUNCC(pred4x4_top_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
  74     pixel *src = (pixel*)p_src;
  75     int stride = p_stride>>(sizeof(pixel)-1);
  76     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
  77
  78     ((pixel4*)(src+0*stride))[0]=
  79     ((pixel4*)(src+1*stride))[0]=
  80     ((pixel4*)(src+2*stride))[0]=
  81     ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(dc);
  82 }
  83
  84 static void FUNCC(pred4x4_128_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
  85     pixel *src = (pixel*)p_src;
  86     int stride = p_stride>>(sizeof(pixel)-1);
  87     ((pixel4*)(src+0*stride))[0]=
  88     ((pixel4*)(src+1*stride))[0]=
  89     ((pixel4*)(src+2*stride))[0]=
  90     ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
  91 }
  92
  93 static void FUNCC(pred4x4_127_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
  94     pixel *src = (pixel*)p_src;
  95     int stride = p_stride>>(sizeof(pixel)-1);
  96     ((pixel4*)(src+0*stride))[0]=
  97     ((pixel4*)(src+1*stride))[0]=
  98     ((pixel4*)(src+2*stride))[0]=
  99     ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
 100 }
 101
 102 static void FUNCC(pred4x4_129_dc)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 103     pixel *src = (pixel*)p_src;
 104     int stride = p_stride>>(sizeof(pixel)-1);
 105     ((pixel4*)(src+0*stride))[0]=
 106     ((pixel4*)(src+1*stride))[0]=
 107     ((pixel4*)(src+2*stride))[0]=
 108     ((pixel4*)(src+3*stride))[0]= PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
 109 }
 110
 111
 112 #define LOAD_TOP_RIGHT_EDGE\
 113     const int av_unused t4= topright[0];\
 114     const int av_unused t5= topright[1];\
 115     const int av_unused t6= topright[2];\
 116     const int av_unused t7= topright[3];\
 117
 118 #define LOAD_DOWN_LEFT_EDGE\
 119     const int av_unused l4= src[-1+4*stride];\
 120     const int av_unused l5= src[-1+5*stride];\
 121     const int av_unused l6= src[-1+6*stride];\
 122     const int av_unused l7= src[-1+7*stride];\
 123
 124 #define LOAD_LEFT_EDGE\
 125     const int av_unused l0= src[-1+0*stride];\
 126     const int av_unused l1= src[-1+1*stride];\
 127     const int av_unused l2= src[-1+2*stride];\
 128     const int av_unused l3= src[-1+3*stride];\
 129
 130 #define LOAD_TOP_EDGE\
 131     const int av_unused t0= src[ 0-1*stride];\
 132     const int av_unused t1= src[ 1-1*stride];\
 133     const int av_unused t2= src[ 2-1*stride];\
 134     const int av_unused t3= src[ 3-1*stride];\
 135
 136 static void FUNCC(pred4x4_vertical_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
 137     pixel *src = (pixel*)p_src;
 138     const pixel *topright = (const pixel*)p_topright;
 139     int stride = p_stride>>(sizeof(pixel)-1);
 140     const int lt= src[-1-1*stride];
 141     LOAD_TOP_EDGE
 142     LOAD_TOP_RIGHT_EDGE
 143     pixel4 v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2,
 144                             (t0 + 2*t1 + t2 + 2) >> 2,
 145                             (t1 + 2*t2 + t3 + 2) >> 2,
 146                             (t2 + 2*t3 + t4 + 2) >> 2);
 147
 148     AV_WN4PA(src+0*stride, v);
 149     AV_WN4PA(src+1*stride, v);
 150     AV_WN4PA(src+2*stride, v);
 151     AV_WN4PA(src+3*stride, v);
 152 }
 153
 154 static void FUNCC(pred4x4_horizontal_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 155     pixel *src = (pixel*)p_src;
 156     int stride = p_stride>>(sizeof(pixel)-1);
 157     const int lt= src[-1-1*stride];
 158     LOAD_LEFT_EDGE
 159
 160     AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4((lt + 2*l0 + l1 + 2) >> 2));
 161     AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4((l0 + 2*l1 + l2 + 2) >> 2));
 162     AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4((l1 + 2*l2 + l3 + 2) >> 2));
 163     AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4((l2 + 2*l3 + l3 + 2) >> 2));
 164 }
 165
 166 static void FUNCC(pred4x4_down_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 167     pixel *src = (pixel*)p_src;
 168     int stride = p_stride>>(sizeof(pixel)-1);
 169     const int lt= src[-1-1*stride];
 170     LOAD_TOP_EDGE
 171     LOAD_LEFT_EDGE
 172
 173     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
 174     src[0+2*stride]=
 175     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
 176     src[0+1*stride]=
 177     src[1+2*stride]=
 178     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
 179     src[0+0*stride]=
 180     src[1+1*stride]=
 181     src[2+2*stride]=
 182     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
 183     src[1+0*stride]=
 184     src[2+1*stride]=
 185     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
 186     src[2+0*stride]=
 187     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 188     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
 189 }
 190
 191 static void FUNCC(pred4x4_down_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
 192     pixel *src = (pixel*)p_src;
 193     const pixel *topright = (const pixel*)p_topright;
 194     int stride = p_stride>>(sizeof(pixel)-1);
 195     LOAD_TOP_EDGE
 196     LOAD_TOP_RIGHT_EDGE
 197 //    LOAD_LEFT_EDGE
 198
 199     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
 200     src[1+0*stride]=
 201     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
 202     src[2+0*stride]=
 203     src[1+1*stride]=
 204     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
 205     src[3+0*stride]=
 206     src[2+1*stride]=
 207     src[1+2*stride]=
 208     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
 209     src[3+1*stride]=
 210     src[2+2*stride]=
 211     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
 212     src[3+2*stride]=
 213     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
 214     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
 215 }
 216
 217 static void FUNCC(pred4x4_down_left_svq3)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 218     pixel *src = (pixel*)p_src;
 219     int stride = p_stride>>(sizeof(pixel)-1);
 220     LOAD_TOP_EDGE
 221     LOAD_LEFT_EDGE
 222     const av_unused int unu0= t0;
 223     const av_unused int unu1= l0;
 224
 225     src[0+0*stride]=(l1 + t1)>>1;
 226     src[1+0*stride]=
 227     src[0+1*stride]=(l2 + t2)>>1;
 228     src[2+0*stride]=
 229     src[1+1*stride]=
 230     src[0+2*stride]=
 231     src[3+0*stride]=
 232     src[2+1*stride]=
 233     src[1+2*stride]=
 234     src[0+3*stride]=
 235     src[3+1*stride]=
 236     src[2+2*stride]=
 237     src[1+3*stride]=
 238     src[3+2*stride]=
 239     src[2+3*stride]=
 240     src[3+3*stride]=(l3 + t3)>>1;
 241 }
 242
 243 static void FUNCC(pred4x4_down_left_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
 244     pixel *src = (pixel*)p_src;
 245     const pixel *topright = (const pixel*)p_topright;
 246     int stride = p_stride>>(sizeof(pixel)-1);
 247     LOAD_TOP_EDGE
 248     LOAD_TOP_RIGHT_EDGE
 249     LOAD_LEFT_EDGE
 250     LOAD_DOWN_LEFT_EDGE
 251
 252     src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
 253     src[1+0*stride]=
 254     src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
 255     src[2+0*stride]=
 256     src[1+1*stride]=
 257     src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + l4 + 2*l3 + 2)>>3;
 258     src[3+0*stride]=
 259     src[2+1*stride]=
 260     src[1+2*stride]=
 261     src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3 + l5 + 2*l4 + 2)>>3;
 262     src[3+1*stride]=
 263     src[2+2*stride]=
 264     src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l4 + l6 + 2*l5 + 2)>>3;
 265     src[3+2*stride]=
 266     src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l5 + l7 + 2*l6 + 2)>>3;
 267     src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
 268 }
 269
 270 static void FUNCC(pred4x4_down_left_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
 271     pixel *src = (pixel*)p_src;
 272     const pixel *topright = (const pixel*)p_topright;
 273     int stride = p_stride>>(sizeof(pixel)-1);
 274     LOAD_TOP_EDGE
 275     LOAD_TOP_RIGHT_EDGE
 276     LOAD_LEFT_EDGE
 277
 278     src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3;
 279     src[1+0*stride]=
 280     src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3;
 281     src[2+0*stride]=
 282     src[1+1*stride]=
 283     src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + 3*l3 + 2)>>3;
 284     src[3+0*stride]=
 285     src[2+1*stride]=
 286     src[1+2*stride]=
 287     src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3*4 + 2)>>3;
 288     src[3+1*stride]=
 289     src[2+2*stride]=
 290     src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l3*4 + 2)>>3;
 291     src[3+2*stride]=
 292     src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l3*4 + 2)>>3;
 293     src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2;
 294 }
 295
 296 static void FUNCC(pred4x4_vertical_right)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 297     pixel *src = (pixel*)p_src;
 298     int stride = p_stride>>(sizeof(pixel)-1);
 299     const int lt= src[-1-1*stride];
 300     LOAD_TOP_EDGE
 301     LOAD_LEFT_EDGE
 302
 303     src[0+0*stride]=
 304     src[1+2*stride]=(lt + t0 + 1)>>1;
 305     src[1+0*stride]=
 306     src[2+2*stride]=(t0 + t1 + 1)>>1;
 307     src[2+0*stride]=
 308     src[3+2*stride]=(t1 + t2 + 1)>>1;
 309     src[3+0*stride]=(t2 + t3 + 1)>>1;
 310     src[0+1*stride]=
 311     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
 312     src[1+1*stride]=
 313     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
 314     src[2+1*stride]=
 315     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 316     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
 317     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
 318     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
 319 }
 320
 321 static void FUNCC(pred4x4_vertical_left)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
 322     pixel *src = (pixel*)p_src;
 323     const pixel *topright = (const pixel*)p_topright;
 324     int stride = p_stride>>(sizeof(pixel)-1);
 325     LOAD_TOP_EDGE
 326     LOAD_TOP_RIGHT_EDGE
 327
 328     src[0+0*stride]=(t0 + t1 + 1)>>1;
 329     src[1+0*stride]=
 330     src[0+2*stride]=(t1 + t2 + 1)>>1;
 331     src[2+0*stride]=
 332     src[1+2*stride]=(t2 + t3 + 1)>>1;
 333     src[3+0*stride]=
 334     src[2+2*stride]=(t3 + t4+ 1)>>1;
 335     src[3+2*stride]=(t4 + t5+ 1)>>1;
 336     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 337     src[1+1*stride]=
 338     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
 339     src[2+1*stride]=
 340     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
 341     src[3+1*stride]=
 342     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
 343     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
 344 }
 345
 346 static void FUNCC(pred4x4_vertical_left_rv40_internal)(uint8_t *p_src, const uint8_t *p_topright, int p_stride,
 347                                       const int l0, const int l1, const int l2, const int l3, const int l4){
 348     pixel *src = (pixel*)p_src;
 349     const pixel *topright = (const pixel*)p_topright;
 350     int stride = p_stride>>(sizeof(pixel)-1);
 351     LOAD_TOP_EDGE
 352     LOAD_TOP_RIGHT_EDGE
 353
 354     src[0+0*stride]=(2*t0 + 2*t1 + l1 + 2*l2 + l3 + 4)>>3;
 355     src[1+0*stride]=
 356     src[0+2*stride]=(t1 + t2 + 1)>>1;
 357     src[2+0*stride]=
 358     src[1+2*stride]=(t2 + t3 + 1)>>1;
 359     src[3+0*stride]=
 360     src[2+2*stride]=(t3 + t4+ 1)>>1;
 361     src[3+2*stride]=(t4 + t5+ 1)>>1;
 362     src[0+1*stride]=(t0 + 2*t1 + t2 + l2 + 2*l3 + l4 + 4)>>3;
 363     src[1+1*stride]=
 364     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
 365     src[2+1*stride]=
 366     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
 367     src[3+1*stride]=
 368     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
 369     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
 370 }
 371
 372 static void FUNCC(pred4x4_vertical_left_rv40)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 373     pixel *src = (pixel*)p_src;
 374     int stride = p_stride>>(sizeof(pixel)-1);
 375     LOAD_LEFT_EDGE
 376     LOAD_DOWN_LEFT_EDGE
 377
 378     FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l4);
 379 }
 380
 381 static void FUNCC(pred4x4_vertical_left_rv40_nodown)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 382     pixel *src = (pixel*)p_src;
 383     int stride = p_stride>>(sizeof(pixel)-1);
 384     LOAD_LEFT_EDGE
 385
 386     FUNCC(pred4x4_vertical_left_rv40_internal)(p_src, topright, p_stride, l0, l1, l2, l3, l3);
 387 }
 388
 389 static void FUNCC(pred4x4_vertical_left_vp8)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
 390     pixel *src = (pixel*)p_src;
 391     const pixel *topright = (const pixel*)p_topright;
 392     int stride = p_stride>>(sizeof(pixel)-1);
 393     LOAD_TOP_EDGE
 394     LOAD_TOP_RIGHT_EDGE
 395
 396     src[0+0*stride]=(t0 + t1 + 1)>>1;
 397     src[1+0*stride]=
 398     src[0+2*stride]=(t1 + t2 + 1)>>1;
 399     src[2+0*stride]=
 400     src[1+2*stride]=(t2 + t3 + 1)>>1;
 401     src[3+0*stride]=
 402     src[2+2*stride]=(t3 + t4 + 1)>>1;
 403     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 404     src[1+1*stride]=
 405     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
 406     src[2+1*stride]=
 407     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
 408     src[3+1*stride]=
 409     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
 410     src[3+2*stride]=(t4 + 2*t5 + t6 + 2)>>2;
 411     src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2;
 412 }
 413
 414 static void FUNCC(pred4x4_horizontal_up)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 415     pixel *src = (pixel*)p_src;
 416     int stride = p_stride>>(sizeof(pixel)-1);
 417     LOAD_LEFT_EDGE
 418
 419     src[0+0*stride]=(l0 + l1 + 1)>>1;
 420     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
 421     src[2+0*stride]=
 422     src[0+1*stride]=(l1 + l2 + 1)>>1;
 423     src[3+0*stride]=
 424     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
 425     src[2+1*stride]=
 426     src[0+2*stride]=(l2 + l3 + 1)>>1;
 427     src[3+1*stride]=
 428     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
 429     src[3+2*stride]=
 430     src[1+3*stride]=
 431     src[0+3*stride]=
 432     src[2+2*stride]=
 433     src[2+3*stride]=
 434     src[3+3*stride]=l3;
 435 }
 436
 437 static void FUNCC(pred4x4_horizontal_up_rv40)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
 438     pixel *src = (pixel*)p_src;
 439     const pixel *topright = (const pixel*)p_topright;
 440     int stride = p_stride>>(sizeof(pixel)-1);
 441     LOAD_LEFT_EDGE
 442     LOAD_DOWN_LEFT_EDGE
 443     LOAD_TOP_EDGE
 444     LOAD_TOP_RIGHT_EDGE
 445
 446     src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
 447     src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
 448     src[2+0*stride]=
 449     src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
 450     src[3+0*stride]=
 451     src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
 452     src[2+1*stride]=
 453     src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
 454     src[3+1*stride]=
 455     src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
 456     src[3+2*stride]=
 457     src[1+3*stride]=(l3 + 2*l4 + l5 + 2)>>2;
 458     src[0+3*stride]=
 459     src[2+2*stride]=(t6 + t7 + l3 + l4 + 2)>>2;
 460     src[2+3*stride]=(l4 + l5 + 1)>>1;
 461     src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2;
 462 }
 463
 464 static void FUNCC(pred4x4_horizontal_up_rv40_nodown)(uint8_t *p_src, const uint8_t *p_topright, int p_stride){
 465     pixel *src = (pixel*)p_src;
 466     const pixel *topright = (const pixel*)p_topright;
 467     int stride = p_stride>>(sizeof(pixel)-1);
 468     LOAD_LEFT_EDGE
 469     LOAD_TOP_EDGE
 470     LOAD_TOP_RIGHT_EDGE
 471
 472     src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3;
 473     src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3;
 474     src[2+0*stride]=
 475     src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3;
 476     src[3+0*stride]=
 477     src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3;
 478     src[2+1*stride]=
 479     src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3;
 480     src[3+1*stride]=
 481     src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3;
 482     src[3+2*stride]=
 483     src[1+3*stride]=l3;
 484     src[0+3*stride]=
 485     src[2+2*stride]=(t6 + t7 + 2*l3 + 2)>>2;
 486     src[2+3*stride]=
 487     src[3+3*stride]=l3;
 488 }
 489
 490 static void FUNCC(pred4x4_horizontal_down)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 491     pixel *src = (pixel*)p_src;
 492     int stride = p_stride>>(sizeof(pixel)-1);
 493     const int lt= src[-1-1*stride];
 494     LOAD_TOP_EDGE
 495     LOAD_LEFT_EDGE
 496
 497     src[0+0*stride]=
 498     src[2+1*stride]=(lt + l0 + 1)>>1;
 499     src[1+0*stride]=
 500     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
 501     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
 502     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 503     src[0+1*stride]=
 504     src[2+2*stride]=(l0 + l1 + 1)>>1;
 505     src[1+1*stride]=
 506     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
 507     src[0+2*stride]=
 508     src[2+3*stride]=(l1 + l2+ 1)>>1;
 509     src[1+2*stride]=
 510     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
 511     src[0+3*stride]=(l2 + l3 + 1)>>1;
 512     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
 513 }
 514
 515 static void FUNCC(pred4x4_tm_vp8)(uint8_t *p_src, const uint8_t *topright, int p_stride){
 516     pixel *src = (pixel*)p_src;
 517     int stride = p_stride>>(sizeof(pixel)-1);
 518     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
 519     pixel *top = src-stride;
 520     int y;
 521
 522     for (y = 0; y < 4; y++) {
 523         uint8_t *cm_in = cm + src[-1];
 524         src[0] = cm_in[top[0]];
 525         src[1] = cm_in[top[1]];
 526         src[2] = cm_in[top[2]];
 527         src[3] = cm_in[top[3]];
 528         src += stride;
 529     }
 530 }
 531
 532 static void FUNCC(pred16x16_vertical)(uint8_t *p_src, int p_stride){
 533     int i;
 534     pixel *src = (pixel*)p_src;
 535     int stride = p_stride>>(sizeof(pixel)-1);
 536     const pixel4 a = ((pixel4*)(src-stride))[0];
 537     const pixel4 b = ((pixel4*)(src-stride))[1];
 538     const pixel4 c = ((pixel4*)(src-stride))[2];
 539     const pixel4 d = ((pixel4*)(src-stride))[3];
 540
 541     for(i=0; i<16; i++){
 542         ((pixel4*)(src+i*stride))[0] = a;
 543         ((pixel4*)(src+i*stride))[1] = b;
 544         ((pixel4*)(src+i*stride))[2] = c;
 545         ((pixel4*)(src+i*stride))[3] = d;
 546     }
 547 }
 548
 549 static void FUNCC(pred16x16_horizontal)(uint8_t *p_src, int stride){
 550     int i;
 551     pixel *src = (pixel*)p_src;
 552     stride >>= sizeof(pixel)-1;
 553
 554     for(i=0; i<16; i++){
 555         ((pixel4*)(src+i*stride))[0] =
 556         ((pixel4*)(src+i*stride))[1] =
 557         ((pixel4*)(src+i*stride))[2] =
 558         ((pixel4*)(src+i*stride))[3] = PIXEL_SPLAT_X4(src[-1+i*stride]);
 559     }
 560 }
 561
 562 #define PREDICT_16x16_DC(v)\
 563     for(i=0; i<16; i++){\
 564         AV_WN4P(src+ 0, v);\
 565         AV_WN4P(src+ 4, v);\
 566         AV_WN4P(src+ 8, v);\
 567         AV_WN4P(src+12, v);\
 568         src += stride;\
 569     }
 570
 571 static void FUNCC(pred16x16_dc)(uint8_t *p_src, int stride){
 572     int i, dc=0;
 573     pixel *src = (pixel*)p_src;
 574     pixel4 dcsplat;
 575     stride >>= sizeof(pixel)-1;
 576
 577     for(i=0;i<16; i++){
 578         dc+= src[-1+i*stride];
 579     }
 580
 581     for(i=0;i<16; i++){
 582         dc+= src[i-stride];
 583     }
 584
 585     dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
 586     PREDICT_16x16_DC(dcsplat);
 587 }
 588
 589 static void FUNCC(pred16x16_left_dc)(uint8_t *p_src, int stride){
 590     int i, dc=0;
 591     pixel *src = (pixel*)p_src;
 592     pixel4 dcsplat;
 593     stride >>= sizeof(pixel)-1;
 594
 595     for(i=0;i<16; i++){
 596         dc+= src[-1+i*stride];
 597     }
 598
 599     dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
 600     PREDICT_16x16_DC(dcsplat);
 601 }
 602
 603 static void FUNCC(pred16x16_top_dc)(uint8_t *p_src, int stride){
 604     int i, dc=0;
 605     pixel *src = (pixel*)p_src;
 606     pixel4 dcsplat;
 607     stride >>= sizeof(pixel)-1;
 608
 609     for(i=0;i<16; i++){
 610         dc+= src[i-stride];
 611     }
 612
 613     dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
 614     PREDICT_16x16_DC(dcsplat);
 615 }
 616
 617 #define PRED16x16_X(n, v) \
 618 static void FUNCC(pred16x16_##n##_dc)(uint8_t *p_src, int stride){\
 619     int i;\
 620     pixel *src = (pixel*)p_src;\
 621     stride >>= sizeof(pixel)-1;\
 622     PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
 623 }
 624
 625 PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1);
 626 PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0);
 627 PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1);
 628
 629 static inline void FUNCC(pred16x16_plane_compat)(uint8_t *p_src, int p_stride, const int svq3, const int rv40){
 630   int i, j, k;
 631   int a;
 632   INIT_CLIP
 633   pixel *src = (pixel*)p_src;
 634   int stride = p_stride>>(sizeof(pixel)-1);
 635   const pixel * const src0 = src +7-stride;
 636   const pixel *       src1 = src +8*stride-1;
 637   const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
 638   int H = src0[1] - src0[-1];
 639   int V = src1[0] - src2[ 0];
 640   for(k=2; k<=8; ++k) {
 641     src1 += stride; src2 -= stride;
 642     H += k*(src0[k] - src0[-k]);
 643     V += k*(src1[0] - src2[ 0]);
 644   }
 645   if(svq3){
 646     H = ( 5*(H/4) ) / 16;
 647     V = ( 5*(V/4) ) / 16;
 648
 649     /* required for 100% accuracy */
 650     i = H; H = V; V = i;
 651   }else if(rv40){
 652     H = ( H + (H>>2) ) >> 4;
 653     V = ( V + (V>>2) ) >> 4;
 654   }else{
 655     H = ( 5*H+32 ) >> 6;
 656     V = ( 5*V+32 ) >> 6;
 657   }
 658
 659   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
 660   for(j=16; j>0; --j) {
 661     int b = a;
 662     a += V;
 663     for(i=-16; i<0; i+=4) {
 664       src[16+i] = CLIP((b    ) >> 5);
 665       src[17+i] = CLIP((b+  H) >> 5);
 666       src[18+i] = CLIP((b+2*H) >> 5);
 667       src[19+i] = CLIP((b+3*H) >> 5);
 668       b += 4*H;
 669     }
 670     src += stride;
 671   }
 672 }
 673
 674 static void FUNCC(pred16x16_plane)(uint8_t *src, int stride){
 675     FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
 676 }
 677
 678 static void FUNCC(pred16x16_plane_svq3)(uint8_t *src, int stride){
 679     FUNCC(pred16x16_plane_compat)(src, stride, 1, 0);
 680 }
 681
 682 static void FUNCC(pred16x16_plane_rv40)(uint8_t *src, int stride){
 683     FUNCC(pred16x16_plane_compat)(src, stride, 0, 1);
 684 }
 685
 686 static void FUNCC(pred16x16_tm_vp8)(uint8_t *src, int stride){
 687     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
 688     uint8_t *top = src-stride;
 689     int y;
 690
 691     for (y = 0; y < 16; y++) {
 692         uint8_t *cm_in = cm + src[-1];
 693         src[0]  = cm_in[top[0]];
 694         src[1]  = cm_in[top[1]];
 695         src[2]  = cm_in[top[2]];
 696         src[3]  = cm_in[top[3]];
 697         src[4]  = cm_in[top[4]];
 698         src[5]  = cm_in[top[5]];
 699         src[6]  = cm_in[top[6]];
 700         src[7]  = cm_in[top[7]];
 701         src[8]  = cm_in[top[8]];
 702         src[9]  = cm_in[top[9]];
 703         src[10] = cm_in[top[10]];
 704         src[11] = cm_in[top[11]];
 705         src[12] = cm_in[top[12]];
 706         src[13] = cm_in[top[13]];
 707         src[14] = cm_in[top[14]];
 708         src[15] = cm_in[top[15]];
 709         src += stride;
 710     }
 711 }
 712
 713 static void FUNCC(pred8x8_vertical)(uint8_t *p_src, int p_stride){
 714     int i;
 715     pixel *src = (pixel*)p_src;
 716     int stride = p_stride>>(sizeof(pixel)-1);
 717     const pixel4 a= ((pixel4*)(src-stride))[0];
 718     const pixel4 b= ((pixel4*)(src-stride))[1];
 719
 720     for(i=0; i<8; i++){
 721         ((pixel4*)(src+i*stride))[0]= a;
 722         ((pixel4*)(src+i*stride))[1]= b;
 723     }
 724 }
 725
 726 static void FUNCC(pred8x8_horizontal)(uint8_t *p_src, int stride){
 727     int i;
 728     pixel *src = (pixel*)p_src;
 729     stride >>= sizeof(pixel)-1;
 730
 731     for(i=0; i<8; i++){
 732         ((pixel4*)(src+i*stride))[0]=
 733         ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(src[-1+i*stride]);
 734     }
 735 }
 736
 737 #define PRED8x8_X(n, v)\
 738 static void FUNCC(pred8x8_##n##_dc)(uint8_t *p_src, int stride){\
 739     int i;\
 740     pixel *src = (pixel*)p_src;\
 741     stride >>= sizeof(pixel)-1;\
 742     for(i=0; i<8; i++){\
 743         ((pixel4*)(src+i*stride))[0]=\
 744         ((pixel4*)(src+i*stride))[1]= PIXEL_SPLAT_X4(v);\
 745     }\
 746 }
 747
 748 PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1);
 749 PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0);
 750 PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1);
 751
 752 static void FUNCC(pred8x8_left_dc)(uint8_t *p_src, int stride){
 753     int i;
 754     int dc0, dc2;
 755     pixel4 dc0splat, dc2splat;
 756     pixel *src = (pixel*)p_src;
 757     stride >>= sizeof(pixel)-1;
 758
 759     dc0=dc2=0;
 760     for(i=0;i<4; i++){
 761         dc0+= src[-1+i*stride];
 762         dc2+= src[-1+(i+4)*stride];
 763     }
 764     dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
 765     dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
 766
 767     for(i=0; i<4; i++){
 768         ((pixel4*)(src+i*stride))[0]=
 769         ((pixel4*)(src+i*stride))[1]= dc0splat;
 770     }
 771     for(i=4; i<8; i++){
 772         ((pixel4*)(src+i*stride))[0]=
 773         ((pixel4*)(src+i*stride))[1]= dc2splat;
 774     }
 775 }
 776
 777 static void FUNCC(pred8x8_left_dc_rv40)(uint8_t *p_src, int stride){
 778     int i;
 779     int dc0;
 780     pixel4 dc0splat;
 781     pixel *src = (pixel*)p_src;
 782     stride >>= sizeof(pixel)-1;
 783
 784     dc0=0;
 785     for(i=0;i<8; i++)
 786         dc0+= src[-1+i*stride];
 787     dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
 788
 789     for(i=0; i<8; i++){
 790         ((pixel4*)(src+i*stride))[0]=
 791         ((pixel4*)(src+i*stride))[1]= dc0splat;
 792     }
 793 }
 794
 795 static void FUNCC(pred8x8_top_dc)(uint8_t *p_src, int stride){
 796     int i;
 797     int dc0, dc1;
 798     pixel4 dc0splat, dc1splat;
 799     pixel *src = (pixel*)p_src;
 800     stride >>= sizeof(pixel)-1;
 801
 802     dc0=dc1=0;
 803     for(i=0;i<4; i++){
 804         dc0+= src[i-stride];
 805         dc1+= src[4+i-stride];
 806     }
 807     dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
 808     dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
 809
 810     for(i=0; i<4; i++){
 811         ((pixel4*)(src+i*stride))[0]= dc0splat;
 812         ((pixel4*)(src+i*stride))[1]= dc1splat;
 813     }
 814     for(i=4; i<8; i++){
 815         ((pixel4*)(src+i*stride))[0]= dc0splat;
 816         ((pixel4*)(src+i*stride))[1]= dc1splat;
 817     }
 818 }
 819
 820 static void FUNCC(pred8x8_top_dc_rv40)(uint8_t *p_src, int stride){
 821     int i;
 822     int dc0;
 823     pixel4 dc0splat;
 824     pixel *src = (pixel*)p_src;
 825     stride >>= sizeof(pixel)-1;
 826
 827     dc0=0;
 828     for(i=0;i<8; i++)
 829         dc0+= src[i-stride];
 830     dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
 831
 832     for(i=0; i<8; i++){
 833         ((pixel4*)(src+i*stride))[0]=
 834         ((pixel4*)(src+i*stride))[1]= dc0splat;
 835     }
 836 }
 837
 838
 839 static void FUNCC(pred8x8_dc)(uint8_t *p_src, int stride){
 840     int i;
 841     int dc0, dc1, dc2;
 842     pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
 843     pixel *src = (pixel*)p_src;
 844     stride >>= sizeof(pixel)-1;
 845
 846     dc0=dc1=dc2=0;
 847     for(i=0;i<4; i++){
 848         dc0+= src[-1+i*stride] + src[i-stride];
 849         dc1+= src[4+i-stride];
 850         dc2+= src[-1+(i+4)*stride];
 851     }
 852     dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
 853     dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
 854     dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
 855     dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
 856
 857     for(i=0; i<4; i++){
 858         ((pixel4*)(src+i*stride))[0]= dc0splat;
 859         ((pixel4*)(src+i*stride))[1]= dc1splat;
 860     }
 861     for(i=4; i<8; i++){
 862         ((pixel4*)(src+i*stride))[0]= dc2splat;
 863         ((pixel4*)(src+i*stride))[1]= dc3splat;
 864     }
 865 }
 866
 867 //the following 4 function should not be optimized!
 868 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
 869     FUNCC(pred8x8_top_dc)(src, stride);
 870     FUNCC(pred4x4_dc)(src, NULL, stride);
 871 }
 872
 873 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
 874     FUNCC(pred8x8_dc)(src, stride);
 875     FUNCC(pred4x4_top_dc)(src, NULL, stride);
 876 }
 877
 878 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
 879     FUNCC(pred8x8_left_dc)(src, stride);
 880     FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
 881     FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
 882 }
 883
 884 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
 885     FUNCC(pred8x8_left_dc)(src, stride);
 886     FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
 887     FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
 888 }
 889
 890 static void FUNCC(pred8x8_dc_rv40)(uint8_t *p_src, int stride){
 891     int i;
 892     int dc0=0;
 893     pixel4 dc0splat;
 894     pixel *src = (pixel*)p_src;
 895     stride >>= sizeof(pixel)-1;
 896
 897     for(i=0;i<4; i++){
 898         dc0+= src[-1+i*stride] + src[i-stride];
 899         dc0+= src[4+i-stride];
 900         dc0+= src[-1+(i+4)*stride];
 901     }
 902     dc0splat = PIXEL_SPLAT_X4((dc0 + 8)>>4);
 903
 904     for(i=0; i<4; i++){
 905         ((pixel4*)(src+i*stride))[0]= dc0splat;
 906         ((pixel4*)(src+i*stride))[1]= dc0splat;
 907     }
 908     for(i=4; i<8; i++){
 909         ((pixel4*)(src+i*stride))[0]= dc0splat;
 910         ((pixel4*)(src+i*stride))[1]= dc0splat;
 911     }
 912 }
 913
 914 static void FUNCC(pred8x8_plane)(uint8_t *p_src, int p_stride){
 915   int j, k;
 916   int a;
 917   INIT_CLIP
 918   pixel *src = (pixel*)p_src;
 919   int stride = p_stride>>(sizeof(pixel)-1);
 920   const pixel * const src0 = src +3-stride;
 921   const pixel *       src1 = src +4*stride-1;
 922   const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
 923   int H = src0[1] - src0[-1];
 924   int V = src1[0] - src2[ 0];
 925   for(k=2; k<=4; ++k) {
 926     src1 += stride; src2 -= stride;
 927     H += k*(src0[k] - src0[-k]);
 928     V += k*(src1[0] - src2[ 0]);
 929   }
 930   H = ( 17*H+16 ) >> 5;
 931   V = ( 17*V+16 ) >> 5;
 932
 933   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
 934   for(j=8; j>0; --j) {
 935     int b = a;
 936     a += V;
 937     src[0] = CLIP((b    ) >> 5);
 938     src[1] = CLIP((b+  H) >> 5);
 939     src[2] = CLIP((b+2*H) >> 5);
 940     src[3] = CLIP((b+3*H) >> 5);
 941     src[4] = CLIP((b+4*H) >> 5);
 942     src[5] = CLIP((b+5*H) >> 5);
 943     src[6] = CLIP((b+6*H) >> 5);
 944     src[7] = CLIP((b+7*H) >> 5);
 945     src += stride;
 946   }
 947 }
 948
 949 static void FUNCC(pred8x8_tm_vp8)(uint8_t *p_src, int p_stride){
 950     pixel *src = (pixel*)p_src;
 951     int stride = p_stride>>(sizeof(pixel)-1);
 952     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP - src[-1-stride];
 953     pixel *top = src-stride;
 954     int y;
 955
 956     for (y = 0; y < 8; y++) {
 957         uint8_t *cm_in = cm + src[-1];
 958         src[0] = cm_in[top[0]];
 959         src[1] = cm_in[top[1]];
 960         src[2] = cm_in[top[2]];
 961         src[3] = cm_in[top[3]];
 962         src[4] = cm_in[top[4]];
 963         src[5] = cm_in[top[5]];
 964         src[6] = cm_in[top[6]];
 965         src[7] = cm_in[top[7]];
 966         src += stride;
 967     }
 968 }
 969
 970 #define SRC(x,y) src[(x)+(y)*stride]
 971 #define PL(y) \
 972     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
 973 #define PREDICT_8x8_LOAD_LEFT \
 974     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
 975                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
 976     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
 977     const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
 978
 979 #define PT(x) \
 980     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
 981 #define PREDICT_8x8_LOAD_TOP \
 982     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
 983                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
 984     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
 985     const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
 986                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
 987
 988 #define PTR(x) \
 989     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
 990 #define PREDICT_8x8_LOAD_TOPRIGHT \
 991     int t8, t9, t10, t11, t12, t13, t14, t15; \
 992     if(has_topright) { \
 993         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
 994         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
 995     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
 996
 997 #define PREDICT_8x8_LOAD_TOPLEFT \
 998     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
 999
1000 #define PREDICT_8x8_DC(v) \
1001     int y; \
1002     for( y = 0; y < 8; y++ ) { \
1003         ((pixel4*)src)[0] = \
1004         ((pixel4*)src)[1] = v; \
1005         src += stride; \
1006     }
1007
1008 static void FUNCC(pred8x8l_128_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1009 {
1010     pixel *src = (pixel*)p_src;
1011     int stride = p_stride>>(sizeof(pixel)-1);
1012
1013     PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
1014 }
1015 static void FUNCC(pred8x8l_left_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1016 {
1017     pixel *src = (pixel*)p_src;
1018     int stride = p_stride>>(sizeof(pixel)-1);
1019
1020     PREDICT_8x8_LOAD_LEFT;
1021     const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
1022     PREDICT_8x8_DC(dc);
1023 }
1024 static void FUNCC(pred8x8l_top_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1025 {
1026     pixel *src = (pixel*)p_src;
1027     int stride = p_stride>>(sizeof(pixel)-1);
1028
1029     PREDICT_8x8_LOAD_TOP;
1030     const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
1031     PREDICT_8x8_DC(dc);
1032 }
1033 static void FUNCC(pred8x8l_dc)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1034 {
1035     pixel *src = (pixel*)p_src;
1036     int stride = p_stride>>(sizeof(pixel)-1);
1037
1038     PREDICT_8x8_LOAD_LEFT;
1039     PREDICT_8x8_LOAD_TOP;
1040     const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
1041                                      +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
1042     PREDICT_8x8_DC(dc);
1043 }
1044 static void FUNCC(pred8x8l_horizontal)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1045 {
1046     pixel *src = (pixel*)p_src;
1047     int stride = p_stride>>(sizeof(pixel)-1);
1048
1049     PREDICT_8x8_LOAD_LEFT;
1050 #define ROW(y) ((pixel4*)(src+y*stride))[0] =\
1051                ((pixel4*)(src+y*stride))[1] = PIXEL_SPLAT_X4(l##y)
1052     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
1053 #undef ROW
1054 }
1055 static void FUNCC(pred8x8l_vertical)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1056 {
1057     int y;
1058     pixel *src = (pixel*)p_src;
1059     int stride = p_stride>>(sizeof(pixel)-1);
1060
1061     PREDICT_8x8_LOAD_TOP;
1062     src[0] = t0;
1063     src[1] = t1;
1064     src[2] = t2;
1065     src[3] = t3;
1066     src[4] = t4;
1067     src[5] = t5;
1068     src[6] = t6;
1069     src[7] = t7;
1070     for( y = 1; y < 8; y++ ) {
1071         ((pixel4*)(src+y*stride))[0] = ((pixel4*)src)[0];
1072         ((pixel4*)(src+y*stride))[1] = ((pixel4*)src)[1];
1073     }
1074 }
1075 static void FUNCC(pred8x8l_down_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1076 {
1077     pixel *src = (pixel*)p_src;
1078     int stride = p_stride>>(sizeof(pixel)-1);
1079     PREDICT_8x8_LOAD_TOP;
1080     PREDICT_8x8_LOAD_TOPRIGHT;
1081     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
1082     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
1083     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
1084     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
1085     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
1086     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1087     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
1088     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
1089     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
1090     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
1091     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
1092     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
1093     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
1094     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
1095     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
1096 }
1097 static void FUNCC(pred8x8l_down_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1098 {
1099     pixel *src = (pixel*)p_src;
1100     int stride = p_stride>>(sizeof(pixel)-1);
1101     PREDICT_8x8_LOAD_TOP;
1102     PREDICT_8x8_LOAD_LEFT;
1103     PREDICT_8x8_LOAD_TOPLEFT;
1104     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
1105     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1106     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
1107     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1108     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
1109     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1110     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
1111     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
1112     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
1113     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
1114     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
1115     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
1116     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
1117     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1118     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1119 }
1120 static void FUNCC(pred8x8l_vertical_right)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1121 {
1122     pixel *src = (pixel*)p_src;
1123     int stride = p_stride>>(sizeof(pixel)-1);
1124     PREDICT_8x8_LOAD_TOP;
1125     PREDICT_8x8_LOAD_LEFT;
1126     PREDICT_8x8_LOAD_TOPLEFT;
1127     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
1128     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1129     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
1130     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1131     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
1132     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1133     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
1134     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
1135     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
1136     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
1137     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1138     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1139     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1140     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1141     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1142     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1143     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1144     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1145     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1146     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1147     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1148     SRC(7,0)= (t6 + t7 + 1) >> 1;
1149 }
1150 static void FUNCC(pred8x8l_horizontal_down)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1151 {
1152     pixel *src = (pixel*)p_src;
1153     int stride = p_stride>>(sizeof(pixel)-1);
1154     PREDICT_8x8_LOAD_TOP;
1155     PREDICT_8x8_LOAD_LEFT;
1156     PREDICT_8x8_LOAD_TOPLEFT;
1157     SRC(0,7)= (l6 + l7 + 1) >> 1;
1158     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1159     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1160     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1161     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1162     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1163     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1164     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1165     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1166     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1167     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1168     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1169     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1170     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1171     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1172     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1173     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1174     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1175     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1176     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1177     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1178     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1179 }
1180 static void FUNCC(pred8x8l_vertical_left)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1181 {
1182     pixel *src = (pixel*)p_src;
1183     int stride = p_stride>>(sizeof(pixel)-1);
1184     PREDICT_8x8_LOAD_TOP;
1185     PREDICT_8x8_LOAD_TOPRIGHT;
1186     SRC(0,0)= (t0 + t1 + 1) >> 1;
1187     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1188     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1189     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1190     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1191     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1192     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1193     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1194     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1195     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1196     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1197     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1198     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1199     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1200     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1201     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1202     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1203     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1204     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1205     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1206     SRC(7,6)= (t10 + t11 + 1) >> 1;
1207     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1208 }
1209 static void FUNCC(pred8x8l_horizontal_up)(uint8_t *p_src, int has_topleft, int has_topright, int p_stride)
1210 {
1211     pixel *src = (pixel*)p_src;
1212     int stride = p_stride>>(sizeof(pixel)-1);
1213     PREDICT_8x8_LOAD_LEFT;
1214     SRC(0,0)= (l0 + l1 + 1) >> 1;
1215     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1216     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1217     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1218     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1219     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1220     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1221     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1222     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1223     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1224     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1225     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1226     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1227     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1228     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1229     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1230     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1231     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1232 }
1233 #undef PREDICT_8x8_LOAD_LEFT
1234 #undef PREDICT_8x8_LOAD_TOP
1235 #undef PREDICT_8x8_LOAD_TOPLEFT
1236 #undef PREDICT_8x8_LOAD_TOPRIGHT
1237 #undef PREDICT_8x8_DC
1238 #undef PTR
1239 #undef PT
1240 #undef PL
1241 #undef SRC
1242
1243 static void FUNCC(pred4x4_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1244     int i;
1245     pixel *pix = (pixel*)p_pix;
1246     const dctcoef *block = (const dctcoef*)p_block;
1247     stride >>= sizeof(pixel)-1;
1248     pix -= stride;
1249     for(i=0; i<4; i++){
1250         pixel v = pix[0];
1251         pix[1*stride]= v += block[0];
1252         pix[2*stride]= v += block[4];
1253         pix[3*stride]= v += block[8];
1254         pix[4*stride]= v +  block[12];
1255         pix++;
1256         block++;
1257     }
1258 }
1259
1260 static void FUNCC(pred4x4_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1261     int i;
1262     pixel *pix = (pixel*)p_pix;
1263     const dctcoef *block = (const dctcoef*)p_block;
1264     stride >>= sizeof(pixel)-1;
1265     for(i=0; i<4; i++){
1266         pixel v = pix[-1];
1267         pix[0]= v += block[0];
1268         pix[1]= v += block[1];
1269         pix[2]= v += block[2];
1270         pix[3]= v +  block[3];
1271         pix+= stride;
1272         block+= 4;
1273     }
1274 }
1275
1276 static void FUNCC(pred8x8l_vertical_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1277     int i;
1278     pixel *pix = (pixel*)p_pix;
1279     const dctcoef *block = (const dctcoef*)p_block;
1280     stride >>= sizeof(pixel)-1;
1281     pix -= stride;
1282     for(i=0; i<8; i++){
1283         pixel v = pix[0];
1284         pix[1*stride]= v += block[0];
1285         pix[2*stride]= v += block[8];
1286         pix[3*stride]= v += block[16];
1287         pix[4*stride]= v += block[24];
1288         pix[5*stride]= v += block[32];
1289         pix[6*stride]= v += block[40];
1290         pix[7*stride]= v += block[48];
1291         pix[8*stride]= v +  block[56];
1292         pix++;
1293         block++;
1294     }
1295 }
1296
1297 static void FUNCC(pred8x8l_horizontal_add)(uint8_t *p_pix, const DCTELEM *p_block, int stride){
1298     int i;
1299     pixel *pix = (pixel*)p_pix;
1300     const dctcoef *block = (const dctcoef*)p_block;
1301     stride >>= sizeof(pixel)-1;
1302     for(i=0; i<8; i++){
1303         pixel v = pix[-1];
1304         pix[0]= v += block[0];
1305         pix[1]= v += block[1];
1306         pix[2]= v += block[2];
1307         pix[3]= v += block[3];
1308         pix[4]= v += block[4];
1309         pix[5]= v += block[5];
1310         pix[6]= v += block[6];
1311         pix[7]= v +  block[7];
1312         pix+= stride;
1313         block+= 8;
1314     }
1315 }
1316
1317 static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1318     int i;
1319     for(i=0; i<16; i++)
1320         FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1321 }
1322
1323 static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1324     int i;
1325     for(i=0; i<16; i++)
1326         FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1327 }
1328
1329 static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1330     int i;
1331     for(i=0; i<4; i++)
1332         FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1333 }
1334
1335 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
1336     int i;
1337     for(i=0; i<4; i++)
1338         FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1339 }