git.sesse.net Git - x264/blob - common/opencl/intra.cl

   1 /* Lookahead lowres intra analysis
   2  *
   3  * Each intra analysis function has been implemented twice, once for scalar GPUs
   4  * (NV) and once for vectorized GPUs (AMD pre-Southern Islands).  x264 detects
   5  * the GPU type and sets the -DVECTORIZE compile flag accordingly.
   6  *
   7  * All the intra analysis functions were based on their C versions in pixel.c
   8  * and produce the exact same results.
   9  */
  10
  11 /* force all clamp arguments and return value to int, prevent ambiguous types */
  12 #define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
  13
  14 #if VECTORIZE
  15 int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
  16 {
  17     int8 a_v, d_v;
  18     int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
  19     int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
  20
  21     d_v = convert_int8( vload8( 0, data ) );
  22     a_v.s01234567 = (d_v - pr0).s04152637;
  23     HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  24
  25     data += data_stride;
  26     d_v = convert_int8( vload8( 0, data ) );
  27     a_v.s01234567 = (d_v - pr1).s04152637;
  28     HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  29
  30     data += data_stride;
  31     d_v = convert_int8( vload8( 0, data ) );
  32     a_v.s01234567 = (d_v - pr2).s04152637;
  33     HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  34
  35     data += data_stride;
  36     d_v = convert_int8( vload8( 0, data ) );
  37     a_v.s01234567 = (d_v - pr3).s04152637;
  38     HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  39
  40     uint8 sum_v;
  41
  42     HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
  43     sum_v = abs( a_v );
  44
  45     HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
  46     sum_v += abs( a_v );
  47
  48     HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
  49     sum_v += abs( a_v );
  50
  51     HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
  52     sum_v += abs( a_v );
  53
  54     uint4 sum2 = sum_v.hi + sum_v.lo;
  55     uint2 sum3 = sum2.hi + sum2.lo;
  56     return ( sum3.hi + sum3.lo ) >> 1;
  57 }
  58 #else
  59 SATD_C_8x4_Q( satd_8x4_lp, const local, private )
  60 #endif
  61
  62 /****************************************************************************
  63  * 8x8 prediction for intra luma block
  64  ****************************************************************************/
  65
  66 #define F1            rhadd
  67 #define F2( a, b, c ) ( a+2*b+c+2 )>>2
  68
  69 #if VECTORIZE
  70 int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
  71 {
  72     int8 pr0, pr1, pr2, pr3;
  73
  74     // Upper half of pred[]
  75     pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
  76     pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
  77     pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  78     pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  79     pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  80     pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  81     pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  82     pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  83
  84     pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
  85     pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  86     pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  87     pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  88     pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  89     pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  90     pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  91     pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  92
  93     pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  94     pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  95     pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  96     pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  97     pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  98     pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  99     pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
 100     pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
 101
 102     pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
 103     pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
 104     pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
 105     pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
 106     pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
 107     pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
 108     pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
 109     pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
 110     int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
 111
 112     // Lower half of pred[]
 113     pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
 114     pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
 115     pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
 116     pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
 117     pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
 118     pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
 119     pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
 120     pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
 121
 122     pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
 123     pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
 124     pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
 125     pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
 126     pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
 127     pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
 128     pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
 129     pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
 130
 131     pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
 132     pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
 133     pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
 134     pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
 135     pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
 136     pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
 137     pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
 138     pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
 139
 140     pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
 141     pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
 142     pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
 143     pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
 144     pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
 145     pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
 146     pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
 147     pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
 148
 149     return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
 150 }
 151
 152 int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
 153 {
 154     int8 pr0, pr1, pr2, pr3;
 155
 156     // Upper half of pred[]
 157     pr3.s0 = F2( left[1], left[2], left[3] );
 158     pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
 159     pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
 160     pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
 161     pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
 162     pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
 163     pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
 164     pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
 165     pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
 166     pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
 167     pr0.s7 = F2( top[5], top[6], top[7] );
 168     int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
 169
 170     // Lower half of pred[]
 171     pr3.s0 = F2( left[5], left[6], left[7] );
 172     pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
 173     pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
 174     pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
 175     pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
 176     pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
 177     pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
 178     pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
 179     pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
 180     pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
 181     pr0.s7 = F2( top[1], top[2], top[3] );
 182     return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
 183 }
 184
 185 int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
 186 {
 187     int8 pr0, pr1, pr2, pr3;
 188
 189     // Upper half of pred[]
 190     pr2.s0 = F2( left[1], left[0], left_top );
 191     pr3.s0 = F2( left[2], left[1], left[0] );
 192     pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
 193     pr0.s0 = pr2.s1 = F1( left_top, top[0] );
 194     pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
 195     pr0.s1 = pr2.s2 = F1( top[0], top[1] );
 196     pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
 197     pr0.s2 = pr2.s3 = F1( top[1], top[2] );
 198     pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
 199     pr0.s3 = pr2.s4 = F1( top[2], top[3] );
 200     pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
 201     pr0.s4 = pr2.s5 = F1( top[3], top[4] );
 202     pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
 203     pr0.s5 = pr2.s6 = F1( top[4], top[5] );
 204     pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
 205     pr0.s6 = pr2.s7 = F1( top[5], top[6] );
 206     pr1.s7 = F2( top[5], top[6], top[7] );
 207     pr0.s7 = F1( top[6], top[7] );
 208     int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
 209
 210     // Lower half of pred[]
 211     pr2.s0 = F2( left[5], left[4], left[3] );
 212     pr3.s0 = F2( left[6], left[5], left[4] );
 213     pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
 214     pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
 215     pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
 216     pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
 217     pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
 218     pr0.s2 = pr2.s3 = F1( left_top, top[0] );
 219     pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
 220     pr0.s3 = pr2.s4 = F1( top[0], top[1] );
 221     pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
 222     pr0.s4 = pr2.s5 = F1( top[1], top[2] );
 223     pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
 224     pr0.s5 = pr2.s6 = F1( top[2], top[3] );
 225     pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
 226     pr0.s6 = pr2.s7 = F1( top[3], top[4] );
 227     pr1.s7 = F2( top[3], top[4], top[5] );
 228     pr0.s7 = F1( top[4], top[5] );
 229     return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
 230 #undef PRED
 231 }
 232
 233 int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
 234 {
 235     int8 pr0, pr1, pr2, pr3;
 236
 237     // Upper half of pred[]
 238     pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
 239     pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
 240     pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
 241     pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
 242
 243     pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
 244     pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
 245     pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
 246     pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
 247
 248     pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
 249     pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
 250     pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
 251     pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
 252
 253     pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
 254     pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
 255     pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
 256     pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
 257     int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
 258
 259     // Lower half of pred[]
 260     pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
 261     pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
 262     pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
 263     pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
 264
 265     pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
 266     pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
 267     pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
 268     pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
 269
 270     pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
 271     pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
 272     pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
 273     pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
 274
 275     pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
 276     pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
 277     pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
 278     pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
 279     return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
 280 }
 281
 282 int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
 283 {
 284     int8 pr0, pr1, pr2, pr3;
 285
 286     // Upper half of pred[]
 287     pr0.s0 = F1( top[0], top[1] );
 288     pr1.s0 = F2( top[0], top[1], top[2] );
 289     pr2.s0 = pr0.s1 = F1( top[1], top[2] );
 290     pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
 291     pr2.s1 = pr0.s2 = F1( top[2], top[3] );
 292     pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
 293     pr2.s2 = pr0.s3 = F1( top[3], top[4] );
 294     pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
 295     pr2.s3 = pr0.s4 = F1( top[4], top[5] );
 296     pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
 297     pr2.s4 = pr0.s5 = F1( top[5], top[6] );
 298     pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
 299     pr2.s5 = pr0.s6 = F1( top[6], top[7] );
 300     pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
 301     pr2.s6 = pr0.s7 = F1( top[7], top[8] );
 302     pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
 303     pr2.s7 = F1( top[8], top[9] );
 304     pr3.s7 = F2( top[8], top[9], top[10] );
 305     int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
 306
 307     // Lower half of pred[]
 308     pr0.s0 = F1( top[2], top[3] );
 309     pr1.s0 = F2( top[2], top[3], top[4] );
 310     pr2.s0 = pr0.s1 = F1( top[3], top[4] );
 311     pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
 312     pr2.s1 = pr0.s2 = F1( top[4], top[5] );
 313     pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
 314     pr2.s2 = pr0.s3 = F1( top[5], top[6] );
 315     pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
 316     pr2.s3 = pr0.s4 = F1( top[6], top[7] );
 317     pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
 318     pr2.s4 = pr0.s5 = F1( top[7], top[8] );
 319     pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
 320     pr2.s5 = pr0.s6 = F1( top[8], top[9] );
 321     pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
 322     pr2.s6 = pr0.s7 = F1( top[9], top[10] );
 323     pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
 324     pr2.s7 = F1( top[10], top[11] );
 325     pr3.s7 = F2( top[10], top[11], top[12] );
 326     return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
 327 }
 328
 329 int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
 330 {
 331     int8 pr0, pr1, pr2, pr3;
 332
 333     // Upper half of pred[]
 334     pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
 335     pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
 336     pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
 337     pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
 338
 339     pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
 340     pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
 341     pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
 342     pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
 343
 344     pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
 345     pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
 346     pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
 347     pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
 348
 349     pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
 350     pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
 351     pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
 352     pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
 353     int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
 354
 355     // Lower half of pred[]
 356     pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
 357     pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
 358     pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
 359     pr0.s6 = left[7]; pr0.s7 = left[7];
 360
 361     pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
 362     pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
 363     pr1.s4 = left[7]; pr1.s5 = left[7];
 364     pr1.s6 = left[7]; pr1.s7 = left[7];
 365
 366     pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
 367     pr2.s2 = left[7]; pr2.s3 = left[7];
 368     pr2.s4 = left[7]; pr2.s5 = left[7];
 369     pr2.s6 = left[7]; pr2.s7 = left[7];
 370
 371     pr3 = (int8)left[7];
 372
 373     return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
 374 }
 375
 376 int x264_predict_8x8c_h( const local pixel *src, int src_stride )
 377 {
 378     const local pixel *src_l = src;
 379     int8 pr0, pr1, pr2, pr3;
 380
 381     // Upper half of pred[]
 382     pr0 = (int8)src[-1]; src += src_stride;
 383     pr1 = (int8)src[-1]; src += src_stride;
 384     pr2 = (int8)src[-1]; src += src_stride;
 385     pr3 = (int8)src[-1]; src += src_stride;
 386     int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
 387
 388     //Lower half of pred[]
 389     pr0 = (int8)src[-1]; src += src_stride;
 390     pr1 = (int8)src[-1]; src += src_stride;
 391     pr2 = (int8)src[-1]; src += src_stride;
 392     pr3 = (int8)src[-1];
 393     return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
 394 }
 395
 396 int x264_predict_8x8c_v( const local pixel *src, int src_stride )
 397 {
 398     int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
 399     return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
 400            satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
 401 }
 402
 403 int x264_predict_8x8c_p( const local pixel *src, int src_stride )
 404 {
 405     int H = 0, V = 0;
 406     for( int i = 0; i < 4; i++ )
 407     {
 408         H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
 409         V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
 410     }
 411
 412     int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
 413     int b = (17 * H + 16) >> 5;
 414     int c = (17 * V + 16) >> 5;
 415     int i00 = a - 3 * b - 3 * c + 16;
 416
 417     // Upper half of pred[]
 418     int pix = i00;
 419     int8 pr0, pr1, pr2, pr3;
 420     pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
 421     pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
 422     pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
 423     pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
 424     pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
 425     pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
 426     pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
 427     pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
 428
 429     pix = i00;
 430     pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
 431     pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
 432     pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
 433     pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
 434     pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
 435     pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
 436     pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
 437     pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
 438
 439     pix = i00;
 440     pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
 441     pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
 442     pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
 443     pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
 444     pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
 445     pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
 446     pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
 447     pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
 448
 449     pix = i00;
 450     pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
 451     pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
 452     pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
 453     pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
 454     pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
 455     pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
 456     pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
 457     pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
 458     int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
 459
 460     //Lower half of pred[]
 461     pix = i00;
 462     pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
 463     pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
 464     pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
 465     pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
 466     pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
 467     pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
 468     pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
 469     pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
 470
 471     pix = i00;
 472     pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
 473     pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
 474     pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
 475     pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
 476     pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
 477     pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
 478     pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
 479     pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
 480
 481     pix = i00;
 482     pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
 483     pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
 484     pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
 485     pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
 486     pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
 487     pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
 488     pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
 489     pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
 490
 491     pix = i00;
 492     pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
 493     pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
 494     pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
 495     pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
 496     pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
 497     pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
 498     pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
 499     pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
 500     return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
 501 }
 502
 503 int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
 504 {
 505     int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
 506     for( int i = 0; i < 4; i++ )
 507     {
 508         s0 += src[i - src_stride];
 509         s1 += src[i + 4 - src_stride];
 510         s2 += src[-1 + i * src_stride];
 511         s3 += src[-1 + (i+4)*src_stride];
 512     }
 513
 514     // Upper half of pred[]
 515     int8 dc0;
 516     dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
 517     dc0.hi = (int4)( (s1 + 2) >> 2 );
 518     int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
 519
 520     // Lower half of pred[]
 521     dc0.lo = (int4)( (s3 + 2) >> 2 );
 522     dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
 523     return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
 524 }
 525
 526 #else  /* not vectorized: private is cheap registers are scarce */
 527
 528 int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
 529 {
 530     private pixel pred[32];
 531
 532     // Upper half of pred[]
 533     for( int y = 0; y < 4; y++ )
 534     {
 535         for( int x = 0; x < 8; x++ )
 536         {
 537             pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
 538             pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
 539         }
 540     }
 541     int satd = satd_8x4_lp( src, src_stride, pred, 8 );
 542     //Lower half of pred[]
 543     for( int y = 4; y < 8; y++ )
 544     {
 545         for( int x = 0; x < 8; x++ )
 546         {
 547             pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
 548             pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
 549         }
 550     }
 551     pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
 552     satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
 553     return satd;
 554 }
 555
 556 int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
 557 {
 558     private pixel pred[32];
 559 #define PRED( x, y ) pred[(x) + (y)*8]
 560     // Upper half of pred[]
 561     PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
 562     PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
 563     PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
 564     PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
 565     PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
 566     PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
 567     PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
 568     PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
 569     PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
 570     PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
 571     PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
 572     int satd = satd_8x4_lp( src, src_stride, pred, 8 );
 573
 574     // Lower half of pred[]
 575     PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
 576     PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
 577     PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
 578     PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
 579     PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
 580     PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
 581     PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
 582     PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
 583     PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
 584     PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
 585     PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
 586     satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
 587     return satd;
 588 #undef PRED
 589 }
 590
 591 int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
 592 {
 593     private pixel pred[32];
 594 #define PRED( x, y ) pred[(x) + (y)*8]
 595     // Upper half of pred[]
 596     PRED( 0, 2 ) = F2( left[1], left[0], left_top );
 597     PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
 598     PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
 599     PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
 600     PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
 601     PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
 602     PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
 603     PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
 604     PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
 605     PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
 606     PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
 607     PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
 608     PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
 609     PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
 610     PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
 611     PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
 612     PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
 613     PRED( 7, 0 ) = F1( top[6], top[7] );
 614     int satd = satd_8x4_lp( src, src_stride, pred, 8 );
 615
 616     //Lower half of pred[]
 617     PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
 618     PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
 619     PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
 620     PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
 621     PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
 622     PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
 623     PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
 624     PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
 625     PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
 626     PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
 627     PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
 628     PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
 629     PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
 630     PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
 631     PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
 632     PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
 633     PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
 634     PRED( 7, 0 ) = F1( top[4], top[5] );
 635     satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
 636     return satd;
 637 #undef PRED
 638 }
 639
 640 inline uint32_t pack16to32( uint32_t a, uint32_t b )
 641 {
 642     return a + (b << 16);
 643 }
 644
 645 inline uint32_t pack8to16( uint32_t a, uint32_t b )
 646 {
 647     return a + (b << 8);
 648 }
 649
 650 int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
 651 {
 652     private pixel pred[32];
 653     int satd;
 654     int p1 =  pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
 655     int p2 =  pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
 656     int p3 =  pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
 657     int p4 =  pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
 658     int p5 =  pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
 659     int p6 =  pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
 660     int p7 =  pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
 661     int p8 =  pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
 662     int p9 =  pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
 663     int p10 =  pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
 664     int p11 =  pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
 665     // Upper half of pred[]
 666     vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
 667     vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
 668     vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
 669     vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
 670     vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
 671     vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
 672     vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
 673     vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
 674     satd = satd_8x4_lp( src, src_stride, pred, 8 );
 675     // Lower half of pred[]
 676     vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
 677     vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
 678     vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
 679     vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
 680     vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
 681     vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
 682     vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
 683     vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
 684     satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
 685     return satd;
 686 }
 687
 688 int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
 689 {
 690     private pixel pred[32];
 691     int satd;
 692 #define PRED( x, y ) pred[(x) + (y)*8]
 693     // Upper half of pred[]
 694     PRED( 0, 0 ) = F1( top[0], top[1] );
 695     PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
 696     PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
 697     PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
 698     PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
 699     PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
 700     PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
 701     PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
 702     PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
 703     PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
 704     PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
 705     PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
 706     PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
 707     PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
 708     PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
 709     PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
 710     PRED( 7, 2 ) = F1( top[8], top[9] );
 711     PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
 712     satd = satd_8x4_lp( src, src_stride, pred, 8 );
 713     // Lower half of pred[]
 714     PRED( 0, 0 ) = F1( top[2], top[3] );
 715     PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
 716     PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
 717     PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
 718     PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
 719     PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
 720     PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
 721     PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
 722     PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
 723     PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
 724     PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
 725     PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
 726     PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
 727     PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
 728     PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
 729     PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
 730     PRED( 7, 2 ) = F1( top[10], top[11] );
 731     PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
 732     satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
 733     return satd;
 734 #undef PRED
 735 }
 736
 737 int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
 738 {
 739     private pixel pred[32];
 740     int satd;
 741     int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
 742     int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
 743     int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
 744     int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
 745     int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
 746     int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
 747     int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
 748     int p8 = pack8to16( left[7], left[7] );
 749     // Upper half of pred[]
 750     vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
 751     vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
 752     vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
 753     vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
 754     vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
 755     vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
 756     vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
 757     vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
 758     satd = satd_8x4_lp( src, src_stride, pred, 8 );
 759     // Lower half of pred[]
 760     vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
 761     vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
 762     vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
 763     vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
 764     vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
 765     vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
 766     vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
 767     vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
 768     satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
 769     return satd;
 770 }
 771
 772 int x264_predict_8x8c_h( const local pixel *src, int src_stride )
 773 {
 774     private pixel pred[32];
 775     const local pixel *src_l = src;
 776
 777     // Upper half of pred[]
 778     vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
 779     vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
 780     vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
 781     vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
 782     int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
 783
 784     // Lower half of pred[]
 785     vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
 786     vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
 787     vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
 788     vstore8( (uchar8)(src[-1]), 3, pred );
 789     return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
 790 }
 791
 792 int x264_predict_8x8c_v( const local pixel *src, int src_stride )
 793 {
 794     private pixel pred[32];
 795     uchar16 v16;
 796     v16.lo = vload8( 0, &src[-src_stride] );
 797     v16.hi = vload8( 0, &src[-src_stride] );
 798
 799     vstore16( v16, 0, pred );
 800     vstore16( v16, 1, pred );
 801
 802     return satd_8x4_lp( src, src_stride, pred, 8 ) +
 803            satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
 804 }
 805
 806 int x264_predict_8x8c_p( const local pixel *src, int src_stride )
 807 {
 808     int H = 0, V = 0;
 809     private pixel pred[32];
 810     int satd;
 811
 812     for( int i = 0; i < 4; i++ )
 813     {
 814         H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
 815         V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
 816     }
 817
 818     int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
 819     int b = (17 * H + 16) >> 5;
 820     int c = (17 * V + 16) >> 5;
 821     int i00 = a - 3 * b - 3 * c + 16;
 822
 823     // Upper half of pred[]
 824     for( int y = 0; y < 4; y++ )
 825     {
 826         int pix = i00;
 827         for( int x = 0; x < 8; x++ )
 828         {
 829             pred[x + y*8] = x264_clip_pixel( pix >> 5 );
 830             pix += b;
 831         }
 832         i00 += c;
 833     }
 834     satd = satd_8x4_lp( src, src_stride, pred, 8 );
 835     // Lower half of pred[]
 836     for( int y = 0; y < 4; y++ )
 837     {
 838         int pix = i00;
 839         for( int x = 0; x < 8; x++ )
 840         {
 841             pred[x + y*8] = x264_clip_pixel( pix >> 5 );
 842             pix += b;
 843         }
 844         i00 += c;
 845     }
 846     satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
 847     return satd;
 848 }
 849
 850 int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
 851 {
 852     private pixel pred[32];
 853     int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
 854     for( int i = 0; i < 4; i++ )
 855     {
 856         s0 += src[i - src_stride];
 857         s1 += src[i + 4 - src_stride];
 858         s2 += src[-1 + i * src_stride];
 859         s3 += src[-1 + (i+4)*src_stride];
 860     }
 861
 862     // Upper half of pred[]
 863     uchar8 dc0;
 864     dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
 865     dc0.hi = (uchar4)( (s1 + 2) >> 2 );
 866     vstore8( dc0, 0, pred );
 867     vstore8( dc0, 1, pred );
 868     vstore8( dc0, 2, pred );
 869     vstore8( dc0, 3, pred );
 870     int satd = satd_8x4_lp( src, src_stride, pred, 8 );
 871
 872     // Lower half of pred[]
 873     dc0.lo = (uchar4)( (s3 + 2) >> 2 );
 874     dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
 875     vstore8( dc0, 0, pred );
 876     vstore8( dc0, 1, pred );
 877     vstore8( dc0, 2, pred );
 878     vstore8( dc0, 3, pred );
 879     return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
 880 }
 881 #endif
 882
 883 /* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
 884  *
 885  * Loads 33 macroblocks plus the pixels directly above them into local memory,
 886  * padding where necessary with edge pixels.  It then cooperatively calculates
 887  * smoothed top and left pixels for use in some of the analysis.
 888  *
 889  * Then groups of 32 threads each calculate a single intra mode for each 8x8
 890  * block.  Since consecutive threads are calculating the same intra mode there
 891  * is no code-path divergence.  8 intra costs are calculated simultaneously.  If
 892  * the "slow" argument is not zero, the final two (least likely) intra modes are
 893  * tested in a second pass.  The slow mode is only enabled for presets slow,
 894  * slower, and placebo.
 895  *
 896  * This allows all of the pixels functions to read pixels from local memory, and
 897  * avoids re-fetching edge pixels from global memory.  And it allows us to
 898  * calculate all of the intra mode costs simultaneously without branch divergence.
 899  *
 900  * Local dimension:    [ 32, 8 ]
 901  * Global dimensions:  [ paddedWidth, height ] */
 902 kernel void mb_intra_cost_satd_8x8( read_only image2d_t  fenc,
 903                                     global uint16_t     *fenc_intra_cost,
 904                                     global int          *frame_stats,
 905                                     int                  lambda,
 906                                     int                  mb_width,
 907                                     int                  slow )
 908 {
 909 #define CACHE_STRIDE 265
 910 #define BLOCK_OFFSET 266
 911     local pixel cache[2385];
 912     local int cost_buf[32];
 913     local pixel top[32 * 16];
 914     local pixel left[32 * 8];
 915     local pixel left_top[32];
 916
 917     int lx = get_local_id( 0 );
 918     int ly = get_local_id( 1 );
 919     int gx = get_global_id( 0 );
 920     int gy = get_global_id( 1 );
 921     int gidx = get_group_id( 0 );
 922     int gidy = get_group_id( 1 );
 923     int linear_id = ly * get_local_size( 0 ) + lx;
 924     int satd = COST_MAX;
 925     int basex = gidx << 8;
 926     int basey = (gidy << 3) - 1;
 927
 928     /* Load 33 8x8 macroblocks and the pixels above them into local cache */
 929     for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ )
 930     {
 931         int x = linear_id << 2;
 932         uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
 933         cache[y * CACHE_STRIDE + 1 + x] = data.s0;
 934         cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
 935         cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
 936         cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
 937     }
 938     /* load pixels on left edge */
 939     if( linear_id < 9 )
 940         cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
 941
 942     barrier( CLK_LOCAL_MEM_FENCE );
 943
 944     // Cooperatively build the top edge for the macroblock using lowpass filter
 945     int j = ly;
 946     top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
 947                        2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
 948                        cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
 949     j += 8;
 950     top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
 951                        2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
 952                        cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
 953     // Cooperatively build the left edge for the macroblock using lowpass filter
 954     left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
 955                         2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
 956                         cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
 957     // One left_top per macroblock
 958     if( 0 == ly )
 959     {
 960         left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
 961                          cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
 962         cost_buf[lx] = COST_MAX;
 963     }
 964     barrier( CLK_LOCAL_MEM_FENCE );
 965
 966     // each warp/wavefront generates a different prediction type; no divergence
 967     switch( ly )
 968     {
 969         case 0:
 970             satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
 971             break;
 972         case 1:
 973             satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
 974             break;
 975         case 2:
 976             satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
 977             break;
 978         case 3:
 979             satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
 980             break;
 981         case 4:
 982             satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
 983             break;
 984         case 5:
 985             satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
 986             break;
 987         case 6:
 988             satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
 989             break;
 990         case 7:
 991             satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
 992             break;
 993         default:
 994             break;
 995     }
 996     atom_min( &cost_buf[lx], satd );
 997     if( slow )
 998     {
 999         // Do the remaining two (least likely) prediction modes
1000         switch( ly )
1001         {
1002             case 0: // DDL
1003                 satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
1004                 atom_min( &cost_buf[lx], satd );
1005                 break;
1006             case 1: // VL
1007                 satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
1008                 atom_min( &cost_buf[lx], satd );
1009                 break;
1010             default:
1011                 break;
1012         }
1013     }
1014     barrier( CLK_LOCAL_MEM_FENCE );
1015
1016     if( (0 == ly) && (gx < mb_width) )
1017         fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
1018
1019     // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
1020     if( gx < 2 && gy == 0 )
1021         frame_stats[gx] = 0;
1022 #undef CACHE_STRIDE
1023 #undef BLOCK_OFFSET
1024 }
1025
1026 /*
1027  * parallel sum intra costs
1028  *
1029  * global launch dimensions: [256, mb_height]
1030  */
1031 kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost,
1032                             const global uint16_t *inv_qscale_factor,
1033                             global int           *fenc_row_satds,
1034                             global int           *frame_stats,
1035                             int                   mb_width )
1036 {
1037     int y = get_global_id( 1 );
1038     int mb_height = get_global_size( 1 );
1039
1040     int row_satds = 0;
1041     int cost_est = 0;
1042     int cost_est_aq = 0;
1043
1044     for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
1045     {
1046         int mb_xy = x + y * mb_width;
1047         int cost = fenc_intra_cost[mb_xy];
1048         int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
1049         int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
1050
1051         row_satds += cost_aq;
1052         if( b_frame_score_mb )
1053         {
1054             cost_est += cost;
1055             cost_est_aq += cost_aq;
1056         }
1057     }
1058
1059     local int buffer[256];
1060     int x = get_global_id( 0 );
1061
1062     row_satds   = parallel_sum( row_satds, x, buffer );
1063     cost_est    = parallel_sum( cost_est, x, buffer );
1064     cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
1065
1066     if( get_global_id( 0 ) == 0 )
1067     {
1068         fenc_row_satds[y] = row_satds;
1069         atomic_add( frame_stats + COST_EST,    cost_est );
1070         atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
1071     }
1072 }