git.sesse.net Git - x264/blob - common/mc.c

   1 /*****************************************************************************
   2  * mc.c: motion compensation
   3  *****************************************************************************
   4  * Copyright (C) 2003-2016 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *
  23  * This program is also available under a commercial proprietary license.
  24  * For more information, contact us at licensing@x264.com.
  25  *****************************************************************************/
  26
  27 #include "common.h"
  28
  29 #if HAVE_MMX
  30 #include "x86/mc.h"
  31 #endif
  32 #if ARCH_PPC
  33 #include "ppc/mc.h"
  34 #endif
  35 #if ARCH_ARM
  36 #include "arm/mc.h"
  37 #endif
  38 #if ARCH_AARCH64
  39 #include "aarch64/mc.h"
  40 #endif
  41 #if ARCH_MIPS
  42 #include "mips/mc.h"
  43 #endif
  44
  45
  46 static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
  47                               pixel *src1, intptr_t i_src1_stride,
  48                               pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
  49 {
  50     for( int y = 0; y < i_height; y++ )
  51     {
  52         for( int x = 0; x < i_width; x++ )
  53             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
  54         dst  += i_dst_stride;
  55         src1 += i_src1_stride;
  56         src2 += i_src2_stride;
  57     }
  58 }
  59
  60 static inline void pixel_avg_wxh( pixel *dst,  intptr_t i_dst,
  61                                   pixel *src1, intptr_t i_src1,
  62                                   pixel *src2, intptr_t i_src2, int width, int height )
  63 {
  64     for( int y = 0; y < height; y++ )
  65     {
  66         for( int x = 0; x < width; x++ )
  67             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
  68         src1 += i_src1;
  69         src2 += i_src2;
  70         dst += i_dst;
  71     }
  72 }
  73
  74 /* Implicit weighted bipred only:
  75  * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
  76 static inline void pixel_avg_weight_wxh( pixel *dst,  intptr_t i_dst,
  77                                          pixel *src1, intptr_t i_src1,
  78                                          pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
  79 {
  80     int i_weight2 = 64 - i_weight1;
  81     for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
  82         for( int x = 0; x<width; x++ )
  83             dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
  84 }
  85 #undef op_scale2
  86
  87 #define PIXEL_AVG_C( name, width, height ) \
  88 static void name( pixel *pix1, intptr_t i_stride_pix1, \
  89                   pixel *pix2, intptr_t i_stride_pix2, \
  90                   pixel *pix3, intptr_t i_stride_pix3, int weight ) \
  91 { \
  92     if( weight == 32 ) \
  93         pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
  94     else \
  95         pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
  96 }
  97 PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
  98 PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
  99 PIXEL_AVG_C( pixel_avg_8x16,  8, 16 )
 100 PIXEL_AVG_C( pixel_avg_8x8,   8, 8 )
 101 PIXEL_AVG_C( pixel_avg_8x4,   8, 4 )
 102 PIXEL_AVG_C( pixel_avg_4x16,  4, 16 )
 103 PIXEL_AVG_C( pixel_avg_4x8,   4, 8 )
 104 PIXEL_AVG_C( pixel_avg_4x4,   4, 4 )
 105 PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
 106 PIXEL_AVG_C( pixel_avg_2x8,   2, 8 )
 107 PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
 108 PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
 109
 110 static void x264_weight_cache( x264_t *h, x264_weight_t *w )
 111 {
 112     w->weightfn = h->mc.weight;
 113 }
 114 #define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
 115 #define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
 116 static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
 117                        const x264_weight_t *weight, int i_width, int i_height )
 118 {
 119     int offset = weight->i_offset << (BIT_DEPTH-8);
 120     int scale = weight->i_scale;
 121     int denom = weight->i_denom;
 122     if( denom >= 1 )
 123     {
 124         for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
 125             for( int x = 0; x < i_width; x++ )
 126                 opscale( x );
 127     }
 128     else
 129     {
 130         for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
 131             for( int x = 0; x < i_width; x++ )
 132                 opscale_noden( x );
 133     }
 134 }
 135
 136 #define MC_WEIGHT_C( name, width ) \
 137     static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
 138 { \
 139     mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
 140 }
 141
 142 MC_WEIGHT_C( mc_weight_w20, 20 )
 143 MC_WEIGHT_C( mc_weight_w16, 16 )
 144 MC_WEIGHT_C( mc_weight_w12, 12 )
 145 MC_WEIGHT_C( mc_weight_w8,   8 )
 146 MC_WEIGHT_C( mc_weight_w4,   4 )
 147 MC_WEIGHT_C( mc_weight_w2,   2 )
 148
 149 static weight_fn_t x264_mc_weight_wtab[6] =
 150 {
 151     mc_weight_w2,
 152     mc_weight_w4,
 153     mc_weight_w8,
 154     mc_weight_w12,
 155     mc_weight_w16,
 156     mc_weight_w20,
 157 };
 158 const x264_weight_t x264_weight_none[3] = { {{0}} };
 159 static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
 160 {
 161     for( int y = 0; y < i_height; y++ )
 162     {
 163         memcpy( dst, src, i_width * sizeof(pixel) );
 164
 165         src += i_src_stride;
 166         dst += i_dst_stride;
 167     }
 168 }
 169
 170 #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
 171 static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
 172                          intptr_t stride, int width, int height, int16_t *buf )
 173 {
 174     const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
 175     for( int y = 0; y < height; y++ )
 176     {
 177         for( int x = -2; x < width+3; x++ )
 178         {
 179             int v = TAPFILTER(src,stride);
 180             dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
 181             /* transform v for storage in a 16-bit integer */
 182             buf[x+2] = v + pad;
 183         }
 184         for( int x = 0; x < width; x++ )
 185             dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 );
 186         for( int x = 0; x < width; x++ )
 187             dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
 188         dsth += stride;
 189         dstv += stride;
 190         dstc += stride;
 191         src += stride;
 192     }
 193 }
 194
 195 const uint8_t x264_hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 196 const uint8_t x264_hpel_ref1[16] = {0,0,1,0,2,2,3,2,2,2,3,2,2,2,3,2};
 197
 198 static void mc_luma( pixel *dst,    intptr_t i_dst_stride,
 199                      pixel *src[4], intptr_t i_src_stride,
 200                      int mvx, int mvy,
 201                      int i_width, int i_height, const x264_weight_t *weight )
 202 {
 203     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
 204     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
 205     pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
 206
 207     if( qpel_idx & 5 ) /* qpel interpolation needed */
 208     {
 209         pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
 210         pixel_avg( dst, i_dst_stride, src1, i_src_stride,
 211                    src2, i_src_stride, i_width, i_height );
 212         if( weight->weightfn )
 213             mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
 214     }
 215     else if( weight->weightfn )
 216         mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
 217     else
 218         mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
 219 }
 220
 221 static pixel *get_ref( pixel *dst,   intptr_t *i_dst_stride,
 222                        pixel *src[4], intptr_t i_src_stride,
 223                        int mvx, int mvy,
 224                        int i_width, int i_height, const x264_weight_t *weight )
 225 {
 226     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
 227     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
 228     pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
 229
 230     if( qpel_idx & 5 ) /* qpel interpolation needed */
 231     {
 232         pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
 233         pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
 234                    src2, i_src_stride, i_width, i_height );
 235         if( weight->weightfn )
 236             mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
 237         return dst;
 238     }
 239     else if( weight->weightfn )
 240     {
 241         mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
 242         return dst;
 243     }
 244     else
 245     {
 246         *i_dst_stride = i_src_stride;
 247         return src1;
 248     }
 249 }
 250
 251 /* full chroma mc (ie until 1/8 pixel)*/
 252 static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
 253                        pixel *src, intptr_t i_src_stride,
 254                        int mvx, int mvy,
 255                        int i_width, int i_height )
 256 {
 257     pixel *srcp;
 258
 259     int d8x = mvx&0x07;
 260     int d8y = mvy&0x07;
 261     int cA = (8-d8x)*(8-d8y);
 262     int cB = d8x    *(8-d8y);
 263     int cC = (8-d8x)*d8y;
 264     int cD = d8x    *d8y;
 265
 266     src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
 267     srcp = &src[i_src_stride];
 268
 269     for( int y = 0; y < i_height; y++ )
 270     {
 271         for( int x = 0; x < i_width; x++ )
 272         {
 273             dstu[x] = ( cA*src[2*x]  + cB*src[2*x+2] +
 274                         cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
 275             dstv[x] = ( cA*src[2*x+1]  + cB*src[2*x+3] +
 276                         cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
 277         }
 278         dstu += i_dst_stride;
 279         dstv += i_dst_stride;
 280         src   = srcp;
 281         srcp += i_src_stride;
 282     }
 283 }
 284
 285 #define MC_COPY(W) \
 286 static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
 287 { \
 288     mc_copy( src, i_src, dst, i_dst, W, i_height ); \
 289 }
 290 MC_COPY( 16 )
 291 MC_COPY( 8 )
 292 MC_COPY( 4 )
 293
 294 void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
 295                         pixel *src, intptr_t i_src, int w, int h )
 296 {
 297     while( h-- )
 298     {
 299         memcpy( dst, src, w * sizeof(pixel) );
 300         dst += i_dst;
 301         src += i_src;
 302     }
 303 }
 304
 305 void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst,
 306                              pixel *src, intptr_t i_src, int w, int h )
 307 {
 308     for( int y=0; y<h; y++, dst+=i_dst, src+=i_src )
 309         for( int x=0; x<2*w; x+=2 )
 310         {
 311             dst[x]   = src[x+1];
 312             dst[x+1] = src[x];
 313         }
 314 }
 315
 316 void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
 317                                    pixel *srcu, intptr_t i_srcu,
 318                                    pixel *srcv, intptr_t i_srcv, int w, int h )
 319 {
 320     for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
 321         for( int x=0; x<w; x++ )
 322         {
 323             dst[2*x]   = srcu[x];
 324             dst[2*x+1] = srcv[x];
 325         }
 326 }
 327
 328 static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
 329                                             pixel *dstv, intptr_t i_dstv,
 330                                             pixel *src,  intptr_t i_src, int w, int h )
 331 {
 332     for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
 333         for( int x=0; x<w; x++ )
 334         {
 335             dstu[x] = src[2*x];
 336             dstv[x] = src[2*x+1];
 337         }
 338 }
 339
 340 static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
 341                                                 pixel *dstb, intptr_t i_dstb,
 342                                                 pixel *dstc, intptr_t i_dstc,
 343                                                 pixel *src,  intptr_t i_src, int pw, int w, int h )
 344 {
 345     for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
 346     {
 347         for( int x=0; x<w; x++ )
 348         {
 349             dsta[x] = src[x*pw];
 350             dstb[x] = src[x*pw+1];
 351             dstc[x] = src[x*pw+2];
 352         }
 353     }
 354 }
 355
 356 void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
 357                                           pixel *dstc, intptr_t i_dstc,
 358                                           uint32_t *src, intptr_t i_src, int w, int h )
 359 {
 360     for( int l = 0; l < h; l++ )
 361     {
 362         pixel *dsty0 = dsty;
 363         pixel *dstc0 = dstc;
 364         uint32_t *src0 = src;
 365
 366         for( int n = 0; n < w; n += 3 )
 367         {
 368             *(dstc0++) = *src0 & 0x03FF;
 369             *(dsty0++) = ( *src0 >> 10 ) & 0x03FF;
 370             *(dstc0++) = ( *src0 >> 20 ) & 0x03FF;
 371             src0++;
 372             *(dsty0++) = *src0 & 0x03FF;
 373             *(dstc0++) = ( *src0 >> 10 ) & 0x03FF;
 374             *(dsty0++) = ( *src0 >> 20 ) & 0x03FF;
 375             src0++;
 376         }
 377
 378         dsty += i_dsty;
 379         dstc += i_dstc;
 380         src  += i_src;
 381     }
 382 }
 383
 384 static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
 385 {
 386     for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
 387         for( int x=0; x<8; x++ )
 388         {
 389             dst[2*x]   = srcu[x];
 390             dst[2*x+1] = srcv[x];
 391         }
 392 }
 393
 394 static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
 395 {
 396     x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
 397 }
 398
 399 static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
 400 {
 401     x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
 402 }
 403
 404 static void prefetch_fenc_null( pixel *pix_y,  intptr_t stride_y,
 405                                 pixel *pix_uv, intptr_t stride_uv, int mb_x )
 406 {}
 407
 408 static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
 409 {}
 410
 411 static void memzero_aligned( void * dst, size_t n )
 412 {
 413     memset( dst, 0, n );
 414 }
 415
 416 static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
 417 {
 418     int v = pix[0]+pix[1]+pix[2]+pix[3];
 419     for( int x = 0; x < stride-4; x++ )
 420     {
 421         sum[x] = v + sum[x-stride];
 422         v += pix[x+4] - pix[x];
 423     }
 424 }
 425
 426 static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
 427 {
 428     int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
 429     for( int x = 0; x < stride-8; x++ )
 430     {
 431         sum[x] = v + sum[x-stride];
 432         v += pix[x+8] - pix[x];
 433     }
 434 }
 435
 436 static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
 437 {
 438     for( int x = 0; x < stride-8; x++ )
 439         sum4[x] = sum8[x+4*stride] - sum8[x];
 440     for( int x = 0; x < stride-8; x++ )
 441         sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
 442 }
 443
 444 static void integral_init8v( uint16_t *sum8, intptr_t stride )
 445 {
 446     for( int x = 0; x < stride-8; x++ )
 447         sum8[x] = sum8[x+8*stride] - sum8[x];
 448 }
 449
 450 void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
 451 {
 452     pixel *src = frame->plane[0];
 453     int i_stride = frame->i_stride[0];
 454     int i_height = frame->i_lines[0];
 455     int i_width  = frame->i_width[0];
 456
 457     // duplicate last row and column so that their interpolation doesn't have to be special-cased
 458     for( int y = 0; y < i_height; y++ )
 459         src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
 460     memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * sizeof(pixel) );
 461     h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
 462                                   i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
 463     x264_frame_expand_border_lowres( frame );
 464
 465     memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) );
 466
 467     for( int y = 0; y < h->param.i_bframe + 2; y++ )
 468         for( int x = 0; x < h->param.i_bframe + 2; x++ )
 469             frame->i_row_satds[y][x][0] = -1;
 470
 471     for( int y = 0; y <= !!h->param.i_bframe; y++ )
 472         for( int x = 0; x <= h->param.i_bframe; x++ )
 473             frame->lowres_mvs[y][x][0][0] = 0x7FFF;
 474 }
 475
 476 static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
 477                                     intptr_t src_stride, intptr_t dst_stride, int width, int height )
 478 {
 479     for( int y = 0; y < height; y++ )
 480     {
 481         pixel *src1 = src0+src_stride;
 482         pixel *src2 = src1+src_stride;
 483         for( int x = 0; x<width; x++ )
 484         {
 485             // slower than naive bilinear, but matches asm
 486 #define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
 487             dst0[x] = FILTER(src0[2*x  ], src1[2*x  ], src0[2*x+1], src1[2*x+1]);
 488             dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
 489             dstv[x] = FILTER(src1[2*x  ], src2[2*x  ], src1[2*x+1], src2[2*x+1]);
 490             dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
 491 #undef FILTER
 492         }
 493         src0 += src_stride*2;
 494         dst0 += dst_stride;
 495         dsth += dst_stride;
 496         dstv += dst_stride;
 497         dstc += dst_stride;
 498     }
 499 }
 500
 501 /* Estimate the total amount of influence on future quality that could be had if we
 502  * were to improve the reference samples used to inter predict any given macroblock. */
 503 static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
 504                                    uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 505 {
 506     float fps = *fps_factor;
 507     for( int i = 0; i < len; i++ )
 508     {
 509         int intra_cost = intra_costs[i];
 510         int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
 511         float propagate_intra  = intra_cost * inv_qscales[i];
 512         float propagate_amount = propagate_in[i] + propagate_intra*fps;
 513         float propagate_num    = intra_cost - inter_cost;
 514         float propagate_denom  = intra_cost;
 515         dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
 516     }
 517 }
 518
 519 static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
 520                                    int16_t *propagate_amount, uint16_t *lowres_costs,
 521                                    int bipred_weight, int mb_y, int len, int list )
 522 {
 523     unsigned stride = h->mb.i_mb_stride;
 524     unsigned width = h->mb.i_mb_width;
 525     unsigned height = h->mb.i_mb_height;
 526
 527     for( unsigned i = 0; i < len; i++ )
 528     {
 529         int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
 530
 531         if( !(lists_used & (1 << list)) )
 532             continue;
 533
 534         int listamount = propagate_amount[i];
 535         /* Apply bipred weighting. */
 536         if( lists_used == 3 )
 537             listamount = (listamount * bipred_weight + 32) >> 6;
 538
 539         /* Early termination for simple case of mv0. */
 540         if( !M32( mvs[i] ) )
 541         {
 542             MC_CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
 543             continue;
 544         }
 545
 546         int x = mvs[i][0];
 547         int y = mvs[i][1];
 548         unsigned mbx = (x>>5)+i;
 549         unsigned mby = (y>>5)+mb_y;
 550         unsigned idx0 = mbx + mby * stride;
 551         unsigned idx2 = idx0 + stride;
 552         x &= 31;
 553         y &= 31;
 554         int idx0weight = (32-y)*(32-x);
 555         int idx1weight = (32-y)*x;
 556         int idx2weight = y*(32-x);
 557         int idx3weight = y*x;
 558         idx0weight = (idx0weight * listamount + 512) >> 10;
 559         idx1weight = (idx1weight * listamount + 512) >> 10;
 560         idx2weight = (idx2weight * listamount + 512) >> 10;
 561         idx3weight = (idx3weight * listamount + 512) >> 10;
 562
 563         if( mbx < width-1 && mby < height-1 )
 564         {
 565             MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
 566             MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
 567             MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
 568             MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
 569         }
 570         else
 571         {
 572             /* Note: this takes advantage of unsigned representation to
 573              * catch negative mbx/mby. */
 574             if( mby < height )
 575             {
 576                 if( mbx < width )
 577                     MC_CLIP_ADD( ref_costs[idx0+0], idx0weight );
 578                 if( mbx+1 < width )
 579                     MC_CLIP_ADD( ref_costs[idx0+1], idx1weight );
 580             }
 581             if( mby+1 < height )
 582             {
 583                 if( mbx < width )
 584                     MC_CLIP_ADD( ref_costs[idx2+0], idx2weight );
 585                 if( mbx+1 < width )
 586                     MC_CLIP_ADD( ref_costs[idx2+1], idx3weight );
 587             }
 588         }
 589     }
 590 }
 591
 592 void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
 593 {
 594     pf->mc_luma   = mc_luma;
 595     pf->get_ref   = get_ref;
 596
 597     pf->mc_chroma = mc_chroma;
 598
 599     pf->avg[PIXEL_16x16]= pixel_avg_16x16;
 600     pf->avg[PIXEL_16x8] = pixel_avg_16x8;
 601     pf->avg[PIXEL_8x16] = pixel_avg_8x16;
 602     pf->avg[PIXEL_8x8]  = pixel_avg_8x8;
 603     pf->avg[PIXEL_8x4]  = pixel_avg_8x4;
 604     pf->avg[PIXEL_4x16] = pixel_avg_4x16;
 605     pf->avg[PIXEL_4x8]  = pixel_avg_4x8;
 606     pf->avg[PIXEL_4x4]  = pixel_avg_4x4;
 607     pf->avg[PIXEL_4x2]  = pixel_avg_4x2;
 608     pf->avg[PIXEL_2x8]  = pixel_avg_2x8;
 609     pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
 610     pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
 611
 612     pf->weight    = x264_mc_weight_wtab;
 613     pf->offsetadd = x264_mc_weight_wtab;
 614     pf->offsetsub = x264_mc_weight_wtab;
 615     pf->weight_cache = x264_weight_cache;
 616
 617     pf->copy_16x16_unaligned = mc_copy_w16;
 618     pf->copy[PIXEL_16x16] = mc_copy_w16;
 619     pf->copy[PIXEL_8x8]   = mc_copy_w8;
 620     pf->copy[PIXEL_4x4]   = mc_copy_w4;
 621
 622     pf->store_interleave_chroma       = store_interleave_chroma;
 623     pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
 624     pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
 625
 626     pf->plane_copy = x264_plane_copy_c;
 627     pf->plane_copy_swap = x264_plane_copy_swap_c;
 628     pf->plane_copy_interleave = x264_plane_copy_interleave_c;
 629     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
 630     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
 631     pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
 632
 633     pf->hpel_filter = hpel_filter;
 634
 635     pf->prefetch_fenc_420 = prefetch_fenc_null;
 636     pf->prefetch_fenc_422 = prefetch_fenc_null;
 637     pf->prefetch_ref  = prefetch_ref_null;
 638     pf->memcpy_aligned = memcpy;
 639     pf->memzero_aligned = memzero_aligned;
 640     pf->frame_init_lowres_core = frame_init_lowres_core;
 641
 642     pf->integral_init4h = integral_init4h;
 643     pf->integral_init8h = integral_init8h;
 644     pf->integral_init4v = integral_init4v;
 645     pf->integral_init8v = integral_init8v;
 646
 647     pf->mbtree_propagate_cost = mbtree_propagate_cost;
 648     pf->mbtree_propagate_list = mbtree_propagate_list;
 649
 650 #if HAVE_MMX
 651     x264_mc_init_mmx( cpu, pf );
 652 #endif
 653 #if HAVE_ALTIVEC
 654     if( cpu&X264_CPU_ALTIVEC )
 655         x264_mc_altivec_init( pf );
 656 #endif
 657 #if HAVE_ARMV6
 658     x264_mc_init_arm( cpu, pf );
 659 #endif
 660 #if ARCH_AARCH64
 661     x264_mc_init_aarch64( cpu, pf );
 662 #endif
 663 #if HAVE_MSA
 664     if( cpu&X264_CPU_MSA )
 665         x264_mc_init_mips( cpu, pf );
 666 #endif
 667
 668     if( cpu_independent )
 669     {
 670         pf->mbtree_propagate_cost = mbtree_propagate_cost;
 671         pf->mbtree_propagate_list = mbtree_propagate_list;
 672     }
 673 }
 674
 675 void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
 676 {
 677     const int b_interlaced = PARAM_INTERLACED;
 678     int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
 679     int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;
 680
 681     if( mb_y & b_interlaced )
 682         return;
 683
 684     for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
 685     {
 686         int stride = frame->i_stride[p];
 687         const int width = frame->i_width[p];
 688         int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
 689
 690         if( !b_interlaced || h->mb.b_adaptive_mbaff )
 691             h->mc.hpel_filter(
 692                 frame->filtered[p][1] + offs,
 693                 frame->filtered[p][2] + offs,
 694                 frame->filtered[p][3] + offs,
 695                 frame->plane[p] + offs,
 696                 stride, width + 16, height - start,
 697                 h->scratch_buffer );
 698
 699         if( b_interlaced )
 700         {
 701             /* MC must happen between pixels in the same field. */
 702             stride = frame->i_stride[p] << 1;
 703             start = (mb_y*16 >> 1) - 8;
 704             int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
 705             offs = start*stride - 8;
 706             for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
 707             {
 708                 h->mc.hpel_filter(
 709                     frame->filtered_fld[p][1] + offs,
 710                     frame->filtered_fld[p][2] + offs,
 711                     frame->filtered_fld[p][3] + offs,
 712                     frame->plane_fld[p] + offs,
 713                     stride, width + 16, height_fld - start,
 714                     h->scratch_buffer );
 715             }
 716         }
 717     }
 718
 719     /* generate integral image:
 720      * frame->integral contains 2 planes. in the upper plane, each element is
 721      * the sum of an 8x8 pixel region with top-left corner on that point.
 722      * in the lower plane, 4x4 sums (needed only with --partitions p4x4). */
 723
 724     if( frame->integral )
 725     {
 726         int stride = frame->i_stride[0];
 727         if( start < 0 )
 728         {
 729             memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) );
 730             start = -PADV;
 731         }
 732         if( b_end )
 733             height += PADV-9;
 734         for( int y = start; y < height; y++ )
 735         {
 736             pixel    *pix  = frame->plane[0] + y * stride - PADH;
 737             uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
 738             uint16_t *sum4;
 739             if( h->frames.b_have_sub8x8_esa )
 740             {
 741                 h->mc.integral_init4h( sum8, pix, stride );
 742                 sum8 -= 8*stride;
 743                 sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
 744                 if( y >= 8-PADV )
 745                     h->mc.integral_init4v( sum8, sum4, stride );
 746             }
 747             else
 748             {
 749                 h->mc.integral_init8h( sum8, pix, stride );
 750                 if( y >= 8-PADV )
 751                     h->mc.integral_init8v( sum8-8*stride, stride );
 752             }
 753         }
 754     }
 755 }