+void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y )
+{
+ x264_macroblock_cache_load( h, mb_x, mb_y, 0 );
+}
+
+void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y )
+{
+ x264_macroblock_cache_load( h, mb_x, mb_y, 1 );
+}
+
+static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][4] )
+{
+ if( (h->mb.i_neighbour & MB_LEFT) && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
+ {
+ static const uint8_t offset[2][2][8] =
+ { { { 0, 0, 0, 0, 1, 1, 1, 1 },
+ { 2, 2, 2, 2, 3, 3, 3, 3 }, },
+ { { 0, 1, 2, 3, 0, 1, 2, 3 },
+ { 0, 1, 2, 3, 0, 1, 2, 3 }, }
+ };
+ ALIGNED_ARRAY_8( uint8_t, tmpbs, [8] );
+
+ const uint8_t *off = offset[MB_INTERLACED][h->mb.i_mb_y&1];
+ uint8_t (*nnz)[48] = h->mb.non_zero_count;
+
+ for( int i = 0; i < 8; i++ )
+ {
+ int left = h->mb.i_mb_left_xy[MB_INTERLACED ? i>>2 : i&1];
+ int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8*(i>>1)];
+ int nnz_left = nnz[left][3 + 4*off[i]];
+ if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
+ {
+ int j = off[i]&~1;
+ if( h->mb.mb_transform_size[left] )
+ nnz_left = !!(M16( &nnz[left][2+4*j] ) | M16( &nnz[left][2+4*(1+j)] ));
+ }
+ tmpbs[i] = (nnz_left || nnz_this) ? 2 : 1;
+ }
+
+ if( MB_INTERLACED )
+ {
+ CP32( bs[0][0], &tmpbs[0] );
+ CP32( bs[0][4], &tmpbs[4] );
+ }
+ else
+ {
+ for( int i = 0; i < 4; i++ ) bs[0][0][i] = tmpbs[2*i];
+ for( int i = 0; i < 4; i++ ) bs[0][4][i] = tmpbs[1+2*i];
+ }
+ }
+
+ if( (h->mb.i_neighbour & MB_TOP) && MB_INTERLACED != h->mb.field[h->mb.i_mb_top_xy] )
+ {
+ if( !(h->mb.i_mb_y&1) && !MB_INTERLACED )
+ {
+ /* Need to filter both fields (even for frame macroblocks).
+ * Filter top two rows using the top macroblock of the above
+ * pair and then the bottom one. */
+ int mbn_xy = h->mb.i_mb_xy - 2 * h->mb.i_mb_stride;
+ uint8_t *nnz_cur = &h->mb.cache.non_zero_count[x264_scan8[0]];
+
+ for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
+ {
+ uint8_t (*nnz)[48] = h->mb.non_zero_count;
+
+ ALIGNED_4( uint8_t nnz_top[4] );
+ CP32( nnz_top, &nnz[mbn_xy][3*4] );
+
+ if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && h->mb.mb_transform_size[mbn_xy] )
+ {
+ nnz_top[0] = nnz_top[1] = M16( &nnz[mbn_xy][ 8] ) || M16( &nnz[mbn_xy][12] );
+ nnz_top[2] = nnz_top[3] = M16( &nnz[mbn_xy][10] ) || M16( &nnz[mbn_xy][14] );
+ }
+
+ for( int i = 0; i < 4; i++ )
+ bs[1][4*j][i] = (nnz_cur[i] || nnz_top[i]) ? 2 : 1;
+ }
+ }
+ else
+ for( int i = 0; i < 4; i++ )
+ bs[1][0][i] = X264_MAX( bs[1][0][i], 1 );
+ }
+}
+
+void x264_macroblock_deblock_strength( x264_t *h )
+{
+ uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
+ if( IS_INTRA( h->mb.i_type ) )
+ {
+ M32( bs[0][1] ) = 0x03030303;
+ M64( bs[0][2] ) = 0x0303030303030303ULL;
+ M32( bs[1][1] ) = 0x03030303;
+ M64( bs[1][2] ) = 0x0303030303030303ULL;
+ return;
+ }
+
+ /* Early termination: in this case, nnz guarantees all edges use strength 2.*/
+ if( h->mb.b_transform_8x8 && !CHROMA444 )
+ {
+ int cbp_mask = 0xf >> CHROMA_V_SHIFT;
+ if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask )
+ {
+ M32( bs[0][0] ) = 0x02020202;
+ M32( bs[0][2] ) = 0x02020202;
+ M32( bs[0][4] ) = 0x02020202;
+ M64( bs[1][0] ) = 0x0202020202020202ULL; /* [1][1] and [1][3] has to be set for 4:2:2 */
+ M64( bs[1][2] ) = 0x0202020202020202ULL;
+ M32( bs[1][4] ) = 0x02020202;
+ return;
+ }
+ }
+
+ int neighbour_changed = 0;
+ if( h->sh.i_disable_deblocking_filter_idc != 2 )
+ {
+ neighbour_changed = h->mb.i_neighbour_frame&~h->mb.i_neighbour;
+ h->mb.i_neighbour = h->mb.i_neighbour_frame;
+ }
+
+ /* MBAFF deblock uses different left neighbors from encoding */
+ if( SLICE_MBAFF && (h->mb.i_neighbour & MB_LEFT) && (h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED) )
+ {
+ h->mb.i_mb_left_xy[1] =
+ h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
+ if( h->mb.i_mb_y&1 )
+ h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
+ else
+ h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
+ }
+
+ /* If we have multiple slices and we're deblocking on slice edges, we
+ * have to reload neighbour data. */
+ if( neighbour_changed )
+ {
+ int top_y = h->mb.i_mb_top_y;
+ int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*h->mb.i_mb_x;
+ int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*h->mb.i_mb_x;
+ int s8x8 = h->mb.i_b8_stride;
+ int s4x4 = h->mb.i_b4_stride;
+
+ uint8_t (*nnz)[48] = h->mb.non_zero_count;
+ const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3];
+
+ if( neighbour_changed & MB_TOP )
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
+
+ if( neighbour_changed & MB_LEFT )
+ {
+ int *left = h->mb.i_mb_left_xy;
+ h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]];
+ h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]];
+ h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]];
+ h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]];
+ }
+
+ for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
+ {
+ int16_t (*mv)[2] = h->mb.mv[l];
+ int8_t *ref = h->mb.ref[l];
+
+ int i8 = x264_scan8[0] - 8;
+ if( neighbour_changed & MB_TOP )
+ {
+ h->mb.cache.ref[l][i8+0] =
+ h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
+ h->mb.cache.ref[l][i8+2] =
+ h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
+ CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
+ }
+
+ i8 = x264_scan8[0] - 1;
+ if( neighbour_changed & MB_LEFT )
+ {
+ h->mb.cache.ref[l][i8+0*8] =
+ h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table->ref[0]];
+ h->mb.cache.ref[l][i8+2*8] =
+ h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table->ref[2]];
+
+ CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[0]] );
+ CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[1]] );
+ CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[2]] );
+ CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[3]] );
+ }
+ }
+ }
+
+ if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.i_type == SLICE_TYPE_P )
+ {
+ /* Handle reference frame duplicates */
+ int i8 = x264_scan8[0] - 8;
+ h->mb.cache.ref[0][i8+0] =
+ h->mb.cache.ref[0][i8+1] = deblock_ref_table(h->mb.cache.ref[0][i8+0]);
+ h->mb.cache.ref[0][i8+2] =
+ h->mb.cache.ref[0][i8+3] = deblock_ref_table(h->mb.cache.ref[0][i8+2]);
+
+ i8 = x264_scan8[0] - 1;
+ h->mb.cache.ref[0][i8+0*8] =
+ h->mb.cache.ref[0][i8+1*8] = deblock_ref_table(h->mb.cache.ref[0][i8+0*8]);
+ h->mb.cache.ref[0][i8+2*8] =
+ h->mb.cache.ref[0][i8+3*8] = deblock_ref_table(h->mb.cache.ref[0][i8+2*8]);
+
+ int ref0 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 0]]);
+ int ref1 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 4]]);
+ int ref2 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 8]]);
+ int ref3 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[12]]);
+ uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101;
+ uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101;
+
+ M32( &h->mb.cache.ref[0][x264_scan8[0]+8*0] ) = reftop;
+ M32( &h->mb.cache.ref[0][x264_scan8[0]+8*1] ) = reftop;
+ M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot;
+ M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot;
+ }
+
+ /* Munge NNZ for cavlc + 8x8dct */
+ if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
+ {
+ uint8_t (*nnz)[48] = h->mb.non_zero_count;
+ int top = h->mb.i_mb_top_xy;
+ int *left = h->mb.i_mb_left_xy;
+
+ if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] )
+ {
+ int i8 = x264_scan8[0] - 8;
+ int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] );
+ int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] );
+ M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0;
+ M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0;
+ }
+
+ if( h->mb.i_neighbour & MB_LEFT )
+ {
+ int i8 = x264_scan8[0] - 1;
+ if( h->mb.mb_transform_size[left[0]] )
+ {
+ int nnz_left0 = M16( &nnz[left[0]][2] ) | M16( &nnz[left[0]][6] );
+ h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0;
+ h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0;
+ }
+ if( h->mb.mb_transform_size[left[1]] )
+ {
+ int nnz_left1 = M16( &nnz[left[1]][10] ) | M16( &nnz[left[1]][14] );
+ h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1;
+ h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1;
+ }
+ }
+
+ if( h->mb.b_transform_8x8 )
+ {
+ int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+ int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
+ int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] );
+ int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] );
+ uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101;
+ uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101;
+
+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot;
+ }
+ }
+
+ h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
+ bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B );
+
+ if( SLICE_MBAFF )
+ x264_macroblock_deblock_strength_mbaff( h, bs );
+}
+
+static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )