X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fh264.h;h=0c9402d19c6b9bb165d23c478d5149db83506849;hb=6c71d2c1357018f7f68a8bde773306298ce72057;hp=0d87f90a9eceee2985affeba0efb977ddeb11f7c;hpb=6d7e6b2657ae7ec577f3a55bc8c8b48075d51ba2;p=ffmpeg diff --git a/libavcodec/h264.h b/libavcodec/h264.h index 0d87f90a9ec..0c9402d19c6 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -28,9 +28,11 @@ #ifndef AVCODEC_H264_H #define AVCODEC_H264_H +#include "libavutil/intreadwrite.h" #include "dsputil.h" #include "cabac.h" #include "mpegvideo.h" +#include "h264dsp.h" #include "h264pred.h" #include "rectangle.h" @@ -60,6 +62,8 @@ #define ALLOW_NOCHROMA +#define FMO 0 + /** * The maximum number of slices supported by the decoder. * must be a power of 2 @@ -259,18 +263,7 @@ typedef struct MMCO{ */ typedef struct H264Context{ MpegEncContext s; - int nal_ref_idc; - int nal_unit_type; - uint8_t *rbsp_buffer[2]; - unsigned int rbsp_buffer_size[2]; - - /** - * Used to parse AVC variant of h264 - */ - int is_avc; ///< this flag is != 0 if codec is avc1 - int got_avcC; ///< flag used to parse avcC data only once - int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) - + H264DSPContext h264dsp; int chroma_qp[2]; //QPc int qp_thresh; ///< QP threshold to skip loopfilter @@ -282,24 +275,33 @@ typedef struct H264Context{ int chroma_pred_mode; int intra16x16_pred_mode; + int topleft_mb_xy; int top_mb_xy; + int topright_mb_xy; int left_mb_xy[2]; + int topleft_type; + int top_type; + int topright_type; + int left_type[2]; + + const uint8_t * left_block; + int topleft_partition; + int8_t intra4x4_pred_mode_cache[5*8]; - int8_t (*intra4x4_pred_mode)[8]; + int8_t (*intra4x4_pred_mode); H264PredContext hpc; unsigned int topleft_samples_available; unsigned int top_samples_available; unsigned int topright_samples_available; unsigned int left_samples_available; uint8_t (*top_borders[2])[16+2*8]; - uint8_t left_border[2*(17+2*9)]; /** * non zero coeff count cache. * is 64 if not available. */ - DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); + DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; /* .UU.YYYY @@ -312,8 +314,8 @@ typedef struct H264Context{ /** * Motion vector cache. */ - DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]); - DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]); + DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; + DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; #define LIST_NOT_USED -1 //FIXME rename? #define PART_NOT_AVAILABLE -2 @@ -334,9 +336,8 @@ typedef struct H264Context{ int block_offset[2*(16+8)]; uint32_t *mb2b_xy; //FIXME are these 4 a good idea? - uint32_t *mb2b8_xy; + uint32_t *mb2br_xy; int b_stride; //FIXME use s->b4_stride - int b8_stride; int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff int mb_uvlinesize; @@ -344,29 +345,19 @@ typedef struct H264Context{ int emu_edge_width; int emu_edge_height; - int halfpel_flag; - int thirdpel_flag; - - int unknown_svq3_flag; - int next_slice_index; - - SPS *sps_buffers[MAX_SPS_COUNT]; SPS sps; ///< current sps - PPS *pps_buffers[MAX_PPS_COUNT]; /** * current pps */ PPS pps; //FIXME move to Picture perhaps? (->no) do we need that? - uint32_t dequant4_buffer[6][52][16]; + uint32_t dequant4_buffer[6][52][16]; //FIXME should these be moved down? uint32_t dequant8_buffer[2][52][64]; uint32_t (*dequant4_coeff[6])[16]; uint32_t (*dequant8_coeff[2])[64]; - int dequant_coeff_pps; ///< reinit tables when pps changes int slice_num; - uint16_t *slice_table_base; uint16_t *slice_table; ///< slice_table_base + 2*mb_stride + 1 int slice_type; int slice_type_nos; ///< S free slice type (SI/SP are remapped to I/P) @@ -377,49 +368,21 @@ typedef struct H264Context{ int mb_field_decoding_flag; int mb_mbaff; ///< mb_aff_frame && mb_field_decoding_flag - DECLARE_ALIGNED_8(uint16_t, sub_mb_type[4]); - - //POC stuff - int poc_lsb; - int poc_msb; - int delta_poc_bottom; - int delta_poc[2]; - int frame_num; - int prev_poc_msb; ///< poc_msb of the last reference pic for POC type 0 - int prev_poc_lsb; ///< poc_lsb of the last reference pic for POC type 0 - int frame_num_offset; ///< for POC type 2 - int prev_frame_num_offset; ///< for POC type 2 - int prev_frame_num; ///< frame_num of the last pic for POC type 1/2 - - /** - * frame_num for frames or 2*frame_num+1 for field pics. - */ - int curr_pic_num; - - /** - * max_frame_num or 2*max_frame_num for field pics. - */ - int max_pic_num; + DECLARE_ALIGNED(8, uint16_t, sub_mb_type)[4]; //Weighted pred stuff int use_weight; int use_weight_chroma; int luma_log2_weight_denom; int chroma_log2_weight_denom; - int luma_weight[2][48]; - int luma_offset[2][48]; - int chroma_weight[2][48][2]; - int chroma_offset[2][48][2]; - int implicit_weight[48][48]; - - //deblock - int deblocking_filter; ///< disable_deblocking_filter_idc with 1<->0 - int slice_alpha_c0_offset; - int slice_beta_offset; - - int redundant_pic_count; + //The following 2 can be changed to int8_t but that causes 10cpu cycles speedloss + int luma_weight[48][2][2]; + int chroma_weight[48][2][2][2]; + int implicit_weight[48][48][2]; int direct_spatial_mv_pred; + int col_parity; + int col_fieldoff; int dist_scale_factor[16]; int dist_scale_factor_field[2][32]; int map_col_to_list0[2][16+32]; @@ -431,24 +394,10 @@ typedef struct H264Context{ unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode unsigned int list_count; uint8_t *list_counts; ///< Array of list_count per MB specifying the slice type - Picture *short_ref[32]; - Picture *long_ref[32]; - Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture Picture ref_list[2][48]; /**< 0..15: frame refs, 16..47: mbaff field refs. Reordered version of default_ref_list according to picture reordering in slice header */ int ref2frm[MAX_SLICES][2][64]; ///< reference to frame number lists, used in the loop filter, the first 2 are for -2,-1 - Picture *delayed_pic[MAX_DELAYED_PIC_COUNT+2]; //FIXME size? - int outputed_poc; - - /** - * memory management control operations buffer. - */ - MMCO mmco[MAX_MMCO_COUNT]; - int mmco_index; - - int long_ref_count; ///< number of actual long term references - int short_ref_count; ///< number of actual short term references //data partitioning GetBitContext intra_gb; @@ -456,7 +405,7 @@ typedef struct H264Context{ GetBitContext *intra_gb_ptr; GetBitContext *inter_gb_ptr; - DECLARE_ALIGNED_16(DCTELEM, mb[16*24]); + DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb /** @@ -464,7 +413,6 @@ typedef struct H264Context{ */ CABACContext cabac; uint8_t cabac_state[460]; - int cabac_init_idc; /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */ uint16_t *cbp_table; @@ -474,8 +422,8 @@ typedef struct H264Context{ /* chroma_pred_mode for i4x4 or i16x16, else 0 */ uint8_t *chroma_pred_mode_table; int last_qscale_diff; - int16_t (*mvd_table[2])[2]; - DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]); + uint8_t (*mvd_table[2])[2]; + DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; uint8_t *direct_table; uint8_t direct_cache[5*8]; @@ -494,6 +442,78 @@ typedef struct H264Context{ int x264_build; + int mb_xy; + + int is_complex; + + //deblock + int deblocking_filter; ///< disable_deblocking_filter_idc with 1<->0 + int slice_alpha_c0_offset; + int slice_beta_offset; + +//============================================================= + //Things below are not used in the MB or more inner code + + int nal_ref_idc; + int nal_unit_type; + uint8_t *rbsp_buffer[2]; + unsigned int rbsp_buffer_size[2]; + + /** + * Used to parse AVC variant of h264 + */ + int is_avc; ///< this flag is != 0 if codec is avc1 + int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) + + SPS *sps_buffers[MAX_SPS_COUNT]; + PPS *pps_buffers[MAX_PPS_COUNT]; + + int dequant_coeff_pps; ///< reinit tables when pps changes + + uint16_t *slice_table_base; + + + //POC stuff + int poc_lsb; + int poc_msb; + int delta_poc_bottom; + int delta_poc[2]; + int frame_num; + int prev_poc_msb; ///< poc_msb of the last reference pic for POC type 0 + int prev_poc_lsb; ///< poc_lsb of the last reference pic for POC type 0 + int frame_num_offset; ///< for POC type 2 + int prev_frame_num_offset; ///< for POC type 2 + int prev_frame_num; ///< frame_num of the last pic for POC type 1/2 + + /** + * frame_num for frames or 2*frame_num+1 for field pics. + */ + int curr_pic_num; + + /** + * max_frame_num or 2*max_frame_num for field pics. + */ + int max_pic_num; + + int redundant_pic_count; + + Picture *short_ref[32]; + Picture *long_ref[32]; + Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture + Picture *delayed_pic[MAX_DELAYED_PIC_COUNT+2]; //FIXME size? + int outputed_poc; + + /** + * memory management control operations buffer. + */ + MMCO mmco[MAX_MMCO_COUNT]; + int mmco_index; + + int long_ref_count; ///< number of actual long term references + int short_ref_count; ///< number of actual short term references + + int cabac_init_idc; + /** * @defgroup multithreading Members for slice based multithreading * @{ @@ -522,10 +542,6 @@ typedef struct H264Context{ int last_slice_type; /** @} */ - int mb_xy; - - uint32_t svq3_watermark_key; - /** * pic_struct in picture timing SEI message */ @@ -565,19 +581,27 @@ typedef struct H264Context{ */ int sei_recovery_frame_cnt; - int is_complex; - int luma_weight_flag[2]; ///< 7.4.3.2 luma_weight_lX_flag int chroma_weight_flag[2]; ///< 7.4.3.2 chroma_weight_lX_flag // Timestamp stuff int sei_buffering_period_present; ///< Buffering period SEI flag int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs + + //SVQ3 specific fields + int halfpel_flag; + int thirdpel_flag; + int unknown_svq3_flag; + int next_slice_index; + uint32_t svq3_watermark_key; }H264Context; extern const uint8_t ff_h264_chroma_qp[52]; +void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp); + +void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); /** * Decode SEI @@ -599,7 +623,7 @@ int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length); * @param consumed is the number of bytes used as input * @param length is the length of the array * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing? - * @returns decoded bytes, might be src+1 if no escapes + * @return decoded bytes, might be src+1 if no escapes */ const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length); @@ -617,7 +641,7 @@ av_cold void ff_h264_free_context(H264Context *h); /** * reconstructs bitstream slice_type. */ -int ff_h264_get_slice_type(H264Context *h); +int ff_h264_get_slice_type(const H264Context *h); /** * allocates tables. @@ -661,13 +685,13 @@ av_cold void ff_h264_decode_init_vlc(void); /** * decodes a macroblock - * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed */ int ff_h264_decode_mb_cavlc(H264Context *h); /** * decodes a CABAC coded macroblock - * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed */ int ff_h264_decode_mb_cabac(H264Context *h); @@ -717,6 +741,14 @@ static av_always_inline uint32_t pack16to32(int a, int b){ #endif } +static av_always_inline uint16_t pack8to16(int a, int b){ +#if HAVE_BIGENDIAN + return (b&0xFF) + (a<<8); +#else + return (a&0xFF) + (b<<8); +#endif +} + /** * gets the chroma qp. */ @@ -726,14 +758,10 @@ static inline int get_chroma_qp(H264Context *h, int t, int qscale){ static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my); -static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deblock){ +static void fill_decode_neighbors(H264Context *h, int mb_type){ MpegEncContext * const s = &h->s; const int mb_xy= h->mb_xy; int topleft_xy, top_xy, topright_xy, left_xy[2]; - int topleft_type, top_type, topright_type, left_type[2]; - const uint8_t * left_block; - int topleft_partition= -1; - int i; static const uint8_t left_block_options[4][16]={ {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, @@ -741,11 +769,9 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} }; - top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); + h->topleft_partition= -1; - //FIXME deblocking could skip the intra and nnz parts. -// if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF) -// return; + top_xy = mb_xy - (s->mb_stride << MB_FIELD); /* Wow, what a mess, why didn't they simplify the interlacing & intra * stuff, I can't imagine that these complex rules are worth it. */ @@ -753,197 +779,151 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb topleft_xy = top_xy - 1; topright_xy= top_xy + 1; left_xy[1] = left_xy[0] = mb_xy-1; - left_block = left_block_options[0]; + h->left_block = left_block_options[0]; if(FRAME_MBAFF){ - const int pair_xy = s->mb_x + (s->mb_y & ~1)*s->mb_stride; - const int top_pair_xy = pair_xy - s->mb_stride; - const int topleft_pair_xy = top_pair_xy - 1; - const int topright_pair_xy = top_pair_xy + 1; - const int topleft_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]); - const int top_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]); - const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]); - const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]); + const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]); const int curr_mb_field_flag = IS_INTERLACED(mb_type); - const int bottom = (s->mb_y & 1); - tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag); - - if (curr_mb_field_flag && (bottom || top_mb_field_flag)){ - top_xy -= s->mb_stride; - } - if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){ - topleft_xy -= s->mb_stride; - } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) { - topleft_xy += s->mb_stride; - // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition - topleft_partition = 0; - } - if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){ - topright_xy -= s->mb_stride; - } - if (left_mb_field_flag != curr_mb_field_flag) { - left_xy[1] = left_xy[0] = pair_xy - 1; - if (curr_mb_field_flag) { - left_xy[1] += s->mb_stride; - left_block = left_block_options[3]; - } else { - left_block= left_block_options[2 - bottom]; + if(s->mb_y&1){ + if (left_mb_field_flag != curr_mb_field_flag) { + left_xy[1] = left_xy[0] = mb_xy - s->mb_stride - 1; + if (curr_mb_field_flag) { + left_xy[1] += s->mb_stride; + h->left_block = left_block_options[3]; + } else { + topleft_xy += s->mb_stride; + // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition + h->topleft_partition = 0; + h->left_block = left_block_options[1]; + } + } + }else{ + if(curr_mb_field_flag){ + topleft_xy += s->mb_stride & (((s->current_picture.mb_type[top_xy - 1]>>7)&1)-1); + topright_xy += s->mb_stride & (((s->current_picture.mb_type[top_xy + 1]>>7)&1)-1); + top_xy += s->mb_stride & (((s->current_picture.mb_type[top_xy ]>>7)&1)-1); + } + if (left_mb_field_flag != curr_mb_field_flag) { + if (curr_mb_field_flag) { + left_xy[1] += s->mb_stride; + h->left_block = left_block_options[3]; + } else { + h->left_block = left_block_options[2]; + } } } } - h->top_mb_xy = top_xy; + h->topleft_mb_xy = topleft_xy; + h->top_mb_xy = top_xy; + h->topright_mb_xy= topright_xy; h->left_mb_xy[0] = left_xy[0]; h->left_mb_xy[1] = left_xy[1]; - if(for_deblock){ - - //for sufficiently low qp, filtering wouldn't do anything - //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp - int qp_thresh = h->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice - int qp = s->current_picture.qscale_table[mb_xy]; - if(qp <= qp_thresh - && (left_xy[0]<0 || ((qp + s->current_picture.qscale_table[left_xy[0]] + 1)>>1) <= qp_thresh) - && (left_xy[1]<0 || ((qp + s->current_picture.qscale_table[left_xy[1]] + 1)>>1) <= qp_thresh) - && (top_xy < 0 || ((qp + s->current_picture.qscale_table[top_xy ] + 1)>>1) <= qp_thresh)){ - return 1; - } + //FIXME do we need all in the context? + + h->topleft_type = s->current_picture.mb_type[topleft_xy] ; + h->top_type = s->current_picture.mb_type[top_xy] ; + h->topright_type= s->current_picture.mb_type[topright_xy]; + h->left_type[0] = s->current_picture.mb_type[left_xy[0]] ; + h->left_type[1] = s->current_picture.mb_type[left_xy[1]] ; + + if(FMO){ + if(h->slice_table[topleft_xy ] != h->slice_num) h->topleft_type = 0; + if(h->slice_table[top_xy ] != h->slice_num) h->top_type = 0; + if(h->slice_table[left_xy[0] ] != h->slice_num) h->left_type[0] = h->left_type[1] = 0; + }else{ + if(h->slice_table[topleft_xy ] != h->slice_num){ + h->topleft_type = 0; + if(h->slice_table[top_xy ] != h->slice_num) h->top_type = 0; + if(h->slice_table[left_xy[0] ] != h->slice_num) h->left_type[0] = h->left_type[1] = 0; + } + } + if(h->slice_table[topright_xy] != h->slice_num) h->topright_type= 0; +} - *((uint64_t*)&h->non_zero_count_cache[0+8*1])= *((uint64_t*)&h->non_zero_count[mb_xy][ 0]); - *((uint64_t*)&h->non_zero_count_cache[0+8*2])= *((uint64_t*)&h->non_zero_count[mb_xy][ 8]); - *((uint32_t*)&h->non_zero_count_cache[0+8*5])= *((uint32_t*)&h->non_zero_count[mb_xy][16]); - *((uint32_t*)&h->non_zero_count_cache[4+8*3])= *((uint32_t*)&h->non_zero_count[mb_xy][20]); - *((uint64_t*)&h->non_zero_count_cache[0+8*4])= *((uint64_t*)&h->non_zero_count[mb_xy][24]); - - topleft_type = 0; - topright_type = 0; - top_type = h->slice_table[top_xy ] < 0xFFFF ? s->current_picture.mb_type[top_xy] : 0; - left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; - left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; - - if(!IS_INTRA(mb_type)){ - int list; - for(list=0; listlist_count; list++){ - int8_t *ref; - int y, b_xy; - if(!USES_LIST(mb_type, list)){ - fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); - *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = - *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = - *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = - *(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101; - continue; - } +static void fill_decode_caches(H264Context *h, int mb_type){ + MpegEncContext * const s = &h->s; + int topleft_xy, top_xy, topright_xy, left_xy[2]; + int topleft_type, top_type, topright_type, left_type[2]; + const uint8_t * left_block= h->left_block; + int i; - ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; - if(for_deblock){ - int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); - *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = - *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101; - ref += h->b8_stride; - *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = - *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101; + topleft_xy = h->topleft_mb_xy ; + top_xy = h->top_mb_xy ; + topright_xy = h->topright_mb_xy; + left_xy[0] = h->left_mb_xy[0] ; + left_xy[1] = h->left_mb_xy[1] ; + topleft_type = h->topleft_type ; + top_type = h->top_type ; + topright_type= h->topright_type ; + left_type[0] = h->left_type[0] ; + left_type[1] = h->left_type[1] ; + + if(!IS_SKIP(mb_type)){ + if(IS_INTRA(mb_type)){ + int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1; + h->topleft_samples_available= + h->top_samples_available= + h->left_samples_available= 0xFFFF; + h->topright_samples_available= 0xEEEA; + + if(!(top_type & type_mask)){ + h->topleft_samples_available= 0xB3FF; + h->top_samples_available= 0x33FF; + h->topright_samples_available= 0x26EA; + } + if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){ + if(IS_INTERLACED(mb_type)){ + if(!(left_type[0] & type_mask)){ + h->topleft_samples_available&= 0xDFFF; + h->left_samples_available&= 0x5FFF; + } + if(!(left_type[1] & type_mask)){ + h->topleft_samples_available&= 0xFF5F; + h->left_samples_available&= 0xFF5F; + } }else{ - *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = - *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; - ref += h->b8_stride; - *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = - *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; - } - - b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; - for(y=0; y<4; y++){ - *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]; - *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]; - } + int left_typei = s->current_picture.mb_type[left_xy[0] + s->mb_stride]; - } - } - }else{ - topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0; - top_type = h->slice_table[top_xy ] == h->slice_num ? s->current_picture.mb_type[top_xy] : 0; - topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0; - left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0; - left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0; - - if(IS_INTRA(mb_type) && !for_deblock){ - int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1; - h->topleft_samples_available= - h->top_samples_available= - h->left_samples_available= 0xFFFF; - h->topright_samples_available= 0xEEEA; - - if(!(top_type & type_mask)){ - h->topleft_samples_available= 0xB3FF; - h->top_samples_available= 0x33FF; - h->topright_samples_available= 0x26EA; - } - if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){ - if(IS_INTERLACED(mb_type)){ - if(!(left_type[0] & type_mask)){ - h->topleft_samples_available&= 0xDFFF; - h->left_samples_available&= 0x5FFF; - } - if(!(left_type[1] & type_mask)){ - h->topleft_samples_available&= 0xFF5F; - h->left_samples_available&= 0xFF5F; + assert(left_xy[0] == left_xy[1]); + if(!((left_typei & type_mask) && (left_type[0] & type_mask))){ + h->topleft_samples_available&= 0xDF5F; + h->left_samples_available&= 0x5F5F; + } } }else{ - int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num - ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0; - assert(left_xy[0] == left_xy[1]); - if(!((left_typei & type_mask) && (left_type[0] & type_mask))){ + if(!(left_type[0] & type_mask)){ h->topleft_samples_available&= 0xDF5F; h->left_samples_available&= 0x5F5F; } } - }else{ - if(!(left_type[0] & type_mask)){ - h->topleft_samples_available&= 0xDF5F; - h->left_samples_available&= 0x5F5F; - } - } - if(!(topleft_type & type_mask)) - h->topleft_samples_available&= 0x7FFF; + if(!(topleft_type & type_mask)) + h->topleft_samples_available&= 0x7FFF; - if(!(topright_type & type_mask)) - h->topright_samples_available&= 0xFBFF; + if(!(topright_type & type_mask)) + h->topright_samples_available&= 0xFBFF; - if(IS_INTRA4x4(mb_type)){ - if(IS_INTRA4x4(top_type)){ - h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4]; - h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5]; - h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6]; - h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3]; - }else{ - int pred; - if(!(top_type & type_mask)) - pred= -1; - else{ - pred= 2; - } - h->intra4x4_pred_mode_cache[4+8*0]= - h->intra4x4_pred_mode_cache[5+8*0]= - h->intra4x4_pred_mode_cache[6+8*0]= - h->intra4x4_pred_mode_cache[7+8*0]= pred; - } - for(i=0; i<2; i++){ - if(IS_INTRA4x4(left_type[i])){ - h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]]; - h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]]; + if(IS_INTRA4x4(mb_type)){ + if(IS_INTRA4x4(top_type)){ + AV_COPY32(h->intra4x4_pred_mode_cache+4+8*0, h->intra4x4_pred_mode + h->mb2br_xy[top_xy]); }else{ - int pred; - if(!(left_type[i] & type_mask)) - pred= -1; - else{ - pred= 2; + h->intra4x4_pred_mode_cache[4+8*0]= + h->intra4x4_pred_mode_cache[5+8*0]= + h->intra4x4_pred_mode_cache[6+8*0]= + h->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask); + } + for(i=0; i<2; i++){ + if(IS_INTRA4x4(left_type[i])){ + int8_t *mode= h->intra4x4_pred_mode + h->mb2br_xy[left_xy[i]]; + h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= mode[6-left_block[0+2*i]]; + h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= mode[6-left_block[1+2*i]]; + }else{ + h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= + h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= 2 - 3*!(left_type[i] & type_mask); } - h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= - h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred; } } } - } - } /* @@ -956,72 +936,58 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb */ //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) if(top_type){ - *(uint32_t*)&h->non_zero_count_cache[4+8*0]= *(uint32_t*)&h->non_zero_count[top_xy][4+3*8]; - - h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8]; - h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8]; - - h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8]; - h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8]; - - }else{ - h->non_zero_count_cache[4+8*0]= - h->non_zero_count_cache[5+8*0]= - h->non_zero_count_cache[6+8*0]= - h->non_zero_count_cache[7+8*0]= - - h->non_zero_count_cache[1+8*0]= - h->non_zero_count_cache[2+8*0]= - - h->non_zero_count_cache[1+8*3]= - h->non_zero_count_cache[2+8*3]= CABAC && !IS_INTRA(mb_type) ? 0 : 64; - + AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]); + h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][1+1*8]; + h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][2+1*8]; + + h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][1+2*8]; + h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][2+2*8]; + }else { + h->non_zero_count_cache[1+8*0]= + h->non_zero_count_cache[2+8*0]= + + h->non_zero_count_cache[1+8*3]= + h->non_zero_count_cache[2+8*3]= + AV_WN32A(&h->non_zero_count_cache[4+8*0], CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040); } for (i=0; i<2; i++) { if(left_type[i]){ h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+0+2*i]]; h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[8+1+2*i]]; - h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]]; - h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]]; + h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+4+2*i]]; + h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[8+5+2*i]]; }else{ - h->non_zero_count_cache[3+8*1 + 2*8*i]= - h->non_zero_count_cache[3+8*2 + 2*8*i]= - h->non_zero_count_cache[0+8*1 + 8*i]= - h->non_zero_count_cache[0+8*4 + 8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64; + h->non_zero_count_cache[3+8*1 + 2*8*i]= + h->non_zero_count_cache[3+8*2 + 2*8*i]= + h->non_zero_count_cache[0+8*1 + 8*i]= + h->non_zero_count_cache[0+8*4 + 8*i]= CABAC && !IS_INTRA(mb_type) ? 0 : 64; } } - if( CABAC && !for_deblock) { + if( CABAC ) { // top_cbp if(top_type) { h->top_cbp = h->cbp_table[top_xy]; - } else if(IS_INTRA(mb_type)) { - h->top_cbp = 0x1C0; } else { - h->top_cbp = 0; + h->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; } // left_cbp if (left_type[0]) { - h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0; - } else if(IS_INTRA(mb_type)) { - h->left_cbp = 0x1C0; + h->left_cbp = (h->cbp_table[left_xy[0]] & 0x1f0) + | ((h->cbp_table[left_xy[0]]>>(left_block[0]&(~1)))&2) + | (((h->cbp_table[left_xy[1]]>>(left_block[2]&(~1)))&2) << 2); } else { - h->left_cbp = 0; - } - if (left_type[0]) { - h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1; - } - if (left_type[1]) { - h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3; + h->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; } } + } #if 1 - if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ + if(IS_INTER(mb_type) || (IS_DIRECT(mb_type) && h->direct_spatial_mv_pred)){ int list; for(list=0; listlist_count; list++){ - if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){ + if(!USES_LIST(mb_type, list)){ /*if(!h->mv_cache_clean[list]){ memset(h->mv_cache [list], 0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all? memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t)); @@ -1029,160 +995,135 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb }*/ continue; } + assert(!(IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)); + h->mv_cache_clean[list]= 0; if(USES_LIST(top_type, list)){ const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; - const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride; - *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0]; - *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1]; - *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2]; - *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3]; - if(for_deblock){ - int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); + AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); h->ref_cache[list][scan8[0] + 0 - 1*8]= - h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]]; + h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][4*top_xy + 2]; h->ref_cache[list][scan8[0] + 2 - 1*8]= - h->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 1]]; - }else{ - h->ref_cache[list][scan8[0] + 0 - 1*8]= - h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0]; - h->ref_cache[list][scan8[0] + 2 - 1*8]= - h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1]; - } + h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][4*top_xy + 3]; }else{ - *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]= - *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]= - *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]= - *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0; - *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= (((for_deblock||top_type) ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101; + AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]); + AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); } + if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ for(i=0; i<2; i++){ int cache_idx = scan8[0] - 1 + i*2*8; if(USES_LIST(left_type[i], list)){ const int b_xy= h->mb2b_xy[left_xy[i]] + 3; - const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1; - *(uint32_t*)h->mv_cache[list][cache_idx ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]]; - *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]]; - if(for_deblock){ - int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[i]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); - h->ref_cache[list][cache_idx ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)]]; - h->ref_cache[list][cache_idx+8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)]]; - }else{ - h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)]; - h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)]; - } + const int b8_xy= 4*left_xy[i] + 1; + AV_COPY32(h->mv_cache[list][cache_idx ], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]]); + AV_COPY32(h->mv_cache[list][cache_idx+8], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]]); + h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + (left_block[0+i*2]&~1)]; + h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + (left_block[1+i*2]&~1)]; }else{ - *(uint32_t*)h->mv_cache [list][cache_idx ]= - *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0; + AV_ZERO32(h->mv_cache [list][cache_idx ]); + AV_ZERO32(h->mv_cache [list][cache_idx+8]); h->ref_cache[list][cache_idx ]= - h->ref_cache[list][cache_idx+8]= (for_deblock||left_type[i]) ? LIST_NOT_USED : PART_NOT_AVAILABLE; + h->ref_cache[list][cache_idx+8]= (left_type[i]) ? LIST_NOT_USED : PART_NOT_AVAILABLE; } } - - if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF)) - continue; - - if(USES_LIST(topleft_type, list)){ - const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride); - const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride); - *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy]; - h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy]; }else{ - *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0; - h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + if(USES_LIST(left_type[0], list)){ + const int b_xy= h->mb2b_xy[left_xy[0]] + 3; + const int b8_xy= 4*left_xy[0] + 1; + AV_COPY32(h->mv_cache[list][scan8[0] - 1], s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]]); + h->ref_cache[list][scan8[0] - 1]= s->current_picture.ref_index[list][b8_xy + (left_block[0]&~1)]; + }else{ + AV_ZERO32(h->mv_cache [list][scan8[0] - 1]); + h->ref_cache[list][scan8[0] - 1]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } } if(USES_LIST(topright_type, list)){ const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride; - const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride; - *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy]; - h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy]; + AV_COPY32(h->mv_cache[list][scan8[0] + 4 - 1*8], s->current_picture.motion_val[list][b_xy]); + h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][4*topright_xy + 2]; }else{ - *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0; + AV_ZERO32(h->mv_cache [list][scan8[0] + 4 - 1*8]); h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; } + if(h->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ + if(USES_LIST(topleft_type, list)){ + const int b_xy = h->mb2b_xy [topleft_xy] + 3 + h->b_stride + (h->topleft_partition & 2*h->b_stride); + const int b8_xy= 4*topleft_xy + 1 + (h->topleft_partition & 2); + AV_COPY32(h->mv_cache[list][scan8[0] - 1 - 1*8], s->current_picture.motion_val[list][b_xy]); + h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy]; + }else{ + AV_ZERO32(h->mv_cache[list][scan8[0] - 1 - 1*8]); + h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + } - if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF) + if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)) && !FRAME_MBAFF) continue; - h->ref_cache[list][scan8[5 ]+1] = - h->ref_cache[list][scan8[7 ]+1] = - h->ref_cache[list][scan8[13]+1] = //FIXME remove past 3 (init somewhere else) + if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { h->ref_cache[list][scan8[4 ]] = h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; - *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]= - *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]= - *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else) - *(uint32_t*)h->mv_cache [list][scan8[4 ]]= - *(uint32_t*)h->mv_cache [list][scan8[12]]= 0; + AV_ZERO32(h->mv_cache [list][scan8[4 ]]); + AV_ZERO32(h->mv_cache [list][scan8[12]]); if( CABAC ) { /* XXX beurk, Load mvd */ if(USES_LIST(top_type, list)){ - const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; - *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0]; - *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1]; - *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2]; - *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3]; + const int b_xy= h->mb2br_xy[top_xy]; + AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]); }else{ - *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]= - *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]= - *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]= - *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0; + AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]); } if(USES_LIST(left_type[0], list)){ - const int b_xy= h->mb2b_xy[left_xy[0]] + 3; - *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]]; - *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]]; + const int b_xy= h->mb2br_xy[left_xy[0]] + 6; + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy - left_block[0]]); + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy - left_block[1]]); }else{ - *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]= - *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0; + AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]); + AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]); } if(USES_LIST(left_type[1], list)){ - const int b_xy= h->mb2b_xy[left_xy[1]] + 3; - *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]]; - *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]]; + const int b_xy= h->mb2br_xy[left_xy[1]] + 6; + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy - left_block[2]]); + AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy - left_block[3]]); }else{ - *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]= - *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0; + AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]); + AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]); } - *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]= - *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]= - *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else) - *(uint32_t*)h->mvd_cache [list][scan8[4 ]]= - *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0; - + AV_ZERO16(h->mvd_cache [list][scan8[4 ]]); + AV_ZERO16(h->mvd_cache [list][scan8[12]]); if(h->slice_type_nos == FF_B_TYPE){ - fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1); + fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); if(IS_DIRECT(top_type)){ - *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101; + AV_WN32A(&h->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1)); }else if(IS_8X8(top_type)){ - int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride; - h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy]; - h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1]; + int b8_xy = 4*top_xy; + h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy + 2]; + h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 3]; }else{ - *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0; + AV_WN32A(&h->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1)); } if(IS_DIRECT(left_type[0])) - h->direct_cache[scan8[0] - 1 + 0*8]= 1; + h->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1; else if(IS_8X8(left_type[0])) - h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)]; + h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[4*left_xy[0] + 1 + (left_block[0]&~1)]; else - h->direct_cache[scan8[0] - 1 + 0*8]= 0; + h->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1; if(IS_DIRECT(left_type[1])) - h->direct_cache[scan8[0] - 1 + 2*8]= 1; + h->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1; else if(IS_8X8(left_type[1])) - h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)]; + h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[4*left_xy[1] + 1 + (left_block[2]&~1)]; else - h->direct_cache[scan8[0] - 1 + 2*8]= 0; + h->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1; } } - + } if(FRAME_MBAFF){ #define MAP_MVS\ MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\ @@ -1200,7 +1141,7 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\ h->ref_cache[list][idx] <<= 1;\ h->mv_cache[list][idx][1] /= 2;\ - h->mvd_cache[list][idx][1] /= 2;\ + h->mvd_cache[list][idx][1] >>=1;\ } MAP_MVS #undef MAP_F2F @@ -1219,21 +1160,223 @@ static av_always_inline int fill_caches(H264Context *h, int mb_type, int for_deb } #endif - if(!for_deblock) - h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]); - return 0; -} - -static void fill_decode_caches(H264Context *h, int mb_type){ - fill_caches(h, mb_type, 0); + h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]); } /** * - * @returns non zero if the loop filter can be skiped + * @return non zero if the loop filter can be skiped */ static int fill_filter_caches(H264Context *h, int mb_type){ - return fill_caches(h, mb_type, 1); + MpegEncContext * const s = &h->s; + const int mb_xy= h->mb_xy; + int top_xy, left_xy[2]; + int top_type, left_type[2]; + + top_xy = mb_xy - (s->mb_stride << MB_FIELD); + + //FIXME deblocking could skip the intra and nnz parts. + + /* Wow, what a mess, why didn't they simplify the interlacing & intra + * stuff, I can't imagine that these complex rules are worth it. */ + + left_xy[1] = left_xy[0] = mb_xy-1; + if(FRAME_MBAFF){ + const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]); + const int curr_mb_field_flag = IS_INTERLACED(mb_type); + if(s->mb_y&1){ + if (left_mb_field_flag != curr_mb_field_flag) { + left_xy[0] -= s->mb_stride; + } + }else{ + if(curr_mb_field_flag){ + top_xy += s->mb_stride & (((s->current_picture.mb_type[top_xy ]>>7)&1)-1); + } + if (left_mb_field_flag != curr_mb_field_flag) { + left_xy[1] += s->mb_stride; + } + } + } + + h->top_mb_xy = top_xy; + h->left_mb_xy[0] = left_xy[0]; + h->left_mb_xy[1] = left_xy[1]; + { + //for sufficiently low qp, filtering wouldn't do anything + //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp + int qp_thresh = h->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice + int qp = s->current_picture.qscale_table[mb_xy]; + if(qp <= qp_thresh + && (left_xy[0]<0 || ((qp + s->current_picture.qscale_table[left_xy[0]] + 1)>>1) <= qp_thresh) + && (top_xy < 0 || ((qp + s->current_picture.qscale_table[top_xy ] + 1)>>1) <= qp_thresh)){ + if(!FRAME_MBAFF) + return 1; + if( (left_xy[0]< 0 || ((qp + s->current_picture.qscale_table[left_xy[1] ] + 1)>>1) <= qp_thresh) + && (top_xy < s->mb_stride || ((qp + s->current_picture.qscale_table[top_xy -s->mb_stride] + 1)>>1) <= qp_thresh)) + return 1; + } + } + + top_type = s->current_picture.mb_type[top_xy] ; + left_type[0] = s->current_picture.mb_type[left_xy[0]]; + left_type[1] = s->current_picture.mb_type[left_xy[1]]; + if(h->deblocking_filter == 2){ + if(h->slice_table[top_xy ] != h->slice_num) top_type= 0; + if(h->slice_table[left_xy[0] ] != h->slice_num) left_type[0]= left_type[1]= 0; + }else{ + if(h->slice_table[top_xy ] == 0xFFFF) top_type= 0; + if(h->slice_table[left_xy[0] ] == 0xFFFF) left_type[0]= left_type[1] =0; + } + h->top_type = top_type ; + h->left_type[0]= left_type[0]; + h->left_type[1]= left_type[1]; + + if(IS_INTRA(mb_type)) + return 0; + + AV_COPY64(&h->non_zero_count_cache[0+8*1], &h->non_zero_count[mb_xy][ 0]); + AV_COPY64(&h->non_zero_count_cache[0+8*2], &h->non_zero_count[mb_xy][ 8]); + AV_COPY32(&h->non_zero_count_cache[0+8*5], &h->non_zero_count[mb_xy][16]); + AV_COPY32(&h->non_zero_count_cache[4+8*3], &h->non_zero_count[mb_xy][20]); + AV_COPY64(&h->non_zero_count_cache[0+8*4], &h->non_zero_count[mb_xy][24]); + + h->cbp= h->cbp_table[mb_xy]; + + { + int list; + for(list=0; listlist_count; list++){ + int8_t *ref; + int y, b_stride; + int16_t (*mv_dst)[2]; + int16_t (*mv_src)[2]; + + if(!USES_LIST(mb_type, list)){ + fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); + AV_WN32A(&h->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&h->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&h->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&h->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + continue; + } + + ref = &s->current_picture.ref_index[list][4*mb_xy]; + { + int (*ref2frm)[64] = h->ref2frm[ h->slice_num&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); + AV_WN32A(&h->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + AV_WN32A(&h->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + ref += 2; + AV_WN32A(&h->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + AV_WN32A(&h->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + } + + b_stride = h->b_stride; + mv_dst = &h->mv_cache[list][scan8[0]]; + mv_src = &s->current_picture.motion_val[list][4*s->mb_x + 4*s->mb_y*b_stride]; + for(y=0; y<4; y++){ + AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride); + } + + } + } + + +/* +0 . T T. T T T T +1 L . .L . . . . +2 L . .L . . . . +3 . T TL . . . . +4 L . .L . . . . +5 L . .. . . . . +*/ +//FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) + if(top_type){ + AV_COPY32(&h->non_zero_count_cache[4+8*0], &h->non_zero_count[top_xy][4+3*8]); + } + + if(left_type[0]){ + h->non_zero_count_cache[3+8*1]= h->non_zero_count[left_xy[0]][7+0*8]; + h->non_zero_count_cache[3+8*2]= h->non_zero_count[left_xy[0]][7+1*8]; + h->non_zero_count_cache[3+8*3]= h->non_zero_count[left_xy[0]][7+2*8]; + h->non_zero_count_cache[3+8*4]= h->non_zero_count[left_xy[0]][7+3*8]; + } + + // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs + if(!CABAC && h->pps.transform_8x8_mode){ + if(IS_8x8DCT(top_type)){ + h->non_zero_count_cache[4+8*0]= + h->non_zero_count_cache[5+8*0]= h->cbp_table[top_xy] & 4; + h->non_zero_count_cache[6+8*0]= + h->non_zero_count_cache[7+8*0]= h->cbp_table[top_xy] & 8; + } + if(IS_8x8DCT(left_type[0])){ + h->non_zero_count_cache[3+8*1]= + h->non_zero_count_cache[3+8*2]= h->cbp_table[left_xy[0]]&2; //FIXME check MBAFF + } + if(IS_8x8DCT(left_type[1])){ + h->non_zero_count_cache[3+8*3]= + h->non_zero_count_cache[3+8*4]= h->cbp_table[left_xy[1]]&8; //FIXME check MBAFF + } + + if(IS_8x8DCT(mb_type)){ + h->non_zero_count_cache[scan8[0 ]]= h->non_zero_count_cache[scan8[1 ]]= + h->non_zero_count_cache[scan8[2 ]]= h->non_zero_count_cache[scan8[3 ]]= h->cbp & 1; + + h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]= + h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2; + + h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]= + h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4; + + h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]= + h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8; + } + } + + if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ + int list; + for(list=0; listlist_count; list++){ + if(USES_LIST(top_type, list)){ + const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; + const int b8_xy= 4*top_xy + 2; + int (*ref2frm)[64] = h->ref2frm[ h->slice_table[top_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); + AV_COPY128(h->mv_cache[list][scan8[0] + 0 - 1*8], s->current_picture.motion_val[list][b_xy + 0]); + h->ref_cache[list][scan8[0] + 0 - 1*8]= + h->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 0]]; + h->ref_cache[list][scan8[0] + 2 - 1*8]= + h->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 1]]; + }else{ + AV_ZERO128(h->mv_cache[list][scan8[0] + 0 - 1*8]); + AV_WN32A(&h->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u); + } + + if(!IS_INTERLACED(mb_type^left_type[0])){ + if(USES_LIST(left_type[0], list)){ + const int b_xy= h->mb2b_xy[left_xy[0]] + 3; + const int b8_xy= 4*left_xy[0] + 1; + int (*ref2frm)[64] = h->ref2frm[ h->slice_table[left_xy[0]]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2); + AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 0 ], s->current_picture.motion_val[list][b_xy + h->b_stride*0]); + AV_COPY32(h->mv_cache[list][scan8[0] - 1 + 8 ], s->current_picture.motion_val[list][b_xy + h->b_stride*1]); + AV_COPY32(h->mv_cache[list][scan8[0] - 1 +16 ], s->current_picture.motion_val[list][b_xy + h->b_stride*2]); + AV_COPY32(h->mv_cache[list][scan8[0] - 1 +24 ], s->current_picture.motion_val[list][b_xy + h->b_stride*3]); + h->ref_cache[list][scan8[0] - 1 + 0 ]= + h->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*0]]; + h->ref_cache[list][scan8[0] - 1 +16 ]= + h->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][s->current_picture.ref_index[list][b8_xy + 2*1]]; + }else{ + AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 0 ]); + AV_ZERO32(h->mv_cache [list][scan8[0] - 1 + 8 ]); + AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +16 ]); + AV_ZERO32(h->mv_cache [list][scan8[0] - 1 +24 ]); + h->ref_cache[list][scan8[0] - 1 + 0 ]= + h->ref_cache[list][scan8[0] - 1 + 8 ]= + h->ref_cache[list][scan8[0] - 1 + 16 ]= + h->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED; + } + } + } + } + + return 0; } /** @@ -1254,76 +1397,73 @@ static inline int pred_intra_mode(H264Context *h, int n){ static inline void write_back_non_zero_count(H264Context *h){ const int mb_xy= h->mb_xy; - *((uint64_t*)&h->non_zero_count[mb_xy][ 0]) = *((uint64_t*)&h->non_zero_count_cache[0+8*1]); - *((uint64_t*)&h->non_zero_count[mb_xy][ 8]) = *((uint64_t*)&h->non_zero_count_cache[0+8*2]); - *((uint32_t*)&h->non_zero_count[mb_xy][16]) = *((uint32_t*)&h->non_zero_count_cache[0+8*5]); - *((uint32_t*)&h->non_zero_count[mb_xy][20]) = *((uint32_t*)&h->non_zero_count_cache[4+8*3]); - *((uint64_t*)&h->non_zero_count[mb_xy][24]) = *((uint64_t*)&h->non_zero_count_cache[0+8*4]); + AV_COPY64(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[0+8*1]); + AV_COPY64(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[0+8*2]); + AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[0+8*5]); + AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8*3]); + AV_COPY64(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[0+8*4]); } static inline void write_back_motion(H264Context *h, int mb_type){ MpegEncContext * const s = &h->s; - const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; - const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride; + const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; //try mb2b(8)_xy + const int b8_xy= 4*h->mb_xy; int list; if(!USES_LIST(mb_type, 0)) - fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1); + fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); for(list=0; listlist_count; list++){ - int y; + int y, b_stride; + int16_t (*mv_dst)[2]; + int16_t (*mv_src)[2]; + if(!USES_LIST(mb_type, list)) continue; + b_stride = h->b_stride; + mv_dst = &s->current_picture.motion_val[list][b_xy]; + mv_src = &h->mv_cache[list][scan8[0]]; for(y=0; y<4; y++){ - *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]; - *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]; + AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); } if( CABAC ) { + uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? 8*h->mb_xy : h->mb2br_xy[h->mb_xy]]; + uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; if(IS_SKIP(mb_type)) - fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4); - else - for(y=0; y<4; y++){ - *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y]; - *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y]; + AV_ZERO128(mvd_dst); + else{ + AV_COPY64(mvd_dst, mvd_src + 8*3); + AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); + AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); + AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); } } { int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; - ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]]; - ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]]; - ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]]; - ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]]; + ref_index[0+0*2]= h->ref_cache[list][scan8[0]]; + ref_index[1+0*2]= h->ref_cache[list][scan8[4]]; + ref_index[0+1*2]= h->ref_cache[list][scan8[8]]; + ref_index[1+1*2]= h->ref_cache[list][scan8[12]]; } } if(h->slice_type_nos == FF_B_TYPE && CABAC){ if(IS_8X8(mb_type)){ - uint8_t *direct_table = &h->direct_table[b8_xy]; - direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0; - direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0; - direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0; + uint8_t *direct_table = &h->direct_table[4*h->mb_xy]; + direct_table[1] = h->sub_mb_type[1]>>1; + direct_table[2] = h->sub_mb_type[2]>>1; + direct_table[3] = h->sub_mb_type[3]>>1; } } } static inline int get_dct8x8_allowed(H264Context *h){ if(h->sps.direct_8x8_inference_flag) - return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); + return !(AV_RN64A(h->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); else - return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL)); -} - -static void predict_field_decoding_flag(H264Context *h){ - MpegEncContext * const s = &h->s; - const int mb_xy= h->mb_xy; - int mb_type = (h->slice_table[mb_xy-1] == h->slice_num) - ? s->current_picture.mb_type[mb_xy-1] - : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num) - ? s->current_picture.mb_type[mb_xy-s->mb_stride] - : 0; - h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0; + return !(AV_RN64A(h->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL)); } /** @@ -1343,9 +1483,11 @@ static void decode_mb_skip(H264Context *h){ if( h->slice_type_nos == FF_B_TYPE ) { // just for fill_caches. pred_direct_motion will set the real mb_type - mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; - + mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; + if(h->direct_spatial_mv_pred){ + fill_decode_neighbors(h, mb_type); fill_decode_caches(h, mb_type); //FIXME check what is needed and what not ... + } ff_h264_pred_direct_motion(h, &mb_type); mb_type|= MB_TYPE_SKIP; } @@ -1354,6 +1496,7 @@ static void decode_mb_skip(H264Context *h){ int mx, my; mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; + fill_decode_neighbors(h, mb_type); fill_decode_caches(h, mb_type); //FIXME check what is needed and what not ... pred_pskip_motion(h, &mx, &my); fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);