git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38
  39 //#undef NDEBUG
  40 #include <assert.h>
  41
  42 /**
  43  * Value of Picture.reference when Picture is not a reference picture, but
  44  * is held for delayed output.
  45  */
  46 #define DELAYED_PIC_REF 4
  47
  48 static VLC coeff_token_vlc[4];
  49 static VLC chroma_dc_coeff_token_vlc;
  50
  51 static VLC total_zeros_vlc[15];
  52 static VLC chroma_dc_total_zeros_vlc[3];
  53
  54 static VLC run_vlc[6];
  55 static VLC run7_vlc;
  56
  57 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  58 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  59 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  60 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  61
  62 static av_always_inline uint32_t pack16to32(int a, int b){
  63 #ifdef WORDS_BIGENDIAN
  64    return (b&0xFFFF) + (a<<16);
  65 #else
  66    return (a&0xFFFF) + (b<<16);
  67 #endif
  68 }
  69
  70 const uint8_t ff_rem6[52]={
  71 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  72 };
  73
  74 const uint8_t ff_div6[52]={
  75 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  76 };
  77
  78
  79 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  80     MpegEncContext * const s = &h->s;
  81     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
  82     int topleft_xy, top_xy, topright_xy, left_xy[2];
  83     int topleft_type, top_type, topright_type, left_type[2];
  84     int left_block[8];
  85     int i;
  86
  87     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  88
  89     //FIXME deblocking could skip the intra and nnz parts.
  90     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  91         return;
  92
  93     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
  94
  95     topleft_xy = top_xy - 1;
  96     topright_xy= top_xy + 1;
  97     left_xy[1] = left_xy[0] = mb_xy-1;
  98     left_block[0]= 0;
  99     left_block[1]= 1;
 100     left_block[2]= 2;
 101     left_block[3]= 3;
 102     left_block[4]= 7;
 103     left_block[5]= 10;
 104     left_block[6]= 8;
 105     left_block[7]= 11;
 106     if(FRAME_MBAFF){
 107         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 108         const int top_pair_xy      = pair_xy     - s->mb_stride;
 109         const int topleft_pair_xy  = top_pair_xy - 1;
 110         const int topright_pair_xy = top_pair_xy + 1;
 111         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 112         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 113         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 114         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 115         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 116         const int bottom = (s->mb_y & 1);
 117         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 118         if (bottom
 119                 ? !curr_mb_frame_flag // bottom macroblock
 120                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 121                 ) {
 122             top_xy -= s->mb_stride;
 123         }
 124         if (bottom
 125                 ? !curr_mb_frame_flag // bottom macroblock
 126                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 127                 ) {
 128             topleft_xy -= s->mb_stride;
 129         }
 130         if (bottom
 131                 ? !curr_mb_frame_flag // bottom macroblock
 132                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 133                 ) {
 134             topright_xy -= s->mb_stride;
 135         }
 136         if (left_mb_frame_flag != curr_mb_frame_flag) {
 137             left_xy[1] = left_xy[0] = pair_xy - 1;
 138             if (curr_mb_frame_flag) {
 139                 if (bottom) {
 140                     left_block[0]= 2;
 141                     left_block[1]= 2;
 142                     left_block[2]= 3;
 143                     left_block[3]= 3;
 144                     left_block[4]= 8;
 145                     left_block[5]= 11;
 146                     left_block[6]= 8;
 147                     left_block[7]= 11;
 148                 } else {
 149                     left_block[0]= 0;
 150                     left_block[1]= 0;
 151                     left_block[2]= 1;
 152                     left_block[3]= 1;
 153                     left_block[4]= 7;
 154                     left_block[5]= 10;
 155                     left_block[6]= 7;
 156                     left_block[7]= 10;
 157                 }
 158             } else {
 159                 left_xy[1] += s->mb_stride;
 160                 //left_block[0]= 0;
 161                 left_block[1]= 2;
 162                 left_block[2]= 0;
 163                 left_block[3]= 2;
 164                 //left_block[4]= 7;
 165                 left_block[5]= 10;
 166                 left_block[6]= 7;
 167                 left_block[7]= 10;
 168             }
 169         }
 170     }
 171
 172     h->top_mb_xy = top_xy;
 173     h->left_mb_xy[0] = left_xy[0];
 174     h->left_mb_xy[1] = left_xy[1];
 175     if(for_deblock){
 176         topleft_type = 0;
 177         topright_type = 0;
 178         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 179         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 180         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 181
 182         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 183             int list;
 184             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 185             for(i=0; i<16; i++)
 186                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 187             for(list=0; list<h->list_count; list++){
 188                 if(USES_LIST(mb_type,list)){
 189                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 190                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 191                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 192                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 193                         dst[0] = src[0];
 194                         dst[1] = src[1];
 195                         dst[2] = src[2];
 196                         dst[3] = src[3];
 197                     }
 198                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 199                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 200                     ref += h->b8_stride;
 201                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 202                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 203                 }else{
 204                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 205                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 206                 }
 207             }
 208         }
 209     }else{
 210         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 211         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 212         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 213         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 214         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 215     }
 216
 217     if(IS_INTRA(mb_type)){
 218         h->topleft_samples_available=
 219         h->top_samples_available=
 220         h->left_samples_available= 0xFFFF;
 221         h->topright_samples_available= 0xEEEA;
 222
 223         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 224             h->topleft_samples_available= 0xB3FF;
 225             h->top_samples_available= 0x33FF;
 226             h->topright_samples_available= 0x26EA;
 227         }
 228         for(i=0; i<2; i++){
 229             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 230                 h->topleft_samples_available&= 0xDF5F;
 231                 h->left_samples_available&= 0x5F5F;
 232             }
 233         }
 234
 235         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 236             h->topleft_samples_available&= 0x7FFF;
 237
 238         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 239             h->topright_samples_available&= 0xFBFF;
 240
 241         if(IS_INTRA4x4(mb_type)){
 242             if(IS_INTRA4x4(top_type)){
 243                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 244                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 245                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 246                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 247             }else{
 248                 int pred;
 249                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 250                     pred= -1;
 251                 else{
 252                     pred= 2;
 253                 }
 254                 h->intra4x4_pred_mode_cache[4+8*0]=
 255                 h->intra4x4_pred_mode_cache[5+8*0]=
 256                 h->intra4x4_pred_mode_cache[6+8*0]=
 257                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 258             }
 259             for(i=0; i<2; i++){
 260                 if(IS_INTRA4x4(left_type[i])){
 261                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 262                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 263                 }else{
 264                     int pred;
 265                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 266                         pred= -1;
 267                     else{
 268                         pred= 2;
 269                     }
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 272                 }
 273             }
 274         }
 275     }
 276
 277
 278 /*
 279 0 . T T. T T T T
 280 1 L . .L . . . .
 281 2 L . .L . . . .
 282 3 . T TL . . . .
 283 4 L . .L . . . .
 284 5 L . .. . . . .
 285 */
 286 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 287     if(top_type){
 288         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 289         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 290         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 291         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 292
 293         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 294         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 295
 296         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 297         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 298
 299     }else{
 300         h->non_zero_count_cache[4+8*0]=
 301         h->non_zero_count_cache[5+8*0]=
 302         h->non_zero_count_cache[6+8*0]=
 303         h->non_zero_count_cache[7+8*0]=
 304
 305         h->non_zero_count_cache[1+8*0]=
 306         h->non_zero_count_cache[2+8*0]=
 307
 308         h->non_zero_count_cache[1+8*3]=
 309         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 310
 311     }
 312
 313     for (i=0; i<2; i++) {
 314         if(left_type[i]){
 315             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 316             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 317             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 318             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 319         }else{
 320             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 321             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 322             h->non_zero_count_cache[0+8*1 +   8*i]=
 323             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 324         }
 325     }
 326
 327     if( h->pps.cabac ) {
 328         // top_cbp
 329         if(top_type) {
 330             h->top_cbp = h->cbp_table[top_xy];
 331         } else if(IS_INTRA(mb_type)) {
 332             h->top_cbp = 0x1C0;
 333         } else {
 334             h->top_cbp = 0;
 335         }
 336         // left_cbp
 337         if (left_type[0]) {
 338             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 339         } else if(IS_INTRA(mb_type)) {
 340             h->left_cbp = 0x1C0;
 341         } else {
 342             h->left_cbp = 0;
 343         }
 344         if (left_type[0]) {
 345             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 346         }
 347         if (left_type[1]) {
 348             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 349         }
 350     }
 351
 352 #if 1
 353     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 354         int list;
 355         for(list=0; list<h->list_count; list++){
 356             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 357                 /*if(!h->mv_cache_clean[list]){
 358                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 359                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 360                     h->mv_cache_clean[list]= 1;
 361                 }*/
 362                 continue;
 363             }
 364             h->mv_cache_clean[list]= 0;
 365
 366             if(USES_LIST(top_type, list)){
 367                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 368                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 369                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 370                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 371                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 372                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 373                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 374                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 375                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 376                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 377             }else{
 378                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 379                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 380                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 381                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 382                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 383             }
 384
 385             for(i=0; i<2; i++){
 386                 int cache_idx = scan8[0] - 1 + i*2*8;
 387                 if(USES_LIST(left_type[i], list)){
 388                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 389                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 390                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 391                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 392                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 393                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 394                 }else{
 395                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 396                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 397                     h->ref_cache[list][cache_idx  ]=
 398                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 399                 }
 400             }
 401
 402             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 403                 continue;
 404
 405             if(USES_LIST(topleft_type, list)){
 406                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 407                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 408                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 409                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 410             }else{
 411                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 412                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 413             }
 414
 415             if(USES_LIST(topright_type, list)){
 416                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 417                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 418                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 419                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 420             }else{
 421                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 422                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 423             }
 424
 425             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 426                 continue;
 427
 428             h->ref_cache[list][scan8[5 ]+1] =
 429             h->ref_cache[list][scan8[7 ]+1] =
 430             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 431             h->ref_cache[list][scan8[4 ]] =
 432             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 433             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 434             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 435             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 436             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 437             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 438
 439             if( h->pps.cabac ) {
 440                 /* XXX beurk, Load mvd */
 441                 if(USES_LIST(top_type, list)){
 442                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 443                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 444                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 445                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 446                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 447                 }else{
 448                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 449                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 450                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 451                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 452                 }
 453                 if(USES_LIST(left_type[0], list)){
 454                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 457                 }else{
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 460                 }
 461                 if(USES_LIST(left_type[1], list)){
 462                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 463                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 465                 }else{
 466                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 468                 }
 469                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 470                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 471                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 472                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 473                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 474
 475                 if(h->slice_type == B_TYPE){
 476                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 477
 478                     if(IS_DIRECT(top_type)){
 479                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 480                     }else if(IS_8X8(top_type)){
 481                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 482                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 483                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 484                     }else{
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 486                     }
 487
 488                     if(IS_DIRECT(left_type[0]))
 489                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 490                     else if(IS_8X8(left_type[0]))
 491                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 492                     else
 493                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 494
 495                     if(IS_DIRECT(left_type[1]))
 496                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 497                     else if(IS_8X8(left_type[1]))
 498                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 499                     else
 500                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 501                 }
 502             }
 503
 504             if(FRAME_MBAFF){
 505 #define MAP_MVS\
 506                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 507                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 508                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 509                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 510                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 511                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 512                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 513                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 514                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 515                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 516                 if(MB_FIELD){
 517 #define MAP_F2F(idx, mb_type)\
 518                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 519                         h->ref_cache[list][idx] <<= 1;\
 520                         h->mv_cache[list][idx][1] /= 2;\
 521                         h->mvd_cache[list][idx][1] /= 2;\
 522                     }
 523                     MAP_MVS
 524 #undef MAP_F2F
 525                 }else{
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] >>= 1;\
 529                         h->mv_cache[list][idx][1] <<= 1;\
 530                         h->mvd_cache[list][idx][1] <<= 1;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }
 535             }
 536         }
 537     }
 538 #endif
 539
 540     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 541 }
 542
 543 static inline void write_back_intra_pred_mode(H264Context *h){
 544     MpegEncContext * const s = &h->s;
 545     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 546
 547     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 548     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 549     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 550     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 551     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 552     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 553     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 554 }
 555
 556 /**
 557  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 558  */
 559 static inline int check_intra4x4_pred_mode(H264Context *h){
 560     MpegEncContext * const s = &h->s;
 561     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 562     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 563     int i;
 564
 565     if(!(h->top_samples_available&0x8000)){
 566         for(i=0; i<4; i++){
 567             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 568             if(status<0){
 569                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 570                 return -1;
 571             } else if(status){
 572                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 573             }
 574         }
 575     }
 576
 577     if(!(h->left_samples_available&0x8000)){
 578         for(i=0; i<4; i++){
 579             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 580             if(status<0){
 581                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 582                 return -1;
 583             } else if(status){
 584                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 585             }
 586         }
 587     }
 588
 589     return 0;
 590 } //FIXME cleanup like next
 591
 592 /**
 593  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 594  */
 595 static inline int check_intra_pred_mode(H264Context *h, int mode){
 596     MpegEncContext * const s = &h->s;
 597     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 598     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 599
 600     if(mode > 6U) {
 601         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 602         return -1;
 603     }
 604
 605     if(!(h->top_samples_available&0x8000)){
 606         mode= top[ mode ];
 607         if(mode<0){
 608             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 609             return -1;
 610         }
 611     }
 612
 613     if(!(h->left_samples_available&0x8000)){
 614         mode= left[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     return mode;
 622 }
 623
 624 /**
 625  * gets the predicted intra4x4 prediction mode.
 626  */
 627 static inline int pred_intra_mode(H264Context *h, int n){
 628     const int index8= scan8[n];
 629     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 630     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 631     const int min= FFMIN(left, top);
 632
 633     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 634
 635     if(min<0) return DC_PRED;
 636     else      return min;
 637 }
 638
 639 static inline void write_back_non_zero_count(H264Context *h){
 640     MpegEncContext * const s = &h->s;
 641     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 642
 643     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 644     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 645     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 646     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 647     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 648     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 649     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 650
 651     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 652     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 653     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 654
 655     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 656     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 657     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 658
 659     if(FRAME_MBAFF){
 660         // store all luma nnzs, for deblocking
 661         int v = 0, i;
 662         for(i=0; i<16; i++)
 663             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 664         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 665     }
 666 }
 667
 668 /**
 669  * gets the predicted number of non zero coefficients.
 670  * @param n block index
 671  */
 672 static inline int pred_non_zero_count(H264Context *h, int n){
 673     const int index8= scan8[n];
 674     const int left= h->non_zero_count_cache[index8 - 1];
 675     const int top = h->non_zero_count_cache[index8 - 8];
 676     int i= left + top;
 677
 678     if(i<64) i= (i+1)>>1;
 679
 680     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 681
 682     return i&31;
 683 }
 684
 685 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 686     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 687     MpegEncContext *s = &h->s;
 688
 689     /* there is no consistent mapping of mvs to neighboring locations that will
 690      * make mbaff happy, so we can't move all this logic to fill_caches */
 691     if(FRAME_MBAFF){
 692         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 693         const int16_t *mv;
 694         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 695         *C = h->mv_cache[list][scan8[0]-2];
 696
 697         if(!MB_FIELD
 698            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 699             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 700             if(IS_INTERLACED(mb_types[topright_xy])){
 701 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 702                 const int x4 = X4, y4 = Y4;\
 703                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 704                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 705                     return LIST_NOT_USED;\
 706                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 707                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 708                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 709                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 710
 711                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 712             }
 713         }
 714         if(topright_ref == PART_NOT_AVAILABLE
 715            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 716            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 717             if(!MB_FIELD
 718                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 720             }
 721             if(MB_FIELD
 722                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 723                && i >= scan8[0]+8){
 724                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 725                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 726             }
 727         }
 728 #undef SET_DIAG_MV
 729     }
 730
 731     if(topright_ref != PART_NOT_AVAILABLE){
 732         *C= h->mv_cache[list][ i - 8 + part_width ];
 733         return topright_ref;
 734     }else{
 735         tprintf(s->avctx, "topright MV not available\n");
 736
 737         *C= h->mv_cache[list][ i - 8 - 1 ];
 738         return h->ref_cache[list][ i - 8 - 1 ];
 739     }
 740 }
 741
 742 /**
 743  * gets the predicted MV.
 744  * @param n the block index
 745  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 746  * @param mx the x component of the predicted motion vector
 747  * @param my the y component of the predicted motion vector
 748  */
 749 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 750     const int index8= scan8[n];
 751     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 752     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 753     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 754     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 755     const int16_t * C;
 756     int diagonal_ref, match_count;
 757
 758     assert(part_width==1 || part_width==2 || part_width==4);
 759
 760 /* mv_cache
 761   B . . A T T T T
 762   U . . L . . , .
 763   U . . L . . . .
 764   U . . L . . , .
 765   . . . L . . . .
 766 */
 767
 768     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 769     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 770     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 771     if(match_count > 1){ //most common
 772         *mx= mid_pred(A[0], B[0], C[0]);
 773         *my= mid_pred(A[1], B[1], C[1]);
 774     }else if(match_count==1){
 775         if(left_ref==ref){
 776             *mx= A[0];
 777             *my= A[1];
 778         }else if(top_ref==ref){
 779             *mx= B[0];
 780             *my= B[1];
 781         }else{
 782             *mx= C[0];
 783             *my= C[1];
 784         }
 785     }else{
 786         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 787             *mx= A[0];
 788             *my= A[1];
 789         }else{
 790             *mx= mid_pred(A[0], B[0], C[0]);
 791             *my= mid_pred(A[1], B[1], C[1]);
 792         }
 793     }
 794
 795     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 796 }
 797
 798 /**
 799  * gets the directionally predicted 16x8 MV.
 800  * @param n the block index
 801  * @param mx the x component of the predicted motion vector
 802  * @param my the y component of the predicted motion vector
 803  */
 804 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 805     if(n==0){
 806         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 807         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 808
 809         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 810
 811         if(top_ref == ref){
 812             *mx= B[0];
 813             *my= B[1];
 814             return;
 815         }
 816     }else{
 817         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 818         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 819
 820         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 821
 822         if(left_ref == ref){
 823             *mx= A[0];
 824             *my= A[1];
 825             return;
 826         }
 827     }
 828
 829     //RARE
 830     pred_motion(h, n, 4, list, ref, mx, my);
 831 }
 832
 833 /**
 834  * gets the directionally predicted 8x16 MV.
 835  * @param n the block index
 836  * @param mx the x component of the predicted motion vector
 837  * @param my the y component of the predicted motion vector
 838  */
 839 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 840     if(n==0){
 841         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 842         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 843
 844         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 845
 846         if(left_ref == ref){
 847             *mx= A[0];
 848             *my= A[1];
 849             return;
 850         }
 851     }else{
 852         const int16_t * C;
 853         int diagonal_ref;
 854
 855         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 856
 857         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 858
 859         if(diagonal_ref == ref){
 860             *mx= C[0];
 861             *my= C[1];
 862             return;
 863         }
 864     }
 865
 866     //RARE
 867     pred_motion(h, n, 2, list, ref, mx, my);
 868 }
 869
 870 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 871     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 872     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 873
 874     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 875
 876     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 877        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 878        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 879
 880         *mx = *my = 0;
 881         return;
 882     }
 883
 884     pred_motion(h, 0, 4, 0, 0, mx, my);
 885
 886     return;
 887 }
 888
 889 static inline void direct_dist_scale_factor(H264Context * const h){
 890     const int poc = h->s.current_picture_ptr->poc;
 891     const int poc1 = h->ref_list[1][0].poc;
 892     int i;
 893     for(i=0; i<h->ref_count[0]; i++){
 894         int poc0 = h->ref_list[0][i].poc;
 895         int td = av_clip(poc1 - poc0, -128, 127);
 896         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 897             h->dist_scale_factor[i] = 256;
 898         }else{
 899             int tb = av_clip(poc - poc0, -128, 127);
 900             int tx = (16384 + (FFABS(td) >> 1)) / td;
 901             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 902         }
 903     }
 904     if(FRAME_MBAFF){
 905         for(i=0; i<h->ref_count[0]; i++){
 906             h->dist_scale_factor_field[2*i] =
 907             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 908         }
 909     }
 910 }
 911 static inline void direct_ref_list_init(H264Context * const h){
 912     MpegEncContext * const s = &h->s;
 913     Picture * const ref1 = &h->ref_list[1][0];
 914     Picture * const cur = s->current_picture_ptr;
 915     int list, i, j;
 916     if(cur->pict_type == I_TYPE)
 917         cur->ref_count[0] = 0;
 918     if(cur->pict_type != B_TYPE)
 919         cur->ref_count[1] = 0;
 920     for(list=0; list<2; list++){
 921         cur->ref_count[list] = h->ref_count[list];
 922         for(j=0; j<h->ref_count[list]; j++)
 923             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 924     }
 925     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
 926         return;
 927     for(list=0; list<2; list++){
 928         for(i=0; i<ref1->ref_count[list]; i++){
 929             const int poc = ref1->ref_poc[list][i];
 930             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 931             for(j=0; j<h->ref_count[list]; j++)
 932                 if(h->ref_list[list][j].poc == poc){
 933                     h->map_col_to_list0[list][i] = j;
 934                     break;
 935                 }
 936         }
 937     }
 938     if(FRAME_MBAFF){
 939         for(list=0; list<2; list++){
 940             for(i=0; i<ref1->ref_count[list]; i++){
 941                 j = h->map_col_to_list0[list][i];
 942                 h->map_col_to_list0_field[list][2*i] = 2*j;
 943                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 944             }
 945         }
 946     }
 947 }
 948
 949 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 950     MpegEncContext * const s = &h->s;
 951     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
 952     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 953     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 954     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 955     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 956     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 957     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 958     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 959     const int is_b8x8 = IS_8X8(*mb_type);
 960     unsigned int sub_mb_type;
 961     int i8, i4;
 962
 963 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 964     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 965         /* FIXME save sub mb types from previous frames (or derive from MVs)
 966          * so we know exactly what block size to use */
 967         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 968         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 969     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 970         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 971         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 972     }else{
 973         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 974         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 975     }
 976     if(!is_b8x8)
 977         *mb_type |= MB_TYPE_DIRECT2;
 978     if(MB_FIELD)
 979         *mb_type |= MB_TYPE_INTERLACED;
 980
 981     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 982
 983     if(h->direct_spatial_mv_pred){
 984         int ref[2];
 985         int mv[2][2];
 986         int list;
 987
 988         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 989
 990         /* ref = min(neighbors) */
 991         for(list=0; list<2; list++){
 992             int refa = h->ref_cache[list][scan8[0] - 1];
 993             int refb = h->ref_cache[list][scan8[0] - 8];
 994             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
 995             if(refc == -2)
 996                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
 997             ref[list] = refa;
 998             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
 999                 ref[list] = refb;
1000             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1001                 ref[list] = refc;
1002             if(ref[list] < 0)
1003                 ref[list] = -1;
1004         }
1005
1006         if(ref[0] < 0 && ref[1] < 0){
1007             ref[0] = ref[1] = 0;
1008             mv[0][0] = mv[0][1] =
1009             mv[1][0] = mv[1][1] = 0;
1010         }else{
1011             for(list=0; list<2; list++){
1012                 if(ref[list] >= 0)
1013                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1014                 else
1015                     mv[list][0] = mv[list][1] = 0;
1016             }
1017         }
1018
1019         if(ref[1] < 0){
1020             *mb_type &= ~MB_TYPE_P0L1;
1021             sub_mb_type &= ~MB_TYPE_P0L1;
1022         }else if(ref[0] < 0){
1023             *mb_type &= ~MB_TYPE_P0L0;
1024             sub_mb_type &= ~MB_TYPE_P0L0;
1025         }
1026
1027         if(IS_16X16(*mb_type)){
1028             int a=0, b=0;
1029
1030             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1031             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1032             if(!IS_INTRA(mb_type_col)
1033                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1034                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1035                        && (h->x264_build>33 || !h->x264_build)))){
1036                 if(ref[0] > 0)
1037                     a= pack16to32(mv[0][0],mv[0][1]);
1038                 if(ref[1] > 0)
1039                     b= pack16to32(mv[1][0],mv[1][1]);
1040             }else{
1041                 a= pack16to32(mv[0][0],mv[0][1]);
1042                 b= pack16to32(mv[1][0],mv[1][1]);
1043             }
1044             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1045             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1046         }else{
1047             for(i8=0; i8<4; i8++){
1048                 const int x8 = i8&1;
1049                 const int y8 = i8>>1;
1050
1051                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1052                     continue;
1053                 h->sub_mb_type[i8] = sub_mb_type;
1054
1055                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1056                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1057                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1058                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1059
1060                 /* col_zero_flag */
1061                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1062                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1063                                                   && (h->x264_build>33 || !h->x264_build)))){
1064                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1065                     if(IS_SUB_8X8(sub_mb_type)){
1066                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1067                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1068                             if(ref[0] == 0)
1069                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1070                             if(ref[1] == 0)
1071                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1072                         }
1073                     }else
1074                     for(i4=0; i4<4; i4++){
1075                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1076                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1077                             if(ref[0] == 0)
1078                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1079                             if(ref[1] == 0)
1080                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1081                         }
1082                     }
1083                 }
1084             }
1085         }
1086     }else{ /* direct temporal mv pred */
1087         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1088         const int *dist_scale_factor = h->dist_scale_factor;
1089
1090         if(FRAME_MBAFF){
1091             if(IS_INTERLACED(*mb_type)){
1092                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1093                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1094                 dist_scale_factor = h->dist_scale_factor_field;
1095             }
1096             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1097                 /* FIXME assumes direct_8x8_inference == 1 */
1098                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1099                 int mb_types_col[2];
1100                 int y_shift;
1101
1102                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1103                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1104                          | (*mb_type & MB_TYPE_INTERLACED);
1105                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1106
1107                 if(IS_INTERLACED(*mb_type)){
1108                     /* frame to field scaling */
1109                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1110                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1111                     if(s->mb_y&1){
1112                         l1ref0 -= 2*h->b8_stride;
1113                         l1ref1 -= 2*h->b8_stride;
1114                         l1mv0 -= 4*h->b_stride;
1115                         l1mv1 -= 4*h->b_stride;
1116                     }
1117                     y_shift = 0;
1118
1119                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1120                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1121                        && !is_b8x8)
1122                         *mb_type |= MB_TYPE_16x8;
1123                     else
1124                         *mb_type |= MB_TYPE_8x8;
1125                 }else{
1126                     /* field to frame scaling */
1127                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1128                      * but in MBAFF, top and bottom POC are equal */
1129                     int dy = (s->mb_y&1) ? 1 : 2;
1130                     mb_types_col[0] =
1131                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1132                     l1ref0 += dy*h->b8_stride;
1133                     l1ref1 += dy*h->b8_stride;
1134                     l1mv0 += 2*dy*h->b_stride;
1135                     l1mv1 += 2*dy*h->b_stride;
1136                     y_shift = 2;
1137
1138                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1139                        && !is_b8x8)
1140                         *mb_type |= MB_TYPE_16x16;
1141                     else
1142                         *mb_type |= MB_TYPE_8x8;
1143                 }
1144
1145                 for(i8=0; i8<4; i8++){
1146                     const int x8 = i8&1;
1147                     const int y8 = i8>>1;
1148                     int ref0, scale;
1149                     const int16_t (*l1mv)[2]= l1mv0;
1150
1151                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1152                         continue;
1153                     h->sub_mb_type[i8] = sub_mb_type;
1154
1155                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1156                     if(IS_INTRA(mb_types_col[y8])){
1157                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1158                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1159                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1160                         continue;
1161                     }
1162
1163                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1164                     if(ref0 >= 0)
1165                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1166                     else{
1167                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1168                         l1mv= l1mv1;
1169                     }
1170                     scale = dist_scale_factor[ref0];
1171                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1172
1173                     {
1174                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1175                         int my_col = (mv_col[1]<<y_shift)/2;
1176                         int mx = (scale * mv_col[0] + 128) >> 8;
1177                         int my = (scale * my_col + 128) >> 8;
1178                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1179                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1180                     }
1181                 }
1182                 return;
1183             }
1184         }
1185
1186         /* one-to-one mv scaling */
1187
1188         if(IS_16X16(*mb_type)){
1189             int ref, mv0, mv1;
1190
1191             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1192             if(IS_INTRA(mb_type_col)){
1193                 ref=mv0=mv1=0;
1194             }else{
1195                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1196                                                 : map_col_to_list0[1][l1ref1[0]];
1197                 const int scale = dist_scale_factor[ref0];
1198                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1199                 int mv_l0[2];
1200                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1201                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1202                 ref= ref0;
1203                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1204                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1205             }
1206             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1207             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1208             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1209         }else{
1210             for(i8=0; i8<4; i8++){
1211                 const int x8 = i8&1;
1212                 const int y8 = i8>>1;
1213                 int ref0, scale;
1214                 const int16_t (*l1mv)[2]= l1mv0;
1215
1216                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1217                     continue;
1218                 h->sub_mb_type[i8] = sub_mb_type;
1219                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                 if(IS_INTRA(mb_type_col)){
1221                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1223                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1224                     continue;
1225                 }
1226
1227                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1228                 if(ref0 >= 0)
1229                     ref0 = map_col_to_list0[0][ref0];
1230                 else{
1231                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1232                     l1mv= l1mv1;
1233                 }
1234                 scale = dist_scale_factor[ref0];
1235
1236                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1237                 if(IS_SUB_8X8(sub_mb_type)){
1238                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1239                     int mx = (scale * mv_col[0] + 128) >> 8;
1240                     int my = (scale * mv_col[1] + 128) >> 8;
1241                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1242                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1243                 }else
1244                 for(i4=0; i4<4; i4++){
1245                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1246                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1247                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1248                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1249                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1250                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1251                 }
1252             }
1253         }
1254     }
1255 }
1256
1257 static inline void write_back_motion(H264Context *h, int mb_type){
1258     MpegEncContext * const s = &h->s;
1259     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1260     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1261     int list;
1262
1263     if(!USES_LIST(mb_type, 0))
1264         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1265
1266     for(list=0; list<h->list_count; list++){
1267         int y;
1268         if(!USES_LIST(mb_type, list))
1269             continue;
1270
1271         for(y=0; y<4; y++){
1272             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1273             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1274         }
1275         if( h->pps.cabac ) {
1276             if(IS_SKIP(mb_type))
1277                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1278             else
1279             for(y=0; y<4; y++){
1280                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1281                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1282             }
1283         }
1284
1285         {
1286             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1287             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1288             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1289             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1290             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1291         }
1292     }
1293
1294     if(h->slice_type == B_TYPE && h->pps.cabac){
1295         if(IS_8X8(mb_type)){
1296             uint8_t *direct_table = &h->direct_table[b8_xy];
1297             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1298             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1299             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1300         }
1301     }
1302 }
1303
1304 /**
1305  * Decodes a network abstraction layer unit.
1306  * @param consumed is the number of bytes used as input
1307  * @param length is the length of the array
1308  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1309  * @returns decoded bytes, might be src+1 if no escapes
1310  */
1311 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1312     int i, si, di;
1313     uint8_t *dst;
1314     int bufidx;
1315
1316 //    src[0]&0x80;                //forbidden bit
1317     h->nal_ref_idc= src[0]>>5;
1318     h->nal_unit_type= src[0]&0x1F;
1319
1320     src++; length--;
1321 #if 0
1322     for(i=0; i<length; i++)
1323         printf("%2X ", src[i]);
1324 #endif
1325     for(i=0; i+1<length; i+=2){
1326         if(src[i]) continue;
1327         if(i>0 && src[i-1]==0) i--;
1328         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1329             if(src[i+2]!=3){
1330                 /* startcode, so we must be past the end */
1331                 length=i;
1332             }
1333             break;
1334         }
1335     }
1336
1337     if(i>=length-1){ //no escaped 0
1338         *dst_length= length;
1339         *consumed= length+1; //+1 for the header
1340         return src;
1341     }
1342
1343     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1344     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1345     dst= h->rbsp_buffer[bufidx];
1346
1347     if (dst == NULL){
1348         return NULL;
1349     }
1350
1351 //printf("decoding esc\n");
1352     si=di=0;
1353     while(si<length){
1354         //remove escapes (very rare 1:2^22)
1355         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1356             if(src[si+2]==3){ //escape
1357                 dst[di++]= 0;
1358                 dst[di++]= 0;
1359                 si+=3;
1360                 continue;
1361             }else //next start code
1362                 break;
1363         }
1364
1365         dst[di++]= src[si++];
1366     }
1367
1368     *dst_length= di;
1369     *consumed= si + 1;//+1 for the header
1370 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1371     return dst;
1372 }
1373
1374 /**
1375  * identifies the exact end of the bitstream
1376  * @return the length of the trailing, or 0 if damaged
1377  */
1378 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1379     int v= *src;
1380     int r;
1381
1382     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1383
1384     for(r=1; r<9; r++){
1385         if(v&1) return r;
1386         v>>=1;
1387     }
1388     return 0;
1389 }
1390
1391 /**
1392  * idct tranforms the 16 dc values and dequantize them.
1393  * @param qp quantization parameter
1394  */
1395 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1396 #define stride 16
1397     int i;
1398     int temp[16]; //FIXME check if this is a good idea
1399     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1400     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1401
1402 //memset(block, 64, 2*256);
1403 //return;
1404     for(i=0; i<4; i++){
1405         const int offset= y_offset[i];
1406         const int z0= block[offset+stride*0] + block[offset+stride*4];
1407         const int z1= block[offset+stride*0] - block[offset+stride*4];
1408         const int z2= block[offset+stride*1] - block[offset+stride*5];
1409         const int z3= block[offset+stride*1] + block[offset+stride*5];
1410
1411         temp[4*i+0]= z0+z3;
1412         temp[4*i+1]= z1+z2;
1413         temp[4*i+2]= z1-z2;
1414         temp[4*i+3]= z0-z3;
1415     }
1416
1417     for(i=0; i<4; i++){
1418         const int offset= x_offset[i];
1419         const int z0= temp[4*0+i] + temp[4*2+i];
1420         const int z1= temp[4*0+i] - temp[4*2+i];
1421         const int z2= temp[4*1+i] - temp[4*3+i];
1422         const int z3= temp[4*1+i] + temp[4*3+i];
1423
1424         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1425         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1426         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1427         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1428     }
1429 }
1430
1431 #if 0
1432 /**
1433  * dct tranforms the 16 dc values.
1434  * @param qp quantization parameter ??? FIXME
1435  */
1436 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1437 //    const int qmul= dequant_coeff[qp][0];
1438     int i;
1439     int temp[16]; //FIXME check if this is a good idea
1440     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1441     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1442
1443     for(i=0; i<4; i++){
1444         const int offset= y_offset[i];
1445         const int z0= block[offset+stride*0] + block[offset+stride*4];
1446         const int z1= block[offset+stride*0] - block[offset+stride*4];
1447         const int z2= block[offset+stride*1] - block[offset+stride*5];
1448         const int z3= block[offset+stride*1] + block[offset+stride*5];
1449
1450         temp[4*i+0]= z0+z3;
1451         temp[4*i+1]= z1+z2;
1452         temp[4*i+2]= z1-z2;
1453         temp[4*i+3]= z0-z3;
1454     }
1455
1456     for(i=0; i<4; i++){
1457         const int offset= x_offset[i];
1458         const int z0= temp[4*0+i] + temp[4*2+i];
1459         const int z1= temp[4*0+i] - temp[4*2+i];
1460         const int z2= temp[4*1+i] - temp[4*3+i];
1461         const int z3= temp[4*1+i] + temp[4*3+i];
1462
1463         block[stride*0 +offset]= (z0 + z3)>>1;
1464         block[stride*2 +offset]= (z1 + z2)>>1;
1465         block[stride*8 +offset]= (z1 - z2)>>1;
1466         block[stride*10+offset]= (z0 - z3)>>1;
1467     }
1468 }
1469 #endif
1470
1471 #undef xStride
1472 #undef stride
1473
1474 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1475     const int stride= 16*2;
1476     const int xStride= 16;
1477     int a,b,c,d,e;
1478
1479     a= block[stride*0 + xStride*0];
1480     b= block[stride*0 + xStride*1];
1481     c= block[stride*1 + xStride*0];
1482     d= block[stride*1 + xStride*1];
1483
1484     e= a-b;
1485     a= a+b;
1486     b= c-d;
1487     c= c+d;
1488
1489     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1490     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1491     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1492     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1493 }
1494
1495 #if 0
1496 static void chroma_dc_dct_c(DCTELEM *block){
1497     const int stride= 16*2;
1498     const int xStride= 16;
1499     int a,b,c,d,e;
1500
1501     a= block[stride*0 + xStride*0];
1502     b= block[stride*0 + xStride*1];
1503     c= block[stride*1 + xStride*0];
1504     d= block[stride*1 + xStride*1];
1505
1506     e= a-b;
1507     a= a+b;
1508     b= c-d;
1509     c= c+d;
1510
1511     block[stride*0 + xStride*0]= (a+c);
1512     block[stride*0 + xStride*1]= (e+b);
1513     block[stride*1 + xStride*0]= (a-c);
1514     block[stride*1 + xStride*1]= (e-b);
1515 }
1516 #endif
1517
1518 /**
1519  * gets the chroma qp.
1520  */
1521 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1522     return h->pps.chroma_qp_table[t][qscale & 0xff];
1523 }
1524
1525 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1526 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1527 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1528     int i;
1529     const int * const quant_table= quant_coeff[qscale];
1530     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1531     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1532     const unsigned int threshold2= (threshold1<<1);
1533     int last_non_zero;
1534
1535     if(separate_dc){
1536         if(qscale<=18){
1537             //avoid overflows
1538             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1539             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1540             const unsigned int dc_threshold2= (dc_threshold1<<1);
1541
1542             int level= block[0]*quant_coeff[qscale+18][0];
1543             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1544                 if(level>0){
1545                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1546                     block[0]= level;
1547                 }else{
1548                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1549                     block[0]= -level;
1550                 }
1551 //                last_non_zero = i;
1552             }else{
1553                 block[0]=0;
1554             }
1555         }else{
1556             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1557             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1558             const unsigned int dc_threshold2= (dc_threshold1<<1);
1559
1560             int level= block[0]*quant_table[0];
1561             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1562                 if(level>0){
1563                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1564                     block[0]= level;
1565                 }else{
1566                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1567                     block[0]= -level;
1568                 }
1569 //                last_non_zero = i;
1570             }else{
1571                 block[0]=0;
1572             }
1573         }
1574         last_non_zero= 0;
1575         i=1;
1576     }else{
1577         last_non_zero= -1;
1578         i=0;
1579     }
1580
1581     for(; i<16; i++){
1582         const int j= scantable[i];
1583         int level= block[j]*quant_table[j];
1584
1585 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1586 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1587         if(((unsigned)(level+threshold1))>threshold2){
1588             if(level>0){
1589                 level= (bias + level)>>QUANT_SHIFT;
1590                 block[j]= level;
1591             }else{
1592                 level= (bias - level)>>QUANT_SHIFT;
1593                 block[j]= -level;
1594             }
1595             last_non_zero = i;
1596         }else{
1597             block[j]=0;
1598         }
1599     }
1600
1601     return last_non_zero;
1602 }
1603
1604 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1605                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1606                            int src_x_offset, int src_y_offset,
1607                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1608     MpegEncContext * const s = &h->s;
1609     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1610     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1611     const int luma_xy= (mx&3) + ((my&3)<<2);
1612     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1613     uint8_t * src_cb, * src_cr;
1614     int extra_width= h->emu_edge_width;
1615     int extra_height= h->emu_edge_height;
1616     int emu=0;
1617     const int full_mx= mx>>2;
1618     const int full_my= my>>2;
1619     const int pic_width  = 16*s->mb_width;
1620     const int pic_height = 16*s->mb_height >> MB_FIELD;
1621
1622     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1623         return;
1624
1625     if(mx&7) extra_width -= 3;
1626     if(my&7) extra_height -= 3;
1627
1628     if(   full_mx < 0-extra_width
1629        || full_my < 0-extra_height
1630        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1631        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1632         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1633             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1634         emu=1;
1635     }
1636
1637     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1638     if(!square){
1639         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1640     }
1641
1642     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1643
1644     if(MB_FIELD){
1645         // chroma offset when predicting from a field of opposite parity
1646         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1647         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1648     }
1649     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1650     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1651
1652     if(emu){
1653         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1654             src_cb= s->edge_emu_buffer;
1655     }
1656     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1657
1658     if(emu){
1659         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1660             src_cr= s->edge_emu_buffer;
1661     }
1662     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1663 }
1664
1665 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1666                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1667                            int x_offset, int y_offset,
1668                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1669                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1670                            int list0, int list1){
1671     MpegEncContext * const s = &h->s;
1672     qpel_mc_func *qpix_op=  qpix_put;
1673     h264_chroma_mc_func chroma_op= chroma_put;
1674
1675     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1676     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1677     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1678     x_offset += 8*s->mb_x;
1679     y_offset += 8*(s->mb_y >> MB_FIELD);
1680
1681     if(list0){
1682         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1683         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1684                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1685                            qpix_op, chroma_op);
1686
1687         qpix_op=  qpix_avg;
1688         chroma_op= chroma_avg;
1689     }
1690
1691     if(list1){
1692         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1693         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1694                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1695                            qpix_op, chroma_op);
1696     }
1697 }
1698
1699 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1700                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1701                            int x_offset, int y_offset,
1702                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1703                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1704                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1705                            int list0, int list1){
1706     MpegEncContext * const s = &h->s;
1707
1708     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1709     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1710     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1711     x_offset += 8*s->mb_x;
1712     y_offset += 8*(s->mb_y >> MB_FIELD);
1713
1714     if(list0 && list1){
1715         /* don't optimize for luma-only case, since B-frames usually
1716          * use implicit weights => chroma too. */
1717         uint8_t *tmp_cb = s->obmc_scratchpad;
1718         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1719         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1720         int refn0 = h->ref_cache[0][ scan8[n] ];
1721         int refn1 = h->ref_cache[1][ scan8[n] ];
1722
1723         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1724                     dest_y, dest_cb, dest_cr,
1725                     x_offset, y_offset, qpix_put, chroma_put);
1726         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1727                     tmp_y, tmp_cb, tmp_cr,
1728                     x_offset, y_offset, qpix_put, chroma_put);
1729
1730         if(h->use_weight == 2){
1731             int weight0 = h->implicit_weight[refn0][refn1];
1732             int weight1 = 64 - weight0;
1733             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1734             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1735             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1736         }else{
1737             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1738                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1739                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1740             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1741                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1742                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1743             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1745                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1746         }
1747     }else{
1748         int list = list1 ? 1 : 0;
1749         int refn = h->ref_cache[list][ scan8[n] ];
1750         Picture *ref= &h->ref_list[list][refn];
1751         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1752                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1753                     qpix_put, chroma_put);
1754
1755         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1756                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1757         if(h->use_weight_chroma){
1758             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1759                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1760             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1761                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1762         }
1763     }
1764 }
1765
1766 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1767                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1768                            int x_offset, int y_offset,
1769                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1770                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1771                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1772                            int list0, int list1){
1773     if((h->use_weight==2 && list0 && list1
1774         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1775        || h->use_weight==1)
1776         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1777                          x_offset, y_offset, qpix_put, chroma_put,
1778                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1779     else
1780         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1781                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1782 }
1783
1784 static inline void prefetch_motion(H264Context *h, int list){
1785     /* fetch pixels for estimated mv 4 macroblocks ahead
1786      * optimized for 64byte cache lines */
1787     MpegEncContext * const s = &h->s;
1788     const int refn = h->ref_cache[list][scan8[0]];
1789     if(refn >= 0){
1790         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1791         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1792         uint8_t **src= h->ref_list[list][refn].data;
1793         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1794         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1795         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1796         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1797     }
1798 }
1799
1800 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1801                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1802                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1803                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1804     MpegEncContext * const s = &h->s;
1805     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1806     const int mb_type= s->current_picture.mb_type[mb_xy];
1807
1808     assert(IS_INTER(mb_type));
1809
1810     prefetch_motion(h, 0);
1811
1812     if(IS_16X16(mb_type)){
1813         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1815                 &weight_op[0], &weight_avg[0],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817     }else if(IS_16X8(mb_type)){
1818         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1819                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1820                 &weight_op[1], &weight_avg[1],
1821                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1822         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1823                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1824                 &weight_op[1], &weight_avg[1],
1825                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1826     }else if(IS_8X16(mb_type)){
1827         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1828                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1829                 &weight_op[2], &weight_avg[2],
1830                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1831         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1832                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1833                 &weight_op[2], &weight_avg[2],
1834                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1835     }else{
1836         int i;
1837
1838         assert(IS_8X8(mb_type));
1839
1840         for(i=0; i<4; i++){
1841             const int sub_mb_type= h->sub_mb_type[i];
1842             const int n= 4*i;
1843             int x_offset= (i&1)<<2;
1844             int y_offset= (i&2)<<1;
1845
1846             if(IS_SUB_8X8(sub_mb_type)){
1847                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1849                     &weight_op[3], &weight_avg[3],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851             }else if(IS_SUB_8X4(sub_mb_type)){
1852                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1853                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1854                     &weight_op[4], &weight_avg[4],
1855                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1856                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1857                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1858                     &weight_op[4], &weight_avg[4],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860             }else if(IS_SUB_4X8(sub_mb_type)){
1861                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1862                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1863                     &weight_op[5], &weight_avg[5],
1864                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1865                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1866                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1867                     &weight_op[5], &weight_avg[5],
1868                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1869             }else{
1870                 int j;
1871                 assert(IS_SUB_4X4(sub_mb_type));
1872                 for(j=0; j<4; j++){
1873                     int sub_x_offset= x_offset + 2*(j&1);
1874                     int sub_y_offset= y_offset +   (j&2);
1875                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1876                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1877                         &weight_op[6], &weight_avg[6],
1878                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1879                 }
1880             }
1881         }
1882     }
1883
1884     prefetch_motion(h, 1);
1885 }
1886
1887 static void decode_init_vlc(void){
1888     static int done = 0;
1889
1890     if (!done) {
1891         int i;
1892         done = 1;
1893
1894         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1895                  &chroma_dc_coeff_token_len [0], 1, 1,
1896                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1897
1898         for(i=0; i<4; i++){
1899             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1900                      &coeff_token_len [i][0], 1, 1,
1901                      &coeff_token_bits[i][0], 1, 1, 1);
1902         }
1903
1904         for(i=0; i<3; i++){
1905             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1906                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1907                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1908         }
1909         for(i=0; i<15; i++){
1910             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1911                      &total_zeros_len [i][0], 1, 1,
1912                      &total_zeros_bits[i][0], 1, 1, 1);
1913         }
1914
1915         for(i=0; i<6; i++){
1916             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1917                      &run_len [i][0], 1, 1,
1918                      &run_bits[i][0], 1, 1, 1);
1919         }
1920         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1921                  &run_len [6][0], 1, 1,
1922                  &run_bits[6][0], 1, 1, 1);
1923     }
1924 }
1925
1926 static void free_tables(H264Context *h){
1927     int i;
1928     H264Context *hx;
1929     av_freep(&h->intra4x4_pred_mode);
1930     av_freep(&h->chroma_pred_mode_table);
1931     av_freep(&h->cbp_table);
1932     av_freep(&h->mvd_table[0]);
1933     av_freep(&h->mvd_table[1]);
1934     av_freep(&h->direct_table);
1935     av_freep(&h->non_zero_count);
1936     av_freep(&h->slice_table_base);
1937     h->slice_table= NULL;
1938
1939     av_freep(&h->mb2b_xy);
1940     av_freep(&h->mb2b8_xy);
1941
1942     for(i = 0; i < MAX_SPS_COUNT; i++)
1943         av_freep(h->sps_buffers + i);
1944
1945     for(i = 0; i < MAX_PPS_COUNT; i++)
1946         av_freep(h->pps_buffers + i);
1947
1948     for(i = 0; i < h->s.avctx->thread_count; i++) {
1949         hx = h->thread_context[i];
1950         if(!hx) continue;
1951         av_freep(&hx->top_borders[1]);
1952         av_freep(&hx->top_borders[0]);
1953         av_freep(&hx->s.obmc_scratchpad);
1954     }
1955 }
1956
1957 static void init_dequant8_coeff_table(H264Context *h){
1958     int i,q,x;
1959     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1960     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1961     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1962
1963     for(i=0; i<2; i++ ){
1964         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1965             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1966             break;
1967         }
1968
1969         for(q=0; q<52; q++){
1970             int shift = ff_div6[q];
1971             int idx = ff_rem6[q];
1972             for(x=0; x<64; x++)
1973                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1974                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1975                     h->pps.scaling_matrix8[i][x]) << shift;
1976         }
1977     }
1978 }
1979
1980 static void init_dequant4_coeff_table(H264Context *h){
1981     int i,j,q,x;
1982     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1983     for(i=0; i<6; i++ ){
1984         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1985         for(j=0; j<i; j++){
1986             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1987                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1988                 break;
1989             }
1990         }
1991         if(j<i)
1992             continue;
1993
1994         for(q=0; q<52; q++){
1995             int shift = ff_div6[q] + 2;
1996             int idx = ff_rem6[q];
1997             for(x=0; x<16; x++)
1998                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
1999                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2000                     h->pps.scaling_matrix4[i][x]) << shift;
2001         }
2002     }
2003 }
2004
2005 static void init_dequant_tables(H264Context *h){
2006     int i,x;
2007     init_dequant4_coeff_table(h);
2008     if(h->pps.transform_8x8_mode)
2009         init_dequant8_coeff_table(h);
2010     if(h->sps.transform_bypass){
2011         for(i=0; i<6; i++)
2012             for(x=0; x<16; x++)
2013                 h->dequant4_coeff[i][0][x] = 1<<6;
2014         if(h->pps.transform_8x8_mode)
2015             for(i=0; i<2; i++)
2016                 for(x=0; x<64; x++)
2017                     h->dequant8_coeff[i][0][x] = 1<<6;
2018     }
2019 }
2020
2021
2022 /**
2023  * allocates tables.
2024  * needs width/height
2025  */
2026 static int alloc_tables(H264Context *h){
2027     MpegEncContext * const s = &h->s;
2028     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2029     int x,y;
2030
2031     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2032
2033     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2034     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2035     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2036
2037     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2038     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2039     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2040     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2041
2042     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2043     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2044
2045     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2046     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2047     for(y=0; y<s->mb_height; y++){
2048         for(x=0; x<s->mb_width; x++){
2049             const int mb_xy= x + y*s->mb_stride;
2050             const int b_xy = 4*x + 4*y*h->b_stride;
2051             const int b8_xy= 2*x + 2*y*h->b8_stride;
2052
2053             h->mb2b_xy [mb_xy]= b_xy;
2054             h->mb2b8_xy[mb_xy]= b8_xy;
2055         }
2056     }
2057
2058     s->obmc_scratchpad = NULL;
2059
2060     if(!h->dequant4_coeff[0])
2061         init_dequant_tables(h);
2062
2063     return 0;
2064 fail:
2065     free_tables(h);
2066     return -1;
2067 }
2068
2069 /**
2070  * Mimic alloc_tables(), but for every context thread.
2071  */
2072 static void clone_tables(H264Context *dst, H264Context *src){
2073     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2074     dst->non_zero_count           = src->non_zero_count;
2075     dst->slice_table              = src->slice_table;
2076     dst->cbp_table                = src->cbp_table;
2077     dst->mb2b_xy                  = src->mb2b_xy;
2078     dst->mb2b8_xy                 = src->mb2b8_xy;
2079     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2080     dst->mvd_table[0]             = src->mvd_table[0];
2081     dst->mvd_table[1]             = src->mvd_table[1];
2082     dst->direct_table             = src->direct_table;
2083
2084     dst->s.obmc_scratchpad = NULL;
2085     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2086 }
2087
2088 /**
2089  * Init context
2090  * Allocate buffers which are not shared amongst multiple threads.
2091  */
2092 static int context_init(H264Context *h){
2093     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2094     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2095
2096     return 0;
2097 fail:
2098     return -1; // free_tables will clean up for us
2099 }
2100
2101 static void common_init(H264Context *h){
2102     MpegEncContext * const s = &h->s;
2103
2104     s->width = s->avctx->width;
2105     s->height = s->avctx->height;
2106     s->codec_id= s->avctx->codec->id;
2107
2108     ff_h264_pred_init(&h->hpc, s->codec_id);
2109
2110     h->dequant_coeff_pps= -1;
2111     s->unrestricted_mv=1;
2112     s->decode=1; //FIXME
2113
2114     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2115     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2116 }
2117
2118 static int decode_init(AVCodecContext *avctx){
2119     H264Context *h= avctx->priv_data;
2120     MpegEncContext * const s = &h->s;
2121
2122     MPV_decode_defaults(s);
2123
2124     s->avctx = avctx;
2125     common_init(h);
2126
2127     s->out_format = FMT_H264;
2128     s->workaround_bugs= avctx->workaround_bugs;
2129
2130     // set defaults
2131 //    s->decode_mb= ff_h263_decode_mb;
2132     s->quarter_sample = 1;
2133     s->low_delay= 1;
2134     avctx->pix_fmt= PIX_FMT_YUV420P;
2135
2136     decode_init_vlc();
2137
2138     if(avctx->extradata_size > 0 && avctx->extradata &&
2139        *(char *)avctx->extradata == 1){
2140         h->is_avc = 1;
2141         h->got_avcC = 0;
2142     } else {
2143         h->is_avc = 0;
2144     }
2145
2146     h->thread_context[0] = h;
2147     return 0;
2148 }
2149
2150 static int frame_start(H264Context *h){
2151     MpegEncContext * const s = &h->s;
2152     int i;
2153
2154     if(MPV_frame_start(s, s->avctx) < 0)
2155         return -1;
2156     ff_er_frame_start(s);
2157     /*
2158      * MPV_frame_start uses pict_type to derive key_frame.
2159      * This is incorrect for H.264; IDR markings must be used.
2160      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2161      * See decode_nal_units().
2162      */
2163     s->current_picture_ptr->key_frame= 0;
2164
2165     assert(s->linesize && s->uvlinesize);
2166
2167     for(i=0; i<16; i++){
2168         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2169         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2170     }
2171     for(i=0; i<4; i++){
2172         h->block_offset[16+i]=
2173         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2174         h->block_offset[24+16+i]=
2175         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2176     }
2177
2178     /* can't be in alloc_tables because linesize isn't known there.
2179      * FIXME: redo bipred weight to not require extra buffer? */
2180     for(i = 0; i < s->avctx->thread_count; i++)
2181         if(!h->thread_context[i]->s.obmc_scratchpad)
2182             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2183
2184     /* some macroblocks will be accessed before they're available */
2185     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2186         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2187
2188 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2189     return 0;
2190 }
2191
2192 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2193     MpegEncContext * const s = &h->s;
2194     int i;
2195
2196     src_y  -=   linesize;
2197     src_cb -= uvlinesize;
2198     src_cr -= uvlinesize;
2199
2200     // There are two lines saved, the line above the the top macroblock of a pair,
2201     // and the line above the bottom macroblock
2202     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2203     for(i=1; i<17; i++){
2204         h->left_border[i]= src_y[15+i*  linesize];
2205     }
2206
2207     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2208     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2209
2210     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2211         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2212         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2213         for(i=1; i<9; i++){
2214             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2215             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2216         }
2217         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2218         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2219     }
2220 }
2221
2222 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2223     MpegEncContext * const s = &h->s;
2224     int temp8, i;
2225     uint64_t temp64;
2226     int deblock_left;
2227     int deblock_top;
2228     int mb_xy;
2229
2230     if(h->deblocking_filter == 2) {
2231         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2232         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2233         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2234     } else {
2235         deblock_left = (s->mb_x > 0);
2236         deblock_top =  (s->mb_y > 0);
2237     }
2238
2239     src_y  -=   linesize + 1;
2240     src_cb -= uvlinesize + 1;
2241     src_cr -= uvlinesize + 1;
2242
2243 #define XCHG(a,b,t,xchg)\
2244 t= a;\
2245 if(xchg)\
2246     a= b;\
2247 b= t;
2248
2249     if(deblock_left){
2250         for(i = !deblock_top; i<17; i++){
2251             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2252         }
2253     }
2254
2255     if(deblock_top){
2256         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2257         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2258         if(s->mb_x+1 < s->mb_width){
2259             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2260         }
2261     }
2262
2263     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2264         if(deblock_left){
2265             for(i = !deblock_top; i<9; i++){
2266                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2267                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2268             }
2269         }
2270         if(deblock_top){
2271             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2272             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2273         }
2274     }
2275 }
2276
2277 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2278     MpegEncContext * const s = &h->s;
2279     int i;
2280
2281     src_y  -= 2 *   linesize;
2282     src_cb -= 2 * uvlinesize;
2283     src_cr -= 2 * uvlinesize;
2284
2285     // There are two lines saved, the line above the the top macroblock of a pair,
2286     // and the line above the bottom macroblock
2287     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2288     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2289     for(i=2; i<34; i++){
2290         h->left_border[i]= src_y[15+i*  linesize];
2291     }
2292
2293     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2294     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2295     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2296     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2297
2298     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2299         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2300         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2301         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2302         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2303         for(i=2; i<18; i++){
2304             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2305             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2306         }
2307         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2308         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2309         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2310         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2311     }
2312 }
2313
2314 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2315     MpegEncContext * const s = &h->s;
2316     int temp8, i;
2317     uint64_t temp64;
2318     int deblock_left = (s->mb_x > 0);
2319     int deblock_top  = (s->mb_y > 1);
2320
2321     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2322
2323     src_y  -= 2 *   linesize + 1;
2324     src_cb -= 2 * uvlinesize + 1;
2325     src_cr -= 2 * uvlinesize + 1;
2326
2327 #define XCHG(a,b,t,xchg)\
2328 t= a;\
2329 if(xchg)\
2330     a= b;\
2331 b= t;
2332
2333     if(deblock_left){
2334         for(i = (!deblock_top)<<1; i<34; i++){
2335             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2336         }
2337     }
2338
2339     if(deblock_top){
2340         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2341         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2342         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2343         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2344         if(s->mb_x+1 < s->mb_width){
2345             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2346             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2347         }
2348     }
2349
2350     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2351         if(deblock_left){
2352             for(i = (!deblock_top) << 1; i<18; i++){
2353                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2354                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2355             }
2356         }
2357         if(deblock_top){
2358             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2359             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2360             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2361             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2362         }
2363     }
2364 }
2365
2366 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2367     MpegEncContext * const s = &h->s;
2368     const int mb_x= s->mb_x;
2369     const int mb_y= s->mb_y;
2370     const int mb_xy= mb_x + mb_y*s->mb_stride;
2371     const int mb_type= s->current_picture.mb_type[mb_xy];
2372     uint8_t  *dest_y, *dest_cb, *dest_cr;
2373     int linesize, uvlinesize /*dct_offset*/;
2374     int i;
2375     int *block_offset = &h->block_offset[0];
2376     const unsigned int bottom = mb_y & 1;
2377     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2378     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2379     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2380
2381     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2382     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2383     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2384
2385     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2386     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2387
2388     if (!simple && MB_FIELD) {
2389         linesize   = h->mb_linesize   = s->linesize * 2;
2390         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2391         block_offset = &h->block_offset[24];
2392         if(mb_y&1){ //FIXME move out of this func?
2393             dest_y -= s->linesize*15;
2394             dest_cb-= s->uvlinesize*7;
2395             dest_cr-= s->uvlinesize*7;
2396         }
2397         if(FRAME_MBAFF) {
2398             int list;
2399             for(list=0; list<h->list_count; list++){
2400                 if(!USES_LIST(mb_type, list))
2401                     continue;
2402                 if(IS_16X16(mb_type)){
2403                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2404                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2405                 }else{
2406                     for(i=0; i<16; i+=4){
2407                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2408                         int ref = h->ref_cache[list][scan8[i]];
2409                         if(ref >= 0)
2410                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2411                     }
2412                 }
2413             }
2414         }
2415     } else {
2416         linesize   = h->mb_linesize   = s->linesize;
2417         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2418 //        dct_offset = s->linesize * 16;
2419     }
2420
2421     if(transform_bypass){
2422         idct_dc_add =
2423         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2424     }else if(IS_8x8DCT(mb_type)){
2425         idct_dc_add = s->dsp.h264_idct8_dc_add;
2426         idct_add = s->dsp.h264_idct8_add;
2427     }else{
2428         idct_dc_add = s->dsp.h264_idct_dc_add;
2429         idct_add = s->dsp.h264_idct_add;
2430     }
2431
2432     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2433        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2434         int mbt_y = mb_y&~1;
2435         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2436         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2437         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2438         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2439     }
2440
2441     if (!simple && IS_INTRA_PCM(mb_type)) {
2442         unsigned int x, y;
2443
2444         // The pixels are stored in h->mb array in the same order as levels,
2445         // copy them in output in the correct order.
2446         for(i=0; i<16; i++) {
2447             for (y=0; y<4; y++) {
2448                 for (x=0; x<4; x++) {
2449                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2450                 }
2451             }
2452         }
2453         for(i=16; i<16+4; i++) {
2454             for (y=0; y<4; y++) {
2455                 for (x=0; x<4; x++) {
2456                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2457                 }
2458             }
2459         }
2460         for(i=20; i<20+4; i++) {
2461             for (y=0; y<4; y++) {
2462                 for (x=0; x<4; x++) {
2463                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2464                 }
2465             }
2466         }
2467     } else {
2468         if(IS_INTRA(mb_type)){
2469             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2470                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2471
2472             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2473                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2474                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2475             }
2476
2477             if(IS_INTRA4x4(mb_type)){
2478                 if(simple || !s->encoding){
2479                     if(IS_8x8DCT(mb_type)){
2480                         for(i=0; i<16; i+=4){
2481                             uint8_t * const ptr= dest_y + block_offset[i];
2482                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2483                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2484                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2485                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2486                             if(nnz){
2487                                 if(nnz == 1 && h->mb[i*16])
2488                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2489                                 else
2490                                     idct_add(ptr, h->mb + i*16, linesize);
2491                             }
2492                         }
2493                     }else
2494                     for(i=0; i<16; i++){
2495                         uint8_t * const ptr= dest_y + block_offset[i];
2496                         uint8_t *topright;
2497                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2498                         int nnz, tr;
2499
2500                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2501                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2502                             assert(mb_y || linesize <= block_offset[i]);
2503                             if(!topright_avail){
2504                                 tr= ptr[3 - linesize]*0x01010101;
2505                                 topright= (uint8_t*) &tr;
2506                             }else
2507                                 topright= ptr + 4 - linesize;
2508                         }else
2509                             topright= NULL;
2510
2511                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2512                         nnz = h->non_zero_count_cache[ scan8[i] ];
2513                         if(nnz){
2514                             if(is_h264){
2515                                 if(nnz == 1 && h->mb[i*16])
2516                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2517                                 else
2518                                     idct_add(ptr, h->mb + i*16, linesize);
2519                             }else
2520                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2521                         }
2522                     }
2523                 }
2524             }else{
2525                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2526                 if(is_h264){
2527                     if(!transform_bypass)
2528                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2529                 }else
2530                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2531             }
2532             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2533                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2534         }else if(is_h264){
2535             hl_motion(h, dest_y, dest_cb, dest_cr,
2536                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2537                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2538                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2539         }
2540
2541
2542         if(!IS_INTRA4x4(mb_type)){
2543             if(is_h264){
2544                 if(IS_INTRA16x16(mb_type)){
2545                     for(i=0; i<16; i++){
2546                         if(h->non_zero_count_cache[ scan8[i] ])
2547                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2548                         else if(h->mb[i*16])
2549                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2550                     }
2551                 }else{
2552                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2553                     for(i=0; i<16; i+=di){
2554                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2555                         if(nnz){
2556                             if(nnz==1 && h->mb[i*16])
2557                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2558                             else
2559                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2560                         }
2561                     }
2562                 }
2563             }else{
2564                 for(i=0; i<16; i++){
2565                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2566                         uint8_t * const ptr= dest_y + block_offset[i];
2567                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2568                     }
2569                 }
2570             }
2571         }
2572
2573         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2574             uint8_t *dest[2] = {dest_cb, dest_cr};
2575             if(transform_bypass){
2576                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2577             }else{
2578                 idct_add = s->dsp.h264_idct_add;
2579                 idct_dc_add = s->dsp.h264_idct_dc_add;
2580                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2581                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2582             }
2583             if(is_h264){
2584                 for(i=16; i<16+8; i++){
2585                     if(h->non_zero_count_cache[ scan8[i] ])
2586                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2587                     else if(h->mb[i*16])
2588                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2589                 }
2590             }else{
2591                 for(i=16; i<16+8; i++){
2592                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2593                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2594                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2595                     }
2596                 }
2597             }
2598         }
2599     }
2600     if(h->deblocking_filter) {
2601         if (!simple && FRAME_MBAFF) {
2602             //FIXME try deblocking one mb at a time?
2603             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2604             const int mb_y = s->mb_y - 1;
2605             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2606             const int mb_xy= mb_x + mb_y*s->mb_stride;
2607             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2608             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2609             if (!bottom) return;
2610             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2611             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2612             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2613
2614             if(IS_INTRA(mb_type_top | mb_type_bottom))
2615                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2616
2617             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2618             // deblock a pair
2619             // top
2620             s->mb_y--;
2621             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2622             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2623             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2624             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2625             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2626             // bottom
2627             s->mb_y++;
2628             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2629             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2630             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2631             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2632             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2633         } else {
2634             tprintf(h->s.avctx, "call filter_mb\n");
2635             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2636             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2637             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2638         }
2639     }
2640 }
2641
2642 /**
2643  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2644  */
2645 static void hl_decode_mb_simple(H264Context *h){
2646     hl_decode_mb_internal(h, 1);
2647 }
2648
2649 /**
2650  * Process a macroblock; this handles edge cases, such as interlacing.
2651  */
2652 static void av_noinline hl_decode_mb_complex(H264Context *h){
2653     hl_decode_mb_internal(h, 0);
2654 }
2655
2656 static void hl_decode_mb(H264Context *h){
2657     MpegEncContext * const s = &h->s;
2658     const int mb_x= s->mb_x;
2659     const int mb_y= s->mb_y;
2660     const int mb_xy= mb_x + mb_y*s->mb_stride;
2661     const int mb_type= s->current_picture.mb_type[mb_xy];
2662     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2663
2664     if(!s->decode)
2665         return;
2666
2667     if (is_complex)
2668         hl_decode_mb_complex(h);
2669     else hl_decode_mb_simple(h);
2670 }
2671
2672 static void pic_as_field(Picture *pic, const int parity){
2673     int i;
2674     for (i = 0; i < 4; ++i) {
2675         if (parity == PICT_BOTTOM_FIELD)
2676             pic->data[i] += pic->linesize[i];
2677         pic->reference = parity;
2678         pic->linesize[i] *= 2;
2679     }
2680 }
2681
2682 static int split_field_copy(Picture *dest, Picture *src,
2683                             int parity, int id_add){
2684     int match = !!(src->reference & parity);
2685
2686     if (match) {
2687         *dest = *src;
2688         pic_as_field(dest, parity);
2689         dest->pic_id *= 2;
2690         dest->pic_id += id_add;
2691     }
2692
2693     return match;
2694 }
2695
2696 /**
2697  * Split one reference list into field parts, interleaving by parity
2698  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2699  * set to look at the actual start of data for that field.
2700  *
2701  * @param dest output list
2702  * @param dest_len maximum number of fields to put in dest
2703  * @param src the source reference list containing fields and/or field pairs
2704  *            (aka short_ref/long_ref, or
2705  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2706  * @param src_len number of Picture's in source (pairs and unmatched fields)
2707  * @param parity the parity of the picture being decoded/needing
2708  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2709  * @return number of fields placed in dest
2710  */
2711 static int split_field_half_ref_list(Picture *dest, int dest_len,
2712                                      Picture *src,  int src_len,  int parity){
2713     int same_parity   = 1;
2714     int same_i        = 0;
2715     int opp_i         = 0;
2716     int out_i;
2717     int field_output;
2718
2719     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2720         if (same_parity && same_i < src_len) {
2721             field_output = split_field_copy(dest + out_i, src + same_i,
2722                                             parity, 1);
2723             same_parity = !field_output;
2724             same_i++;
2725
2726         } else if (opp_i < src_len) {
2727             field_output = split_field_copy(dest + out_i, src + opp_i,
2728                                             PICT_FRAME - parity, 0);
2729             same_parity = field_output;
2730             opp_i++;
2731
2732         } else {
2733             break;
2734         }
2735     }
2736
2737     return out_i;
2738 }
2739
2740 /**
2741  * Split the reference frame list into a reference field list.
2742  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2743  * The input list contains both reference field pairs and
2744  * unmatched reference fields; it is ordered as spec describes
2745  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2746  * unmatched field pairs are also present. Conceptually this is equivalent
2747  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2748  *
2749  * @param dest output reference list where ordered fields are to be placed
2750  * @param dest_len max number of fields to place at dest
2751  * @param src source reference list, as described above
2752  * @param src_len number of pictures (pairs and unmatched fields) in src
2753  * @param parity parity of field being currently decoded
2754  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2755  * @param long_i index into src array that holds first long reference picture,
2756  *        or src_len if no long refs present.
2757  */
2758 static int split_field_ref_list(Picture *dest, int dest_len,
2759                                 Picture *src,  int src_len,
2760                                 int parity,    int long_i){
2761
2762     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2763     dest += i;
2764     dest_len -= i;
2765
2766     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2767                                    src_len - long_i, parity);
2768     return i;
2769 }
2770
2771 /**
2772  * fills the default_ref_list.
2773  */
2774 static int fill_default_ref_list(H264Context *h){
2775     MpegEncContext * const s = &h->s;
2776     int i;
2777     int smallest_poc_greater_than_current = -1;
2778     int structure_sel;
2779     Picture sorted_short_ref[32];
2780     Picture field_entry_list[2][32];
2781     Picture *frame_list[2];
2782
2783     if (FIELD_PICTURE) {
2784         structure_sel = PICT_FRAME;
2785         frame_list[0] = field_entry_list[0];
2786         frame_list[1] = field_entry_list[1];
2787     } else {
2788         structure_sel = 0;
2789         frame_list[0] = h->default_ref_list[0];
2790         frame_list[1] = h->default_ref_list[1];
2791     }
2792
2793     if(h->slice_type==B_TYPE){
2794         int list;
2795         int len[2];
2796         int short_len[2];
2797         int out_i;
2798         int limit= INT_MIN;
2799
2800         /* sort frame according to poc in B slice */
2801         for(out_i=0; out_i<h->short_ref_count; out_i++){
2802             int best_i=INT_MIN;
2803             int best_poc=INT_MAX;
2804
2805             for(i=0; i<h->short_ref_count; i++){
2806                 const int poc= h->short_ref[i]->poc;
2807                 if(poc > limit && poc < best_poc){
2808                     best_poc= poc;
2809                     best_i= i;
2810                 }
2811             }
2812
2813             assert(best_i != INT_MIN);
2814
2815             limit= best_poc;
2816             sorted_short_ref[out_i]= *h->short_ref[best_i];
2817             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2818             if (-1 == smallest_poc_greater_than_current) {
2819                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2820                     smallest_poc_greater_than_current = out_i;
2821                 }
2822             }
2823         }
2824
2825         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2826
2827         // find the largest poc
2828         for(list=0; list<2; list++){
2829             int index = 0;
2830             int j= -99;
2831             int step= list ? -1 : 1;
2832
2833             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2834                 int sel;
2835                 while(j<0 || j>= h->short_ref_count){
2836                     if(j != -99 && step == (list ? -1 : 1))
2837                         return -1;
2838                     step = -step;
2839                     j= smallest_poc_greater_than_current + (step>>1);
2840                 }
2841                 sel = sorted_short_ref[j].reference | structure_sel;
2842                 if(sel != PICT_FRAME) continue;
2843                 frame_list[list][index  ]= sorted_short_ref[j];
2844                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2845             }
2846             short_len[list] = index;
2847
2848             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2849                 int sel;
2850                 if(h->long_ref[i] == NULL) continue;
2851                 sel = h->long_ref[i]->reference | structure_sel;
2852                 if(sel != PICT_FRAME) continue;
2853
2854                 frame_list[ list ][index  ]= *h->long_ref[i];
2855                 frame_list[ list ][index++].pic_id= i;;
2856             }
2857             len[list] = index;
2858
2859             if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
2860                 // swap the two first elements of L1 when
2861                 // L0 and L1 are identical
2862                 Picture temp= frame_list[1][0];
2863                 frame_list[1][0] = frame_list[1][1];
2864                 frame_list[1][1] = temp;
2865             }
2866
2867         }
2868
2869         for(list=0; list<2; list++){
2870             if (FIELD_PICTURE)
2871                 len[list] = split_field_ref_list(h->default_ref_list[list],
2872                                                  h->ref_count[list],
2873                                                  frame_list[list],
2874                                                  len[list],
2875                                                  s->picture_structure,
2876                                                  short_len[list]);
2877
2878             if(len[list] < h->ref_count[ list ])
2879                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2880         }
2881
2882
2883     }else{
2884         int index=0;
2885         int short_len;
2886         for(i=0; i<h->short_ref_count; i++){
2887             int sel;
2888             sel = h->short_ref[i]->reference | structure_sel;
2889             if(sel != PICT_FRAME) continue;
2890             frame_list[0][index  ]= *h->short_ref[i];
2891             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2892         }
2893         short_len = index;
2894         for(i = 0; i < 16; i++){
2895             int sel;
2896             if(h->long_ref[i] == NULL) continue;
2897             sel = h->long_ref[i]->reference | structure_sel;
2898             if(sel != PICT_FRAME) continue;
2899             frame_list[0][index  ]= *h->long_ref[i];
2900             frame_list[0][index++].pic_id= i;;
2901         }
2902
2903         if (FIELD_PICTURE)
2904             index = split_field_ref_list(h->default_ref_list[0],
2905                                          h->ref_count[0], frame_list[0],
2906                                          index, s->picture_structure,
2907                                          short_len);
2908
2909         if(index < h->ref_count[0])
2910             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2911     }
2912 #ifdef TRACE
2913     for (i=0; i<h->ref_count[0]; i++) {
2914         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2915     }
2916     if(h->slice_type==B_TYPE){
2917         for (i=0; i<h->ref_count[1]; i++) {
2918             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
2919         }
2920     }
2921 #endif
2922     return 0;
2923 }
2924
2925 static void print_short_term(H264Context *h);
2926 static void print_long_term(H264Context *h);
2927
2928 /**
2929  * Extract structure information about the picture described by pic_num in
2930  * the current decoding context (frame or field). Note that pic_num is
2931  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2932  * @param pic_num picture number for which to extract structure information
2933  * @param structure one of PICT_XXX describing structure of picture
2934  *                      with pic_num
2935  * @return frame number (short term) or long term index of picture
2936  *         described by pic_num
2937  */
2938 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2939     MpegEncContext * const s = &h->s;
2940
2941     *structure = s->picture_structure;
2942     if(FIELD_PICTURE){
2943         if (!(pic_num & 1))
2944             /* opposite field */
2945             *structure ^= PICT_FRAME;
2946         pic_num >>= 1;
2947     }
2948
2949     return pic_num;
2950 }
2951
2952 static int decode_ref_pic_list_reordering(H264Context *h){
2953     MpegEncContext * const s = &h->s;
2954     int list, index, pic_structure;
2955
2956     print_short_term(h);
2957     print_long_term(h);
2958     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
2959
2960     for(list=0; list<h->list_count; list++){
2961         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2962
2963         if(get_bits1(&s->gb)){
2964             int pred= h->curr_pic_num;
2965
2966             for(index=0; ; index++){
2967                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2968                 unsigned int pic_id;
2969                 int i;
2970                 Picture *ref = NULL;
2971
2972                 if(reordering_of_pic_nums_idc==3)
2973                     break;
2974
2975                 if(index >= h->ref_count[list]){
2976                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2977                     return -1;
2978                 }
2979
2980                 if(reordering_of_pic_nums_idc<3){
2981                     if(reordering_of_pic_nums_idc<2){
2982                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2983                         int frame_num;
2984
2985                         if(abs_diff_pic_num > h->max_pic_num){
2986                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2987                             return -1;
2988                         }
2989
2990                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2991                         else                                pred+= abs_diff_pic_num;
2992                         pred &= h->max_pic_num - 1;
2993
2994                         frame_num = pic_num_extract(h, pred, &pic_structure);
2995
2996                         for(i= h->short_ref_count-1; i>=0; i--){
2997                             ref = h->short_ref[i];
2998                             assert(ref->reference);
2999                             assert(!ref->long_ref);
3000                             if(ref->data[0] != NULL &&
3001                                    ref->frame_num == frame_num &&
3002                                    (ref->reference & pic_structure) &&
3003                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3004                                 break;
3005                         }
3006                         if(i>=0)
3007                             ref->pic_id= pred;
3008                     }else{
3009                         int long_idx;
3010                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3011
3012                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3013
3014                         if(long_idx>31){
3015                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3016                             return -1;
3017                         }
3018                         ref = h->long_ref[long_idx];
3019                         assert(!(ref && !ref->reference));
3020                         if(ref && (ref->reference & pic_structure)){
3021                             ref->pic_id= pic_id;
3022                             assert(ref->long_ref);
3023                             i=0;
3024                         }else{
3025                             i=-1;
3026                         }
3027                     }
3028
3029                     if (i < 0) {
3030                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3031                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3032                     } else {
3033                         for(i=index; i+1<h->ref_count[list]; i++){
3034                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3035                                 break;
3036                         }
3037                         for(; i > index; i--){
3038                             h->ref_list[list][i]= h->ref_list[list][i-1];
3039                         }
3040                         h->ref_list[list][index]= *ref;
3041                         if (FIELD_PICTURE){
3042                             pic_as_field(&h->ref_list[list][index], pic_structure);
3043                         }
3044                     }
3045                 }else{
3046                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3047                     return -1;
3048                 }
3049             }
3050         }
3051     }
3052     for(list=0; list<h->list_count; list++){
3053         for(index= 0; index < h->ref_count[list]; index++){
3054             if(!h->ref_list[list][index].data[0])
3055                 h->ref_list[list][index]= s->current_picture;
3056         }
3057     }
3058
3059     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3060         direct_dist_scale_factor(h);
3061     direct_ref_list_init(h);
3062     return 0;
3063 }
3064
3065 static void fill_mbaff_ref_list(H264Context *h){
3066     int list, i, j;
3067     for(list=0; list<2; list++){ //FIXME try list_count
3068         for(i=0; i<h->ref_count[list]; i++){
3069             Picture *frame = &h->ref_list[list][i];
3070             Picture *field = &h->ref_list[list][16+2*i];
3071             field[0] = *frame;
3072             for(j=0; j<3; j++)
3073                 field[0].linesize[j] <<= 1;
3074             field[0].reference = PICT_TOP_FIELD;
3075             field[1] = field[0];
3076             for(j=0; j<3; j++)
3077                 field[1].data[j] += frame->linesize[j];
3078             field[1].reference = PICT_BOTTOM_FIELD;
3079
3080             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3081             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3082             for(j=0; j<2; j++){
3083                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3084                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3085             }
3086         }
3087     }
3088     for(j=0; j<h->ref_count[1]; j++){
3089         for(i=0; i<h->ref_count[0]; i++)
3090             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3091         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3092         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3093     }
3094 }
3095
3096 static int pred_weight_table(H264Context *h){
3097     MpegEncContext * const s = &h->s;
3098     int list, i;
3099     int luma_def, chroma_def;
3100
3101     h->use_weight= 0;
3102     h->use_weight_chroma= 0;
3103     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3104     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3105     luma_def = 1<<h->luma_log2_weight_denom;
3106     chroma_def = 1<<h->chroma_log2_weight_denom;
3107
3108     for(list=0; list<2; list++){
3109         for(i=0; i<h->ref_count[list]; i++){
3110             int luma_weight_flag, chroma_weight_flag;
3111
3112             luma_weight_flag= get_bits1(&s->gb);
3113             if(luma_weight_flag){
3114                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3115                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3116                 if(   h->luma_weight[list][i] != luma_def
3117                    || h->luma_offset[list][i] != 0)
3118                     h->use_weight= 1;
3119             }else{
3120                 h->luma_weight[list][i]= luma_def;
3121                 h->luma_offset[list][i]= 0;
3122             }
3123
3124             chroma_weight_flag= get_bits1(&s->gb);
3125             if(chroma_weight_flag){
3126                 int j;
3127                 for(j=0; j<2; j++){
3128                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3129                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3130                     if(   h->chroma_weight[list][i][j] != chroma_def
3131                        || h->chroma_offset[list][i][j] != 0)
3132                         h->use_weight_chroma= 1;
3133                 }
3134             }else{
3135                 int j;
3136                 for(j=0; j<2; j++){
3137                     h->chroma_weight[list][i][j]= chroma_def;
3138                     h->chroma_offset[list][i][j]= 0;
3139                 }
3140             }
3141         }
3142         if(h->slice_type != B_TYPE) break;
3143     }
3144     h->use_weight= h->use_weight || h->use_weight_chroma;
3145     return 0;
3146 }
3147
3148 static void implicit_weight_table(H264Context *h){
3149     MpegEncContext * const s = &h->s;
3150     int ref0, ref1;
3151     int cur_poc = s->current_picture_ptr->poc;
3152
3153     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3154        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3155         h->use_weight= 0;
3156         h->use_weight_chroma= 0;
3157         return;
3158     }
3159
3160     h->use_weight= 2;
3161     h->use_weight_chroma= 2;
3162     h->luma_log2_weight_denom= 5;
3163     h->chroma_log2_weight_denom= 5;
3164
3165     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3166         int poc0 = h->ref_list[0][ref0].poc;
3167         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3168             int poc1 = h->ref_list[1][ref1].poc;
3169             int td = av_clip(poc1 - poc0, -128, 127);
3170             if(td){
3171                 int tb = av_clip(cur_poc - poc0, -128, 127);
3172                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3173                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3174                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3175                     h->implicit_weight[ref0][ref1] = 32;
3176                 else
3177                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3178             }else
3179                 h->implicit_weight[ref0][ref1] = 32;
3180         }
3181     }
3182 }
3183
3184 /**
3185  * Mark a picture as no longer needed for reference. The refmask
3186  * argument allows unreferencing of individual fields or the whole frame.
3187  * If the picture becomes entirely unreferenced, but is being held for
3188  * display purposes, it is marked as such.
3189  * @param refmask mask of fields to unreference; the mask is bitwise
3190  *                anded with the reference marking of pic
3191  * @return non-zero if pic becomes entirely unreferenced (except possibly
3192  *         for display purposes) zero if one of the fields remains in
3193  *         reference
3194  */
3195 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3196     int i;
3197     if (pic->reference &= refmask) {
3198         return 0;
3199     } else {
3200         if(pic == h->delayed_output_pic)
3201             pic->reference=DELAYED_PIC_REF;
3202         else{
3203             for(i = 0; h->delayed_pic[i]; i++)
3204                 if(pic == h->delayed_pic[i]){
3205                     pic->reference=DELAYED_PIC_REF;
3206                     break;
3207                 }
3208         }
3209         return 1;
3210     }
3211 }
3212
3213 /**
3214  * instantaneous decoder refresh.
3215  */
3216 static void idr(H264Context *h){
3217     int i;
3218
3219     for(i=0; i<16; i++){
3220         if (h->long_ref[i] != NULL) {
3221             unreference_pic(h, h->long_ref[i], 0);
3222             h->long_ref[i]= NULL;
3223         }
3224     }
3225     h->long_ref_count=0;
3226
3227     for(i=0; i<h->short_ref_count; i++){
3228         unreference_pic(h, h->short_ref[i], 0);
3229         h->short_ref[i]= NULL;
3230     }
3231     h->short_ref_count=0;
3232 }
3233
3234 /* forget old pics after a seek */
3235 static void flush_dpb(AVCodecContext *avctx){
3236     H264Context *h= avctx->priv_data;
3237     int i;
3238     for(i=0; i<16; i++) {
3239         if(h->delayed_pic[i])
3240             h->delayed_pic[i]->reference= 0;
3241         h->delayed_pic[i]= NULL;
3242     }
3243     if(h->delayed_output_pic)
3244         h->delayed_output_pic->reference= 0;
3245     h->delayed_output_pic= NULL;
3246     idr(h);
3247     if(h->s.current_picture_ptr)
3248         h->s.current_picture_ptr->reference= 0;
3249     h->s.first_field= 0;
3250     ff_mpeg_flush(avctx);
3251 }
3252
3253 /**
3254  * Find a Picture in the short term reference list by frame number.
3255  * @param frame_num frame number to search for
3256  * @param idx the index into h->short_ref where returned picture is found
3257  *            undefined if no picture found.
3258  * @return pointer to the found picture, or NULL if no pic with the provided
3259  *                 frame number is found
3260  */
3261 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3262     MpegEncContext * const s = &h->s;
3263     int i;
3264
3265     for(i=0; i<h->short_ref_count; i++){
3266         Picture *pic= h->short_ref[i];
3267         if(s->avctx->debug&FF_DEBUG_MMCO)
3268             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3269         if(pic->frame_num == frame_num) {
3270             *idx = i;
3271             return pic;
3272         }
3273     }
3274     return NULL;
3275 }
3276
3277 /**
3278  * Remove a picture from the short term reference list by its index in
3279  * that list.  This does no checking on the provided index; it is assumed
3280  * to be valid. Other list entries are shifted down.
3281  * @param i index into h->short_ref of picture to remove.
3282  */
3283 static void remove_short_at_index(H264Context *h, int i){
3284     assert(i > 0 && i < h->short_ref_count);
3285     h->short_ref[i]= NULL;
3286     if (--h->short_ref_count)
3287         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3288 }
3289
3290 /**
3291  *
3292  * @return the removed picture or NULL if an error occurs
3293  */
3294 static Picture * remove_short(H264Context *h, int frame_num){
3295     MpegEncContext * const s = &h->s;
3296     Picture *pic;
3297     int i;
3298
3299     if(s->avctx->debug&FF_DEBUG_MMCO)
3300         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3301
3302     pic = find_short(h, frame_num, &i);
3303     if (pic)
3304         remove_short_at_index(h, i);
3305
3306     return pic;
3307 }
3308
3309 /**
3310  * Remove a picture from the long term reference list by its index in
3311  * that list.  This does no checking on the provided index; it is assumed
3312  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3313  * @param i index into h->long_ref of picture to remove.
3314  */
3315 static void remove_long_at_index(H264Context *h, int i){
3316     h->long_ref[i]= NULL;
3317     h->long_ref_count--;
3318 }
3319
3320 /**
3321  *
3322  * @return the removed picture or NULL if an error occurs
3323  */
3324 static Picture * remove_long(H264Context *h, int i){
3325     Picture *pic;
3326
3327     pic= h->long_ref[i];
3328     if (pic)
3329         remove_long_at_index(h, i);
3330
3331     return pic;
3332 }
3333
3334 /**
3335  * print short term list
3336  */
3337 static void print_short_term(H264Context *h) {
3338     uint32_t i;
3339     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3340         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3341         for(i=0; i<h->short_ref_count; i++){
3342             Picture *pic= h->short_ref[i];
3343             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3344         }
3345     }
3346 }
3347
3348 /**
3349  * print long term list
3350  */
3351 static void print_long_term(H264Context *h) {
3352     uint32_t i;
3353     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3354         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3355         for(i = 0; i < 16; i++){
3356             Picture *pic= h->long_ref[i];
3357             if (pic) {
3358                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3359             }
3360         }
3361     }
3362 }
3363
3364 /**
3365  * Executes the reference picture marking (memory management control operations).
3366  */
3367 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3368     MpegEncContext * const s = &h->s;
3369     int i, j;
3370     int current_ref_assigned=0;
3371     Picture *pic;
3372
3373     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3374         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3375
3376     for(i=0; i<mmco_count; i++){
3377         int structure, frame_num, unref_pic;
3378         if(s->avctx->debug&FF_DEBUG_MMCO)
3379             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3380
3381         switch(mmco[i].opcode){
3382         case MMCO_SHORT2UNUSED:
3383             if(s->avctx->debug&FF_DEBUG_MMCO)
3384                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3385             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3386             pic = find_short(h, frame_num, &j);
3387             if (pic) {
3388                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3389                     remove_short_at_index(h, j);
3390             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3391                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3392             break;
3393         case MMCO_SHORT2LONG:
3394             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3395                     h->long_ref[mmco[i].long_arg]->frame_num ==
3396                                               mmco[i].short_pic_num / 2) {
3397                 /* do nothing, we've already moved this field pair. */
3398             } else {
3399                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3400
3401                 pic= remove_long(h, mmco[i].long_arg);
3402                 if(pic) unreference_pic(h, pic, 0);
3403
3404                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3405                 if (h->long_ref[ mmco[i].long_arg ]){
3406                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3407                     h->long_ref_count++;
3408                 }
3409             }
3410             break;
3411         case MMCO_LONG2UNUSED:
3412             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3413             pic = h->long_ref[j];
3414             if (pic) {
3415                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3416                     remove_long_at_index(h, j);
3417             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3418                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3419             break;
3420         case MMCO_LONG:
3421             unref_pic = 1;
3422             if (FIELD_PICTURE && !s->first_field) {
3423                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3424                     /* Just mark second field as referenced */
3425                     unref_pic = 0;
3426                 } else if (s->current_picture_ptr->reference) {
3427                     /* First field in pair is in short term list or
3428                      * at a different long term index.
3429                      * This is not allowed; see 7.4.3, notes 2 and 3.
3430                      * Report the problem and keep the pair where it is,
3431                      * and mark this field valid.
3432                      */
3433                     av_log(h->s.avctx, AV_LOG_ERROR,
3434                         "illegal long term reference assignment for second "
3435                         "field in complementary field pair (first field is "
3436                         "short term or has non-matching long index)\n");
3437                     unref_pic = 0;
3438                 }
3439             }
3440
3441             if (unref_pic) {
3442                 pic= remove_long(h, mmco[i].long_arg);
3443                 if(pic) unreference_pic(h, pic, 0);
3444
3445                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3446                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3447                 h->long_ref_count++;
3448             }
3449
3450             s->current_picture_ptr->reference |= s->picture_structure;
3451             current_ref_assigned=1;
3452             break;
3453         case MMCO_SET_MAX_LONG:
3454             assert(mmco[i].long_arg <= 16);
3455             // just remove the long term which index is greater than new max
3456             for(j = mmco[i].long_arg; j<16; j++){
3457                 pic = remove_long(h, j);
3458                 if (pic) unreference_pic(h, pic, 0);
3459             }
3460             break;
3461         case MMCO_RESET:
3462             while(h->short_ref_count){
3463                 pic= remove_short(h, h->short_ref[0]->frame_num);
3464                 if(pic) unreference_pic(h, pic, 0);
3465             }
3466             for(j = 0; j < 16; j++) {
3467                 pic= remove_long(h, j);
3468                 if(pic) unreference_pic(h, pic, 0);
3469             }
3470             break;
3471         default: assert(0);
3472         }
3473     }
3474
3475     if (!current_ref_assigned && FIELD_PICTURE &&
3476             !s->first_field && s->current_picture_ptr->reference) {
3477
3478         /* Second field of complementary field pair; the first field of
3479          * which is already referenced. If short referenced, it
3480          * should be first entry in short_ref. If not, it must exist
3481          * in long_ref; trying to put it on the short list here is an
3482          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3483          */
3484         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3485             /* Just mark the second field valid */
3486             s->current_picture_ptr->reference = PICT_FRAME;
3487         } else if (s->current_picture_ptr->long_ref) {
3488             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3489                                              "assignment for second field "
3490                                              "in complementary field pair "
3491                                              "(first field is long term)\n");
3492         } else {
3493             /*
3494              * First field in reference, but not in any sensible place on our
3495              * reference lists. This shouldn't happen unless reference
3496              * handling somewhere else is wrong.
3497              */
3498             assert(0);
3499         }
3500         current_ref_assigned = 1;
3501     }
3502
3503     if(!current_ref_assigned){
3504         pic= remove_short(h, s->current_picture_ptr->frame_num);
3505         if(pic){
3506             unreference_pic(h, pic, 0);
3507             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3508         }
3509
3510         if(h->short_ref_count)
3511             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3512
3513         h->short_ref[0]= s->current_picture_ptr;
3514         h->short_ref[0]->long_ref=0;
3515         h->short_ref_count++;
3516         s->current_picture_ptr->reference |= s->picture_structure;
3517     }
3518
3519     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3520
3521         /* We have too many reference frames, probably due to corrupted
3522          * stream. Need to discard one frame. Prevents overrun of the
3523          * short_ref and long_ref buffers.
3524          */
3525         av_log(h->s.avctx, AV_LOG_ERROR,
3526                "number of reference frames exceeds max (probably "
3527                "corrupt input), discarding one\n");
3528
3529         if (h->long_ref_count) {
3530             for (i = 0; i < 16; ++i)
3531                 if (h->long_ref[i])
3532                     break;
3533
3534             assert(i < 16);
3535             pic = h->long_ref[i];
3536             remove_long_at_index(h, i);
3537         } else {
3538             pic = h->short_ref[h->short_ref_count - 1];
3539             remove_short_at_index(h, h->short_ref_count - 1);
3540         }
3541         unreference_pic(h, pic, 0);
3542     }
3543
3544     print_short_term(h);
3545     print_long_term(h);
3546     return 0;
3547 }
3548
3549 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3550     MpegEncContext * const s = &h->s;
3551     int i;
3552
3553     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3554         s->broken_link= get_bits1(gb) -1;
3555         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3556         if(h->mmco[0].long_arg == -1)
3557             h->mmco_index= 0;
3558         else{
3559             h->mmco[0].opcode= MMCO_LONG;
3560             h->mmco_index= 1;
3561         }
3562     }else{
3563         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3564             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3565                 MMCOOpcode opcode= get_ue_golomb(gb);
3566
3567                 h->mmco[i].opcode= opcode;
3568                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3569                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3570 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3571                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3572                         return -1;
3573                     }*/
3574                 }
3575                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3576                     unsigned int long_arg= get_ue_golomb(gb);
3577                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3578                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3579                         return -1;
3580                     }
3581                     h->mmco[i].long_arg= long_arg;
3582                 }
3583
3584                 if(opcode > (unsigned)MMCO_LONG){
3585                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3586                     return -1;
3587                 }
3588                 if(opcode == MMCO_END)
3589                     break;
3590             }
3591             h->mmco_index= i;
3592         }else{
3593             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3594
3595             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3596                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3597                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3598                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3599                 h->mmco_index= 1;
3600                 if (FIELD_PICTURE) {
3601                     h->mmco[0].short_pic_num *= 2;
3602                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3603                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3604                     h->mmco_index= 2;
3605                 }
3606             }else
3607                 h->mmco_index= 0;
3608         }
3609     }
3610
3611     return 0;
3612 }
3613
3614 static int init_poc(H264Context *h){
3615     MpegEncContext * const s = &h->s;
3616     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3617     int field_poc[2];
3618
3619     if(h->nal_unit_type == NAL_IDR_SLICE){
3620         h->frame_num_offset= 0;
3621     }else{
3622         if(h->frame_num < h->prev_frame_num)
3623             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3624         else
3625             h->frame_num_offset= h->prev_frame_num_offset;
3626     }
3627
3628     if(h->sps.poc_type==0){
3629         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3630
3631         if(h->nal_unit_type == NAL_IDR_SLICE){
3632              h->prev_poc_msb=
3633              h->prev_poc_lsb= 0;
3634         }
3635
3636         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3637             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3638         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3639             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3640         else
3641             h->poc_msb = h->prev_poc_msb;
3642 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3643         field_poc[0] =
3644         field_poc[1] = h->poc_msb + h->poc_lsb;
3645         if(s->picture_structure == PICT_FRAME)
3646             field_poc[1] += h->delta_poc_bottom;
3647     }else if(h->sps.poc_type==1){
3648         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3649         int i;
3650
3651         if(h->sps.poc_cycle_length != 0)
3652             abs_frame_num = h->frame_num_offset + h->frame_num;
3653         else
3654             abs_frame_num = 0;
3655
3656         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3657             abs_frame_num--;
3658
3659         expected_delta_per_poc_cycle = 0;
3660         for(i=0; i < h->sps.poc_cycle_length; i++)
3661             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3662
3663         if(abs_frame_num > 0){
3664             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3665             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3666
3667             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3668             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3669                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3670         } else
3671             expectedpoc = 0;
3672
3673         if(h->nal_ref_idc == 0)
3674             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3675
3676         field_poc[0] = expectedpoc + h->delta_poc[0];
3677         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3678
3679         if(s->picture_structure == PICT_FRAME)
3680             field_poc[1] += h->delta_poc[1];
3681     }else{
3682         int poc;
3683         if(h->nal_unit_type == NAL_IDR_SLICE){
3684             poc= 0;
3685         }else{
3686             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3687             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3688         }
3689         field_poc[0]= poc;
3690         field_poc[1]= poc;
3691     }
3692
3693     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3694         s->current_picture_ptr->field_poc[0]= field_poc[0];
3695         s->current_picture_ptr->poc = field_poc[0];
3696     }
3697     if(s->picture_structure != PICT_TOP_FIELD) {
3698         s->current_picture_ptr->field_poc[1]= field_poc[1];
3699         s->current_picture_ptr->poc = field_poc[1];
3700     }
3701     if(!FIELD_PICTURE || !s->first_field) {
3702         Picture *cur = s->current_picture_ptr;
3703         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3704     }
3705
3706     return 0;
3707 }
3708
3709
3710 /**
3711  * initialize scan tables
3712  */
3713 static void init_scan_tables(H264Context *h){
3714     MpegEncContext * const s = &h->s;
3715     int i;
3716     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3717         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3718         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3719     }else{
3720         for(i=0; i<16; i++){
3721 #define T(x) (x>>2) | ((x<<2) & 0xF)
3722             h->zigzag_scan[i] = T(zigzag_scan[i]);
3723             h-> field_scan[i] = T( field_scan[i]);
3724 #undef T
3725         }
3726     }
3727     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3728         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3729         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3730         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3731         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3732     }else{
3733         for(i=0; i<64; i++){
3734 #define T(x) (x>>3) | ((x&7)<<3)
3735             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3736             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3737             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3738             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3739 #undef T
3740         }
3741     }
3742     if(h->sps.transform_bypass){ //FIXME same ugly
3743         h->zigzag_scan_q0          = zigzag_scan;
3744         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3745         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3746         h->field_scan_q0           = field_scan;
3747         h->field_scan8x8_q0        = field_scan8x8;
3748         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3749     }else{
3750         h->zigzag_scan_q0          = h->zigzag_scan;
3751         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3752         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3753         h->field_scan_q0           = h->field_scan;
3754         h->field_scan8x8_q0        = h->field_scan8x8;
3755         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3756     }
3757 }
3758
3759 /**
3760  * Replicates H264 "master" context to thread contexts.
3761  */
3762 static void clone_slice(H264Context *dst, H264Context *src)
3763 {
3764     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3765     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3766     dst->s.current_picture      = src->s.current_picture;
3767     dst->s.linesize             = src->s.linesize;
3768     dst->s.uvlinesize           = src->s.uvlinesize;
3769     dst->s.first_field          = src->s.first_field;
3770
3771     dst->prev_poc_msb           = src->prev_poc_msb;
3772     dst->prev_poc_lsb           = src->prev_poc_lsb;
3773     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3774     dst->prev_frame_num         = src->prev_frame_num;
3775     dst->short_ref_count        = src->short_ref_count;
3776
3777     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3778     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3779     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3780     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3781
3782     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3783     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3784 }
3785
3786 /**
3787  * decodes a slice header.
3788  * this will allso call MPV_common_init() and frame_start() as needed
3789  *
3790  * @param h h264context
3791  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3792  *
3793  * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
3794  */
3795 static int decode_slice_header(H264Context *h, H264Context *h0){
3796     MpegEncContext * const s = &h->s;
3797     MpegEncContext * const s0 = &h0->s;
3798     unsigned int first_mb_in_slice;
3799     unsigned int pps_id;
3800     int num_ref_idx_active_override_flag;
3801     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3802     unsigned int slice_type, tmp, i;
3803     int default_ref_list_done = 0;
3804     int last_pic_structure;
3805
3806     s->dropable= h->nal_ref_idc == 0;
3807
3808     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3809         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3810         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3811     }else{
3812         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3813         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3814     }
3815
3816     first_mb_in_slice= get_ue_golomb(&s->gb);
3817
3818     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3819         h0->current_slice = 0;
3820         if (!s0->first_field)
3821             s->current_picture_ptr= NULL;
3822     }
3823
3824     slice_type= get_ue_golomb(&s->gb);
3825     if(slice_type > 9){
3826         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3827         return -1;
3828     }
3829     if(slice_type > 4){
3830         slice_type -= 5;
3831         h->slice_type_fixed=1;
3832     }else
3833         h->slice_type_fixed=0;
3834
3835     slice_type= slice_type_map[ slice_type ];
3836     if (slice_type == I_TYPE
3837         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3838         default_ref_list_done = 1;
3839     }
3840     h->slice_type= slice_type;
3841
3842     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3843     if (s->pict_type == B_TYPE && s0->last_picture_ptr == NULL) {
3844         av_log(h->s.avctx, AV_LOG_ERROR,
3845                "B picture before any references, skipping\n");
3846         return -1;
3847     }
3848
3849     pps_id= get_ue_golomb(&s->gb);
3850     if(pps_id>=MAX_PPS_COUNT){
3851         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3852         return -1;
3853     }
3854     if(!h0->pps_buffers[pps_id]) {
3855         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3856         return -1;
3857     }
3858     h->pps= *h0->pps_buffers[pps_id];
3859
3860     if(!h0->sps_buffers[h->pps.sps_id]) {
3861         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3862         return -1;
3863     }
3864     h->sps = *h0->sps_buffers[h->pps.sps_id];
3865
3866     if(h == h0 && h->dequant_coeff_pps != pps_id){
3867         h->dequant_coeff_pps = pps_id;
3868         init_dequant_tables(h);
3869     }
3870
3871     s->mb_width= h->sps.mb_width;
3872     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3873
3874     h->b_stride=  s->mb_width*4;
3875     h->b8_stride= s->mb_width*2;
3876
3877     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3878     if(h->sps.frame_mbs_only_flag)
3879         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3880     else
3881         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3882
3883     if (s->context_initialized
3884         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3885         if(h != h0)
3886             return -1;   // width / height changed during parallelized decoding
3887         free_tables(h);
3888         MPV_common_end(s);
3889     }
3890     if (!s->context_initialized) {
3891         if(h != h0)
3892             return -1;  // we cant (re-)initialize context during parallel decoding
3893         if (MPV_common_init(s) < 0)
3894             return -1;
3895         s->first_field = 0;
3896
3897         init_scan_tables(h);
3898         alloc_tables(h);
3899
3900         for(i = 1; i < s->avctx->thread_count; i++) {
3901             H264Context *c;
3902             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3903             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3904             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3905             c->sps = h->sps;
3906             c->pps = h->pps;
3907             init_scan_tables(c);
3908             clone_tables(c, h);
3909         }
3910
3911         for(i = 0; i < s->avctx->thread_count; i++)
3912             if(context_init(h->thread_context[i]) < 0)
3913                 return -1;
3914
3915         s->avctx->width = s->width;
3916         s->avctx->height = s->height;
3917         s->avctx->sample_aspect_ratio= h->sps.sar;
3918         if(!s->avctx->sample_aspect_ratio.den)
3919             s->avctx->sample_aspect_ratio.den = 1;
3920
3921         if(h->sps.timing_info_present_flag){
3922             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3923             if(h->x264_build > 0 && h->x264_build < 44)
3924                 s->avctx->time_base.den *= 2;
3925             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3926                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3927         }
3928     }
3929
3930     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3931
3932     h->mb_mbaff = 0;
3933     h->mb_aff_frame = 0;
3934     last_pic_structure = s0->picture_structure;
3935     if(h->sps.frame_mbs_only_flag){
3936         s->picture_structure= PICT_FRAME;
3937     }else{
3938         if(get_bits1(&s->gb)) { //field_pic_flag
3939             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3940         } else {
3941             s->picture_structure= PICT_FRAME;
3942             h->mb_aff_frame = h->sps.mb_aff;
3943         }
3944     }
3945
3946     if(h0->current_slice == 0){
3947         /* See if we have a decoded first field looking for a pair... */
3948         if (s0->first_field) {
3949             assert(s0->current_picture_ptr);
3950             assert(s0->current_picture_ptr->data[0]);
3951             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3952
3953             /* figure out if we have a complementary field pair */
3954             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3955                 /*
3956                  * Previous field is unmatched. Don't display it, but let it
3957                  * remain for reference if marked as such.
3958                  */
3959                 s0->current_picture_ptr = NULL;
3960                 s0->first_field = FIELD_PICTURE;
3961
3962             } else {
3963                 if (h->nal_ref_idc &&
3964                         s0->current_picture_ptr->reference &&
3965                         s0->current_picture_ptr->frame_num != h->frame_num) {
3966                     /*
3967                      * This and previous field were reference, but had
3968                      * different frame_nums. Consider this field first in
3969                      * pair. Throw away previous field except for reference
3970                      * purposes.
3971                      */
3972                     s0->first_field = 1;
3973                     s0->current_picture_ptr = NULL;
3974
3975                 } else {
3976                     /* Second field in complementary pair */
3977                     s0->first_field = 0;
3978                 }
3979             }
3980
3981         } else {
3982             /* Frame or first field in a potentially complementary pair */
3983             assert(!s0->current_picture_ptr);
3984             s0->first_field = FIELD_PICTURE;
3985         }
3986
3987         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3988             s0->first_field = 0;
3989             return -1;
3990         }
3991     }
3992     if(h != h0)
3993         clone_slice(h, h0);
3994
3995     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3996
3997     assert(s->mb_num == s->mb_width * s->mb_height);
3998     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3999        first_mb_in_slice                    >= s->mb_num){
4000         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4001         return -1;
4002     }
4003     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4004     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4005     if (s->picture_structure == PICT_BOTTOM_FIELD)
4006         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4007     assert(s->mb_y < s->mb_height);
4008
4009     if(s->picture_structure==PICT_FRAME){
4010         h->curr_pic_num=   h->frame_num;
4011         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4012     }else{
4013         h->curr_pic_num= 2*h->frame_num + 1;
4014         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4015     }
4016
4017     if(h->nal_unit_type == NAL_IDR_SLICE){
4018         get_ue_golomb(&s->gb); /* idr_pic_id */
4019     }
4020
4021     if(h->sps.poc_type==0){
4022         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4023
4024         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4025             h->delta_poc_bottom= get_se_golomb(&s->gb);
4026         }
4027     }
4028
4029     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4030         h->delta_poc[0]= get_se_golomb(&s->gb);
4031
4032         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4033             h->delta_poc[1]= get_se_golomb(&s->gb);
4034     }
4035
4036     init_poc(h);
4037
4038     if(h->pps.redundant_pic_cnt_present){
4039         h->redundant_pic_count= get_ue_golomb(&s->gb);
4040     }
4041
4042     //set defaults, might be overriden a few line later
4043     h->ref_count[0]= h->pps.ref_count[0];
4044     h->ref_count[1]= h->pps.ref_count[1];
4045
4046     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4047         if(h->slice_type == B_TYPE){
4048             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4049             if(FIELD_OR_MBAFF_PICTURE && h->direct_spatial_mv_pred)
4050                 av_log(h->s.avctx, AV_LOG_ERROR, "Interlaced pictures + spatial direct mode is not implemented\n");
4051         }
4052         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4053
4054         if(num_ref_idx_active_override_flag){
4055             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4056             if(h->slice_type==B_TYPE)
4057                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4058
4059             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4060                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4061                 h->ref_count[0]= h->ref_count[1]= 1;
4062                 return -1;
4063             }
4064         }
4065         if(h->slice_type == B_TYPE)
4066             h->list_count= 2;
4067         else
4068             h->list_count= 1;
4069     }else
4070         h->list_count= 0;
4071
4072     if(!default_ref_list_done){
4073         fill_default_ref_list(h);
4074     }
4075
4076     if(decode_ref_pic_list_reordering(h) < 0)
4077         return -1;
4078
4079     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4080        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4081         pred_weight_table(h);
4082     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4083         implicit_weight_table(h);
4084     else
4085         h->use_weight = 0;
4086
4087     if(h->nal_ref_idc)
4088         decode_ref_pic_marking(h0, &s->gb);
4089
4090     if(FRAME_MBAFF)
4091         fill_mbaff_ref_list(h);
4092
4093     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4094         tmp = get_ue_golomb(&s->gb);
4095         if(tmp > 2){
4096             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4097             return -1;
4098         }
4099         h->cabac_init_idc= tmp;
4100     }
4101
4102     h->last_qscale_diff = 0;
4103     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4104     if(tmp>51){
4105         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4106         return -1;
4107     }
4108     s->qscale= tmp;
4109     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4110     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4111     //FIXME qscale / qp ... stuff
4112     if(h->slice_type == SP_TYPE){
4113         get_bits1(&s->gb); /* sp_for_switch_flag */
4114     }
4115     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4116         get_se_golomb(&s->gb); /* slice_qs_delta */
4117     }
4118
4119     h->deblocking_filter = 1;
4120     h->slice_alpha_c0_offset = 0;
4121     h->slice_beta_offset = 0;
4122     if( h->pps.deblocking_filter_parameters_present ) {
4123         tmp= get_ue_golomb(&s->gb);
4124         if(tmp > 2){
4125             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4126             return -1;
4127         }
4128         h->deblocking_filter= tmp;
4129         if(h->deblocking_filter < 2)
4130             h->deblocking_filter^= 1; // 1<->0
4131
4132         if( h->deblocking_filter ) {
4133             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4134             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4135         }
4136     }
4137
4138     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4139        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4140        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4141        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4142         h->deblocking_filter= 0;
4143
4144     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4145         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4146             /* Cheat slightly for speed:
4147                Do not bother to deblock across slices. */
4148             h->deblocking_filter = 2;
4149         } else {
4150             h0->max_contexts = 1;
4151             if(!h0->single_decode_warning) {
4152                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4153                 h0->single_decode_warning = 1;
4154             }
4155             if(h != h0)
4156                 return 1; // deblocking switched inside frame
4157         }
4158     }
4159
4160 #if 0 //FMO
4161     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4162         slice_group_change_cycle= get_bits(&s->gb, ?);
4163 #endif
4164
4165     h0->last_slice_type = slice_type;
4166     h->slice_num = ++h0->current_slice;
4167
4168     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4169     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4170
4171     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4172         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4173                h->slice_num,
4174                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4175                first_mb_in_slice,
4176                av_get_pict_type_char(h->slice_type),
4177                pps_id, h->frame_num,
4178                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4179                h->ref_count[0], h->ref_count[1],
4180                s->qscale,
4181                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4182                h->use_weight,
4183                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4184                );
4185     }
4186
4187     return 0;
4188 }
4189
4190 /**
4191  *
4192  */
4193 static inline int get_level_prefix(GetBitContext *gb){
4194     unsigned int buf;
4195     int log;
4196
4197     OPEN_READER(re, gb);
4198     UPDATE_CACHE(re, gb);
4199     buf=GET_CACHE(re, gb);
4200
4201     log= 32 - av_log2(buf);
4202 #ifdef TRACE
4203     print_bin(buf>>(32-log), log);
4204     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4205 #endif
4206
4207     LAST_SKIP_BITS(re, gb, log);
4208     CLOSE_READER(re, gb);
4209
4210     return log-1;
4211 }
4212
4213 static inline int get_dct8x8_allowed(H264Context *h){
4214     int i;
4215     for(i=0; i<4; i++){
4216         if(!IS_SUB_8X8(h->sub_mb_type[i])
4217            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4218             return 0;
4219     }
4220     return 1;
4221 }
4222
4223 /**
4224  * decodes a residual block.
4225  * @param n block index
4226  * @param scantable scantable
4227  * @param max_coeff number of coefficients in the block
4228  * @return <0 if an error occured
4229  */
4230 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4231     MpegEncContext * const s = &h->s;
4232     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4233     int level[16];
4234     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4235
4236     //FIXME put trailing_onex into the context
4237
4238     if(n == CHROMA_DC_BLOCK_INDEX){
4239         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4240         total_coeff= coeff_token>>2;
4241     }else{
4242         if(n == LUMA_DC_BLOCK_INDEX){
4243             total_coeff= pred_non_zero_count(h, 0);
4244             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4245             total_coeff= coeff_token>>2;
4246         }else{
4247             total_coeff= pred_non_zero_count(h, n);
4248             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4249             total_coeff= coeff_token>>2;
4250             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4251         }
4252     }
4253
4254     //FIXME set last_non_zero?
4255
4256     if(total_coeff==0)
4257         return 0;
4258     if(total_coeff > (unsigned)max_coeff) {
4259         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4260         return -1;
4261     }
4262
4263     trailing_ones= coeff_token&3;
4264     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4265     assert(total_coeff<=16);
4266
4267     for(i=0; i<trailing_ones; i++){
4268         level[i]= 1 - 2*get_bits1(gb);
4269     }
4270
4271     if(i<total_coeff) {
4272         int level_code, mask;
4273         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4274         int prefix= get_level_prefix(gb);
4275
4276         //first coefficient has suffix_length equal to 0 or 1
4277         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4278             if(suffix_length)
4279                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4280             else
4281                 level_code= (prefix<<suffix_length); //part
4282         }else if(prefix==14){
4283             if(suffix_length)
4284                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4285             else
4286                 level_code= prefix + get_bits(gb, 4); //part
4287         }else if(prefix==15){
4288             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4289             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4290         }else{
4291             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4292             return -1;
4293         }
4294
4295         if(trailing_ones < 3) level_code += 2;
4296
4297         suffix_length = 1;
4298         if(level_code > 5)
4299             suffix_length++;
4300         mask= -(level_code&1);
4301         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4302         i++;
4303
4304         //remaining coefficients have suffix_length > 0
4305         for(;i<total_coeff;i++) {
4306             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4307             prefix = get_level_prefix(gb);
4308             if(prefix<15){
4309                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4310             }else if(prefix==15){
4311                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4312             }else{
4313                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4314                 return -1;
4315             }
4316             mask= -(level_code&1);
4317             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4318             if(level_code > suffix_limit[suffix_length])
4319                 suffix_length++;
4320         }
4321     }
4322
4323     if(total_coeff == max_coeff)
4324         zeros_left=0;
4325     else{
4326         if(n == CHROMA_DC_BLOCK_INDEX)
4327             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4328         else
4329             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4330     }
4331
4332     coeff_num = zeros_left + total_coeff - 1;
4333     j = scantable[coeff_num];
4334     if(n > 24){
4335         block[j] = level[0];
4336         for(i=1;i<total_coeff;i++) {
4337             if(zeros_left <= 0)
4338                 run_before = 0;
4339             else if(zeros_left < 7){
4340                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4341             }else{
4342                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4343             }
4344             zeros_left -= run_before;
4345             coeff_num -= 1 + run_before;
4346             j= scantable[ coeff_num ];
4347
4348             block[j]= level[i];
4349         }
4350     }else{
4351         block[j] = (level[0] * qmul[j] + 32)>>6;
4352         for(i=1;i<total_coeff;i++) {
4353             if(zeros_left <= 0)
4354                 run_before = 0;
4355             else if(zeros_left < 7){
4356                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4357             }else{
4358                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4359             }
4360             zeros_left -= run_before;
4361             coeff_num -= 1 + run_before;
4362             j= scantable[ coeff_num ];
4363
4364             block[j]= (level[i] * qmul[j] + 32)>>6;
4365         }
4366     }
4367
4368     if(zeros_left<0){
4369         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4370         return -1;
4371     }
4372
4373     return 0;
4374 }
4375
4376 static void predict_field_decoding_flag(H264Context *h){
4377     MpegEncContext * const s = &h->s;
4378     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4379     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4380                 ? s->current_picture.mb_type[mb_xy-1]
4381                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4382                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4383                 : 0;
4384     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4385 }
4386
4387 /**
4388  * decodes a P_SKIP or B_SKIP macroblock
4389  */
4390 static void decode_mb_skip(H264Context *h){
4391     MpegEncContext * const s = &h->s;
4392     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4393     int mb_type=0;
4394
4395     memset(h->non_zero_count[mb_xy], 0, 16);
4396     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4397
4398     if(MB_FIELD)
4399         mb_type|= MB_TYPE_INTERLACED;
4400
4401     if( h->slice_type == B_TYPE )
4402     {
4403         // just for fill_caches. pred_direct_motion will set the real mb_type
4404         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4405
4406         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4407         pred_direct_motion(h, &mb_type);
4408         mb_type|= MB_TYPE_SKIP;
4409     }
4410     else
4411     {
4412         int mx, my;
4413         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4414
4415         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4416         pred_pskip_motion(h, &mx, &my);
4417         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4418         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4419     }
4420
4421     write_back_motion(h, mb_type);
4422     s->current_picture.mb_type[mb_xy]= mb_type;
4423     s->current_picture.qscale_table[mb_xy]= s->qscale;
4424     h->slice_table[ mb_xy ]= h->slice_num;
4425     h->prev_mb_skipped= 1;
4426 }
4427
4428 /**
4429  * decodes a macroblock
4430  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4431  */
4432 static int decode_mb_cavlc(H264Context *h){
4433     MpegEncContext * const s = &h->s;
4434     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4435     int partition_count;
4436     unsigned int mb_type, cbp;
4437     int dct8x8_allowed= h->pps.transform_8x8_mode;
4438
4439     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4440
4441     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4442     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4443                 down the code */
4444     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4445         if(s->mb_skip_run==-1)
4446             s->mb_skip_run= get_ue_golomb(&s->gb);
4447
4448         if (s->mb_skip_run--) {
4449             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4450                 if(s->mb_skip_run==0)
4451                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4452                 else
4453                     predict_field_decoding_flag(h);
4454             }
4455             decode_mb_skip(h);
4456             return 0;
4457         }
4458     }
4459     if(FRAME_MBAFF){
4460         if( (s->mb_y&1) == 0 )
4461             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4462     }else
4463         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4464
4465     h->prev_mb_skipped= 0;
4466
4467     mb_type= get_ue_golomb(&s->gb);
4468     if(h->slice_type == B_TYPE){
4469         if(mb_type < 23){
4470             partition_count= b_mb_type_info[mb_type].partition_count;
4471             mb_type=         b_mb_type_info[mb_type].type;
4472         }else{
4473             mb_type -= 23;
4474             goto decode_intra_mb;
4475         }
4476     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4477         if(mb_type < 5){
4478             partition_count= p_mb_type_info[mb_type].partition_count;
4479             mb_type=         p_mb_type_info[mb_type].type;
4480         }else{
4481             mb_type -= 5;
4482             goto decode_intra_mb;
4483         }
4484     }else{
4485        assert(h->slice_type == I_TYPE);
4486 decode_intra_mb:
4487         if(mb_type > 25){
4488             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4489             return -1;
4490         }
4491         partition_count=0;
4492         cbp= i_mb_type_info[mb_type].cbp;
4493         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4494         mb_type= i_mb_type_info[mb_type].type;
4495     }
4496
4497     if(MB_FIELD)
4498         mb_type |= MB_TYPE_INTERLACED;
4499
4500     h->slice_table[ mb_xy ]= h->slice_num;
4501
4502     if(IS_INTRA_PCM(mb_type)){
4503         unsigned int x, y;
4504
4505         // We assume these blocks are very rare so we do not optimize it.
4506         align_get_bits(&s->gb);
4507
4508         // The pixels are stored in the same order as levels in h->mb array.
4509         for(y=0; y<16; y++){
4510             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4511             for(x=0; x<16; x++){
4512                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4513                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4514             }
4515         }
4516         for(y=0; y<8; y++){
4517             const int index= 256 + 4*(y&3) + 32*(y>>2);
4518             for(x=0; x<8; x++){
4519                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4520                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4521             }
4522         }
4523         for(y=0; y<8; y++){
4524             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4525             for(x=0; x<8; x++){
4526                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4527                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4528             }
4529         }
4530
4531         // In deblocking, the quantizer is 0
4532         s->current_picture.qscale_table[mb_xy]= 0;
4533         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4534         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4535         // All coeffs are present
4536         memset(h->non_zero_count[mb_xy], 16, 16);
4537
4538         s->current_picture.mb_type[mb_xy]= mb_type;
4539         return 0;
4540     }
4541
4542     if(MB_MBAFF){
4543         h->ref_count[0] <<= 1;
4544         h->ref_count[1] <<= 1;
4545     }
4546
4547     fill_caches(h, mb_type, 0);
4548
4549     //mb_pred
4550     if(IS_INTRA(mb_type)){
4551             int pred_mode;
4552 //            init_top_left_availability(h);
4553             if(IS_INTRA4x4(mb_type)){
4554                 int i;
4555                 int di = 1;
4556                 if(dct8x8_allowed && get_bits1(&s->gb)){
4557                     mb_type |= MB_TYPE_8x8DCT;
4558                     di = 4;
4559                 }
4560
4561 //                fill_intra4x4_pred_table(h);
4562                 for(i=0; i<16; i+=di){
4563                     int mode= pred_intra_mode(h, i);
4564
4565                     if(!get_bits1(&s->gb)){
4566                         const int rem_mode= get_bits(&s->gb, 3);
4567                         mode = rem_mode + (rem_mode >= mode);
4568                     }
4569
4570                     if(di==4)
4571                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4572                     else
4573                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4574                 }
4575                 write_back_intra_pred_mode(h);
4576                 if( check_intra4x4_pred_mode(h) < 0)
4577                     return -1;
4578             }else{
4579                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4580                 if(h->intra16x16_pred_mode < 0)
4581                     return -1;
4582             }
4583
4584             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4585             if(pred_mode < 0)
4586                 return -1;
4587             h->chroma_pred_mode= pred_mode;
4588     }else if(partition_count==4){
4589         int i, j, sub_partition_count[4], list, ref[2][4];
4590
4591         if(h->slice_type == B_TYPE){
4592             for(i=0; i<4; i++){
4593                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4594                 if(h->sub_mb_type[i] >=13){
4595                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4596                     return -1;
4597                 }
4598                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4599                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4600             }
4601             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4602                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4603                 pred_direct_motion(h, &mb_type);
4604                 h->ref_cache[0][scan8[4]] =
4605                 h->ref_cache[1][scan8[4]] =
4606                 h->ref_cache[0][scan8[12]] =
4607                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4608             }
4609         }else{
4610             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4611             for(i=0; i<4; i++){
4612                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4613                 if(h->sub_mb_type[i] >=4){
4614                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4615                     return -1;
4616                 }
4617                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4618                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4619             }
4620         }
4621
4622         for(list=0; list<h->list_count; list++){
4623             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4624             for(i=0; i<4; i++){
4625                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4626                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4627                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4628                     if(tmp>=ref_count){
4629                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4630                         return -1;
4631                     }
4632                     ref[list][i]= tmp;
4633                 }else{
4634                  //FIXME
4635                     ref[list][i] = -1;
4636                 }
4637             }
4638         }
4639
4640         if(dct8x8_allowed)
4641             dct8x8_allowed = get_dct8x8_allowed(h);
4642
4643         for(list=0; list<h->list_count; list++){
4644             for(i=0; i<4; i++){
4645                 if(IS_DIRECT(h->sub_mb_type[i])) {
4646                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4647                     continue;
4648                 }
4649                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4650                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4651
4652                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4653                     const int sub_mb_type= h->sub_mb_type[i];
4654                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4655                     for(j=0; j<sub_partition_count[i]; j++){
4656                         int mx, my;
4657                         const int index= 4*i + block_width*j;
4658                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4659                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4660                         mx += get_se_golomb(&s->gb);
4661                         my += get_se_golomb(&s->gb);
4662                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4663
4664                         if(IS_SUB_8X8(sub_mb_type)){
4665                             mv_cache[ 1 ][0]=
4666                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4667                             mv_cache[ 1 ][1]=
4668                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4669                         }else if(IS_SUB_8X4(sub_mb_type)){
4670                             mv_cache[ 1 ][0]= mx;
4671                             mv_cache[ 1 ][1]= my;
4672                         }else if(IS_SUB_4X8(sub_mb_type)){
4673                             mv_cache[ 8 ][0]= mx;
4674                             mv_cache[ 8 ][1]= my;
4675                         }
4676                         mv_cache[ 0 ][0]= mx;
4677                         mv_cache[ 0 ][1]= my;
4678                     }
4679                 }else{
4680                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4681                     p[0] = p[1]=
4682                     p[8] = p[9]= 0;
4683                 }
4684             }
4685         }
4686     }else if(IS_DIRECT(mb_type)){
4687         pred_direct_motion(h, &mb_type);
4688         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4689     }else{
4690         int list, mx, my, i;
4691          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4692         if(IS_16X16(mb_type)){
4693             for(list=0; list<h->list_count; list++){
4694                     unsigned int val;
4695                     if(IS_DIR(mb_type, 0, list)){
4696                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4697                         if(val >= h->ref_count[list]){
4698                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4699                             return -1;
4700                         }
4701                     }else
4702                         val= LIST_NOT_USED&0xFF;
4703                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4704             }
4705             for(list=0; list<h->list_count; list++){
4706                 unsigned int val;
4707                 if(IS_DIR(mb_type, 0, list)){
4708                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4709                     mx += get_se_golomb(&s->gb);
4710                     my += get_se_golomb(&s->gb);
4711                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4712
4713                     val= pack16to32(mx,my);
4714                 }else
4715                     val=0;
4716                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4717             }
4718         }
4719         else if(IS_16X8(mb_type)){
4720             for(list=0; list<h->list_count; list++){
4721                     for(i=0; i<2; i++){
4722                         unsigned int val;
4723                         if(IS_DIR(mb_type, i, list)){
4724                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4725                             if(val >= h->ref_count[list]){
4726                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4727                                 return -1;
4728                             }
4729                         }else
4730                             val= LIST_NOT_USED&0xFF;
4731                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4732                     }
4733             }
4734             for(list=0; list<h->list_count; list++){
4735                 for(i=0; i<2; i++){
4736                     unsigned int val;
4737                     if(IS_DIR(mb_type, i, list)){
4738                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4739                         mx += get_se_golomb(&s->gb);
4740                         my += get_se_golomb(&s->gb);
4741                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4742
4743                         val= pack16to32(mx,my);
4744                     }else
4745                         val=0;
4746                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4747                 }
4748             }
4749         }else{
4750             assert(IS_8X16(mb_type));
4751             for(list=0; list<h->list_count; list++){
4752                     for(i=0; i<2; i++){
4753                         unsigned int val;
4754                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4755                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4756                             if(val >= h->ref_count[list]){
4757                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4758                                 return -1;
4759                             }
4760                         }else
4761                             val= LIST_NOT_USED&0xFF;
4762                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4763                     }
4764             }
4765             for(list=0; list<h->list_count; list++){
4766                 for(i=0; i<2; i++){
4767                     unsigned int val;
4768                     if(IS_DIR(mb_type, i, list)){
4769                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4770                         mx += get_se_golomb(&s->gb);
4771                         my += get_se_golomb(&s->gb);
4772                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4773
4774                         val= pack16to32(mx,my);
4775                     }else
4776                         val=0;
4777                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4778                 }
4779             }
4780         }
4781     }
4782
4783     if(IS_INTER(mb_type))
4784         write_back_motion(h, mb_type);
4785
4786     if(!IS_INTRA16x16(mb_type)){
4787         cbp= get_ue_golomb(&s->gb);
4788         if(cbp > 47){
4789             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4790             return -1;
4791         }
4792
4793         if(IS_INTRA4x4(mb_type))
4794             cbp= golomb_to_intra4x4_cbp[cbp];
4795         else
4796             cbp= golomb_to_inter_cbp[cbp];
4797     }
4798     h->cbp = cbp;
4799
4800     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4801         if(get_bits1(&s->gb))
4802             mb_type |= MB_TYPE_8x8DCT;
4803     }
4804     s->current_picture.mb_type[mb_xy]= mb_type;
4805
4806     if(cbp || IS_INTRA16x16(mb_type)){
4807         int i8x8, i4x4, chroma_idx;
4808         int dquant;
4809         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4810         const uint8_t *scan, *scan8x8, *dc_scan;
4811
4812 //        fill_non_zero_count_cache(h);
4813
4814         if(IS_INTERLACED(mb_type)){
4815             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4816             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4817             dc_scan= luma_dc_field_scan;
4818         }else{
4819             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4820             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4821             dc_scan= luma_dc_zigzag_scan;
4822         }
4823
4824         dquant= get_se_golomb(&s->gb);
4825
4826         if( dquant > 25 || dquant < -26 ){
4827             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4828             return -1;
4829         }
4830
4831         s->qscale += dquant;
4832         if(((unsigned)s->qscale) > 51){
4833             if(s->qscale<0) s->qscale+= 52;
4834             else            s->qscale-= 52;
4835         }
4836
4837         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4838         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4839         if(IS_INTRA16x16(mb_type)){
4840             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4841                 return -1; //FIXME continue if partitioned and other return -1 too
4842             }
4843
4844             assert((cbp&15) == 0 || (cbp&15) == 15);
4845
4846             if(cbp&15){
4847                 for(i8x8=0; i8x8<4; i8x8++){
4848                     for(i4x4=0; i4x4<4; i4x4++){
4849                         const int index= i4x4 + 4*i8x8;
4850                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4851                             return -1;
4852                         }
4853                     }
4854                 }
4855             }else{
4856                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4857             }
4858         }else{
4859             for(i8x8=0; i8x8<4; i8x8++){
4860                 if(cbp & (1<<i8x8)){
4861                     if(IS_8x8DCT(mb_type)){
4862                         DCTELEM *buf = &h->mb[64*i8x8];
4863                         uint8_t *nnz;
4864                         for(i4x4=0; i4x4<4; i4x4++){
4865                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4866                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4867                                 return -1;
4868                         }
4869                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4870                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4871                     }else{
4872                         for(i4x4=0; i4x4<4; i4x4++){
4873                             const int index= i4x4 + 4*i8x8;
4874
4875                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4876                                 return -1;
4877                             }
4878                         }
4879                     }
4880                 }else{
4881                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4882                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4883                 }
4884             }
4885         }
4886
4887         if(cbp&0x30){
4888             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4889                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4890                     return -1;
4891                 }
4892         }
4893
4894         if(cbp&0x20){
4895             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4896                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4897                 for(i4x4=0; i4x4<4; i4x4++){
4898                     const int index= 16 + 4*chroma_idx + i4x4;
4899                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4900                         return -1;
4901                     }
4902                 }
4903             }
4904         }else{
4905             uint8_t * const nnz= &h->non_zero_count_cache[0];
4906             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4907             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4908         }
4909     }else{
4910         uint8_t * const nnz= &h->non_zero_count_cache[0];
4911         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4912         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4913         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4914     }
4915     s->current_picture.qscale_table[mb_xy]= s->qscale;
4916     write_back_non_zero_count(h);
4917
4918     if(MB_MBAFF){
4919         h->ref_count[0] >>= 1;
4920         h->ref_count[1] >>= 1;
4921     }
4922
4923     return 0;
4924 }
4925
4926 static int decode_cabac_field_decoding_flag(H264Context *h) {
4927     MpegEncContext * const s = &h->s;
4928     const int mb_x = s->mb_x;
4929     const int mb_y = s->mb_y & ~1;
4930     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4931     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4932
4933     unsigned int ctx = 0;
4934
4935     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4936         ctx += 1;
4937     }
4938     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4939         ctx += 1;
4940     }
4941
4942     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4943 }
4944
4945 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4946     uint8_t *state= &h->cabac_state[ctx_base];
4947     int mb_type;
4948
4949     if(intra_slice){
4950         MpegEncContext * const s = &h->s;
4951         const int mba_xy = h->left_mb_xy[0];
4952         const int mbb_xy = h->top_mb_xy;
4953         int ctx=0;
4954         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4955             ctx++;
4956         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4957             ctx++;
4958         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4959             return 0;   /* I4x4 */
4960         state += 2;
4961     }else{
4962         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4963             return 0;   /* I4x4 */
4964     }
4965
4966     if( get_cabac_terminate( &h->cabac ) )
4967         return 25;  /* PCM */
4968
4969     mb_type = 1; /* I16x16 */
4970     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4971     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4972         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4973     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4974     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4975     return mb_type;
4976 }
4977
4978 static int decode_cabac_mb_type( H264Context *h ) {
4979     MpegEncContext * const s = &h->s;
4980
4981     if( h->slice_type == I_TYPE ) {
4982         return decode_cabac_intra_mb_type(h, 3, 1);
4983     } else if( h->slice_type == P_TYPE ) {
4984         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4985             /* P-type */
4986             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4987                 /* P_L0_D16x16, P_8x8 */
4988                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4989             } else {
4990                 /* P_L0_D8x16, P_L0_D16x8 */
4991                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4992             }
4993         } else {
4994             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4995         }
4996     } else if( h->slice_type == B_TYPE ) {
4997         const int mba_xy = h->left_mb_xy[0];
4998         const int mbb_xy = h->top_mb_xy;
4999         int ctx = 0;
5000         int bits;
5001
5002         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5003             ctx++;
5004         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5005             ctx++;
5006
5007         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5008             return 0; /* B_Direct_16x16 */
5009
5010         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5011             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5012         }
5013
5014         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5015         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5016         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5017         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5018         if( bits < 8 )
5019             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5020         else if( bits == 13 ) {
5021             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5022         } else if( bits == 14 )
5023             return 11; /* B_L1_L0_8x16 */
5024         else if( bits == 15 )
5025             return 22; /* B_8x8 */
5026
5027         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5028         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5029     } else {
5030         /* TODO SI/SP frames? */
5031         return -1;
5032     }
5033 }
5034
5035 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5036     MpegEncContext * const s = &h->s;
5037     int mba_xy, mbb_xy;
5038     int ctx = 0;
5039
5040     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5041         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5042         mba_xy = mb_xy - 1;
5043         if( (mb_y&1)
5044             && h->slice_table[mba_xy] == h->slice_num
5045             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5046             mba_xy += s->mb_stride;
5047         if( MB_FIELD ){
5048             mbb_xy = mb_xy - s->mb_stride;
5049             if( !(mb_y&1)
5050                 && h->slice_table[mbb_xy] == h->slice_num
5051                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5052                 mbb_xy -= s->mb_stride;
5053         }else
5054             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5055     }else{
5056         int mb_xy = mb_x + mb_y*s->mb_stride;
5057         mba_xy = mb_xy - 1;
5058         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5059     }
5060
5061     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5062         ctx++;
5063     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5064         ctx++;
5065
5066     if( h->slice_type == B_TYPE )
5067         ctx += 13;
5068     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5069 }
5070
5071 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5072     int mode = 0;
5073
5074     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5075         return pred_mode;
5076
5077     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5078     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5079     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5080
5081     if( mode >= pred_mode )
5082         return mode + 1;
5083     else
5084         return mode;
5085 }
5086
5087 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5088     const int mba_xy = h->left_mb_xy[0];
5089     const int mbb_xy = h->top_mb_xy;
5090
5091     int ctx = 0;
5092
5093     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5094     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5095         ctx++;
5096
5097     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5098         ctx++;
5099
5100     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5101         return 0;
5102
5103     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5104         return 1;
5105     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5106         return 2;
5107     else
5108         return 3;
5109 }
5110
5111 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5112     int cbp_b, cbp_a, ctx, cbp = 0;
5113
5114     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5115     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5116
5117     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5118     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5119     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5120     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5121     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5122     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5123     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5124     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5125     return cbp;
5126 }
5127 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5128     int ctx;
5129     int cbp_a, cbp_b;
5130
5131     cbp_a = (h->left_cbp>>4)&0x03;
5132     cbp_b = (h-> top_cbp>>4)&0x03;
5133
5134     ctx = 0;
5135     if( cbp_a > 0 ) ctx++;
5136     if( cbp_b > 0 ) ctx += 2;
5137     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5138         return 0;
5139
5140     ctx = 4;
5141     if( cbp_a == 2 ) ctx++;
5142     if( cbp_b == 2 ) ctx += 2;
5143     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5144 }
5145 static int decode_cabac_mb_dqp( H264Context *h) {
5146     int   ctx = 0;
5147     int   val = 0;
5148
5149     if( h->last_qscale_diff != 0 )
5150         ctx++;
5151
5152     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5153         if( ctx < 2 )
5154             ctx = 2;
5155         else
5156             ctx = 3;
5157         val++;
5158         if(val > 102) //prevent infinite loop
5159             return INT_MIN;
5160     }
5161
5162     if( val&0x01 )
5163         return (val + 1)/2;
5164     else
5165         return -(val + 1)/2;
5166 }
5167 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5168     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5169         return 0;   /* 8x8 */
5170     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5171         return 1;   /* 8x4 */
5172     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5173         return 2;   /* 4x8 */
5174     return 3;       /* 4x4 */
5175 }
5176 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5177     int type;
5178     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5179         return 0;   /* B_Direct_8x8 */
5180     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5181         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5182     type = 3;
5183     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5184         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5185             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5186         type += 4;
5187     }
5188     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5189     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5190     return type;
5191 }
5192
5193 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5194     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5195 }
5196
5197 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5198     int refa = h->ref_cache[list][scan8[n] - 1];
5199     int refb = h->ref_cache[list][scan8[n] - 8];
5200     int ref  = 0;
5201     int ctx  = 0;
5202
5203     if( h->slice_type == B_TYPE) {
5204         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5205             ctx++;
5206         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5207             ctx += 2;
5208     } else {
5209         if( refa > 0 )
5210             ctx++;
5211         if( refb > 0 )
5212             ctx += 2;
5213     }
5214
5215     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5216         ref++;
5217         if( ctx < 4 )
5218             ctx = 4;
5219         else
5220             ctx = 5;
5221         if(ref >= 32 /*h->ref_list[list]*/){
5222             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5223             return 0; //FIXME we should return -1 and check the return everywhere
5224         }
5225     }
5226     return ref;
5227 }
5228
5229 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5230     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5231                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5232     int ctxbase = (l == 0) ? 40 : 47;
5233     int ctx, mvd;
5234
5235     if( amvd < 3 )
5236         ctx = 0;
5237     else if( amvd > 32 )
5238         ctx = 2;
5239     else
5240         ctx = 1;
5241
5242     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5243         return 0;
5244
5245     mvd= 1;
5246     ctx= 3;
5247     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5248         mvd++;
5249         if( ctx < 6 )
5250             ctx++;
5251     }
5252
5253     if( mvd >= 9 ) {
5254         int k = 3;
5255         while( get_cabac_bypass( &h->cabac ) ) {
5256             mvd += 1 << k;
5257             k++;
5258             if(k>24){
5259                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5260                 return INT_MIN;
5261             }
5262         }
5263         while( k-- ) {
5264             if( get_cabac_bypass( &h->cabac ) )
5265                 mvd += 1 << k;
5266         }
5267     }
5268     return get_cabac_bypass_sign( &h->cabac, -mvd );
5269 }
5270
5271 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5272     int nza, nzb;
5273     int ctx = 0;
5274
5275     if( cat == 0 ) {
5276         nza = h->left_cbp&0x100;
5277         nzb = h-> top_cbp&0x100;
5278     } else if( cat == 1 || cat == 2 ) {
5279         nza = h->non_zero_count_cache[scan8[idx] - 1];
5280         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5281     } else if( cat == 3 ) {
5282         nza = (h->left_cbp>>(6+idx))&0x01;
5283         nzb = (h-> top_cbp>>(6+idx))&0x01;
5284     } else {
5285         assert(cat == 4);
5286         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5287         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5288     }
5289
5290     if( nza > 0 )
5291         ctx++;
5292
5293     if( nzb > 0 )
5294         ctx += 2;
5295
5296     return ctx + 4 * cat;
5297 }
5298
5299 DECLARE_ASM_CONST(1, const uint8_t, last_coeff_flag_offset_8x8[63]) = {
5300     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5301     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5302     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5303     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5304 };
5305
5306 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5307     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5308     static const int significant_coeff_flag_offset[2][6] = {
5309       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5310       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5311     };
5312     static const int last_coeff_flag_offset[2][6] = {
5313       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5314       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5315     };
5316     static const int coeff_abs_level_m1_offset[6] = {
5317         227+0, 227+10, 227+20, 227+30, 227+39, 426
5318     };
5319     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5320       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5321         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5322         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5323        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5324       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5325         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5326         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5327         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5328     };
5329
5330     int index[64];
5331
5332     int av_unused last;
5333     int coeff_count = 0;
5334
5335     int abslevel1 = 1;
5336     int abslevelgt1 = 0;
5337
5338     uint8_t *significant_coeff_ctx_base;
5339     uint8_t *last_coeff_ctx_base;
5340     uint8_t *abs_level_m1_ctx_base;
5341
5342 #ifndef ARCH_X86
5343 #define CABAC_ON_STACK
5344 #endif
5345 #ifdef CABAC_ON_STACK
5346 #define CC &cc
5347     CABACContext cc;
5348     cc.range     = h->cabac.range;
5349     cc.low       = h->cabac.low;
5350     cc.bytestream= h->cabac.bytestream;
5351 #else
5352 #define CC &h->cabac
5353 #endif
5354
5355
5356     /* cat: 0-> DC 16x16  n = 0
5357      *      1-> AC 16x16  n = luma4x4idx
5358      *      2-> Luma4x4   n = luma4x4idx
5359      *      3-> DC Chroma n = iCbCr
5360      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5361      *      5-> Luma8x8   n = 4 * luma8x8idx
5362      */
5363
5364     /* read coded block flag */
5365     if( cat != 5 ) {
5366         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5367             if( cat == 1 || cat == 2 )
5368                 h->non_zero_count_cache[scan8[n]] = 0;
5369             else if( cat == 4 )
5370                 h->non_zero_count_cache[scan8[16+n]] = 0;
5371 #ifdef CABAC_ON_STACK
5372             h->cabac.range     = cc.range     ;
5373             h->cabac.low       = cc.low       ;
5374             h->cabac.bytestream= cc.bytestream;
5375 #endif
5376             return;
5377         }
5378     }
5379
5380     significant_coeff_ctx_base = h->cabac_state
5381         + significant_coeff_flag_offset[MB_FIELD][cat];
5382     last_coeff_ctx_base = h->cabac_state
5383         + last_coeff_flag_offset[MB_FIELD][cat];
5384     abs_level_m1_ctx_base = h->cabac_state
5385         + coeff_abs_level_m1_offset[cat];
5386
5387     if( cat == 5 ) {
5388 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5389         for(last= 0; last < coefs; last++) { \
5390             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5391             if( get_cabac( CC, sig_ctx )) { \
5392                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5393                 index[coeff_count++] = last; \
5394                 if( get_cabac( CC, last_ctx ) ) { \
5395                     last= max_coeff; \
5396                     break; \
5397                 } \
5398             } \
5399         }\
5400         if( last == max_coeff -1 ) {\
5401             index[coeff_count++] = last;\
5402         }
5403         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5404 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5405         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5406     } else {
5407         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5408 #else
5409         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5410     } else {
5411         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5412 #endif
5413     }
5414     assert(coeff_count > 0);
5415
5416     if( cat == 0 )
5417         h->cbp_table[mb_xy] |= 0x100;
5418     else if( cat == 1 || cat == 2 )
5419         h->non_zero_count_cache[scan8[n]] = coeff_count;
5420     else if( cat == 3 )
5421         h->cbp_table[mb_xy] |= 0x40 << n;
5422     else if( cat == 4 )
5423         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5424     else {
5425         assert( cat == 5 );
5426         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5427     }
5428
5429     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5430         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5431         int j= scantable[index[coeff_count]];
5432
5433         if( get_cabac( CC, ctx ) == 0 ) {
5434             if( !qmul ) {
5435                 block[j] = get_cabac_bypass_sign( CC, -1);
5436             }else{
5437                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5438             }
5439
5440             abslevel1++;
5441         } else {
5442             int coeff_abs = 2;
5443             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5444             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5445                 coeff_abs++;
5446             }
5447
5448             if( coeff_abs >= 15 ) {
5449                 int j = 0;
5450                 while( get_cabac_bypass( CC ) ) {
5451                     j++;
5452                 }
5453
5454                 coeff_abs=1;
5455                 while( j-- ) {
5456                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5457                 }
5458                 coeff_abs+= 14;
5459             }
5460
5461             if( !qmul ) {
5462                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5463                 else                                block[j] =  coeff_abs;
5464             }else{
5465                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5466                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5467             }
5468
5469             abslevelgt1++;
5470         }
5471     }
5472 #ifdef CABAC_ON_STACK
5473             h->cabac.range     = cc.range     ;
5474             h->cabac.low       = cc.low       ;
5475             h->cabac.bytestream= cc.bytestream;
5476 #endif
5477
5478 }
5479
5480 static inline void compute_mb_neighbors(H264Context *h)
5481 {
5482     MpegEncContext * const s = &h->s;
5483     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5484     h->top_mb_xy     = mb_xy - s->mb_stride;
5485     h->left_mb_xy[0] = mb_xy - 1;
5486     if(FRAME_MBAFF){
5487         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5488         const int top_pair_xy      = pair_xy     - s->mb_stride;
5489         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5490         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5491         const int curr_mb_frame_flag = !MB_FIELD;
5492         const int bottom = (s->mb_y & 1);
5493         if (bottom
5494                 ? !curr_mb_frame_flag // bottom macroblock
5495                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5496                 ) {
5497             h->top_mb_xy -= s->mb_stride;
5498         }
5499         if (left_mb_frame_flag != curr_mb_frame_flag) {
5500             h->left_mb_xy[0] = pair_xy - 1;
5501         }
5502     } else if (FIELD_PICTURE) {
5503         h->top_mb_xy -= s->mb_stride;
5504     }
5505     return;
5506 }
5507
5508 /**
5509  * decodes a macroblock
5510  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5511  */
5512 static int decode_mb_cabac(H264Context *h) {
5513     MpegEncContext * const s = &h->s;
5514     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5515     int mb_type, partition_count, cbp = 0;
5516     int dct8x8_allowed= h->pps.transform_8x8_mode;
5517
5518     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5519
5520     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5521     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5522         int skip;
5523         /* a skipped mb needs the aff flag from the following mb */
5524         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5525             predict_field_decoding_flag(h);
5526         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5527             skip = h->next_mb_skipped;
5528         else
5529             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5530         /* read skip flags */
5531         if( skip ) {
5532             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5533                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5534                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5535                 if(h->next_mb_skipped)
5536                     predict_field_decoding_flag(h);
5537                 else
5538                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5539             }
5540
5541             decode_mb_skip(h);
5542
5543             h->cbp_table[mb_xy] = 0;
5544             h->chroma_pred_mode_table[mb_xy] = 0;
5545             h->last_qscale_diff = 0;
5546
5547             return 0;
5548
5549         }
5550     }
5551     if(FRAME_MBAFF){
5552         if( (s->mb_y&1) == 0 )
5553             h->mb_mbaff =
5554             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5555     }else
5556         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5557
5558     h->prev_mb_skipped = 0;
5559
5560     compute_mb_neighbors(h);
5561     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5562         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5563         return -1;
5564     }
5565
5566     if( h->slice_type == B_TYPE ) {
5567         if( mb_type < 23 ){
5568             partition_count= b_mb_type_info[mb_type].partition_count;
5569             mb_type=         b_mb_type_info[mb_type].type;
5570         }else{
5571             mb_type -= 23;
5572             goto decode_intra_mb;
5573         }
5574     } else if( h->slice_type == P_TYPE ) {
5575         if( mb_type < 5) {
5576             partition_count= p_mb_type_info[mb_type].partition_count;
5577             mb_type=         p_mb_type_info[mb_type].type;
5578         } else {
5579             mb_type -= 5;
5580             goto decode_intra_mb;
5581         }
5582     } else {
5583        assert(h->slice_type == I_TYPE);
5584 decode_intra_mb:
5585         partition_count = 0;
5586         cbp= i_mb_type_info[mb_type].cbp;
5587         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5588         mb_type= i_mb_type_info[mb_type].type;
5589     }
5590     if(MB_FIELD)
5591         mb_type |= MB_TYPE_INTERLACED;
5592
5593     h->slice_table[ mb_xy ]= h->slice_num;
5594
5595     if(IS_INTRA_PCM(mb_type)) {
5596         const uint8_t *ptr;
5597         unsigned int x, y;
5598
5599         // We assume these blocks are very rare so we do not optimize it.
5600         // FIXME The two following lines get the bitstream position in the cabac
5601         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5602         ptr= h->cabac.bytestream;
5603         if(h->cabac.low&0x1) ptr--;
5604         if(CABAC_BITS==16){
5605             if(h->cabac.low&0x1FF) ptr--;
5606         }
5607
5608         // The pixels are stored in the same order as levels in h->mb array.
5609         for(y=0; y<16; y++){
5610             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5611             for(x=0; x<16; x++){
5612                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5613                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5614             }
5615         }
5616         for(y=0; y<8; y++){
5617             const int index= 256 + 4*(y&3) + 32*(y>>2);
5618             for(x=0; x<8; x++){
5619                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5620                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5621             }
5622         }
5623         for(y=0; y<8; y++){
5624             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5625             for(x=0; x<8; x++){
5626                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5627                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5628             }
5629         }
5630
5631         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5632
5633         // All blocks are present
5634         h->cbp_table[mb_xy] = 0x1ef;
5635         h->chroma_pred_mode_table[mb_xy] = 0;
5636         // In deblocking, the quantizer is 0
5637         s->current_picture.qscale_table[mb_xy]= 0;
5638         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5639         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5640         // All coeffs are present
5641         memset(h->non_zero_count[mb_xy], 16, 16);
5642         s->current_picture.mb_type[mb_xy]= mb_type;
5643         return 0;
5644     }
5645
5646     if(MB_MBAFF){
5647         h->ref_count[0] <<= 1;
5648         h->ref_count[1] <<= 1;
5649     }
5650
5651     fill_caches(h, mb_type, 0);
5652
5653     if( IS_INTRA( mb_type ) ) {
5654         int i, pred_mode;
5655         if( IS_INTRA4x4( mb_type ) ) {
5656             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5657                 mb_type |= MB_TYPE_8x8DCT;
5658                 for( i = 0; i < 16; i+=4 ) {
5659                     int pred = pred_intra_mode( h, i );
5660                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5661                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5662                 }
5663             } else {
5664                 for( i = 0; i < 16; i++ ) {
5665                     int pred = pred_intra_mode( h, i );
5666                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5667
5668                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5669                 }
5670             }
5671             write_back_intra_pred_mode(h);
5672             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5673         } else {
5674             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5675             if( h->intra16x16_pred_mode < 0 ) return -1;
5676         }
5677         h->chroma_pred_mode_table[mb_xy] =
5678         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5679
5680         pred_mode= check_intra_pred_mode( h, pred_mode );
5681         if( pred_mode < 0 ) return -1;
5682         h->chroma_pred_mode= pred_mode;
5683     } else if( partition_count == 4 ) {
5684         int i, j, sub_partition_count[4], list, ref[2][4];
5685
5686         if( h->slice_type == B_TYPE ) {
5687             for( i = 0; i < 4; i++ ) {
5688                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5689                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5690                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5691             }
5692             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5693                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5694                 pred_direct_motion(h, &mb_type);
5695                 h->ref_cache[0][scan8[4]] =
5696                 h->ref_cache[1][scan8[4]] =
5697                 h->ref_cache[0][scan8[12]] =
5698                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5699                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5700                     for( i = 0; i < 4; i++ )
5701                         if( IS_DIRECT(h->sub_mb_type[i]) )
5702                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5703                 }
5704             }
5705         } else {
5706             for( i = 0; i < 4; i++ ) {
5707                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5708                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5709                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5710             }
5711         }
5712
5713         for( list = 0; list < h->list_count; list++ ) {
5714                 for( i = 0; i < 4; i++ ) {
5715                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5716                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5717                         if( h->ref_count[list] > 1 )
5718                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5719                         else
5720                             ref[list][i] = 0;
5721                     } else {
5722                         ref[list][i] = -1;
5723                     }
5724                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5725                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5726                 }
5727         }
5728
5729         if(dct8x8_allowed)
5730             dct8x8_allowed = get_dct8x8_allowed(h);
5731
5732         for(list=0; list<h->list_count; list++){
5733             for(i=0; i<4; i++){
5734                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5735                 if(IS_DIRECT(h->sub_mb_type[i])){
5736                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5737                     continue;
5738                 }
5739
5740                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5741                     const int sub_mb_type= h->sub_mb_type[i];
5742                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5743                     for(j=0; j<sub_partition_count[i]; j++){
5744                         int mpx, mpy;
5745                         int mx, my;
5746                         const int index= 4*i + block_width*j;
5747                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5748                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5749                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5750
5751                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5752                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5753                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5754
5755                         if(IS_SUB_8X8(sub_mb_type)){
5756                             mv_cache[ 1 ][0]=
5757                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5758                             mv_cache[ 1 ][1]=
5759                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5760
5761                             mvd_cache[ 1 ][0]=
5762                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5763                             mvd_cache[ 1 ][1]=
5764                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5765                         }else if(IS_SUB_8X4(sub_mb_type)){
5766                             mv_cache[ 1 ][0]= mx;
5767                             mv_cache[ 1 ][1]= my;
5768
5769                             mvd_cache[ 1 ][0]= mx - mpx;
5770                             mvd_cache[ 1 ][1]= my - mpy;
5771                         }else if(IS_SUB_4X8(sub_mb_type)){
5772                             mv_cache[ 8 ][0]= mx;
5773                             mv_cache[ 8 ][1]= my;
5774
5775                             mvd_cache[ 8 ][0]= mx - mpx;
5776                             mvd_cache[ 8 ][1]= my - mpy;
5777                         }
5778                         mv_cache[ 0 ][0]= mx;
5779                         mv_cache[ 0 ][1]= my;
5780
5781                         mvd_cache[ 0 ][0]= mx - mpx;
5782                         mvd_cache[ 0 ][1]= my - mpy;
5783                     }
5784                 }else{
5785                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5786                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5787                     p[0] = p[1] = p[8] = p[9] = 0;
5788                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5789                 }
5790             }
5791         }
5792     } else if( IS_DIRECT(mb_type) ) {
5793         pred_direct_motion(h, &mb_type);
5794         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5795         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5796         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5797     } else {
5798         int list, mx, my, i, mpx, mpy;
5799         if(IS_16X16(mb_type)){
5800             for(list=0; list<h->list_count; list++){
5801                 if(IS_DIR(mb_type, 0, list)){
5802                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5803                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5804                 }else
5805                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5806             }
5807             for(list=0; list<h->list_count; list++){
5808                 if(IS_DIR(mb_type, 0, list)){
5809                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5810
5811                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5812                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5813                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5814
5815                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5816                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5817                 }else
5818                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5819             }
5820         }
5821         else if(IS_16X8(mb_type)){
5822             for(list=0; list<h->list_count; list++){
5823                     for(i=0; i<2; i++){
5824                         if(IS_DIR(mb_type, i, list)){
5825                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5826                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5827                         }else
5828                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5829                     }
5830             }
5831             for(list=0; list<h->list_count; list++){
5832                 for(i=0; i<2; i++){
5833                     if(IS_DIR(mb_type, i, list)){
5834                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5835                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5836                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5837                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5838
5839                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5840                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5841                     }else{
5842                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5843                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5844                     }
5845                 }
5846             }
5847         }else{
5848             assert(IS_8X16(mb_type));
5849             for(list=0; list<h->list_count; list++){
5850                     for(i=0; i<2; i++){
5851                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5852                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5853                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5854                         }else
5855                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5856                     }
5857             }
5858             for(list=0; list<h->list_count; list++){
5859                 for(i=0; i<2; i++){
5860                     if(IS_DIR(mb_type, i, list)){
5861                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5862                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5863                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5864
5865                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5866                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5867                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5868                     }else{
5869                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5870                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5871                     }
5872                 }
5873             }
5874         }
5875     }
5876
5877    if( IS_INTER( mb_type ) ) {
5878         h->chroma_pred_mode_table[mb_xy] = 0;
5879         write_back_motion( h, mb_type );
5880    }
5881
5882     if( !IS_INTRA16x16( mb_type ) ) {
5883         cbp  = decode_cabac_mb_cbp_luma( h );
5884         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5885     }
5886
5887     h->cbp_table[mb_xy] = h->cbp = cbp;
5888
5889     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5890         if( decode_cabac_mb_transform_size( h ) )
5891             mb_type |= MB_TYPE_8x8DCT;
5892     }
5893     s->current_picture.mb_type[mb_xy]= mb_type;
5894
5895     if( cbp || IS_INTRA16x16( mb_type ) ) {
5896         const uint8_t *scan, *scan8x8, *dc_scan;
5897         const uint32_t *qmul;
5898         int dqp;
5899
5900         if(IS_INTERLACED(mb_type)){
5901             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5902             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5903             dc_scan= luma_dc_field_scan;
5904         }else{
5905             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5906             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5907             dc_scan= luma_dc_zigzag_scan;
5908         }
5909
5910         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5911         if( dqp == INT_MIN ){
5912             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5913             return -1;
5914         }
5915         s->qscale += dqp;
5916         if(((unsigned)s->qscale) > 51){
5917             if(s->qscale<0) s->qscale+= 52;
5918             else            s->qscale-= 52;
5919         }
5920         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5921         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5922
5923         if( IS_INTRA16x16( mb_type ) ) {
5924             int i;
5925             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5926             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5927
5928             if( cbp&15 ) {
5929                 qmul = h->dequant4_coeff[0][s->qscale];
5930                 for( i = 0; i < 16; i++ ) {
5931                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5932                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5933                 }
5934             } else {
5935                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5936             }
5937         } else {
5938             int i8x8, i4x4;
5939             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5940                 if( cbp & (1<<i8x8) ) {
5941                     if( IS_8x8DCT(mb_type) ) {
5942                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5943                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5944                     } else {
5945                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5946                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5947                             const int index = 4*i8x8 + i4x4;
5948                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5949 //START_TIMER
5950                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5951 //STOP_TIMER("decode_residual")
5952                         }
5953                     }
5954                 } else {
5955                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5956                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5957                 }
5958             }
5959         }
5960
5961         if( cbp&0x30 ){
5962             int c;
5963             for( c = 0; c < 2; c++ ) {
5964                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5965                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5966             }
5967         }
5968
5969         if( cbp&0x20 ) {
5970             int c, i;
5971             for( c = 0; c < 2; c++ ) {
5972                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5973                 for( i = 0; i < 4; i++ ) {
5974                     const int index = 16 + 4 * c + i;
5975                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5976                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5977                 }
5978             }
5979         } else {
5980             uint8_t * const nnz= &h->non_zero_count_cache[0];
5981             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5982             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5983         }
5984     } else {
5985         uint8_t * const nnz= &h->non_zero_count_cache[0];
5986         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5987         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5988         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5989         h->last_qscale_diff = 0;
5990     }
5991
5992     s->current_picture.qscale_table[mb_xy]= s->qscale;
5993     write_back_non_zero_count(h);
5994
5995     if(MB_MBAFF){
5996         h->ref_count[0] >>= 1;
5997         h->ref_count[1] >>= 1;
5998     }
5999
6000     return 0;
6001 }
6002
6003
6004 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6005     int i, d;
6006     const int index_a = qp + h->slice_alpha_c0_offset;
6007     const int alpha = (alpha_table+52)[index_a];
6008     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6009
6010     if( bS[0] < 4 ) {
6011         int8_t tc[4];
6012         for(i=0; i<4; i++)
6013             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6014         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6015     } else {
6016         /* 16px edge length, because bS=4 is triggered by being at
6017          * the edge of an intra MB, so all 4 bS are the same */
6018             for( d = 0; d < 16; d++ ) {
6019                 const int p0 = pix[-1];
6020                 const int p1 = pix[-2];
6021                 const int p2 = pix[-3];
6022
6023                 const int q0 = pix[0];
6024                 const int q1 = pix[1];
6025                 const int q2 = pix[2];
6026
6027                 if( FFABS( p0 - q0 ) < alpha &&
6028                     FFABS( p1 - p0 ) < beta &&
6029                     FFABS( q1 - q0 ) < beta ) {
6030
6031                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6032                         if( FFABS( p2 - p0 ) < beta)
6033                         {
6034                             const int p3 = pix[-4];
6035                             /* p0', p1', p2' */
6036                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6037                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6038                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6039                         } else {
6040                             /* p0' */
6041                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6042                         }
6043                         if( FFABS( q2 - q0 ) < beta)
6044                         {
6045                             const int q3 = pix[3];
6046                             /* q0', q1', q2' */
6047                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6048                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6049                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6050                         } else {
6051                             /* q0' */
6052                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6053                         }
6054                     }else{
6055                         /* p0', q0' */
6056                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6057                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6058                     }
6059                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6060                 }
6061                 pix += stride;
6062             }
6063     }
6064 }
6065 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6066     int i;
6067     const int index_a = qp + h->slice_alpha_c0_offset;
6068     const int alpha = (alpha_table+52)[index_a];
6069     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6070
6071     if( bS[0] < 4 ) {
6072         int8_t tc[4];
6073         for(i=0; i<4; i++)
6074             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6075         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6076     } else {
6077         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6078     }
6079 }
6080
6081 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6082     int i;
6083     for( i = 0; i < 16; i++, pix += stride) {
6084         int index_a;
6085         int alpha;
6086         int beta;
6087
6088         int qp_index;
6089         int bS_index = (i >> 1);
6090         if (!MB_FIELD) {
6091             bS_index &= ~1;
6092             bS_index |= (i & 1);
6093         }
6094
6095         if( bS[bS_index] == 0 ) {
6096             continue;
6097         }
6098
6099         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6100         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6101         alpha = (alpha_table+52)[index_a];
6102         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6103
6104         if( bS[bS_index] < 4 ) {
6105             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6106             const int p0 = pix[-1];
6107             const int p1 = pix[-2];
6108             const int p2 = pix[-3];
6109             const int q0 = pix[0];
6110             const int q1 = pix[1];
6111             const int q2 = pix[2];
6112
6113             if( FFABS( p0 - q0 ) < alpha &&
6114                 FFABS( p1 - p0 ) < beta &&
6115                 FFABS( q1 - q0 ) < beta ) {
6116                 int tc = tc0;
6117                 int i_delta;
6118
6119                 if( FFABS( p2 - p0 ) < beta ) {
6120                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6121                     tc++;
6122                 }
6123                 if( FFABS( q2 - q0 ) < beta ) {
6124                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6125                     tc++;
6126                 }
6127
6128                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6129                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6130                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6131                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6132             }
6133         }else{
6134             const int p0 = pix[-1];
6135             const int p1 = pix[-2];
6136             const int p2 = pix[-3];
6137
6138             const int q0 = pix[0];
6139             const int q1 = pix[1];
6140             const int q2 = pix[2];
6141
6142             if( FFABS( p0 - q0 ) < alpha &&
6143                 FFABS( p1 - p0 ) < beta &&
6144                 FFABS( q1 - q0 ) < beta ) {
6145
6146                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6147                     if( FFABS( p2 - p0 ) < beta)
6148                     {
6149                         const int p3 = pix[-4];
6150                         /* p0', p1', p2' */
6151                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6152                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6153                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6154                     } else {
6155                         /* p0' */
6156                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6157                     }
6158                     if( FFABS( q2 - q0 ) < beta)
6159                     {
6160                         const int q3 = pix[3];
6161                         /* q0', q1', q2' */
6162                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6163                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6164                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6165                     } else {
6166                         /* q0' */
6167                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6168                     }
6169                 }else{
6170                     /* p0', q0' */
6171                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6172                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6173                 }
6174                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6175             }
6176         }
6177     }
6178 }
6179 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6180     int i;
6181     for( i = 0; i < 8; i++, pix += stride) {
6182         int index_a;
6183         int alpha;
6184         int beta;
6185
6186         int qp_index;
6187         int bS_index = i;
6188
6189         if( bS[bS_index] == 0 ) {
6190             continue;
6191         }
6192
6193         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6194         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6195         alpha = (alpha_table+52)[index_a];
6196         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6197
6198         if( bS[bS_index] < 4 ) {
6199             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6200             const int p0 = pix[-1];
6201             const int p1 = pix[-2];
6202             const int q0 = pix[0];
6203             const int q1 = pix[1];
6204
6205             if( FFABS( p0 - q0 ) < alpha &&
6206                 FFABS( p1 - p0 ) < beta &&
6207                 FFABS( q1 - q0 ) < beta ) {
6208                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6209
6210                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6211                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6212                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6213             }
6214         }else{
6215             const int p0 = pix[-1];
6216             const int p1 = pix[-2];
6217             const int q0 = pix[0];
6218             const int q1 = pix[1];
6219
6220             if( FFABS( p0 - q0 ) < alpha &&
6221                 FFABS( p1 - p0 ) < beta &&
6222                 FFABS( q1 - q0 ) < beta ) {
6223
6224                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6225                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6226                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6227             }
6228         }
6229     }
6230 }
6231
6232 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6233     int i, d;
6234     const int index_a = qp + h->slice_alpha_c0_offset;
6235     const int alpha = (alpha_table+52)[index_a];
6236     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6237     const int pix_next  = stride;
6238
6239     if( bS[0] < 4 ) {
6240         int8_t tc[4];
6241         for(i=0; i<4; i++)
6242             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6243         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6244     } else {
6245         /* 16px edge length, see filter_mb_edgev */
6246             for( d = 0; d < 16; d++ ) {
6247                 const int p0 = pix[-1*pix_next];
6248                 const int p1 = pix[-2*pix_next];
6249                 const int p2 = pix[-3*pix_next];
6250                 const int q0 = pix[0];
6251                 const int q1 = pix[1*pix_next];
6252                 const int q2 = pix[2*pix_next];
6253
6254                 if( FFABS( p0 - q0 ) < alpha &&
6255                     FFABS( p1 - p0 ) < beta &&
6256                     FFABS( q1 - q0 ) < beta ) {
6257
6258                     const int p3 = pix[-4*pix_next];
6259                     const int q3 = pix[ 3*pix_next];
6260
6261                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6262                         if( FFABS( p2 - p0 ) < beta) {
6263                             /* p0', p1', p2' */
6264                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6265                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6266                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6267                         } else {
6268                             /* p0' */
6269                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6270                         }
6271                         if( FFABS( q2 - q0 ) < beta) {
6272                             /* q0', q1', q2' */
6273                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6274                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6275                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6276                         } else {
6277                             /* q0' */
6278                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6279                         }
6280                     }else{
6281                         /* p0', q0' */
6282                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6283                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6284                     }
6285                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6286                 }
6287                 pix++;
6288             }
6289     }
6290 }
6291
6292 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6293     int i;
6294     const int index_a = qp + h->slice_alpha_c0_offset;
6295     const int alpha = (alpha_table+52)[index_a];
6296     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6297
6298     if( bS[0] < 4 ) {
6299         int8_t tc[4];
6300         for(i=0; i<4; i++)
6301             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6302         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6303     } else {
6304         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6305     }
6306 }
6307
6308 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6309     MpegEncContext * const s = &h->s;
6310     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6311     int mb_xy, mb_type;
6312     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6313
6314     mb_xy = mb_x + mb_y*s->mb_stride;
6315
6316     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6317        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6318                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6319         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6320         return;
6321     }
6322     assert(!FRAME_MBAFF);
6323
6324     mb_type = s->current_picture.mb_type[mb_xy];
6325     qp = s->current_picture.qscale_table[mb_xy];
6326     qp0 = s->current_picture.qscale_table[mb_xy-1];
6327     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6328     qpc = get_chroma_qp( h, 0, qp );
6329     qpc0 = get_chroma_qp( h, 0, qp0 );
6330     qpc1 = get_chroma_qp( h, 0, qp1 );
6331     qp0 = (qp + qp0 + 1) >> 1;
6332     qp1 = (qp + qp1 + 1) >> 1;
6333     qpc0 = (qpc + qpc0 + 1) >> 1;
6334     qpc1 = (qpc + qpc1 + 1) >> 1;
6335     qp_thresh = 15 - h->slice_alpha_c0_offset;
6336     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6337        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6338         return;
6339
6340     if( IS_INTRA(mb_type) ) {
6341         int16_t bS4[4] = {4,4,4,4};
6342         int16_t bS3[4] = {3,3,3,3};
6343         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6344         if( IS_8x8DCT(mb_type) ) {
6345             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6346             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6347             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6348             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6349         } else {
6350             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6351             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6352             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6353             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6354             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6355             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6356             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6357             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6358         }
6359         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6360         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6361         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6362         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6363         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6364         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6365         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6366         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6367         return;
6368     } else {
6369         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6370         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6371         int edges;
6372         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6373             edges = 4;
6374             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6375         } else {
6376             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6377                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6378             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6379                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6380                              ? 3 : 0;
6381             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6382             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6383             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6384                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6385         }
6386         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6387             bSv[0][0] = 0x0004000400040004ULL;
6388         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6389             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6390
6391 #define FILTER(hv,dir,edge)\
6392         if(bSv[dir][edge]) {\
6393             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6394             if(!(edge&1)) {\
6395                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6396                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6397             }\
6398         }
6399         if( edges == 1 ) {
6400             FILTER(v,0,0);
6401             FILTER(h,1,0);
6402         } else if( IS_8x8DCT(mb_type) ) {
6403             FILTER(v,0,0);
6404             FILTER(v,0,2);
6405             FILTER(h,1,0);
6406             FILTER(h,1,2);
6407         } else {
6408             FILTER(v,0,0);
6409             FILTER(v,0,1);
6410             FILTER(v,0,2);
6411             FILTER(v,0,3);
6412             FILTER(h,1,0);
6413             FILTER(h,1,1);
6414             FILTER(h,1,2);
6415             FILTER(h,1,3);
6416         }
6417 #undef FILTER
6418     }
6419 }
6420
6421 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6422     MpegEncContext * const s = &h->s;
6423     const int mb_xy= mb_x + mb_y*s->mb_stride;
6424     const int mb_type = s->current_picture.mb_type[mb_xy];
6425     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6426     int first_vertical_edge_done = 0;
6427     int dir;
6428     /* FIXME: A given frame may occupy more than one position in
6429      * the reference list. So ref2frm should be populated with
6430      * frame numbers, not indices. */
6431     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6432                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6433
6434     //for sufficiently low qp, filtering wouldn't do anything
6435     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6436     if(!FRAME_MBAFF){
6437         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6438         int qp = s->current_picture.qscale_table[mb_xy];
6439         if(qp <= qp_thresh
6440            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6441            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6442             return;
6443         }
6444     }
6445
6446     if (FRAME_MBAFF
6447             // left mb is in picture
6448             && h->slice_table[mb_xy-1] != 255
6449             // and current and left pair do not have the same interlaced type
6450             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6451             // and left mb is in the same slice if deblocking_filter == 2
6452             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6453         /* First vertical edge is different in MBAFF frames
6454          * There are 8 different bS to compute and 2 different Qp
6455          */
6456         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6457         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6458         int16_t bS[8];
6459         int qp[2];
6460         int bqp[2];
6461         int rqp[2];
6462         int mb_qp, mbn0_qp, mbn1_qp;
6463         int i;
6464         first_vertical_edge_done = 1;
6465
6466         if( IS_INTRA(mb_type) )
6467             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6468         else {
6469             for( i = 0; i < 8; i++ ) {
6470                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6471
6472                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6473                     bS[i] = 4;
6474                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6475                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6476                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6477                     bS[i] = 2;
6478                 else
6479                     bS[i] = 1;
6480             }
6481         }
6482
6483         mb_qp = s->current_picture.qscale_table[mb_xy];
6484         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6485         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6486         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6487         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6488                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6489         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6490                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6491         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6492         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6493                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6494         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6495                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6496
6497         /* Filter edge */
6498         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6499         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6500         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6501         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6502         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6503     }
6504     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6505     for( dir = 0; dir < 2; dir++ )
6506     {
6507         int edge;
6508         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6509         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6510         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6511
6512         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6513                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6514         // how often to recheck mv-based bS when iterating between edges
6515         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6516                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6517         // how often to recheck mv-based bS when iterating along each edge
6518         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6519
6520         if (first_vertical_edge_done) {
6521             start = 1;
6522             first_vertical_edge_done = 0;
6523         }
6524
6525         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6526             start = 1;
6527
6528         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6529             && !IS_INTERLACED(mb_type)
6530             && IS_INTERLACED(mbm_type)
6531             ) {
6532             // This is a special case in the norm where the filtering must
6533             // be done twice (one each of the field) even if we are in a
6534             // frame macroblock.
6535             //
6536             static const int nnz_idx[4] = {4,5,6,3};
6537             unsigned int tmp_linesize   = 2 *   linesize;
6538             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6539             int mbn_xy = mb_xy - 2 * s->mb_stride;
6540             int qp;
6541             int i, j;
6542             int16_t bS[4];
6543
6544             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6545                 if( IS_INTRA(mb_type) ||
6546                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6547                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6548                 } else {
6549                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6550                     for( i = 0; i < 4; i++ ) {
6551                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6552                             mbn_nnz[nnz_idx[i]] != 0 )
6553                             bS[i] = 2;
6554                         else
6555                             bS[i] = 1;
6556                     }
6557                 }
6558                 // Do not use s->qscale as luma quantizer because it has not the same
6559                 // value in IPCM macroblocks.
6560                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6561                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6562                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6563                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6564                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6565                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6566                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6567                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6568             }
6569
6570             start = 1;
6571         }
6572
6573         /* Calculate bS */
6574         for( edge = start; edge < edges; edge++ ) {
6575             /* mbn_xy: neighbor macroblock */
6576             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6577             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6578             int16_t bS[4];
6579             int qp;
6580
6581             if( (edge&1) && IS_8x8DCT(mb_type) )
6582                 continue;
6583
6584             if( IS_INTRA(mb_type) ||
6585                 IS_INTRA(mbn_type) ) {
6586                 int value;
6587                 if (edge == 0) {
6588                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6589                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6590                     ) {
6591                         value = 4;
6592                     } else {
6593                         value = 3;
6594                     }
6595                 } else {
6596                     value = 3;
6597                 }
6598                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6599             } else {
6600                 int i, l;
6601                 int mv_done;
6602
6603                 if( edge & mask_edge ) {
6604                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6605                     mv_done = 1;
6606                 }
6607                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6608                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6609                     mv_done = 1;
6610                 }
6611                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6612                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6613                     int bn_idx= b_idx - (dir ? 8:1);
6614                     int v = 0;
6615                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6616                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6617                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6618                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6619                     }
6620                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6621                     mv_done = 1;
6622                 }
6623                 else
6624                     mv_done = 0;
6625
6626                 for( i = 0; i < 4; i++ ) {
6627                     int x = dir == 0 ? edge : i;
6628                     int y = dir == 0 ? i    : edge;
6629                     int b_idx= 8 + 4 + x + 8*y;
6630                     int bn_idx= b_idx - (dir ? 8:1);
6631
6632                     if( h->non_zero_count_cache[b_idx] != 0 ||
6633                         h->non_zero_count_cache[bn_idx] != 0 ) {
6634                         bS[i] = 2;
6635                     }
6636                     else if(!mv_done)
6637                     {
6638                         bS[i] = 0;
6639                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6640                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6641                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6642                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6643                                 bS[i] = 1;
6644                                 break;
6645                             }
6646                         }
6647                     }
6648                 }
6649
6650                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6651                     continue;
6652             }
6653
6654             /* Filter edge */
6655             // Do not use s->qscale as luma quantizer because it has not the same
6656             // value in IPCM macroblocks.
6657             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6658             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6659             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6660             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6661             if( dir == 0 ) {
6662                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6663                 if( (edge&1) == 0 ) {
6664                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6665                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6666                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6667                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6668                 }
6669             } else {
6670                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6671                 if( (edge&1) == 0 ) {
6672                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6673                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6674                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6675                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6676                 }
6677             }
6678         }
6679     }
6680 }
6681
6682 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6683     MpegEncContext * const s = &h->s;
6684     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6685
6686     s->mb_skip_run= -1;
6687
6688     if( h->pps.cabac ) {
6689         int i;
6690
6691         /* realign */
6692         align_get_bits( &s->gb );
6693
6694         /* init cabac */
6695         ff_init_cabac_states( &h->cabac);
6696         ff_init_cabac_decoder( &h->cabac,
6697                                s->gb.buffer + get_bits_count(&s->gb)/8,
6698                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6699         /* calculate pre-state */
6700         for( i= 0; i < 460; i++ ) {
6701             int pre;
6702             if( h->slice_type == I_TYPE )
6703                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6704             else
6705                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6706
6707             if( pre <= 63 )
6708                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6709             else
6710                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6711         }
6712
6713         for(;;){
6714 //START_TIMER
6715             int ret = decode_mb_cabac(h);
6716             int eos;
6717 //STOP_TIMER("decode_mb_cabac")
6718
6719             if(ret>=0) hl_decode_mb(h);
6720
6721             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6722                 s->mb_y++;
6723
6724                 if(ret>=0) ret = decode_mb_cabac(h);
6725
6726                 if(ret>=0) hl_decode_mb(h);
6727                 s->mb_y--;
6728             }
6729             eos = get_cabac_terminate( &h->cabac );
6730
6731             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6732                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6733                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6734                 return -1;
6735             }
6736
6737             if( ++s->mb_x >= s->mb_width ) {
6738                 s->mb_x = 0;
6739                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6740                 ++s->mb_y;
6741                 if(FIELD_OR_MBAFF_PICTURE) {
6742                     ++s->mb_y;
6743                 }
6744             }
6745
6746             if( eos || s->mb_y >= s->mb_height ) {
6747                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6748                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6749                 return 0;
6750             }
6751         }
6752
6753     } else {
6754         for(;;){
6755             int ret = decode_mb_cavlc(h);
6756
6757             if(ret>=0) hl_decode_mb(h);
6758
6759             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6760                 s->mb_y++;
6761                 ret = decode_mb_cavlc(h);
6762
6763                 if(ret>=0) hl_decode_mb(h);
6764                 s->mb_y--;
6765             }
6766
6767             if(ret<0){
6768                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6769                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6770
6771                 return -1;
6772             }
6773
6774             if(++s->mb_x >= s->mb_width){
6775                 s->mb_x=0;
6776                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6777                 ++s->mb_y;
6778                 if(FIELD_OR_MBAFF_PICTURE) {
6779                     ++s->mb_y;
6780                 }
6781                 if(s->mb_y >= s->mb_height){
6782                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6783
6784                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6785                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6786
6787                         return 0;
6788                     }else{
6789                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6790
6791                         return -1;
6792                     }
6793                 }
6794             }
6795
6796             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6797                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6798                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6799                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6800
6801                     return 0;
6802                 }else{
6803                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6804
6805                     return -1;
6806                 }
6807             }
6808         }
6809     }
6810
6811 #if 0
6812     for(;s->mb_y < s->mb_height; s->mb_y++){
6813         for(;s->mb_x < s->mb_width; s->mb_x++){
6814             int ret= decode_mb(h);
6815
6816             hl_decode_mb(h);
6817
6818             if(ret<0){
6819                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6820                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6821
6822                 return -1;
6823             }
6824
6825             if(++s->mb_x >= s->mb_width){
6826                 s->mb_x=0;
6827                 if(++s->mb_y >= s->mb_height){
6828                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6829                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6830
6831                         return 0;
6832                     }else{
6833                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6834
6835                         return -1;
6836                     }
6837                 }
6838             }
6839
6840             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6841                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6842                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6843
6844                     return 0;
6845                 }else{
6846                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6847
6848                     return -1;
6849                 }
6850             }
6851         }
6852         s->mb_x=0;
6853         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6854     }
6855 #endif
6856     return -1; //not reached
6857 }
6858
6859 static int decode_unregistered_user_data(H264Context *h, int size){
6860     MpegEncContext * const s = &h->s;
6861     uint8_t user_data[16+256];
6862     int e, build, i;
6863
6864     if(size<16)
6865         return -1;
6866
6867     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6868         user_data[i]= get_bits(&s->gb, 8);
6869     }
6870
6871     user_data[i]= 0;
6872     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6873     if(e==1 && build>=0)
6874         h->x264_build= build;
6875
6876     if(s->avctx->debug & FF_DEBUG_BUGS)
6877         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6878
6879     for(; i<size; i++)
6880         skip_bits(&s->gb, 8);
6881
6882     return 0;
6883 }
6884
6885 static int decode_sei(H264Context *h){
6886     MpegEncContext * const s = &h->s;
6887
6888     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6889         int size, type;
6890
6891         type=0;
6892         do{
6893             type+= show_bits(&s->gb, 8);
6894         }while(get_bits(&s->gb, 8) == 255);
6895
6896         size=0;
6897         do{
6898             size+= show_bits(&s->gb, 8);
6899         }while(get_bits(&s->gb, 8) == 255);
6900
6901         switch(type){
6902         case 5:
6903             if(decode_unregistered_user_data(h, size) < 0)
6904                 return -1;
6905             break;
6906         default:
6907             skip_bits(&s->gb, 8*size);
6908         }
6909
6910         //FIXME check bits here
6911         align_get_bits(&s->gb);
6912     }
6913
6914     return 0;
6915 }
6916
6917 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6918     MpegEncContext * const s = &h->s;
6919     int cpb_count, i;
6920     cpb_count = get_ue_golomb(&s->gb) + 1;
6921     get_bits(&s->gb, 4); /* bit_rate_scale */
6922     get_bits(&s->gb, 4); /* cpb_size_scale */
6923     for(i=0; i<cpb_count; i++){
6924         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6925         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6926         get_bits1(&s->gb);     /* cbr_flag */
6927     }
6928     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6929     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6930     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6931     get_bits(&s->gb, 5); /* time_offset_length */
6932 }
6933
6934 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6935     MpegEncContext * const s = &h->s;
6936     int aspect_ratio_info_present_flag;
6937     unsigned int aspect_ratio_idc;
6938     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6939
6940     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6941
6942     if( aspect_ratio_info_present_flag ) {
6943         aspect_ratio_idc= get_bits(&s->gb, 8);
6944         if( aspect_ratio_idc == EXTENDED_SAR ) {
6945             sps->sar.num= get_bits(&s->gb, 16);
6946             sps->sar.den= get_bits(&s->gb, 16);
6947         }else if(aspect_ratio_idc < 14){
6948             sps->sar=  pixel_aspect[aspect_ratio_idc];
6949         }else{
6950             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6951             return -1;
6952         }
6953     }else{
6954         sps->sar.num=
6955         sps->sar.den= 0;
6956     }
6957 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6958
6959     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6960         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6961     }
6962
6963     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6964         get_bits(&s->gb, 3);    /* video_format */
6965         get_bits1(&s->gb);      /* video_full_range_flag */
6966         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6967             get_bits(&s->gb, 8); /* colour_primaries */
6968             get_bits(&s->gb, 8); /* transfer_characteristics */
6969             get_bits(&s->gb, 8); /* matrix_coefficients */
6970         }
6971     }
6972
6973     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6974         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6975         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6976     }
6977
6978     sps->timing_info_present_flag = get_bits1(&s->gb);
6979     if(sps->timing_info_present_flag){
6980         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6981         sps->time_scale = get_bits_long(&s->gb, 32);
6982         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6983     }
6984
6985     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6986     if(nal_hrd_parameters_present_flag)
6987         decode_hrd_parameters(h, sps);
6988     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6989     if(vcl_hrd_parameters_present_flag)
6990         decode_hrd_parameters(h, sps);
6991     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6992         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6993     get_bits1(&s->gb);         /* pic_struct_present_flag */
6994
6995     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6996     if(sps->bitstream_restriction_flag){
6997         unsigned int num_reorder_frames;
6998         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6999         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7000         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7001         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7002         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7003         num_reorder_frames= get_ue_golomb(&s->gb);
7004         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7005
7006         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7007             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7008             return -1;
7009         }
7010
7011         sps->num_reorder_frames= num_reorder_frames;
7012     }
7013
7014     return 0;
7015 }
7016
7017 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7018                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7019     MpegEncContext * const s = &h->s;
7020     int i, last = 8, next = 8;
7021     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7022     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7023         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7024     else
7025     for(i=0;i<size;i++){
7026         if(next)
7027             next = (last + get_se_golomb(&s->gb)) & 0xff;
7028         if(!i && !next){ /* matrix not written, we use the preset one */
7029             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7030             break;
7031         }
7032         last = factors[scan[i]] = next ? next : last;
7033     }
7034 }
7035
7036 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7037                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7038     MpegEncContext * const s = &h->s;
7039     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7040     const uint8_t *fallback[4] = {
7041         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7042         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7043         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7044         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7045     };
7046     if(get_bits1(&s->gb)){
7047         sps->scaling_matrix_present |= is_sps;
7048         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7049         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7050         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7051         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7052         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7053         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7054         if(is_sps || pps->transform_8x8_mode){
7055             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7056             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7057         }
7058     } else if(fallback_sps) {
7059         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7060         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7061     }
7062 }
7063
7064 /**
7065  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7066  */
7067 static void *
7068 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7069                     const size_t size, const char *name)
7070 {
7071     if(id>=max) {
7072         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7073         return NULL;
7074     }
7075
7076     if(!vec[id]) {
7077         vec[id] = av_mallocz(size);
7078         if(vec[id] == NULL)
7079             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7080     }
7081     return vec[id];
7082 }
7083
7084 static inline int decode_seq_parameter_set(H264Context *h){
7085     MpegEncContext * const s = &h->s;
7086     int profile_idc, level_idc;
7087     unsigned int sps_id, tmp, mb_width, mb_height;
7088     int i;
7089     SPS *sps;
7090
7091     profile_idc= get_bits(&s->gb, 8);
7092     get_bits1(&s->gb);   //constraint_set0_flag
7093     get_bits1(&s->gb);   //constraint_set1_flag
7094     get_bits1(&s->gb);   //constraint_set2_flag
7095     get_bits1(&s->gb);   //constraint_set3_flag
7096     get_bits(&s->gb, 4); // reserved
7097     level_idc= get_bits(&s->gb, 8);
7098     sps_id= get_ue_golomb(&s->gb);
7099
7100     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7101     if(sps == NULL)
7102         return -1;
7103
7104     sps->profile_idc= profile_idc;
7105     sps->level_idc= level_idc;
7106
7107     if(sps->profile_idc >= 100){ //high profile
7108         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7109             get_bits1(&s->gb);  //residual_color_transform_flag
7110         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7111         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7112         sps->transform_bypass = get_bits1(&s->gb);
7113         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7114     }else
7115         sps->scaling_matrix_present = 0;
7116
7117     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7118     sps->poc_type= get_ue_golomb(&s->gb);
7119
7120     if(sps->poc_type == 0){ //FIXME #define
7121         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7122     } else if(sps->poc_type == 1){//FIXME #define
7123         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7124         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7125         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7126         tmp= get_ue_golomb(&s->gb);
7127
7128         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7129             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7130             return -1;
7131         }
7132         sps->poc_cycle_length= tmp;
7133
7134         for(i=0; i<sps->poc_cycle_length; i++)
7135             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7136     }else if(sps->poc_type != 2){
7137         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7138         return -1;
7139     }
7140
7141     tmp= get_ue_golomb(&s->gb);
7142     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7143         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7144         return -1;
7145     }
7146     sps->ref_frame_count= tmp;
7147     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7148     mb_width= get_ue_golomb(&s->gb) + 1;
7149     mb_height= get_ue_golomb(&s->gb) + 1;
7150     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7151        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7152         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7153         return -1;
7154     }
7155     sps->mb_width = mb_width;
7156     sps->mb_height= mb_height;
7157
7158     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7159     if(!sps->frame_mbs_only_flag)
7160         sps->mb_aff= get_bits1(&s->gb);
7161     else
7162         sps->mb_aff= 0;
7163
7164     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7165
7166 #ifndef ALLOW_INTERLACE
7167     if(sps->mb_aff)
7168         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7169 #endif
7170     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7171         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7172
7173     sps->crop= get_bits1(&s->gb);
7174     if(sps->crop){
7175         sps->crop_left  = get_ue_golomb(&s->gb);
7176         sps->crop_right = get_ue_golomb(&s->gb);
7177         sps->crop_top   = get_ue_golomb(&s->gb);
7178         sps->crop_bottom= get_ue_golomb(&s->gb);
7179         if(sps->crop_left || sps->crop_top){
7180             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7181         }
7182     }else{
7183         sps->crop_left  =
7184         sps->crop_right =
7185         sps->crop_top   =
7186         sps->crop_bottom= 0;
7187     }
7188
7189     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7190     if( sps->vui_parameters_present_flag )
7191         decode_vui_parameters(h, sps);
7192
7193     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7194         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7195                sps_id, sps->profile_idc, sps->level_idc,
7196                sps->poc_type,
7197                sps->ref_frame_count,
7198                sps->mb_width, sps->mb_height,
7199                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7200                sps->direct_8x8_inference_flag ? "8B8" : "",
7201                sps->crop_left, sps->crop_right,
7202                sps->crop_top, sps->crop_bottom,
7203                sps->vui_parameters_present_flag ? "VUI" : ""
7204                );
7205     }
7206     return 0;
7207 }
7208
7209 static void
7210 build_qp_table(PPS *pps, int t, int index)
7211 {
7212     int i;
7213     for(i = 0; i < 255; i++)
7214         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7215 }
7216
7217 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7218     MpegEncContext * const s = &h->s;
7219     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7220     PPS *pps;
7221
7222     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7223     if(pps == NULL)
7224         return -1;
7225
7226     tmp= get_ue_golomb(&s->gb);
7227     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7228         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7229         return -1;
7230     }
7231     pps->sps_id= tmp;
7232
7233     pps->cabac= get_bits1(&s->gb);
7234     pps->pic_order_present= get_bits1(&s->gb);
7235     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7236     if(pps->slice_group_count > 1 ){
7237         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7238         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7239         switch(pps->mb_slice_group_map_type){
7240         case 0:
7241 #if 0
7242 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7243 |    run_length[ i ]                                |1  |ue(v)   |
7244 #endif
7245             break;
7246         case 2:
7247 #if 0
7248 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7249 |{                                                  |   |        |
7250 |    top_left_mb[ i ]                               |1  |ue(v)   |
7251 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7252 |   }                                               |   |        |
7253 #endif
7254             break;
7255         case 3:
7256         case 4:
7257         case 5:
7258 #if 0
7259 |   slice_group_change_direction_flag               |1  |u(1)    |
7260 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7261 #endif
7262             break;
7263         case 6:
7264 #if 0
7265 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7266 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7267 |)                                                  |   |        |
7268 |    slice_group_id[ i ]                            |1  |u(v)    |
7269 #endif
7270             break;
7271         }
7272     }
7273     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7274     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7275     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7276         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7277         pps->ref_count[0]= pps->ref_count[1]= 1;
7278         return -1;
7279     }
7280
7281     pps->weighted_pred= get_bits1(&s->gb);
7282     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7283     pps->init_qp= get_se_golomb(&s->gb) + 26;
7284     pps->init_qs= get_se_golomb(&s->gb) + 26;
7285     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7286     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7287     pps->constrained_intra_pred= get_bits1(&s->gb);
7288     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7289
7290     pps->transform_8x8_mode= 0;
7291     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7292     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7293     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7294
7295     if(get_bits_count(&s->gb) < bit_length){
7296         pps->transform_8x8_mode= get_bits1(&s->gb);
7297         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7298         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7299     } else {
7300         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7301     }
7302
7303     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7304     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7305         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7306         h->pps.chroma_qp_diff= 1;
7307     } else
7308         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7309
7310     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7311         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7312                pps_id, pps->sps_id,
7313                pps->cabac ? "CABAC" : "CAVLC",
7314                pps->slice_group_count,
7315                pps->ref_count[0], pps->ref_count[1],
7316                pps->weighted_pred ? "weighted" : "",
7317                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7318                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7319                pps->constrained_intra_pred ? "CONSTR" : "",
7320                pps->redundant_pic_cnt_present ? "REDU" : "",
7321                pps->transform_8x8_mode ? "8x8DCT" : ""
7322                );
7323     }
7324
7325     return 0;
7326 }
7327
7328 /**
7329  * Call decode_slice() for each context.
7330  *
7331  * @param h h264 master context
7332  * @param context_count number of contexts to execute
7333  */
7334 static void execute_decode_slices(H264Context *h, int context_count){
7335     MpegEncContext * const s = &h->s;
7336     AVCodecContext * const avctx= s->avctx;
7337     H264Context *hx;
7338     int i;
7339
7340     if(context_count == 1) {
7341         decode_slice(avctx, h);
7342     } else {
7343         for(i = 1; i < context_count; i++) {
7344             hx = h->thread_context[i];
7345             hx->s.error_resilience = avctx->error_resilience;
7346             hx->s.error_count = 0;
7347         }
7348
7349         avctx->execute(avctx, (void *)decode_slice,
7350                        (void **)h->thread_context, NULL, context_count);
7351
7352         /* pull back stuff from slices to master context */
7353         hx = h->thread_context[context_count - 1];
7354         s->mb_x = hx->s.mb_x;
7355         s->mb_y = hx->s.mb_y;
7356         s->dropable = hx->s.dropable;
7357         s->picture_structure = hx->s.picture_structure;
7358         for(i = 1; i < context_count; i++)
7359             h->s.error_count += h->thread_context[i]->s.error_count;
7360     }
7361 }
7362
7363
7364 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7365     MpegEncContext * const s = &h->s;
7366     AVCodecContext * const avctx= s->avctx;
7367     int buf_index=0;
7368     H264Context *hx; ///< thread context
7369     int context_count = 0;
7370
7371     h->max_contexts = avctx->thread_count;
7372 #if 0
7373     int i;
7374     for(i=0; i<50; i++){
7375         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7376     }
7377 #endif
7378     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7379         h->current_slice = 0;
7380         if (!s->first_field)
7381             s->current_picture_ptr= NULL;
7382     }
7383
7384     for(;;){
7385         int consumed;
7386         int dst_length;
7387         int bit_length;
7388         const uint8_t *ptr;
7389         int i, nalsize = 0;
7390         int err;
7391
7392         if(h->is_avc) {
7393             if(buf_index >= buf_size) break;
7394             nalsize = 0;
7395             for(i = 0; i < h->nal_length_size; i++)
7396                 nalsize = (nalsize << 8) | buf[buf_index++];
7397             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7398                 if(nalsize == 1){
7399                     buf_index++;
7400                     continue;
7401                 }else{
7402                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7403                     break;
7404                 }
7405             }
7406         } else {
7407             // start code prefix search
7408             for(; buf_index + 3 < buf_size; buf_index++){
7409                 // This should always succeed in the first iteration.
7410                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7411                     break;
7412             }
7413
7414             if(buf_index+3 >= buf_size) break;
7415
7416             buf_index+=3;
7417         }
7418
7419         hx = h->thread_context[context_count];
7420
7421         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7422         if (ptr==NULL || dst_length < 0){
7423             return -1;
7424         }
7425         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7426             dst_length--;
7427         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7428
7429         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7430             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7431         }
7432
7433         if (h->is_avc && (nalsize != consumed))
7434             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7435
7436         buf_index += consumed;
7437
7438         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7439            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7440             continue;
7441
7442       again:
7443         err = 0;
7444         switch(hx->nal_unit_type){
7445         case NAL_IDR_SLICE:
7446             if (h->nal_unit_type != NAL_IDR_SLICE) {
7447                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7448                 return -1;
7449             }
7450             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7451         case NAL_SLICE:
7452             init_get_bits(&hx->s.gb, ptr, bit_length);
7453             hx->intra_gb_ptr=
7454             hx->inter_gb_ptr= &hx->s.gb;
7455             hx->s.data_partitioning = 0;
7456
7457             if((err = decode_slice_header(hx, h)))
7458                break;
7459
7460             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7461             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7462                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7463                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7464                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7465                && avctx->skip_frame < AVDISCARD_ALL)
7466                 context_count++;
7467             break;
7468         case NAL_DPA:
7469             init_get_bits(&hx->s.gb, ptr, bit_length);
7470             hx->intra_gb_ptr=
7471             hx->inter_gb_ptr= NULL;
7472             hx->s.data_partitioning = 1;
7473
7474             err = decode_slice_header(hx, h);
7475             break;
7476         case NAL_DPB:
7477             init_get_bits(&hx->intra_gb, ptr, bit_length);
7478             hx->intra_gb_ptr= &hx->intra_gb;
7479             break;
7480         case NAL_DPC:
7481             init_get_bits(&hx->inter_gb, ptr, bit_length);
7482             hx->inter_gb_ptr= &hx->inter_gb;
7483
7484             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7485                && s->context_initialized
7486                && s->hurry_up < 5
7487                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7488                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7489                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7490                && avctx->skip_frame < AVDISCARD_ALL)
7491                 context_count++;
7492             break;
7493         case NAL_SEI:
7494             init_get_bits(&s->gb, ptr, bit_length);
7495             decode_sei(h);
7496             break;
7497         case NAL_SPS:
7498             init_get_bits(&s->gb, ptr, bit_length);
7499             decode_seq_parameter_set(h);
7500
7501             if(s->flags& CODEC_FLAG_LOW_DELAY)
7502                 s->low_delay=1;
7503
7504             if(avctx->has_b_frames < 2)
7505                 avctx->has_b_frames= !s->low_delay;
7506             break;
7507         case NAL_PPS:
7508             init_get_bits(&s->gb, ptr, bit_length);
7509
7510             decode_picture_parameter_set(h, bit_length);
7511
7512             break;
7513         case NAL_AUD:
7514         case NAL_END_SEQUENCE:
7515         case NAL_END_STREAM:
7516         case NAL_FILLER_DATA:
7517         case NAL_SPS_EXT:
7518         case NAL_AUXILIARY_SLICE:
7519             break;
7520         default:
7521             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7522         }
7523
7524         if(context_count == h->max_contexts) {
7525             execute_decode_slices(h, context_count);
7526             context_count = 0;
7527         }
7528
7529         if (err < 0)
7530             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7531         else if(err == 1) {
7532             /* Slice could not be decoded in parallel mode, copy down
7533              * NAL unit stuff to context 0 and restart. Note that
7534              * rbsp_buffer is not transfered, but since we no longer
7535              * run in parallel mode this should not be an issue. */
7536             h->nal_unit_type = hx->nal_unit_type;
7537             h->nal_ref_idc   = hx->nal_ref_idc;
7538             hx = h;
7539             goto again;
7540         }
7541     }
7542     if(context_count)
7543         execute_decode_slices(h, context_count);
7544     return buf_index;
7545 }
7546
7547 /**
7548  * returns the number of bytes consumed for building the current frame
7549  */
7550 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7551     if(s->flags&CODEC_FLAG_TRUNCATED){
7552         pos -= s->parse_context.last_index;
7553         if(pos<0) pos=0; // FIXME remove (unneeded?)
7554
7555         return pos;
7556     }else{
7557         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7558         if(pos+10>buf_size) pos=buf_size; // oops ;)
7559
7560         return pos;
7561     }
7562 }
7563
7564 static int decode_frame(AVCodecContext *avctx,
7565                              void *data, int *data_size,
7566                              const uint8_t *buf, int buf_size)
7567 {
7568     H264Context *h = avctx->priv_data;
7569     MpegEncContext *s = &h->s;
7570     AVFrame *pict = data;
7571     int buf_index;
7572
7573     s->flags= avctx->flags;
7574     s->flags2= avctx->flags2;
7575
7576    /* no supplementary picture */
7577     if (buf_size == 0) {
7578         Picture *out;
7579         int i, out_idx;
7580
7581 //FIXME factorize this with the output code below
7582         out = h->delayed_pic[0];
7583         out_idx = 0;
7584         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7585             if(h->delayed_pic[i]->poc < out->poc){
7586                 out = h->delayed_pic[i];
7587                 out_idx = i;
7588             }
7589
7590         for(i=out_idx; h->delayed_pic[i]; i++)
7591             h->delayed_pic[i] = h->delayed_pic[i+1];
7592
7593         if(out){
7594             *data_size = sizeof(AVFrame);
7595             *pict= *(AVFrame*)out;
7596         }
7597
7598         return 0;
7599     }
7600
7601     if(s->flags&CODEC_FLAG_TRUNCATED){
7602         int next= ff_h264_find_frame_end(h, buf, buf_size);
7603
7604         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7605             return buf_size;
7606 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7607     }
7608
7609     if(h->is_avc && !h->got_avcC) {
7610         int i, cnt, nalsize;
7611         unsigned char *p = avctx->extradata;
7612         if(avctx->extradata_size < 7) {
7613             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7614             return -1;
7615         }
7616         if(*p != 1) {
7617             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7618             return -1;
7619         }
7620         /* sps and pps in the avcC always have length coded with 2 bytes,
7621            so put a fake nal_length_size = 2 while parsing them */
7622         h->nal_length_size = 2;
7623         // Decode sps from avcC
7624         cnt = *(p+5) & 0x1f; // Number of sps
7625         p += 6;
7626         for (i = 0; i < cnt; i++) {
7627             nalsize = AV_RB16(p) + 2;
7628             if(decode_nal_units(h, p, nalsize) < 0) {
7629                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7630                 return -1;
7631             }
7632             p += nalsize;
7633         }
7634         // Decode pps from avcC
7635         cnt = *(p++); // Number of pps
7636         for (i = 0; i < cnt; i++) {
7637             nalsize = AV_RB16(p) + 2;
7638             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7639                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7640                 return -1;
7641             }
7642             p += nalsize;
7643         }
7644         // Now store right nal length size, that will be use to parse all other nals
7645         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7646         // Do not reparse avcC
7647         h->got_avcC = 1;
7648     }
7649
7650     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7651         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7652             return -1;
7653     }
7654
7655     buf_index=decode_nal_units(h, buf, buf_size);
7656     if(buf_index < 0)
7657         return -1;
7658
7659     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7660         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7661         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7662         return -1;
7663     }
7664
7665     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7666         Picture *out = s->current_picture_ptr;
7667         Picture *cur = s->current_picture_ptr;
7668         Picture *prev = h->delayed_output_pic;
7669         int i, pics, cross_idr, out_of_order, out_idx;
7670
7671         s->mb_y= 0;
7672
7673         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7674         s->current_picture_ptr->pict_type= s->pict_type;
7675
7676         h->prev_frame_num_offset= h->frame_num_offset;
7677         h->prev_frame_num= h->frame_num;
7678         if(!s->dropable) {
7679             h->prev_poc_msb= h->poc_msb;
7680             h->prev_poc_lsb= h->poc_lsb;
7681             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7682         }
7683
7684         /*
7685          * FIXME: Error handling code does not seem to support interlaced
7686          * when slices span multiple rows
7687          * The ff_er_add_slice calls don't work right for bottom
7688          * fields; they cause massive erroneous error concealing
7689          * Error marking covers both fields (top and bottom).
7690          * This causes a mismatched s->error_count
7691          * and a bad error table. Further, the error count goes to
7692          * INT_MAX when called for bottom field, because mb_y is
7693          * past end by one (callers fault) and resync_mb_y != 0
7694          * causes problems for the first MB line, too.
7695          */
7696         if (!FIELD_PICTURE)
7697             ff_er_frame_end(s);
7698
7699         MPV_frame_end(s);
7700
7701         if (s->first_field) {
7702             /* Wait for second field. */
7703             *data_size = 0;
7704
7705         } else {
7706             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7707             /* Derive top_field_first from field pocs. */
7708             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7709
7710         //FIXME do something with unavailable reference frames
7711
7712 #if 0 //decode order
7713             *data_size = sizeof(AVFrame);
7714 #else
7715             /* Sort B-frames into display order */
7716
7717             if(h->sps.bitstream_restriction_flag
7718                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7719                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7720                 s->low_delay = 0;
7721             }
7722
7723             pics = 0;
7724             while(h->delayed_pic[pics]) pics++;
7725
7726             assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7727
7728             h->delayed_pic[pics++] = cur;
7729             if(cur->reference == 0)
7730                 cur->reference = DELAYED_PIC_REF;
7731
7732             cross_idr = 0;
7733             for(i=0; h->delayed_pic[i]; i++)
7734                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7735                     cross_idr = 1;
7736
7737             out = h->delayed_pic[0];
7738             out_idx = 0;
7739             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7740                 if(h->delayed_pic[i]->poc < out->poc){
7741                     out = h->delayed_pic[i];
7742                     out_idx = i;
7743                 }
7744
7745             out_of_order = !cross_idr && prev && out->poc < prev->poc;
7746             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7747                 { }
7748             else if(prev && pics <= s->avctx->has_b_frames)
7749                 out = prev;
7750             else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7751                || (s->low_delay &&
7752                 ((!cross_idr && prev && out->poc > prev->poc + 2)
7753                  || cur->pict_type == B_TYPE)))
7754             {
7755                 s->low_delay = 0;
7756                 s->avctx->has_b_frames++;
7757                 out = prev;
7758             }
7759             else if(out_of_order)
7760                 out = prev;
7761
7762             if(out_of_order || pics > s->avctx->has_b_frames){
7763                 for(i=out_idx; h->delayed_pic[i]; i++)
7764                     h->delayed_pic[i] = h->delayed_pic[i+1];
7765             }
7766
7767             if(prev == out)
7768                 *data_size = 0;
7769             else
7770                 *data_size = sizeof(AVFrame);
7771             if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7772                 prev->reference = 0;
7773             h->delayed_output_pic = out;
7774 #endif
7775
7776             if(out)
7777                 *pict= *(AVFrame*)out;
7778             else
7779                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7780         }
7781     }
7782
7783     assert(pict->data[0] || !*data_size);
7784     ff_print_debug_info(s, pict);
7785 //printf("out %d\n", (int)pict->data[0]);
7786 #if 0 //?
7787
7788     /* Return the Picture timestamp as the frame number */
7789     /* we subtract 1 because it is added on utils.c     */
7790     avctx->frame_number = s->picture_number - 1;
7791 #endif
7792     return get_consumed_bytes(s, buf_index, buf_size);
7793 }
7794 #if 0
7795 static inline void fill_mb_avail(H264Context *h){
7796     MpegEncContext * const s = &h->s;
7797     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7798
7799     if(s->mb_y){
7800         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7801         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7802         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7803     }else{
7804         h->mb_avail[0]=
7805         h->mb_avail[1]=
7806         h->mb_avail[2]= 0;
7807     }
7808     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7809     h->mb_avail[4]= 1; //FIXME move out
7810     h->mb_avail[5]= 0; //FIXME move out
7811 }
7812 #endif
7813
7814 #ifdef TEST
7815 #undef printf
7816 #undef random
7817 #define COUNT 8000
7818 #define SIZE (COUNT*40)
7819 int main(void){
7820     int i;
7821     uint8_t temp[SIZE];
7822     PutBitContext pb;
7823     GetBitContext gb;
7824 //    int int_temp[10000];
7825     DSPContext dsp;
7826     AVCodecContext avctx;
7827
7828     dsputil_init(&dsp, &avctx);
7829
7830     init_put_bits(&pb, temp, SIZE);
7831     printf("testing unsigned exp golomb\n");
7832     for(i=0; i<COUNT; i++){
7833         START_TIMER
7834         set_ue_golomb(&pb, i);
7835         STOP_TIMER("set_ue_golomb");
7836     }
7837     flush_put_bits(&pb);
7838
7839     init_get_bits(&gb, temp, 8*SIZE);
7840     for(i=0; i<COUNT; i++){
7841         int j, s;
7842
7843         s= show_bits(&gb, 24);
7844
7845         START_TIMER
7846         j= get_ue_golomb(&gb);
7847         if(j != i){
7848             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7849 //            return -1;
7850         }
7851         STOP_TIMER("get_ue_golomb");
7852     }
7853
7854
7855     init_put_bits(&pb, temp, SIZE);
7856     printf("testing signed exp golomb\n");
7857     for(i=0; i<COUNT; i++){
7858         START_TIMER
7859         set_se_golomb(&pb, i - COUNT/2);
7860         STOP_TIMER("set_se_golomb");
7861     }
7862     flush_put_bits(&pb);
7863
7864     init_get_bits(&gb, temp, 8*SIZE);
7865     for(i=0; i<COUNT; i++){
7866         int j, s;
7867
7868         s= show_bits(&gb, 24);
7869
7870         START_TIMER
7871         j= get_se_golomb(&gb);
7872         if(j != i - COUNT/2){
7873             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7874 //            return -1;
7875         }
7876         STOP_TIMER("get_se_golomb");
7877     }
7878
7879 #if 0
7880     printf("testing 4x4 (I)DCT\n");
7881
7882     DCTELEM block[16];
7883     uint8_t src[16], ref[16];
7884     uint64_t error= 0, max_error=0;
7885
7886     for(i=0; i<COUNT; i++){
7887         int j;
7888 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7889         for(j=0; j<16; j++){
7890             ref[j]= random()%255;
7891             src[j]= random()%255;
7892         }
7893
7894         h264_diff_dct_c(block, src, ref, 4);
7895
7896         //normalize
7897         for(j=0; j<16; j++){
7898 //            printf("%d ", block[j]);
7899             block[j]= block[j]*4;
7900             if(j&1) block[j]= (block[j]*4 + 2)/5;
7901             if(j&4) block[j]= (block[j]*4 + 2)/5;
7902         }
7903 //        printf("\n");
7904
7905         s->dsp.h264_idct_add(ref, block, 4);
7906 /*        for(j=0; j<16; j++){
7907             printf("%d ", ref[j]);
7908         }
7909         printf("\n");*/
7910
7911         for(j=0; j<16; j++){
7912             int diff= FFABS(src[j] - ref[j]);
7913
7914             error+= diff*diff;
7915             max_error= FFMAX(max_error, diff);
7916         }
7917     }
7918     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7919     printf("testing quantizer\n");
7920     for(qp=0; qp<52; qp++){
7921         for(i=0; i<16; i++)
7922             src1_block[i]= src2_block[i]= random()%255;
7923
7924     }
7925     printf("Testing NAL layer\n");
7926
7927     uint8_t bitstream[COUNT];
7928     uint8_t nal[COUNT*2];
7929     H264Context h;
7930     memset(&h, 0, sizeof(H264Context));
7931
7932     for(i=0; i<COUNT; i++){
7933         int zeros= i;
7934         int nal_length;
7935         int consumed;
7936         int out_length;
7937         uint8_t *out;
7938         int j;
7939
7940         for(j=0; j<COUNT; j++){
7941             bitstream[j]= (random() % 255) + 1;
7942         }
7943
7944         for(j=0; j<zeros; j++){
7945             int pos= random() % COUNT;
7946             while(bitstream[pos] == 0){
7947                 pos++;
7948                 pos %= COUNT;
7949             }
7950             bitstream[pos]=0;
7951         }
7952
7953         START_TIMER
7954
7955         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7956         if(nal_length<0){
7957             printf("encoding failed\n");
7958             return -1;
7959         }
7960
7961         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7962
7963         STOP_TIMER("NAL")
7964
7965         if(out_length != COUNT){
7966             printf("incorrect length %d %d\n", out_length, COUNT);
7967             return -1;
7968         }
7969
7970         if(consumed != nal_length){
7971             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7972             return -1;
7973         }
7974
7975         if(memcmp(bitstream, out, COUNT)){
7976             printf("mismatch\n");
7977             return -1;
7978         }
7979     }
7980 #endif
7981
7982     printf("Testing RBSP\n");
7983
7984
7985     return 0;
7986 }
7987 #endif /* TEST */
7988
7989
7990 static int decode_end(AVCodecContext *avctx)
7991 {
7992     H264Context *h = avctx->priv_data;
7993     MpegEncContext *s = &h->s;
7994
7995     av_freep(&h->rbsp_buffer[0]);
7996     av_freep(&h->rbsp_buffer[1]);
7997     free_tables(h); //FIXME cleanup init stuff perhaps
7998     MPV_common_end(s);
7999
8000 //    memset(h, 0, sizeof(H264Context));
8001
8002     return 0;
8003 }
8004
8005
8006 AVCodec h264_decoder = {
8007     "h264",
8008     CODEC_TYPE_VIDEO,
8009     CODEC_ID_H264,
8010     sizeof(H264Context),
8011     decode_init,
8012     NULL,
8013     decode_end,
8014     decode_frame,
8015     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8016     .flush= flush_dpb,
8017 };
8018
8019 #include "svq3.c"