git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  65
  66 static av_always_inline uint32_t pack16to32(int a, int b){
  67 #ifdef WORDS_BIGENDIAN
  68    return (b&0xFFFF) + (a<<16);
  69 #else
  70    return (a&0xFFFF) + (b<<16);
  71 #endif
  72 }
  73
  74 const uint8_t ff_rem6[52]={
  75 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  76 };
  77
  78 const uint8_t ff_div6[52]={
  79 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  80 };
  81
  82
  83 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  84     MpegEncContext * const s = &h->s;
  85     const int mb_xy= h->mb_xy;
  86     int topleft_xy, top_xy, topright_xy, left_xy[2];
  87     int topleft_type, top_type, topright_type, left_type[2];
  88     int left_block[8];
  89     int topleft_partition= -1;
  90     int i;
  91
  92     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  93
  94     //FIXME deblocking could skip the intra and nnz parts.
  95     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  96         return;
  97
  98     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  99      * stuff, I can't imagine that these complex rules are worth it. */
 100
 101     topleft_xy = top_xy - 1;
 102     topright_xy= top_xy + 1;
 103     left_xy[1] = left_xy[0] = mb_xy-1;
 104     left_block[0]= 0;
 105     left_block[1]= 1;
 106     left_block[2]= 2;
 107     left_block[3]= 3;
 108     left_block[4]= 7;
 109     left_block[5]= 10;
 110     left_block[6]= 8;
 111     left_block[7]= 11;
 112     if(FRAME_MBAFF){
 113         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 114         const int top_pair_xy      = pair_xy     - s->mb_stride;
 115         const int topleft_pair_xy  = top_pair_xy - 1;
 116         const int topright_pair_xy = top_pair_xy + 1;
 117         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 118         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 119         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 120         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 121         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 122         const int bottom = (s->mb_y & 1);
 123         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 124         if (bottom
 125                 ? !curr_mb_frame_flag // bottom macroblock
 126                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 127                 ) {
 128             top_xy -= s->mb_stride;
 129         }
 130         if (bottom
 131                 ? !curr_mb_frame_flag // bottom macroblock
 132                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 133                 ) {
 134             topleft_xy -= s->mb_stride;
 135         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 136             topleft_xy += s->mb_stride;
 137             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 138             topleft_partition = 0;
 139         }
 140         if (bottom
 141                 ? !curr_mb_frame_flag // bottom macroblock
 142                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 143                 ) {
 144             topright_xy -= s->mb_stride;
 145         }
 146         if (left_mb_frame_flag != curr_mb_frame_flag) {
 147             left_xy[1] = left_xy[0] = pair_xy - 1;
 148             if (curr_mb_frame_flag) {
 149                 if (bottom) {
 150                     left_block[0]= 2;
 151                     left_block[1]= 2;
 152                     left_block[2]= 3;
 153                     left_block[3]= 3;
 154                     left_block[4]= 8;
 155                     left_block[5]= 11;
 156                     left_block[6]= 8;
 157                     left_block[7]= 11;
 158                 } else {
 159                     left_block[0]= 0;
 160                     left_block[1]= 0;
 161                     left_block[2]= 1;
 162                     left_block[3]= 1;
 163                     left_block[4]= 7;
 164                     left_block[5]= 10;
 165                     left_block[6]= 7;
 166                     left_block[7]= 10;
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 //left_block[0]= 0;
 171                 left_block[1]= 2;
 172                 left_block[2]= 0;
 173                 left_block[3]= 2;
 174                 //left_block[4]= 7;
 175                 left_block[5]= 10;
 176                 left_block[6]= 7;
 177                 left_block[7]= 10;
 178             }
 179         }
 180     }
 181
 182     h->top_mb_xy = top_xy;
 183     h->left_mb_xy[0] = left_xy[0];
 184     h->left_mb_xy[1] = left_xy[1];
 185     if(for_deblock){
 186         topleft_type = 0;
 187         topright_type = 0;
 188         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 189         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 190         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 191
 192         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 193             int list;
 194             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 195             for(i=0; i<16; i++)
 196                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 197             for(list=0; list<h->list_count; list++){
 198                 if(USES_LIST(mb_type,list)){
 199                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 200                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 201                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 202                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 203                         dst[0] = src[0];
 204                         dst[1] = src[1];
 205                         dst[2] = src[2];
 206                         dst[3] = src[3];
 207                     }
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 209                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 210                     ref += h->b8_stride;
 211                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 212                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 213                 }else{
 214                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 215                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 216                 }
 217             }
 218         }
 219     }else{
 220         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 221         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 222         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 223         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 224         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 225     }
 226
 227     if(IS_INTRA(mb_type)){
 228         h->topleft_samples_available=
 229         h->top_samples_available=
 230         h->left_samples_available= 0xFFFF;
 231         h->topright_samples_available= 0xEEEA;
 232
 233         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 234             h->topleft_samples_available= 0xB3FF;
 235             h->top_samples_available= 0x33FF;
 236             h->topright_samples_available= 0x26EA;
 237         }
 238         for(i=0; i<2; i++){
 239             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 240                 h->topleft_samples_available&= 0xDF5F;
 241                 h->left_samples_available&= 0x5F5F;
 242             }
 243         }
 244
 245         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 246             h->topleft_samples_available&= 0x7FFF;
 247
 248         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 249             h->topright_samples_available&= 0xFBFF;
 250
 251         if(IS_INTRA4x4(mb_type)){
 252             if(IS_INTRA4x4(top_type)){
 253                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 254                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 255                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 256                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 257             }else{
 258                 int pred;
 259                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 260                     pred= -1;
 261                 else{
 262                     pred= 2;
 263                 }
 264                 h->intra4x4_pred_mode_cache[4+8*0]=
 265                 h->intra4x4_pred_mode_cache[5+8*0]=
 266                 h->intra4x4_pred_mode_cache[6+8*0]=
 267                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 268             }
 269             for(i=0; i<2; i++){
 270                 if(IS_INTRA4x4(left_type[i])){
 271                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 272                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 273                 }else{
 274                     int pred;
 275                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 276                         pred= -1;
 277                     else{
 278                         pred= 2;
 279                     }
 280                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 281                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 282                 }
 283             }
 284         }
 285     }
 286
 287
 288 /*
 289 0 . T T. T T T T
 290 1 L . .L . . . .
 291 2 L . .L . . . .
 292 3 . T TL . . . .
 293 4 L . .L . . . .
 294 5 L . .. . . . .
 295 */
 296 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 297     if(top_type){
 298         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 299         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 300         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 301         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 302
 303         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 304         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 305
 306         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 307         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 308
 309     }else{
 310         h->non_zero_count_cache[4+8*0]=
 311         h->non_zero_count_cache[5+8*0]=
 312         h->non_zero_count_cache[6+8*0]=
 313         h->non_zero_count_cache[7+8*0]=
 314
 315         h->non_zero_count_cache[1+8*0]=
 316         h->non_zero_count_cache[2+8*0]=
 317
 318         h->non_zero_count_cache[1+8*3]=
 319         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 320
 321     }
 322
 323     for (i=0; i<2; i++) {
 324         if(left_type[i]){
 325             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 326             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 327             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 328             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 329         }else{
 330             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 331             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 332             h->non_zero_count_cache[0+8*1 +   8*i]=
 333             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 334         }
 335     }
 336
 337     if( h->pps.cabac ) {
 338         // top_cbp
 339         if(top_type) {
 340             h->top_cbp = h->cbp_table[top_xy];
 341         } else if(IS_INTRA(mb_type)) {
 342             h->top_cbp = 0x1C0;
 343         } else {
 344             h->top_cbp = 0;
 345         }
 346         // left_cbp
 347         if (left_type[0]) {
 348             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 349         } else if(IS_INTRA(mb_type)) {
 350             h->left_cbp = 0x1C0;
 351         } else {
 352             h->left_cbp = 0;
 353         }
 354         if (left_type[0]) {
 355             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 356         }
 357         if (left_type[1]) {
 358             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 359         }
 360     }
 361
 362 #if 1
 363     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 364         int list;
 365         for(list=0; list<h->list_count; list++){
 366             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 367                 /*if(!h->mv_cache_clean[list]){
 368                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 369                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 370                     h->mv_cache_clean[list]= 1;
 371                 }*/
 372                 continue;
 373             }
 374             h->mv_cache_clean[list]= 0;
 375
 376             if(USES_LIST(top_type, list)){
 377                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 378                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 383                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 384                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 385                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 387             }else{
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 392                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 393             }
 394
 395             for(i=0; i<2; i++){
 396                 int cache_idx = scan8[0] - 1 + i*2*8;
 397                 if(USES_LIST(left_type[i], list)){
 398                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 399                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 400                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 401                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 402                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 403                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 404                 }else{
 405                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 406                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 407                     h->ref_cache[list][cache_idx  ]=
 408                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 409                 }
 410             }
 411
 412             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 413                 continue;
 414
 415             if(USES_LIST(topleft_type, list)){
 416                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 417                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 418                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 419                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 420             }else{
 421                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 422                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 423             }
 424
 425             if(USES_LIST(topright_type, list)){
 426                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 427                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 428                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 429                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 430             }else{
 431                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 432                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 433             }
 434
 435             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 436                 continue;
 437
 438             h->ref_cache[list][scan8[5 ]+1] =
 439             h->ref_cache[list][scan8[7 ]+1] =
 440             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 441             h->ref_cache[list][scan8[4 ]] =
 442             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 443             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 445             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 446             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 447             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 448
 449             if( h->pps.cabac ) {
 450                 /* XXX beurk, Load mvd */
 451                 if(USES_LIST(top_type, list)){
 452                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 457                 }else{
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 462                 }
 463                 if(USES_LIST(left_type[0], list)){
 464                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 466                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 467                 }else{
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 469                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 470                 }
 471                 if(USES_LIST(left_type[1], list)){
 472                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 474                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 475                 }else{
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 477                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 478                 }
 479                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 481                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 482                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 484
 485                 if(h->slice_type_nos == FF_B_TYPE){
 486                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 487
 488                     if(IS_DIRECT(top_type)){
 489                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 490                     }else if(IS_8X8(top_type)){
 491                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 492                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 493                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 494                     }else{
 495                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 496                     }
 497
 498                     if(IS_DIRECT(left_type[0]))
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 500                     else if(IS_8X8(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 502                     else
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 504
 505                     if(IS_DIRECT(left_type[1]))
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 507                     else if(IS_8X8(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 509                     else
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 511                 }
 512             }
 513
 514             if(FRAME_MBAFF){
 515 #define MAP_MVS\
 516                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 517                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 522                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 524                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 525                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 526                 if(MB_FIELD){
 527 #define MAP_F2F(idx, mb_type)\
 528                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 529                         h->ref_cache[list][idx] <<= 1;\
 530                         h->mv_cache[list][idx][1] /= 2;\
 531                         h->mvd_cache[list][idx][1] /= 2;\
 532                     }
 533                     MAP_MVS
 534 #undef MAP_F2F
 535                 }else{
 536 #define MAP_F2F(idx, mb_type)\
 537                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 538                         h->ref_cache[list][idx] >>= 1;\
 539                         h->mv_cache[list][idx][1] <<= 1;\
 540                         h->mvd_cache[list][idx][1] <<= 1;\
 541                     }
 542                     MAP_MVS
 543 #undef MAP_F2F
 544                 }
 545             }
 546         }
 547     }
 548 #endif
 549
 550     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 551 }
 552
 553 static inline void write_back_intra_pred_mode(H264Context *h){
 554     const int mb_xy= h->mb_xy;
 555
 556     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 557     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 558     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 559     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 560     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 561     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 562     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 563 }
 564
 565 /**
 566  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 567  */
 568 static inline int check_intra4x4_pred_mode(H264Context *h){
 569     MpegEncContext * const s = &h->s;
 570     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 571     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 572     int i;
 573
 574     if(!(h->top_samples_available&0x8000)){
 575         for(i=0; i<4; i++){
 576             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 577             if(status<0){
 578                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 579                 return -1;
 580             } else if(status){
 581                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 582             }
 583         }
 584     }
 585
 586     if(!(h->left_samples_available&0x8000)){
 587         for(i=0; i<4; i++){
 588             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 589             if(status<0){
 590                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 591                 return -1;
 592             } else if(status){
 593                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 594             }
 595         }
 596     }
 597
 598     return 0;
 599 } //FIXME cleanup like next
 600
 601 /**
 602  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 603  */
 604 static inline int check_intra_pred_mode(H264Context *h, int mode){
 605     MpegEncContext * const s = &h->s;
 606     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 607     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 608
 609     if(mode > 6U) {
 610         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 611         return -1;
 612     }
 613
 614     if(!(h->top_samples_available&0x8000)){
 615         mode= top[ mode ];
 616         if(mode<0){
 617             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 618             return -1;
 619         }
 620     }
 621
 622     if(!(h->left_samples_available&0x8000)){
 623         mode= left[ mode ];
 624         if(mode<0){
 625             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 626             return -1;
 627         }
 628     }
 629
 630     return mode;
 631 }
 632
 633 /**
 634  * gets the predicted intra4x4 prediction mode.
 635  */
 636 static inline int pred_intra_mode(H264Context *h, int n){
 637     const int index8= scan8[n];
 638     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 639     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 640     const int min= FFMIN(left, top);
 641
 642     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 643
 644     if(min<0) return DC_PRED;
 645     else      return min;
 646 }
 647
 648 static inline void write_back_non_zero_count(H264Context *h){
 649     const int mb_xy= h->mb_xy;
 650
 651     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 652     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 653     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 654     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 655     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 656     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 657     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 658
 659     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 660     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 661     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 662
 663     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 664     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 665     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 666
 667     if(FRAME_MBAFF){
 668         // store all luma nnzs, for deblocking
 669         int v = 0, i;
 670         for(i=0; i<16; i++)
 671             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 672         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 673     }
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static inline void direct_dist_scale_factor(H264Context * const h){
 898     const int poc = h->s.current_picture_ptr->poc;
 899     const int poc1 = h->ref_list[1][0].poc;
 900     int i;
 901     for(i=0; i<h->ref_count[0]; i++){
 902         int poc0 = h->ref_list[0][i].poc;
 903         int td = av_clip(poc1 - poc0, -128, 127);
 904         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 905             h->dist_scale_factor[i] = 256;
 906         }else{
 907             int tb = av_clip(poc - poc0, -128, 127);
 908             int tx = (16384 + (FFABS(td) >> 1)) / td;
 909             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 910         }
 911     }
 912     if(FRAME_MBAFF){
 913         for(i=0; i<h->ref_count[0]; i++){
 914             h->dist_scale_factor_field[2*i] =
 915             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 916         }
 917     }
 918 }
 919 static inline void direct_ref_list_init(H264Context * const h){
 920     MpegEncContext * const s = &h->s;
 921     Picture * const ref1 = &h->ref_list[1][0];
 922     Picture * const cur = s->current_picture_ptr;
 923     int list, i, j;
 924     if(cur->pict_type == FF_I_TYPE)
 925         cur->ref_count[0] = 0;
 926     if(cur->pict_type != FF_B_TYPE)
 927         cur->ref_count[1] = 0;
 928     for(list=0; list<2; list++){
 929         cur->ref_count[list] = h->ref_count[list];
 930         for(j=0; j<h->ref_count[list]; j++)
 931             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 932     }
 933     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 934         return;
 935     for(list=0; list<2; list++){
 936         for(i=0; i<ref1->ref_count[list]; i++){
 937             const int poc = ref1->ref_poc[list][i];
 938             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 939             for(j=0; j<h->ref_count[list]; j++)
 940                 if(h->ref_list[list][j].poc == poc){
 941                     h->map_col_to_list0[list][i] = j;
 942                     break;
 943                 }
 944         }
 945     }
 946     if(FRAME_MBAFF){
 947         for(list=0; list<2; list++){
 948             for(i=0; i<ref1->ref_count[list]; i++){
 949                 j = h->map_col_to_list0[list][i];
 950                 h->map_col_to_list0_field[list][2*i] = 2*j;
 951                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 952             }
 953         }
 954     }
 955 }
 956
 957 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 958     MpegEncContext * const s = &h->s;
 959     const int mb_xy =   h->mb_xy;
 960     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 961     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 962     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 963     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 964     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 965     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 966     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 967     const int is_b8x8 = IS_8X8(*mb_type);
 968     unsigned int sub_mb_type;
 969     int i8, i4;
 970
 971 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 972     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 973         /* FIXME save sub mb types from previous frames (or derive from MVs)
 974          * so we know exactly what block size to use */
 975         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 976         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 977     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 978         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 979         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 980     }else{
 981         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 982         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 983     }
 984     if(!is_b8x8)
 985         *mb_type |= MB_TYPE_DIRECT2;
 986     if(MB_FIELD)
 987         *mb_type |= MB_TYPE_INTERLACED;
 988
 989     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 990
 991     if(h->direct_spatial_mv_pred){
 992         int ref[2];
 993         int mv[2][2];
 994         int list;
 995
 996         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 997
 998         /* ref = min(neighbors) */
 999         for(list=0; list<2; list++){
1000             int refa = h->ref_cache[list][scan8[0] - 1];
1001             int refb = h->ref_cache[list][scan8[0] - 8];
1002             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1003             if(refc == -2)
1004                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1005             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1006             if(ref[list] < 0)
1007                 ref[list] = -1;
1008         }
1009
1010         if(ref[0] < 0 && ref[1] < 0){
1011             ref[0] = ref[1] = 0;
1012             mv[0][0] = mv[0][1] =
1013             mv[1][0] = mv[1][1] = 0;
1014         }else{
1015             for(list=0; list<2; list++){
1016                 if(ref[list] >= 0)
1017                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1018                 else
1019                     mv[list][0] = mv[list][1] = 0;
1020             }
1021         }
1022
1023         if(ref[1] < 0){
1024             if(!is_b8x8)
1025                 *mb_type &= ~MB_TYPE_L1;
1026             sub_mb_type &= ~MB_TYPE_L1;
1027         }else if(ref[0] < 0){
1028             if(!is_b8x8)
1029                 *mb_type &= ~MB_TYPE_L0;
1030             sub_mb_type &= ~MB_TYPE_L0;
1031         }
1032
1033         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1034             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1035             int mb_types_col[2];
1036             int b8_stride = h->b8_stride;
1037             int b4_stride = h->b_stride;
1038
1039             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1040
1041             if(IS_INTERLACED(*mb_type)){
1042                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1043                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1044                 if(s->mb_y&1){
1045                     l1ref0 -= 2*b8_stride;
1046                     l1ref1 -= 2*b8_stride;
1047                     l1mv0 -= 4*b4_stride;
1048                     l1mv1 -= 4*b4_stride;
1049                 }
1050                 b8_stride *= 3;
1051                 b4_stride *= 6;
1052             }else{
1053                 int cur_poc = s->current_picture_ptr->poc;
1054                 int *col_poc = h->ref_list[1]->field_poc;
1055                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1056                 int dy = 2*col_parity - (s->mb_y&1);
1057                 mb_types_col[0] =
1058                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1059                 l1ref0 += dy*b8_stride;
1060                 l1ref1 += dy*b8_stride;
1061                 l1mv0 += 2*dy*b4_stride;
1062                 l1mv1 += 2*dy*b4_stride;
1063                 b8_stride = 0;
1064             }
1065
1066             for(i8=0; i8<4; i8++){
1067                 int x8 = i8&1;
1068                 int y8 = i8>>1;
1069                 int xy8 = x8+y8*b8_stride;
1070                 int xy4 = 3*x8+y8*b4_stride;
1071                 int a=0, b=0;
1072
1073                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1074                     continue;
1075                 h->sub_mb_type[i8] = sub_mb_type;
1076
1077                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1078                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1079                 if(!IS_INTRA(mb_types_col[y8])
1080                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1081                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1082                     if(ref[0] > 0)
1083                         a= pack16to32(mv[0][0],mv[0][1]);
1084                     if(ref[1] > 0)
1085                         b= pack16to32(mv[1][0],mv[1][1]);
1086                 }else{
1087                     a= pack16to32(mv[0][0],mv[0][1]);
1088                     b= pack16to32(mv[1][0],mv[1][1]);
1089                 }
1090                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1091                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1092             }
1093         }else if(IS_16X16(*mb_type)){
1094             int a=0, b=0;
1095
1096             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1097             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1098             if(!IS_INTRA(mb_type_col)
1099                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1100                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1101                        && (h->x264_build>33 || !h->x264_build)))){
1102                 if(ref[0] > 0)
1103                     a= pack16to32(mv[0][0],mv[0][1]);
1104                 if(ref[1] > 0)
1105                     b= pack16to32(mv[1][0],mv[1][1]);
1106             }else{
1107                 a= pack16to32(mv[0][0],mv[0][1]);
1108                 b= pack16to32(mv[1][0],mv[1][1]);
1109             }
1110             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1111             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1112         }else{
1113             for(i8=0; i8<4; i8++){
1114                 const int x8 = i8&1;
1115                 const int y8 = i8>>1;
1116
1117                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1118                     continue;
1119                 h->sub_mb_type[i8] = sub_mb_type;
1120
1121                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1122                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1123                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1124                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1125
1126                 /* col_zero_flag */
1127                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1128                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1129                                                   && (h->x264_build>33 || !h->x264_build)))){
1130                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1131                     if(IS_SUB_8X8(sub_mb_type)){
1132                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1133                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1134                             if(ref[0] == 0)
1135                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1136                             if(ref[1] == 0)
1137                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1138                         }
1139                     }else
1140                     for(i4=0; i4<4; i4++){
1141                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1142                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1143                             if(ref[0] == 0)
1144                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1145                             if(ref[1] == 0)
1146                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1147                         }
1148                     }
1149                 }
1150             }
1151         }
1152     }else{ /* direct temporal mv pred */
1153         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1154         const int *dist_scale_factor = h->dist_scale_factor;
1155
1156         if(FRAME_MBAFF){
1157             if(IS_INTERLACED(*mb_type)){
1158                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1159                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1160                 dist_scale_factor = h->dist_scale_factor_field;
1161             }
1162             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1163                 /* FIXME assumes direct_8x8_inference == 1 */
1164                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1165                 int mb_types_col[2];
1166                 int y_shift;
1167
1168                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1169                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1170                          | (*mb_type & MB_TYPE_INTERLACED);
1171                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1172
1173                 if(IS_INTERLACED(*mb_type)){
1174                     /* frame to field scaling */
1175                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1176                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1177                     if(s->mb_y&1){
1178                         l1ref0 -= 2*h->b8_stride;
1179                         l1ref1 -= 2*h->b8_stride;
1180                         l1mv0 -= 4*h->b_stride;
1181                         l1mv1 -= 4*h->b_stride;
1182                     }
1183                     y_shift = 0;
1184
1185                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1186                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1187                        && !is_b8x8)
1188                         *mb_type |= MB_TYPE_16x8;
1189                     else
1190                         *mb_type |= MB_TYPE_8x8;
1191                 }else{
1192                     /* field to frame scaling */
1193                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1194                      * but in MBAFF, top and bottom POC are equal */
1195                     int dy = (s->mb_y&1) ? 1 : 2;
1196                     mb_types_col[0] =
1197                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1198                     l1ref0 += dy*h->b8_stride;
1199                     l1ref1 += dy*h->b8_stride;
1200                     l1mv0 += 2*dy*h->b_stride;
1201                     l1mv1 += 2*dy*h->b_stride;
1202                     y_shift = 2;
1203
1204                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1205                        && !is_b8x8)
1206                         *mb_type |= MB_TYPE_16x16;
1207                     else
1208                         *mb_type |= MB_TYPE_8x8;
1209                 }
1210
1211                 for(i8=0; i8<4; i8++){
1212                     const int x8 = i8&1;
1213                     const int y8 = i8>>1;
1214                     int ref0, scale;
1215                     const int16_t (*l1mv)[2]= l1mv0;
1216
1217                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                         continue;
1219                     h->sub_mb_type[i8] = sub_mb_type;
1220
1221                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                     if(IS_INTRA(mb_types_col[y8])){
1223                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                         continue;
1227                     }
1228
1229                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1230                     if(ref0 >= 0)
1231                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1232                     else{
1233                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1234                         l1mv= l1mv1;
1235                     }
1236                     scale = dist_scale_factor[ref0];
1237                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                     {
1240                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1241                         int my_col = (mv_col[1]<<y_shift)/2;
1242                         int mx = (scale * mv_col[0] + 128) >> 8;
1243                         int my = (scale * my_col + 128) >> 8;
1244                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                     }
1247                 }
1248                 return;
1249             }
1250         }
1251
1252         /* one-to-one mv scaling */
1253
1254         if(IS_16X16(*mb_type)){
1255             int ref, mv0, mv1;
1256
1257             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1258             if(IS_INTRA(mb_type_col)){
1259                 ref=mv0=mv1=0;
1260             }else{
1261                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1262                                                 : map_col_to_list0[1][l1ref1[0]];
1263                 const int scale = dist_scale_factor[ref0];
1264                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1265                 int mv_l0[2];
1266                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1267                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1268                 ref= ref0;
1269                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1270                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1271             }
1272             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1273             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1274             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1275         }else{
1276             for(i8=0; i8<4; i8++){
1277                 const int x8 = i8&1;
1278                 const int y8 = i8>>1;
1279                 int ref0, scale;
1280                 const int16_t (*l1mv)[2]= l1mv0;
1281
1282                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1283                     continue;
1284                 h->sub_mb_type[i8] = sub_mb_type;
1285                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1286                 if(IS_INTRA(mb_type_col)){
1287                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1288                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1290                     continue;
1291                 }
1292
1293                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1294                 if(ref0 >= 0)
1295                     ref0 = map_col_to_list0[0][ref0];
1296                 else{
1297                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1298                     l1mv= l1mv1;
1299                 }
1300                 scale = dist_scale_factor[ref0];
1301
1302                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1303                 if(IS_SUB_8X8(sub_mb_type)){
1304                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1305                     int mx = (scale * mv_col[0] + 128) >> 8;
1306                     int my = (scale * mv_col[1] + 128) >> 8;
1307                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1308                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1309                 }else
1310                 for(i4=0; i4<4; i4++){
1311                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1312                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1313                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1314                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1315                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1316                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1317                 }
1318             }
1319         }
1320     }
1321 }
1322
1323 static inline void write_back_motion(H264Context *h, int mb_type){
1324     MpegEncContext * const s = &h->s;
1325     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1326     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1327     int list;
1328
1329     if(!USES_LIST(mb_type, 0))
1330         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1331
1332     for(list=0; list<h->list_count; list++){
1333         int y;
1334         if(!USES_LIST(mb_type, list))
1335             continue;
1336
1337         for(y=0; y<4; y++){
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1339             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1340         }
1341         if( h->pps.cabac ) {
1342             if(IS_SKIP(mb_type))
1343                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1344             else
1345             for(y=0; y<4; y++){
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1347                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1348             }
1349         }
1350
1351         {
1352             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1353             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1354             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1355             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1356             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1357         }
1358     }
1359
1360     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1361         if(IS_8X8(mb_type)){
1362             uint8_t *direct_table = &h->direct_table[b8_xy];
1363             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1364             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1365             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1366         }
1367     }
1368 }
1369
1370 /**
1371  * Decodes a network abstraction layer unit.
1372  * @param consumed is the number of bytes used as input
1373  * @param length is the length of the array
1374  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1375  * @returns decoded bytes, might be src+1 if no escapes
1376  */
1377 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1378     int i, si, di;
1379     uint8_t *dst;
1380     int bufidx;
1381
1382 //    src[0]&0x80;                //forbidden bit
1383     h->nal_ref_idc= src[0]>>5;
1384     h->nal_unit_type= src[0]&0x1F;
1385
1386     src++; length--;
1387 #if 0
1388     for(i=0; i<length; i++)
1389         printf("%2X ", src[i]);
1390 #endif
1391     for(i=0; i+1<length; i+=2){
1392         if(src[i]) continue;
1393         if(i>0 && src[i-1]==0) i--;
1394         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1395             if(src[i+2]!=3){
1396                 /* startcode, so we must be past the end */
1397                 length=i;
1398             }
1399             break;
1400         }
1401     }
1402
1403     if(i>=length-1){ //no escaped 0
1404         *dst_length= length;
1405         *consumed= length+1; //+1 for the header
1406         return src;
1407     }
1408
1409     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1410     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1411     dst= h->rbsp_buffer[bufidx];
1412
1413     if (dst == NULL){
1414         return NULL;
1415     }
1416
1417 //printf("decoding esc\n");
1418     si=di=0;
1419     while(si<length){
1420         //remove escapes (very rare 1:2^22)
1421         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1422             if(src[si+2]==3){ //escape
1423                 dst[di++]= 0;
1424                 dst[di++]= 0;
1425                 si+=3;
1426                 continue;
1427             }else //next start code
1428                 break;
1429         }
1430
1431         dst[di++]= src[si++];
1432     }
1433
1434     *dst_length= di;
1435     *consumed= si + 1;//+1 for the header
1436 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1437     return dst;
1438 }
1439
1440 /**
1441  * identifies the exact end of the bitstream
1442  * @return the length of the trailing, or 0 if damaged
1443  */
1444 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1445     int v= *src;
1446     int r;
1447
1448     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1449
1450     for(r=1; r<9; r++){
1451         if(v&1) return r;
1452         v>>=1;
1453     }
1454     return 0;
1455 }
1456
1457 /**
1458  * IDCT transforms the 16 dc values and dequantizes them.
1459  * @param qp quantization parameter
1460  */
1461 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1462 #define stride 16
1463     int i;
1464     int temp[16]; //FIXME check if this is a good idea
1465     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1466     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1467
1468 //memset(block, 64, 2*256);
1469 //return;
1470     for(i=0; i<4; i++){
1471         const int offset= y_offset[i];
1472         const int z0= block[offset+stride*0] + block[offset+stride*4];
1473         const int z1= block[offset+stride*0] - block[offset+stride*4];
1474         const int z2= block[offset+stride*1] - block[offset+stride*5];
1475         const int z3= block[offset+stride*1] + block[offset+stride*5];
1476
1477         temp[4*i+0]= z0+z3;
1478         temp[4*i+1]= z1+z2;
1479         temp[4*i+2]= z1-z2;
1480         temp[4*i+3]= z0-z3;
1481     }
1482
1483     for(i=0; i<4; i++){
1484         const int offset= x_offset[i];
1485         const int z0= temp[4*0+i] + temp[4*2+i];
1486         const int z1= temp[4*0+i] - temp[4*2+i];
1487         const int z2= temp[4*1+i] - temp[4*3+i];
1488         const int z3= temp[4*1+i] + temp[4*3+i];
1489
1490         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1491         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1492         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1493         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1494     }
1495 }
1496
1497 #if 0
1498 /**
1499  * DCT transforms the 16 dc values.
1500  * @param qp quantization parameter ??? FIXME
1501  */
1502 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1503 //    const int qmul= dequant_coeff[qp][0];
1504     int i;
1505     int temp[16]; //FIXME check if this is a good idea
1506     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1507     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1508
1509     for(i=0; i<4; i++){
1510         const int offset= y_offset[i];
1511         const int z0= block[offset+stride*0] + block[offset+stride*4];
1512         const int z1= block[offset+stride*0] - block[offset+stride*4];
1513         const int z2= block[offset+stride*1] - block[offset+stride*5];
1514         const int z3= block[offset+stride*1] + block[offset+stride*5];
1515
1516         temp[4*i+0]= z0+z3;
1517         temp[4*i+1]= z1+z2;
1518         temp[4*i+2]= z1-z2;
1519         temp[4*i+3]= z0-z3;
1520     }
1521
1522     for(i=0; i<4; i++){
1523         const int offset= x_offset[i];
1524         const int z0= temp[4*0+i] + temp[4*2+i];
1525         const int z1= temp[4*0+i] - temp[4*2+i];
1526         const int z2= temp[4*1+i] - temp[4*3+i];
1527         const int z3= temp[4*1+i] + temp[4*3+i];
1528
1529         block[stride*0 +offset]= (z0 + z3)>>1;
1530         block[stride*2 +offset]= (z1 + z2)>>1;
1531         block[stride*8 +offset]= (z1 - z2)>>1;
1532         block[stride*10+offset]= (z0 - z3)>>1;
1533     }
1534 }
1535 #endif
1536
1537 #undef xStride
1538 #undef stride
1539
1540 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1541     const int stride= 16*2;
1542     const int xStride= 16;
1543     int a,b,c,d,e;
1544
1545     a= block[stride*0 + xStride*0];
1546     b= block[stride*0 + xStride*1];
1547     c= block[stride*1 + xStride*0];
1548     d= block[stride*1 + xStride*1];
1549
1550     e= a-b;
1551     a= a+b;
1552     b= c-d;
1553     c= c+d;
1554
1555     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1556     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1557     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1558     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1559 }
1560
1561 #if 0
1562 static void chroma_dc_dct_c(DCTELEM *block){
1563     const int stride= 16*2;
1564     const int xStride= 16;
1565     int a,b,c,d,e;
1566
1567     a= block[stride*0 + xStride*0];
1568     b= block[stride*0 + xStride*1];
1569     c= block[stride*1 + xStride*0];
1570     d= block[stride*1 + xStride*1];
1571
1572     e= a-b;
1573     a= a+b;
1574     b= c-d;
1575     c= c+d;
1576
1577     block[stride*0 + xStride*0]= (a+c);
1578     block[stride*0 + xStride*1]= (e+b);
1579     block[stride*1 + xStride*0]= (a-c);
1580     block[stride*1 + xStride*1]= (e-b);
1581 }
1582 #endif
1583
1584 /**
1585  * gets the chroma qp.
1586  */
1587 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1588     return h->pps.chroma_qp_table[t][qscale];
1589 }
1590
1591 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1592 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1593 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1594     int i;
1595     const int * const quant_table= quant_coeff[qscale];
1596     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1597     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1598     const unsigned int threshold2= (threshold1<<1);
1599     int last_non_zero;
1600
1601     if(separate_dc){
1602         if(qscale<=18){
1603             //avoid overflows
1604             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1605             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1606             const unsigned int dc_threshold2= (dc_threshold1<<1);
1607
1608             int level= block[0]*quant_coeff[qscale+18][0];
1609             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1610                 if(level>0){
1611                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1612                     block[0]= level;
1613                 }else{
1614                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1615                     block[0]= -level;
1616                 }
1617 //                last_non_zero = i;
1618             }else{
1619                 block[0]=0;
1620             }
1621         }else{
1622             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1623             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1624             const unsigned int dc_threshold2= (dc_threshold1<<1);
1625
1626             int level= block[0]*quant_table[0];
1627             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1628                 if(level>0){
1629                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1630                     block[0]= level;
1631                 }else{
1632                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1633                     block[0]= -level;
1634                 }
1635 //                last_non_zero = i;
1636             }else{
1637                 block[0]=0;
1638             }
1639         }
1640         last_non_zero= 0;
1641         i=1;
1642     }else{
1643         last_non_zero= -1;
1644         i=0;
1645     }
1646
1647     for(; i<16; i++){
1648         const int j= scantable[i];
1649         int level= block[j]*quant_table[j];
1650
1651 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1652 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1653         if(((unsigned)(level+threshold1))>threshold2){
1654             if(level>0){
1655                 level= (bias + level)>>QUANT_SHIFT;
1656                 block[j]= level;
1657             }else{
1658                 level= (bias - level)>>QUANT_SHIFT;
1659                 block[j]= -level;
1660             }
1661             last_non_zero = i;
1662         }else{
1663             block[j]=0;
1664         }
1665     }
1666
1667     return last_non_zero;
1668 }
1669
1670 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1671                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1672                            int src_x_offset, int src_y_offset,
1673                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1674     MpegEncContext * const s = &h->s;
1675     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1676     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1677     const int luma_xy= (mx&3) + ((my&3)<<2);
1678     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1679     uint8_t * src_cb, * src_cr;
1680     int extra_width= h->emu_edge_width;
1681     int extra_height= h->emu_edge_height;
1682     int emu=0;
1683     const int full_mx= mx>>2;
1684     const int full_my= my>>2;
1685     const int pic_width  = 16*s->mb_width;
1686     const int pic_height = 16*s->mb_height >> MB_FIELD;
1687
1688     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1689         return;
1690
1691     if(mx&7) extra_width -= 3;
1692     if(my&7) extra_height -= 3;
1693
1694     if(   full_mx < 0-extra_width
1695        || full_my < 0-extra_height
1696        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1697        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1698         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1699             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1700         emu=1;
1701     }
1702
1703     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1704     if(!square){
1705         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1706     }
1707
1708     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1709
1710     if(MB_FIELD){
1711         // chroma offset when predicting from a field of opposite parity
1712         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1713         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1714     }
1715     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1717
1718     if(emu){
1719         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1720             src_cb= s->edge_emu_buffer;
1721     }
1722     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1723
1724     if(emu){
1725         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1726             src_cr= s->edge_emu_buffer;
1727     }
1728     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1729 }
1730
1731 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1732                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1733                            int x_offset, int y_offset,
1734                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1735                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1736                            int list0, int list1){
1737     MpegEncContext * const s = &h->s;
1738     qpel_mc_func *qpix_op=  qpix_put;
1739     h264_chroma_mc_func chroma_op= chroma_put;
1740
1741     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1742     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1744     x_offset += 8*s->mb_x;
1745     y_offset += 8*(s->mb_y >> MB_FIELD);
1746
1747     if(list0){
1748         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1749         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1750                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1751                            qpix_op, chroma_op);
1752
1753         qpix_op=  qpix_avg;
1754         chroma_op= chroma_avg;
1755     }
1756
1757     if(list1){
1758         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1759         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1760                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1761                            qpix_op, chroma_op);
1762     }
1763 }
1764
1765 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1766                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1767                            int x_offset, int y_offset,
1768                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1769                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1770                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1771                            int list0, int list1){
1772     MpegEncContext * const s = &h->s;
1773
1774     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1775     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1777     x_offset += 8*s->mb_x;
1778     y_offset += 8*(s->mb_y >> MB_FIELD);
1779
1780     if(list0 && list1){
1781         /* don't optimize for luma-only case, since B-frames usually
1782          * use implicit weights => chroma too. */
1783         uint8_t *tmp_cb = s->obmc_scratchpad;
1784         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1785         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1786         int refn0 = h->ref_cache[0][ scan8[n] ];
1787         int refn1 = h->ref_cache[1][ scan8[n] ];
1788
1789         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1790                     dest_y, dest_cb, dest_cr,
1791                     x_offset, y_offset, qpix_put, chroma_put);
1792         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1793                     tmp_y, tmp_cb, tmp_cr,
1794                     x_offset, y_offset, qpix_put, chroma_put);
1795
1796         if(h->use_weight == 2){
1797             int weight0 = h->implicit_weight[refn0][refn1];
1798             int weight1 = 64 - weight0;
1799             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1802         }else{
1803             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1804                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1805                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1806             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1807                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1808                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1809             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1810                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1811                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1812         }
1813     }else{
1814         int list = list1 ? 1 : 0;
1815         int refn = h->ref_cache[list][ scan8[n] ];
1816         Picture *ref= &h->ref_list[list][refn];
1817         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1818                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1819                     qpix_put, chroma_put);
1820
1821         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1822                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1823         if(h->use_weight_chroma){
1824             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1825                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1826             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1827                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1828         }
1829     }
1830 }
1831
1832 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1833                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1834                            int x_offset, int y_offset,
1835                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1836                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1837                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1838                            int list0, int list1){
1839     if((h->use_weight==2 && list0 && list1
1840         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1841        || h->use_weight==1)
1842         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1843                          x_offset, y_offset, qpix_put, chroma_put,
1844                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1845     else
1846         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1847                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1848 }
1849
1850 static inline void prefetch_motion(H264Context *h, int list){
1851     /* fetch pixels for estimated mv 4 macroblocks ahead
1852      * optimized for 64byte cache lines */
1853     MpegEncContext * const s = &h->s;
1854     const int refn = h->ref_cache[list][scan8[0]];
1855     if(refn >= 0){
1856         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1857         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1858         uint8_t **src= h->ref_list[list][refn].data;
1859         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1860         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1861         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1862         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1863     }
1864 }
1865
1866 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1867                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1868                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1869                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1870     MpegEncContext * const s = &h->s;
1871     const int mb_xy= h->mb_xy;
1872     const int mb_type= s->current_picture.mb_type[mb_xy];
1873
1874     assert(IS_INTER(mb_type));
1875
1876     prefetch_motion(h, 0);
1877
1878     if(IS_16X16(mb_type)){
1879         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1880                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1881                 &weight_op[0], &weight_avg[0],
1882                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1883     }else if(IS_16X8(mb_type)){
1884         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1885                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1886                 &weight_op[1], &weight_avg[1],
1887                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1888         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1889                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1890                 &weight_op[1], &weight_avg[1],
1891                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1892     }else if(IS_8X16(mb_type)){
1893         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1894                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1895                 &weight_op[2], &weight_avg[2],
1896                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1897         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1898                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1899                 &weight_op[2], &weight_avg[2],
1900                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1901     }else{
1902         int i;
1903
1904         assert(IS_8X8(mb_type));
1905
1906         for(i=0; i<4; i++){
1907             const int sub_mb_type= h->sub_mb_type[i];
1908             const int n= 4*i;
1909             int x_offset= (i&1)<<2;
1910             int y_offset= (i&2)<<1;
1911
1912             if(IS_SUB_8X8(sub_mb_type)){
1913                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1914                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1915                     &weight_op[3], &weight_avg[3],
1916                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1917             }else if(IS_SUB_8X4(sub_mb_type)){
1918                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1919                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1920                     &weight_op[4], &weight_avg[4],
1921                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1922                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1923                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1924                     &weight_op[4], &weight_avg[4],
1925                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1926             }else if(IS_SUB_4X8(sub_mb_type)){
1927                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1928                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1929                     &weight_op[5], &weight_avg[5],
1930                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1931                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1932                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1933                     &weight_op[5], &weight_avg[5],
1934                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1935             }else{
1936                 int j;
1937                 assert(IS_SUB_4X4(sub_mb_type));
1938                 for(j=0; j<4; j++){
1939                     int sub_x_offset= x_offset + 2*(j&1);
1940                     int sub_y_offset= y_offset +   (j&2);
1941                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1942                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1943                         &weight_op[6], &weight_avg[6],
1944                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1945                 }
1946             }
1947         }
1948     }
1949
1950     prefetch_motion(h, 1);
1951 }
1952
1953 static av_cold void decode_init_vlc(void){
1954     static int done = 0;
1955
1956     if (!done) {
1957         int i;
1958         done = 1;
1959
1960         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1961                  &chroma_dc_coeff_token_len [0], 1, 1,
1962                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1963
1964         for(i=0; i<4; i++){
1965             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1966                      &coeff_token_len [i][0], 1, 1,
1967                      &coeff_token_bits[i][0], 1, 1, 1);
1968         }
1969
1970         for(i=0; i<3; i++){
1971             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1972                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1973                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1974         }
1975         for(i=0; i<15; i++){
1976             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1977                      &total_zeros_len [i][0], 1, 1,
1978                      &total_zeros_bits[i][0], 1, 1, 1);
1979         }
1980
1981         for(i=0; i<6; i++){
1982             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1983                      &run_len [i][0], 1, 1,
1984                      &run_bits[i][0], 1, 1, 1);
1985         }
1986         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1987                  &run_len [6][0], 1, 1,
1988                  &run_bits[6][0], 1, 1, 1);
1989     }
1990 }
1991
1992 static void free_tables(H264Context *h){
1993     int i;
1994     H264Context *hx;
1995     av_freep(&h->intra4x4_pred_mode);
1996     av_freep(&h->chroma_pred_mode_table);
1997     av_freep(&h->cbp_table);
1998     av_freep(&h->mvd_table[0]);
1999     av_freep(&h->mvd_table[1]);
2000     av_freep(&h->direct_table);
2001     av_freep(&h->non_zero_count);
2002     av_freep(&h->slice_table_base);
2003     h->slice_table= NULL;
2004
2005     av_freep(&h->mb2b_xy);
2006     av_freep(&h->mb2b8_xy);
2007
2008     for(i = 0; i < MAX_SPS_COUNT; i++)
2009         av_freep(h->sps_buffers + i);
2010
2011     for(i = 0; i < MAX_PPS_COUNT; i++)
2012         av_freep(h->pps_buffers + i);
2013
2014     for(i = 0; i < h->s.avctx->thread_count; i++) {
2015         hx = h->thread_context[i];
2016         if(!hx) continue;
2017         av_freep(&hx->top_borders[1]);
2018         av_freep(&hx->top_borders[0]);
2019         av_freep(&hx->s.obmc_scratchpad);
2020     }
2021 }
2022
2023 static void init_dequant8_coeff_table(H264Context *h){
2024     int i,q,x;
2025     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2026     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2027     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2028
2029     for(i=0; i<2; i++ ){
2030         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2031             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2032             break;
2033         }
2034
2035         for(q=0; q<52; q++){
2036             int shift = ff_div6[q];
2037             int idx = ff_rem6[q];
2038             for(x=0; x<64; x++)
2039                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2040                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2041                     h->pps.scaling_matrix8[i][x]) << shift;
2042         }
2043     }
2044 }
2045
2046 static void init_dequant4_coeff_table(H264Context *h){
2047     int i,j,q,x;
2048     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2049     for(i=0; i<6; i++ ){
2050         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2051         for(j=0; j<i; j++){
2052             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2053                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2054                 break;
2055             }
2056         }
2057         if(j<i)
2058             continue;
2059
2060         for(q=0; q<52; q++){
2061             int shift = ff_div6[q] + 2;
2062             int idx = ff_rem6[q];
2063             for(x=0; x<16; x++)
2064                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2065                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2066                     h->pps.scaling_matrix4[i][x]) << shift;
2067         }
2068     }
2069 }
2070
2071 static void init_dequant_tables(H264Context *h){
2072     int i,x;
2073     init_dequant4_coeff_table(h);
2074     if(h->pps.transform_8x8_mode)
2075         init_dequant8_coeff_table(h);
2076     if(h->sps.transform_bypass){
2077         for(i=0; i<6; i++)
2078             for(x=0; x<16; x++)
2079                 h->dequant4_coeff[i][0][x] = 1<<6;
2080         if(h->pps.transform_8x8_mode)
2081             for(i=0; i<2; i++)
2082                 for(x=0; x<64; x++)
2083                     h->dequant8_coeff[i][0][x] = 1<<6;
2084     }
2085 }
2086
2087
2088 /**
2089  * allocates tables.
2090  * needs width/height
2091  */
2092 static int alloc_tables(H264Context *h){
2093     MpegEncContext * const s = &h->s;
2094     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2095     int x,y;
2096
2097     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2098
2099     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2100     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2101     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2102
2103     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2104     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2105     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2106     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2107
2108     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2109     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2110
2111     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2112     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2113     for(y=0; y<s->mb_height; y++){
2114         for(x=0; x<s->mb_width; x++){
2115             const int mb_xy= x + y*s->mb_stride;
2116             const int b_xy = 4*x + 4*y*h->b_stride;
2117             const int b8_xy= 2*x + 2*y*h->b8_stride;
2118
2119             h->mb2b_xy [mb_xy]= b_xy;
2120             h->mb2b8_xy[mb_xy]= b8_xy;
2121         }
2122     }
2123
2124     s->obmc_scratchpad = NULL;
2125
2126     if(!h->dequant4_coeff[0])
2127         init_dequant_tables(h);
2128
2129     return 0;
2130 fail:
2131     free_tables(h);
2132     return -1;
2133 }
2134
2135 /**
2136  * Mimic alloc_tables(), but for every context thread.
2137  */
2138 static void clone_tables(H264Context *dst, H264Context *src){
2139     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2140     dst->non_zero_count           = src->non_zero_count;
2141     dst->slice_table              = src->slice_table;
2142     dst->cbp_table                = src->cbp_table;
2143     dst->mb2b_xy                  = src->mb2b_xy;
2144     dst->mb2b8_xy                 = src->mb2b8_xy;
2145     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2146     dst->mvd_table[0]             = src->mvd_table[0];
2147     dst->mvd_table[1]             = src->mvd_table[1];
2148     dst->direct_table             = src->direct_table;
2149
2150     dst->s.obmc_scratchpad = NULL;
2151     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2152 }
2153
2154 /**
2155  * Init context
2156  * Allocate buffers which are not shared amongst multiple threads.
2157  */
2158 static int context_init(H264Context *h){
2159     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2160     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2161
2162     return 0;
2163 fail:
2164     return -1; // free_tables will clean up for us
2165 }
2166
2167 static av_cold void common_init(H264Context *h){
2168     MpegEncContext * const s = &h->s;
2169
2170     s->width = s->avctx->width;
2171     s->height = s->avctx->height;
2172     s->codec_id= s->avctx->codec->id;
2173
2174     ff_h264_pred_init(&h->hpc, s->codec_id);
2175
2176     h->dequant_coeff_pps= -1;
2177     s->unrestricted_mv=1;
2178     s->decode=1; //FIXME
2179
2180     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2181     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2182 }
2183
2184 static av_cold int decode_init(AVCodecContext *avctx){
2185     H264Context *h= avctx->priv_data;
2186     MpegEncContext * const s = &h->s;
2187
2188     MPV_decode_defaults(s);
2189
2190     s->avctx = avctx;
2191     common_init(h);
2192
2193     s->out_format = FMT_H264;
2194     s->workaround_bugs= avctx->workaround_bugs;
2195
2196     // set defaults
2197 //    s->decode_mb= ff_h263_decode_mb;
2198     s->quarter_sample = 1;
2199     s->low_delay= 1;
2200
2201     if(avctx->codec_id == CODEC_ID_SVQ3)
2202         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2203     else
2204         avctx->pix_fmt= PIX_FMT_YUV420P;
2205
2206     decode_init_vlc();
2207
2208     if(avctx->extradata_size > 0 && avctx->extradata &&
2209        *(char *)avctx->extradata == 1){
2210         h->is_avc = 1;
2211         h->got_avcC = 0;
2212     } else {
2213         h->is_avc = 0;
2214     }
2215
2216     h->thread_context[0] = h;
2217     return 0;
2218 }
2219
2220 static int frame_start(H264Context *h){
2221     MpegEncContext * const s = &h->s;
2222     int i;
2223
2224     if(MPV_frame_start(s, s->avctx) < 0)
2225         return -1;
2226     ff_er_frame_start(s);
2227     /*
2228      * MPV_frame_start uses pict_type to derive key_frame.
2229      * This is incorrect for H.264; IDR markings must be used.
2230      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2231      * See decode_nal_units().
2232      */
2233     s->current_picture_ptr->key_frame= 0;
2234
2235     assert(s->linesize && s->uvlinesize);
2236
2237     for(i=0; i<16; i++){
2238         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2239         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2240     }
2241     for(i=0; i<4; i++){
2242         h->block_offset[16+i]=
2243         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2244         h->block_offset[24+16+i]=
2245         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2246     }
2247
2248     /* can't be in alloc_tables because linesize isn't known there.
2249      * FIXME: redo bipred weight to not require extra buffer? */
2250     for(i = 0; i < s->avctx->thread_count; i++)
2251         if(!h->thread_context[i]->s.obmc_scratchpad)
2252             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2253
2254     /* some macroblocks will be accessed before they're available */
2255     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2256         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2257
2258 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2259
2260     // We mark the current picture as non-reference after allocating it, so
2261     // that if we break out due to an error it can be released automatically
2262     // in the next MPV_frame_start().
2263     // SVQ3 as well as most other codecs have only last/next/current and thus
2264     // get released even with set reference, besides SVQ3 and others do not
2265     // mark frames as reference later "naturally".
2266     if(s->codec_id != CODEC_ID_SVQ3)
2267         s->current_picture_ptr->reference= 0;
2268
2269     s->current_picture_ptr->field_poc[0]=
2270     s->current_picture_ptr->field_poc[1]= INT_MAX;
2271     assert(s->current_picture_ptr->long_ref==0);
2272
2273     return 0;
2274 }
2275
2276 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2277     MpegEncContext * const s = &h->s;
2278     int i;
2279
2280     src_y  -=   linesize;
2281     src_cb -= uvlinesize;
2282     src_cr -= uvlinesize;
2283
2284     // There are two lines saved, the line above the the top macroblock of a pair,
2285     // and the line above the bottom macroblock
2286     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2287     for(i=1; i<17; i++){
2288         h->left_border[i]= src_y[15+i*  linesize];
2289     }
2290
2291     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2292     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2293
2294     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2295         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2296         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2297         for(i=1; i<9; i++){
2298             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2299             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2300         }
2301         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2302         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2303     }
2304 }
2305
2306 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2307     MpegEncContext * const s = &h->s;
2308     int temp8, i;
2309     uint64_t temp64;
2310     int deblock_left;
2311     int deblock_top;
2312     int mb_xy;
2313
2314     if(h->deblocking_filter == 2) {
2315         mb_xy = h->mb_xy;
2316         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2317         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2318     } else {
2319         deblock_left = (s->mb_x > 0);
2320         deblock_top =  (s->mb_y > 0);
2321     }
2322
2323     src_y  -=   linesize + 1;
2324     src_cb -= uvlinesize + 1;
2325     src_cr -= uvlinesize + 1;
2326
2327 #define XCHG(a,b,t,xchg)\
2328 t= a;\
2329 if(xchg)\
2330     a= b;\
2331 b= t;
2332
2333     if(deblock_left){
2334         for(i = !deblock_top; i<17; i++){
2335             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2336         }
2337     }
2338
2339     if(deblock_top){
2340         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2341         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2342         if(s->mb_x+1 < s->mb_width){
2343             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2344         }
2345     }
2346
2347     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2348         if(deblock_left){
2349             for(i = !deblock_top; i<9; i++){
2350                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2351                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2352             }
2353         }
2354         if(deblock_top){
2355             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2356             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2357         }
2358     }
2359 }
2360
2361 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2362     MpegEncContext * const s = &h->s;
2363     int i;
2364
2365     src_y  -= 2 *   linesize;
2366     src_cb -= 2 * uvlinesize;
2367     src_cr -= 2 * uvlinesize;
2368
2369     // There are two lines saved, the line above the the top macroblock of a pair,
2370     // and the line above the bottom macroblock
2371     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2372     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2373     for(i=2; i<34; i++){
2374         h->left_border[i]= src_y[15+i*  linesize];
2375     }
2376
2377     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2378     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2379     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2380     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2381
2382     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2383         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2384         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2385         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2386         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2387         for(i=2; i<18; i++){
2388             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2389             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2390         }
2391         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2392         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2393         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2394         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2395     }
2396 }
2397
2398 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2399     MpegEncContext * const s = &h->s;
2400     int temp8, i;
2401     uint64_t temp64;
2402     int deblock_left = (s->mb_x > 0);
2403     int deblock_top  = (s->mb_y > 1);
2404
2405     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2406
2407     src_y  -= 2 *   linesize + 1;
2408     src_cb -= 2 * uvlinesize + 1;
2409     src_cr -= 2 * uvlinesize + 1;
2410
2411 #define XCHG(a,b,t,xchg)\
2412 t= a;\
2413 if(xchg)\
2414     a= b;\
2415 b= t;
2416
2417     if(deblock_left){
2418         for(i = (!deblock_top)<<1; i<34; i++){
2419             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2420         }
2421     }
2422
2423     if(deblock_top){
2424         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2425         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2426         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2427         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2428         if(s->mb_x+1 < s->mb_width){
2429             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2430             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2431         }
2432     }
2433
2434     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2435         if(deblock_left){
2436             for(i = (!deblock_top) << 1; i<18; i++){
2437                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2438                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2439             }
2440         }
2441         if(deblock_top){
2442             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2443             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2444             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2445             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2446         }
2447     }
2448 }
2449
2450 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2451     MpegEncContext * const s = &h->s;
2452     const int mb_x= s->mb_x;
2453     const int mb_y= s->mb_y;
2454     const int mb_xy= h->mb_xy;
2455     const int mb_type= s->current_picture.mb_type[mb_xy];
2456     uint8_t  *dest_y, *dest_cb, *dest_cr;
2457     int linesize, uvlinesize /*dct_offset*/;
2458     int i;
2459     int *block_offset = &h->block_offset[0];
2460     const unsigned int bottom = mb_y & 1;
2461     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2462     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2463     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2464
2465     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2466     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2467     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2468
2469     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2470     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2471
2472     if (!simple && MB_FIELD) {
2473         linesize   = h->mb_linesize   = s->linesize * 2;
2474         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2475         block_offset = &h->block_offset[24];
2476         if(mb_y&1){ //FIXME move out of this function?
2477             dest_y -= s->linesize*15;
2478             dest_cb-= s->uvlinesize*7;
2479             dest_cr-= s->uvlinesize*7;
2480         }
2481         if(FRAME_MBAFF) {
2482             int list;
2483             for(list=0; list<h->list_count; list++){
2484                 if(!USES_LIST(mb_type, list))
2485                     continue;
2486                 if(IS_16X16(mb_type)){
2487                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2488                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2489                 }else{
2490                     for(i=0; i<16; i+=4){
2491                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2492                         int ref = h->ref_cache[list][scan8[i]];
2493                         if(ref >= 0)
2494                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2495                     }
2496                 }
2497             }
2498         }
2499     } else {
2500         linesize   = h->mb_linesize   = s->linesize;
2501         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2502 //        dct_offset = s->linesize * 16;
2503     }
2504
2505     if(transform_bypass){
2506         idct_dc_add =
2507         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2508     }else if(IS_8x8DCT(mb_type)){
2509         idct_dc_add = s->dsp.h264_idct8_dc_add;
2510         idct_add = s->dsp.h264_idct8_add;
2511     }else{
2512         idct_dc_add = s->dsp.h264_idct_dc_add;
2513         idct_add = s->dsp.h264_idct_add;
2514     }
2515
2516     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2517        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2518         int mbt_y = mb_y&~1;
2519         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2520         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2521         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2522         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2523     }
2524
2525     if (!simple && IS_INTRA_PCM(mb_type)) {
2526         unsigned int x, y;
2527
2528         // The pixels are stored in h->mb array in the same order as levels,
2529         // copy them in output in the correct order.
2530         for(i=0; i<16; i++) {
2531             for (y=0; y<4; y++) {
2532                 for (x=0; x<4; x++) {
2533                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2534                 }
2535             }
2536         }
2537         for(i=16; i<16+4; i++) {
2538             for (y=0; y<4; y++) {
2539                 for (x=0; x<4; x++) {
2540                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2541                 }
2542             }
2543         }
2544         for(i=20; i<20+4; i++) {
2545             for (y=0; y<4; y++) {
2546                 for (x=0; x<4; x++) {
2547                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2548                 }
2549             }
2550         }
2551     } else {
2552         if(IS_INTRA(mb_type)){
2553             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2554                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2555
2556             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2557                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2558                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2559             }
2560
2561             if(IS_INTRA4x4(mb_type)){
2562                 if(simple || !s->encoding){
2563                     if(IS_8x8DCT(mb_type)){
2564                         for(i=0; i<16; i+=4){
2565                             uint8_t * const ptr= dest_y + block_offset[i];
2566                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2567                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2568                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2569                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2570                             if(nnz){
2571                                 if(nnz == 1 && h->mb[i*16])
2572                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2573                                 else
2574                                     idct_add(ptr, h->mb + i*16, linesize);
2575                             }
2576                         }
2577                     }else
2578                     for(i=0; i<16; i++){
2579                         uint8_t * const ptr= dest_y + block_offset[i];
2580                         uint8_t *topright;
2581                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2582                         int nnz, tr;
2583
2584                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2585                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2586                             assert(mb_y || linesize <= block_offset[i]);
2587                             if(!topright_avail){
2588                                 tr= ptr[3 - linesize]*0x01010101;
2589                                 topright= (uint8_t*) &tr;
2590                             }else
2591                                 topright= ptr + 4 - linesize;
2592                         }else
2593                             topright= NULL;
2594
2595                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2596                         nnz = h->non_zero_count_cache[ scan8[i] ];
2597                         if(nnz){
2598                             if(is_h264){
2599                                 if(nnz == 1 && h->mb[i*16])
2600                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2601                                 else
2602                                     idct_add(ptr, h->mb + i*16, linesize);
2603                             }else
2604                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2605                         }
2606                     }
2607                 }
2608             }else{
2609                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2610                 if(is_h264){
2611                     if(!transform_bypass)
2612                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2613                 }else
2614                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2615             }
2616             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2617                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2618         }else if(is_h264){
2619             hl_motion(h, dest_y, dest_cb, dest_cr,
2620                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2621                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2622                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2623         }
2624
2625
2626         if(!IS_INTRA4x4(mb_type)){
2627             if(is_h264){
2628                 if(IS_INTRA16x16(mb_type)){
2629                     for(i=0; i<16; i++){
2630                         if(h->non_zero_count_cache[ scan8[i] ])
2631                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2632                         else if(h->mb[i*16])
2633                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2634                     }
2635                 }else{
2636                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2637                     for(i=0; i<16; i+=di){
2638                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2639                         if(nnz){
2640                             if(nnz==1 && h->mb[i*16])
2641                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2642                             else
2643                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2644                         }
2645                     }
2646                 }
2647             }else{
2648                 for(i=0; i<16; i++){
2649                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2650                         uint8_t * const ptr= dest_y + block_offset[i];
2651                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2652                     }
2653                 }
2654             }
2655         }
2656
2657         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2658             uint8_t *dest[2] = {dest_cb, dest_cr};
2659             if(transform_bypass){
2660                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2661             }else{
2662                 idct_add = s->dsp.h264_idct_add;
2663                 idct_dc_add = s->dsp.h264_idct_dc_add;
2664                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2665                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2666             }
2667             if(is_h264){
2668                 for(i=16; i<16+8; i++){
2669                     if(h->non_zero_count_cache[ scan8[i] ])
2670                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2671                     else if(h->mb[i*16])
2672                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2673                 }
2674             }else{
2675                 for(i=16; i<16+8; i++){
2676                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2677                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2678                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2679                     }
2680                 }
2681             }
2682         }
2683     }
2684     if(h->deblocking_filter) {
2685         if (!simple && FRAME_MBAFF) {
2686             //FIXME try deblocking one mb at a time?
2687             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2688             const int mb_y = s->mb_y - 1;
2689             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2690             const int mb_xy= mb_x + mb_y*s->mb_stride;
2691             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2692             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2693             if (!bottom) return;
2694             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2695             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2696             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2697
2698             if(IS_INTRA(mb_type_top | mb_type_bottom))
2699                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2700
2701             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2702             // deblock a pair
2703             // top
2704             s->mb_y--; h->mb_xy -= s->mb_stride;
2705             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2706             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2707             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2708             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2709             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2710             // bottom
2711             s->mb_y++; h->mb_xy += s->mb_stride;
2712             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2713             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2714             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2715             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2716             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2717         } else {
2718             tprintf(h->s.avctx, "call filter_mb\n");
2719             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2720             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2721             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2722             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2723             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2724         }
2725     }
2726 }
2727
2728 /**
2729  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2730  */
2731 static void hl_decode_mb_simple(H264Context *h){
2732     hl_decode_mb_internal(h, 1);
2733 }
2734
2735 /**
2736  * Process a macroblock; this handles edge cases, such as interlacing.
2737  */
2738 static void av_noinline hl_decode_mb_complex(H264Context *h){
2739     hl_decode_mb_internal(h, 0);
2740 }
2741
2742 static void hl_decode_mb(H264Context *h){
2743     MpegEncContext * const s = &h->s;
2744     const int mb_xy= h->mb_xy;
2745     const int mb_type= s->current_picture.mb_type[mb_xy];
2746     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2747                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2748
2749     if(ENABLE_H264_ENCODER && !s->decode)
2750         return;
2751
2752     if (is_complex)
2753         hl_decode_mb_complex(h);
2754     else hl_decode_mb_simple(h);
2755 }
2756
2757 static void pic_as_field(Picture *pic, const int parity){
2758     int i;
2759     for (i = 0; i < 4; ++i) {
2760         if (parity == PICT_BOTTOM_FIELD)
2761             pic->data[i] += pic->linesize[i];
2762         pic->reference = parity;
2763         pic->linesize[i] *= 2;
2764     }
2765 }
2766
2767 static int split_field_copy(Picture *dest, Picture *src,
2768                             int parity, int id_add){
2769     int match = !!(src->reference & parity);
2770
2771     if (match) {
2772         *dest = *src;
2773         if(parity != PICT_FRAME){
2774             pic_as_field(dest, parity);
2775             dest->pic_id *= 2;
2776             dest->pic_id += id_add;
2777         }
2778     }
2779
2780     return match;
2781 }
2782
2783 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2784     int i[2]={0};
2785     int index=0;
2786
2787     while(i[0]<len || i[1]<len){
2788         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2789             i[0]++;
2790         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2791             i[1]++;
2792         if(i[0] < len){
2793             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2794             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2795         }
2796         if(i[1] < len){
2797             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2798             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2799         }
2800     }
2801
2802     return index;
2803 }
2804
2805 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2806     int i, best_poc;
2807     int out_i= 0;
2808
2809     for(;;){
2810         best_poc= dir ? INT_MIN : INT_MAX;
2811
2812         for(i=0; i<len; i++){
2813             const int poc= src[i]->poc;
2814             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2815                 best_poc= poc;
2816                 sorted[out_i]= src[i];
2817             }
2818         }
2819         if(best_poc == (dir ? INT_MIN : INT_MAX))
2820             break;
2821         limit= sorted[out_i++]->poc - dir;
2822     }
2823     return out_i;
2824 }
2825
2826 /**
2827  * fills the default_ref_list.
2828  */
2829 static int fill_default_ref_list(H264Context *h){
2830     MpegEncContext * const s = &h->s;
2831     int i, len;
2832
2833     if(h->slice_type_nos==FF_B_TYPE){
2834         Picture *sorted[32];
2835         int cur_poc, list;
2836         int lens[2];
2837
2838         if(FIELD_PICTURE)
2839             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2840         else
2841             cur_poc= s->current_picture_ptr->poc;
2842
2843         for(list= 0; list<2; list++){
2844             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2845             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2846             assert(len<=32);
2847             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2848             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2849             assert(len<=32);
2850
2851             if(len < h->ref_count[list])
2852                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2853             lens[list]= len;
2854         }
2855
2856         if(lens[0] == lens[1] && lens[1] > 1){
2857             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2858             if(i == lens[0])
2859                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2860         }
2861     }else{
2862         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2863         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2864         assert(len <= 32);
2865         if(len < h->ref_count[0])
2866             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2867     }
2868 #ifdef TRACE
2869     for (i=0; i<h->ref_count[0]; i++) {
2870         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2871     }
2872     if(h->slice_type_nos==FF_B_TYPE){
2873         for (i=0; i<h->ref_count[1]; i++) {
2874             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2875         }
2876     }
2877 #endif
2878     return 0;
2879 }
2880
2881 static void print_short_term(H264Context *h);
2882 static void print_long_term(H264Context *h);
2883
2884 /**
2885  * Extract structure information about the picture described by pic_num in
2886  * the current decoding context (frame or field). Note that pic_num is
2887  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2888  * @param pic_num picture number for which to extract structure information
2889  * @param structure one of PICT_XXX describing structure of picture
2890  *                      with pic_num
2891  * @return frame number (short term) or long term index of picture
2892  *         described by pic_num
2893  */
2894 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2895     MpegEncContext * const s = &h->s;
2896
2897     *structure = s->picture_structure;
2898     if(FIELD_PICTURE){
2899         if (!(pic_num & 1))
2900             /* opposite field */
2901             *structure ^= PICT_FRAME;
2902         pic_num >>= 1;
2903     }
2904
2905     return pic_num;
2906 }
2907
2908 static int decode_ref_pic_list_reordering(H264Context *h){
2909     MpegEncContext * const s = &h->s;
2910     int list, index, pic_structure;
2911
2912     print_short_term(h);
2913     print_long_term(h);
2914     if(h->slice_type_nos==FF_I_TYPE) return 0; //FIXME move before function
2915
2916     for(list=0; list<h->list_count; list++){
2917         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2918
2919         if(get_bits1(&s->gb)){
2920             int pred= h->curr_pic_num;
2921
2922             for(index=0; ; index++){
2923                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2924                 unsigned int pic_id;
2925                 int i;
2926                 Picture *ref = NULL;
2927
2928                 if(reordering_of_pic_nums_idc==3)
2929                     break;
2930
2931                 if(index >= h->ref_count[list]){
2932                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2933                     return -1;
2934                 }
2935
2936                 if(reordering_of_pic_nums_idc<3){
2937                     if(reordering_of_pic_nums_idc<2){
2938                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2939                         int frame_num;
2940
2941                         if(abs_diff_pic_num > h->max_pic_num){
2942                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2943                             return -1;
2944                         }
2945
2946                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2947                         else                                pred+= abs_diff_pic_num;
2948                         pred &= h->max_pic_num - 1;
2949
2950                         frame_num = pic_num_extract(h, pred, &pic_structure);
2951
2952                         for(i= h->short_ref_count-1; i>=0; i--){
2953                             ref = h->short_ref[i];
2954                             assert(ref->reference);
2955                             assert(!ref->long_ref);
2956                             if(
2957                                    ref->frame_num == frame_num &&
2958                                    (ref->reference & pic_structure)
2959                               )
2960                                 break;
2961                         }
2962                         if(i>=0)
2963                             ref->pic_id= pred;
2964                     }else{
2965                         int long_idx;
2966                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2967
2968                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2969
2970                         if(long_idx>31){
2971                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2972                             return -1;
2973                         }
2974                         ref = h->long_ref[long_idx];
2975                         assert(!(ref && !ref->reference));
2976                         if(ref && (ref->reference & pic_structure)){
2977                             ref->pic_id= pic_id;
2978                             assert(ref->long_ref);
2979                             i=0;
2980                         }else{
2981                             i=-1;
2982                         }
2983                     }
2984
2985                     if (i < 0) {
2986                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2987                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2988                     } else {
2989                         for(i=index; i+1<h->ref_count[list]; i++){
2990                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2991                                 break;
2992                         }
2993                         for(; i > index; i--){
2994                             h->ref_list[list][i]= h->ref_list[list][i-1];
2995                         }
2996                         h->ref_list[list][index]= *ref;
2997                         if (FIELD_PICTURE){
2998                             pic_as_field(&h->ref_list[list][index], pic_structure);
2999                         }
3000                     }
3001                 }else{
3002                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3003                     return -1;
3004                 }
3005             }
3006         }
3007     }
3008     for(list=0; list<h->list_count; list++){
3009         for(index= 0; index < h->ref_count[list]; index++){
3010             if(!h->ref_list[list][index].data[0]){
3011                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
3012                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
3013             }
3014         }
3015     }
3016
3017     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3018         direct_dist_scale_factor(h);
3019     direct_ref_list_init(h);
3020     return 0;
3021 }
3022
3023 static void fill_mbaff_ref_list(H264Context *h){
3024     int list, i, j;
3025     for(list=0; list<2; list++){ //FIXME try list_count
3026         for(i=0; i<h->ref_count[list]; i++){
3027             Picture *frame = &h->ref_list[list][i];
3028             Picture *field = &h->ref_list[list][16+2*i];
3029             field[0] = *frame;
3030             for(j=0; j<3; j++)
3031                 field[0].linesize[j] <<= 1;
3032             field[0].reference = PICT_TOP_FIELD;
3033             field[1] = field[0];
3034             for(j=0; j<3; j++)
3035                 field[1].data[j] += frame->linesize[j];
3036             field[1].reference = PICT_BOTTOM_FIELD;
3037
3038             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3039             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3040             for(j=0; j<2; j++){
3041                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3042                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3043             }
3044         }
3045     }
3046     for(j=0; j<h->ref_count[1]; j++){
3047         for(i=0; i<h->ref_count[0]; i++)
3048             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3049         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3050         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3051     }
3052 }
3053
3054 static int pred_weight_table(H264Context *h){
3055     MpegEncContext * const s = &h->s;
3056     int list, i;
3057     int luma_def, chroma_def;
3058
3059     h->use_weight= 0;
3060     h->use_weight_chroma= 0;
3061     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3062     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3063     luma_def = 1<<h->luma_log2_weight_denom;
3064     chroma_def = 1<<h->chroma_log2_weight_denom;
3065
3066     for(list=0; list<2; list++){
3067         for(i=0; i<h->ref_count[list]; i++){
3068             int luma_weight_flag, chroma_weight_flag;
3069
3070             luma_weight_flag= get_bits1(&s->gb);
3071             if(luma_weight_flag){
3072                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3073                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3074                 if(   h->luma_weight[list][i] != luma_def
3075                    || h->luma_offset[list][i] != 0)
3076                     h->use_weight= 1;
3077             }else{
3078                 h->luma_weight[list][i]= luma_def;
3079                 h->luma_offset[list][i]= 0;
3080             }
3081
3082             chroma_weight_flag= get_bits1(&s->gb);
3083             if(chroma_weight_flag){
3084                 int j;
3085                 for(j=0; j<2; j++){
3086                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3087                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3088                     if(   h->chroma_weight[list][i][j] != chroma_def
3089                        || h->chroma_offset[list][i][j] != 0)
3090                         h->use_weight_chroma= 1;
3091                 }
3092             }else{
3093                 int j;
3094                 for(j=0; j<2; j++){
3095                     h->chroma_weight[list][i][j]= chroma_def;
3096                     h->chroma_offset[list][i][j]= 0;
3097                 }
3098             }
3099         }
3100         if(h->slice_type_nos != FF_B_TYPE) break;
3101     }
3102     h->use_weight= h->use_weight || h->use_weight_chroma;
3103     return 0;
3104 }
3105
3106 static void implicit_weight_table(H264Context *h){
3107     MpegEncContext * const s = &h->s;
3108     int ref0, ref1;
3109     int cur_poc = s->current_picture_ptr->poc;
3110
3111     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3112        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3113         h->use_weight= 0;
3114         h->use_weight_chroma= 0;
3115         return;
3116     }
3117
3118     h->use_weight= 2;
3119     h->use_weight_chroma= 2;
3120     h->luma_log2_weight_denom= 5;
3121     h->chroma_log2_weight_denom= 5;
3122
3123     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3124         int poc0 = h->ref_list[0][ref0].poc;
3125         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3126             int poc1 = h->ref_list[1][ref1].poc;
3127             int td = av_clip(poc1 - poc0, -128, 127);
3128             if(td){
3129                 int tb = av_clip(cur_poc - poc0, -128, 127);
3130                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3131                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3132                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3133                     h->implicit_weight[ref0][ref1] = 32;
3134                 else
3135                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3136             }else
3137                 h->implicit_weight[ref0][ref1] = 32;
3138         }
3139     }
3140 }
3141
3142 /**
3143  * Mark a picture as no longer needed for reference. The refmask
3144  * argument allows unreferencing of individual fields or the whole frame.
3145  * If the picture becomes entirely unreferenced, but is being held for
3146  * display purposes, it is marked as such.
3147  * @param refmask mask of fields to unreference; the mask is bitwise
3148  *                anded with the reference marking of pic
3149  * @return non-zero if pic becomes entirely unreferenced (except possibly
3150  *         for display purposes) zero if one of the fields remains in
3151  *         reference
3152  */
3153 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3154     int i;
3155     if (pic->reference &= refmask) {
3156         return 0;
3157     } else {
3158         for(i = 0; h->delayed_pic[i]; i++)
3159             if(pic == h->delayed_pic[i]){
3160                 pic->reference=DELAYED_PIC_REF;
3161                 break;
3162             }
3163         return 1;
3164     }
3165 }
3166
3167 /**
3168  * instantaneous decoder refresh.
3169  */
3170 static void idr(H264Context *h){
3171     int i;
3172
3173     for(i=0; i<16; i++){
3174         remove_long(h, i, 0);
3175     }
3176     assert(h->long_ref_count==0);
3177
3178     for(i=0; i<h->short_ref_count; i++){
3179         unreference_pic(h, h->short_ref[i], 0);
3180         h->short_ref[i]= NULL;
3181     }
3182     h->short_ref_count=0;
3183     h->prev_frame_num= 0;
3184     h->prev_frame_num_offset= 0;
3185     h->prev_poc_msb=
3186     h->prev_poc_lsb= 0;
3187 }
3188
3189 /* forget old pics after a seek */
3190 static void flush_dpb(AVCodecContext *avctx){
3191     H264Context *h= avctx->priv_data;
3192     int i;
3193     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3194         if(h->delayed_pic[i])
3195             h->delayed_pic[i]->reference= 0;
3196         h->delayed_pic[i]= NULL;
3197     }
3198     h->outputed_poc= INT_MIN;
3199     idr(h);
3200     if(h->s.current_picture_ptr)
3201         h->s.current_picture_ptr->reference= 0;
3202     h->s.first_field= 0;
3203     ff_mpeg_flush(avctx);
3204 }
3205
3206 /**
3207  * Find a Picture in the short term reference list by frame number.
3208  * @param frame_num frame number to search for
3209  * @param idx the index into h->short_ref where returned picture is found
3210  *            undefined if no picture found.
3211  * @return pointer to the found picture, or NULL if no pic with the provided
3212  *                 frame number is found
3213  */
3214 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3215     MpegEncContext * const s = &h->s;
3216     int i;
3217
3218     for(i=0; i<h->short_ref_count; i++){
3219         Picture *pic= h->short_ref[i];
3220         if(s->avctx->debug&FF_DEBUG_MMCO)
3221             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3222         if(pic->frame_num == frame_num) {
3223             *idx = i;
3224             return pic;
3225         }
3226     }
3227     return NULL;
3228 }
3229
3230 /**
3231  * Remove a picture from the short term reference list by its index in
3232  * that list.  This does no checking on the provided index; it is assumed
3233  * to be valid. Other list entries are shifted down.
3234  * @param i index into h->short_ref of picture to remove.
3235  */
3236 static void remove_short_at_index(H264Context *h, int i){
3237     assert(i >= 0 && i < h->short_ref_count);
3238     h->short_ref[i]= NULL;
3239     if (--h->short_ref_count)
3240         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3241 }
3242
3243 /**
3244  *
3245  * @return the removed picture or NULL if an error occurs
3246  */
3247 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3248     MpegEncContext * const s = &h->s;
3249     Picture *pic;
3250     int i;
3251
3252     if(s->avctx->debug&FF_DEBUG_MMCO)
3253         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3254
3255     pic = find_short(h, frame_num, &i);
3256     if (pic){
3257         if(unreference_pic(h, pic, ref_mask))
3258         remove_short_at_index(h, i);
3259     }
3260
3261     return pic;
3262 }
3263
3264 /**
3265  * Remove a picture from the long term reference list by its index in
3266  * that list.
3267  * @return the removed picture or NULL if an error occurs
3268  */
3269 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3270     Picture *pic;
3271
3272     pic= h->long_ref[i];
3273     if (pic){
3274         if(unreference_pic(h, pic, ref_mask)){
3275             assert(h->long_ref[i]->long_ref == 1);
3276             h->long_ref[i]->long_ref= 0;
3277             h->long_ref[i]= NULL;
3278             h->long_ref_count--;
3279         }
3280     }
3281
3282     return pic;
3283 }
3284
3285 /**
3286  * print short term list
3287  */
3288 static void print_short_term(H264Context *h) {
3289     uint32_t i;
3290     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3291         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3292         for(i=0; i<h->short_ref_count; i++){
3293             Picture *pic= h->short_ref[i];
3294             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3295         }
3296     }
3297 }
3298
3299 /**
3300  * print long term list
3301  */
3302 static void print_long_term(H264Context *h) {
3303     uint32_t i;
3304     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3305         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3306         for(i = 0; i < 16; i++){
3307             Picture *pic= h->long_ref[i];
3308             if (pic) {
3309                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3310             }
3311         }
3312     }
3313 }
3314
3315 /**
3316  * Executes the reference picture marking (memory management control operations).
3317  */
3318 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3319     MpegEncContext * const s = &h->s;
3320     int i, j;
3321     int current_ref_assigned=0;
3322     Picture *pic;
3323
3324     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3325         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3326
3327     for(i=0; i<mmco_count; i++){
3328         int structure, frame_num;
3329         if(s->avctx->debug&FF_DEBUG_MMCO)
3330             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3331
3332         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3333            || mmco[i].opcode == MMCO_SHORT2LONG){
3334             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3335             pic = find_short(h, frame_num, &j);
3336             if(!pic){
3337                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3338                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3339                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3340                 continue;
3341             }
3342         }
3343
3344         switch(mmco[i].opcode){
3345         case MMCO_SHORT2UNUSED:
3346             if(s->avctx->debug&FF_DEBUG_MMCO)
3347                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3348             remove_short(h, frame_num, structure ^ PICT_FRAME);
3349             break;
3350         case MMCO_SHORT2LONG:
3351                 if (h->long_ref[mmco[i].long_arg] != pic)
3352                     remove_long(h, mmco[i].long_arg, 0);
3353
3354                 remove_short_at_index(h, j);
3355                 h->long_ref[ mmco[i].long_arg ]= pic;
3356                 if (h->long_ref[ mmco[i].long_arg ]){
3357                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3358                     h->long_ref_count++;
3359                 }
3360             break;
3361         case MMCO_LONG2UNUSED:
3362             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3363             pic = h->long_ref[j];
3364             if (pic) {
3365                 remove_long(h, j, structure ^ PICT_FRAME);
3366             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3367                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3368             break;
3369         case MMCO_LONG:
3370                     // Comment below left from previous code as it is an interresting note.
3371                     /* First field in pair is in short term list or
3372                      * at a different long term index.
3373                      * This is not allowed; see 7.4.3, notes 2 and 3.
3374                      * Report the problem and keep the pair where it is,
3375                      * and mark this field valid.
3376                      */
3377
3378             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3379                 remove_long(h, mmco[i].long_arg, 0);
3380
3381                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3382                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3383                 h->long_ref_count++;
3384             }
3385
3386             s->current_picture_ptr->reference |= s->picture_structure;
3387             current_ref_assigned=1;
3388             break;
3389         case MMCO_SET_MAX_LONG:
3390             assert(mmco[i].long_arg <= 16);
3391             // just remove the long term which index is greater than new max
3392             for(j = mmco[i].long_arg; j<16; j++){
3393                 remove_long(h, j, 0);
3394             }
3395             break;
3396         case MMCO_RESET:
3397             while(h->short_ref_count){
3398                 remove_short(h, h->short_ref[0]->frame_num, 0);
3399             }
3400             for(j = 0; j < 16; j++) {
3401                 remove_long(h, j, 0);
3402             }
3403             s->current_picture_ptr->poc=
3404             s->current_picture_ptr->field_poc[0]=
3405             s->current_picture_ptr->field_poc[1]=
3406             h->poc_lsb=
3407             h->poc_msb=
3408             h->frame_num=
3409             s->current_picture_ptr->frame_num= 0;
3410             break;
3411         default: assert(0);
3412         }
3413     }
3414
3415     if (!current_ref_assigned) {
3416         /* Second field of complementary field pair; the first field of
3417          * which is already referenced. If short referenced, it
3418          * should be first entry in short_ref. If not, it must exist
3419          * in long_ref; trying to put it on the short list here is an
3420          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3421          */
3422         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3423             /* Just mark the second field valid */
3424             s->current_picture_ptr->reference = PICT_FRAME;
3425         } else if (s->current_picture_ptr->long_ref) {
3426             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3427                                              "assignment for second field "
3428                                              "in complementary field pair "
3429                                              "(first field is long term)\n");
3430         } else {
3431             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3432             if(pic){
3433                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3434             }
3435
3436             if(h->short_ref_count)
3437                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3438
3439             h->short_ref[0]= s->current_picture_ptr;
3440             h->short_ref_count++;
3441             s->current_picture_ptr->reference |= s->picture_structure;
3442         }
3443     }
3444
3445     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3446
3447         /* We have too many reference frames, probably due to corrupted
3448          * stream. Need to discard one frame. Prevents overrun of the
3449          * short_ref and long_ref buffers.
3450          */
3451         av_log(h->s.avctx, AV_LOG_ERROR,
3452                "number of reference frames exceeds max (probably "
3453                "corrupt input), discarding one\n");
3454
3455         if (h->long_ref_count && !h->short_ref_count) {
3456             for (i = 0; i < 16; ++i)
3457                 if (h->long_ref[i])
3458                     break;
3459
3460             assert(i < 16);
3461             remove_long(h, i, 0);
3462         } else {
3463             pic = h->short_ref[h->short_ref_count - 1];
3464             remove_short(h, pic->frame_num, 0);
3465         }
3466     }
3467
3468     print_short_term(h);
3469     print_long_term(h);
3470     return 0;
3471 }
3472
3473 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3474     MpegEncContext * const s = &h->s;
3475     int i;
3476
3477     h->mmco_index= 0;
3478     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3479         s->broken_link= get_bits1(gb) -1;
3480         if(get_bits1(gb)){
3481             h->mmco[0].opcode= MMCO_LONG;
3482             h->mmco[0].long_arg= 0;
3483             h->mmco_index= 1;
3484         }
3485     }else{
3486         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3487             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3488                 MMCOOpcode opcode= get_ue_golomb(gb);
3489
3490                 h->mmco[i].opcode= opcode;
3491                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3492                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3493 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3494                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3495                         return -1;
3496                     }*/
3497                 }
3498                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3499                     unsigned int long_arg= get_ue_golomb(gb);
3500                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3501                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3502                         return -1;
3503                     }
3504                     h->mmco[i].long_arg= long_arg;
3505                 }
3506
3507                 if(opcode > (unsigned)MMCO_LONG){
3508                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3509                     return -1;
3510                 }
3511                 if(opcode == MMCO_END)
3512                     break;
3513             }
3514             h->mmco_index= i;
3515         }else{
3516             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3517
3518             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3519                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3520                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3521                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3522                 h->mmco_index= 1;
3523                 if (FIELD_PICTURE) {
3524                     h->mmco[0].short_pic_num *= 2;
3525                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3526                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3527                     h->mmco_index= 2;
3528                 }
3529             }
3530         }
3531     }
3532
3533     return 0;
3534 }
3535
3536 static int init_poc(H264Context *h){
3537     MpegEncContext * const s = &h->s;
3538     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3539     int field_poc[2];
3540     Picture *cur = s->current_picture_ptr;
3541
3542     h->frame_num_offset= h->prev_frame_num_offset;
3543     if(h->frame_num < h->prev_frame_num)
3544         h->frame_num_offset += max_frame_num;
3545
3546     if(h->sps.poc_type==0){
3547         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3548
3549         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3550             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3551         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3552             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3553         else
3554             h->poc_msb = h->prev_poc_msb;
3555 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3556         field_poc[0] =
3557         field_poc[1] = h->poc_msb + h->poc_lsb;
3558         if(s->picture_structure == PICT_FRAME)
3559             field_poc[1] += h->delta_poc_bottom;
3560     }else if(h->sps.poc_type==1){
3561         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3562         int i;
3563
3564         if(h->sps.poc_cycle_length != 0)
3565             abs_frame_num = h->frame_num_offset + h->frame_num;
3566         else
3567             abs_frame_num = 0;
3568
3569         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3570             abs_frame_num--;
3571
3572         expected_delta_per_poc_cycle = 0;
3573         for(i=0; i < h->sps.poc_cycle_length; i++)
3574             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3575
3576         if(abs_frame_num > 0){
3577             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3578             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3579
3580             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3581             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3582                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3583         } else
3584             expectedpoc = 0;
3585
3586         if(h->nal_ref_idc == 0)
3587             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3588
3589         field_poc[0] = expectedpoc + h->delta_poc[0];
3590         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3591
3592         if(s->picture_structure == PICT_FRAME)
3593             field_poc[1] += h->delta_poc[1];
3594     }else{
3595         int poc= 2*(h->frame_num_offset + h->frame_num);
3596
3597         if(!h->nal_ref_idc)
3598             poc--;
3599
3600         field_poc[0]= poc;
3601         field_poc[1]= poc;
3602     }
3603
3604     if(s->picture_structure != PICT_BOTTOM_FIELD)
3605         s->current_picture_ptr->field_poc[0]= field_poc[0];
3606     if(s->picture_structure != PICT_TOP_FIELD)
3607         s->current_picture_ptr->field_poc[1]= field_poc[1];
3608     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3609
3610     return 0;
3611 }
3612
3613
3614 /**
3615  * initialize scan tables
3616  */
3617 static void init_scan_tables(H264Context *h){
3618     MpegEncContext * const s = &h->s;
3619     int i;
3620     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3621         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3622         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3623     }else{
3624         for(i=0; i<16; i++){
3625 #define T(x) (x>>2) | ((x<<2) & 0xF)
3626             h->zigzag_scan[i] = T(zigzag_scan[i]);
3627             h-> field_scan[i] = T( field_scan[i]);
3628 #undef T
3629         }
3630     }
3631     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3632         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3633         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3634         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3635         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3636     }else{
3637         for(i=0; i<64; i++){
3638 #define T(x) (x>>3) | ((x&7)<<3)
3639             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3640             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3641             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3642             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3643 #undef T
3644         }
3645     }
3646     if(h->sps.transform_bypass){ //FIXME same ugly
3647         h->zigzag_scan_q0          = zigzag_scan;
3648         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3649         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3650         h->field_scan_q0           = field_scan;
3651         h->field_scan8x8_q0        = field_scan8x8;
3652         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3653     }else{
3654         h->zigzag_scan_q0          = h->zigzag_scan;
3655         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3656         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3657         h->field_scan_q0           = h->field_scan;
3658         h->field_scan8x8_q0        = h->field_scan8x8;
3659         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3660     }
3661 }
3662
3663 /**
3664  * Replicates H264 "master" context to thread contexts.
3665  */
3666 static void clone_slice(H264Context *dst, H264Context *src)
3667 {
3668     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3669     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3670     dst->s.current_picture      = src->s.current_picture;
3671     dst->s.linesize             = src->s.linesize;
3672     dst->s.uvlinesize           = src->s.uvlinesize;
3673     dst->s.first_field          = src->s.first_field;
3674
3675     dst->prev_poc_msb           = src->prev_poc_msb;
3676     dst->prev_poc_lsb           = src->prev_poc_lsb;
3677     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3678     dst->prev_frame_num         = src->prev_frame_num;
3679     dst->short_ref_count        = src->short_ref_count;
3680
3681     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3682     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3683     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3684     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3685
3686     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3687     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3688 }
3689
3690 /**
3691  * decodes a slice header.
3692  * This will also call MPV_common_init() and frame_start() as needed.
3693  *
3694  * @param h h264context
3695  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3696  *
3697  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3698  */
3699 static int decode_slice_header(H264Context *h, H264Context *h0){
3700     MpegEncContext * const s = &h->s;
3701     MpegEncContext * const s0 = &h0->s;
3702     unsigned int first_mb_in_slice;
3703     unsigned int pps_id;
3704     int num_ref_idx_active_override_flag;
3705     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3706     unsigned int slice_type, tmp, i, j;
3707     int default_ref_list_done = 0;
3708     int last_pic_structure;
3709
3710     s->dropable= h->nal_ref_idc == 0;
3711
3712     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3713         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3714         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3715     }else{
3716         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3717         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3718     }
3719
3720     first_mb_in_slice= get_ue_golomb(&s->gb);
3721
3722     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3723         h0->current_slice = 0;
3724         if (!s0->first_field)
3725             s->current_picture_ptr= NULL;
3726     }
3727
3728     slice_type= get_ue_golomb(&s->gb);
3729     if(slice_type > 9){
3730         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3731         return -1;
3732     }
3733     if(slice_type > 4){
3734         slice_type -= 5;
3735         h->slice_type_fixed=1;
3736     }else
3737         h->slice_type_fixed=0;
3738
3739     slice_type= slice_type_map[ slice_type ];
3740     if (slice_type == FF_I_TYPE
3741         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3742         default_ref_list_done = 1;
3743     }
3744     h->slice_type= slice_type;
3745     h->slice_type_nos= slice_type & 3;
3746
3747     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3748     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3749         av_log(h->s.avctx, AV_LOG_ERROR,
3750                "B picture before any references, skipping\n");
3751         return -1;
3752     }
3753
3754     pps_id= get_ue_golomb(&s->gb);
3755     if(pps_id>=MAX_PPS_COUNT){
3756         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3757         return -1;
3758     }
3759     if(!h0->pps_buffers[pps_id]) {
3760         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3761         return -1;
3762     }
3763     h->pps= *h0->pps_buffers[pps_id];
3764
3765     if(!h0->sps_buffers[h->pps.sps_id]) {
3766         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3767         return -1;
3768     }
3769     h->sps = *h0->sps_buffers[h->pps.sps_id];
3770
3771     if(h == h0 && h->dequant_coeff_pps != pps_id){
3772         h->dequant_coeff_pps = pps_id;
3773         init_dequant_tables(h);
3774     }
3775
3776     s->mb_width= h->sps.mb_width;
3777     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3778
3779     h->b_stride=  s->mb_width*4;
3780     h->b8_stride= s->mb_width*2;
3781
3782     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3783     if(h->sps.frame_mbs_only_flag)
3784         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3785     else
3786         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3787
3788     if (s->context_initialized
3789         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3790         if(h != h0)
3791             return -1;   // width / height changed during parallelized decoding
3792         free_tables(h);
3793         MPV_common_end(s);
3794     }
3795     if (!s->context_initialized) {
3796         if(h != h0)
3797             return -1;  // we cant (re-)initialize context during parallel decoding
3798         if (MPV_common_init(s) < 0)
3799             return -1;
3800         s->first_field = 0;
3801
3802         init_scan_tables(h);
3803         alloc_tables(h);
3804
3805         for(i = 1; i < s->avctx->thread_count; i++) {
3806             H264Context *c;
3807             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3808             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3809             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3810             c->sps = h->sps;
3811             c->pps = h->pps;
3812             init_scan_tables(c);
3813             clone_tables(c, h);
3814         }
3815
3816         for(i = 0; i < s->avctx->thread_count; i++)
3817             if(context_init(h->thread_context[i]) < 0)
3818                 return -1;
3819
3820         s->avctx->width = s->width;
3821         s->avctx->height = s->height;
3822         s->avctx->sample_aspect_ratio= h->sps.sar;
3823         if(!s->avctx->sample_aspect_ratio.den)
3824             s->avctx->sample_aspect_ratio.den = 1;
3825
3826         if(h->sps.timing_info_present_flag){
3827             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3828             if(h->x264_build > 0 && h->x264_build < 44)
3829                 s->avctx->time_base.den *= 2;
3830             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3831                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3832         }
3833     }
3834
3835     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3836
3837     h->mb_mbaff = 0;
3838     h->mb_aff_frame = 0;
3839     last_pic_structure = s0->picture_structure;
3840     if(h->sps.frame_mbs_only_flag){
3841         s->picture_structure= PICT_FRAME;
3842     }else{
3843         if(get_bits1(&s->gb)) { //field_pic_flag
3844             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3845         } else {
3846             s->picture_structure= PICT_FRAME;
3847             h->mb_aff_frame = h->sps.mb_aff;
3848         }
3849     }
3850
3851     if(h0->current_slice == 0){
3852         while(h->frame_num !=  h->prev_frame_num &&
3853               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3854             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3855             frame_start(h);
3856             h->prev_frame_num++;
3857             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3858             s->current_picture_ptr->frame_num= h->prev_frame_num;
3859             execute_ref_pic_marking(h, NULL, 0);
3860         }
3861
3862         /* See if we have a decoded first field looking for a pair... */
3863         if (s0->first_field) {
3864             assert(s0->current_picture_ptr);
3865             assert(s0->current_picture_ptr->data[0]);
3866             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3867
3868             /* figure out if we have a complementary field pair */
3869             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3870                 /*
3871                  * Previous field is unmatched. Don't display it, but let it
3872                  * remain for reference if marked as such.
3873                  */
3874                 s0->current_picture_ptr = NULL;
3875                 s0->first_field = FIELD_PICTURE;
3876
3877             } else {
3878                 if (h->nal_ref_idc &&
3879                         s0->current_picture_ptr->reference &&
3880                         s0->current_picture_ptr->frame_num != h->frame_num) {
3881                     /*
3882                      * This and previous field were reference, but had
3883                      * different frame_nums. Consider this field first in
3884                      * pair. Throw away previous field except for reference
3885                      * purposes.
3886                      */
3887                     s0->first_field = 1;
3888                     s0->current_picture_ptr = NULL;
3889
3890                 } else {
3891                     /* Second field in complementary pair */
3892                     s0->first_field = 0;
3893                 }
3894             }
3895
3896         } else {
3897             /* Frame or first field in a potentially complementary pair */
3898             assert(!s0->current_picture_ptr);
3899             s0->first_field = FIELD_PICTURE;
3900         }
3901
3902         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3903             s0->first_field = 0;
3904             return -1;
3905         }
3906     }
3907     if(h != h0)
3908         clone_slice(h, h0);
3909
3910     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3911
3912     assert(s->mb_num == s->mb_width * s->mb_height);
3913     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3914        first_mb_in_slice                    >= s->mb_num){
3915         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3916         return -1;
3917     }
3918     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3919     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3920     if (s->picture_structure == PICT_BOTTOM_FIELD)
3921         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3922     assert(s->mb_y < s->mb_height);
3923
3924     if(s->picture_structure==PICT_FRAME){
3925         h->curr_pic_num=   h->frame_num;
3926         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3927     }else{
3928         h->curr_pic_num= 2*h->frame_num + 1;
3929         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3930     }
3931
3932     if(h->nal_unit_type == NAL_IDR_SLICE){
3933         get_ue_golomb(&s->gb); /* idr_pic_id */
3934     }
3935
3936     if(h->sps.poc_type==0){
3937         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3938
3939         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3940             h->delta_poc_bottom= get_se_golomb(&s->gb);
3941         }
3942     }
3943
3944     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3945         h->delta_poc[0]= get_se_golomb(&s->gb);
3946
3947         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3948             h->delta_poc[1]= get_se_golomb(&s->gb);
3949     }
3950
3951     init_poc(h);
3952
3953     if(h->pps.redundant_pic_cnt_present){
3954         h->redundant_pic_count= get_ue_golomb(&s->gb);
3955     }
3956
3957     //set defaults, might be overridden a few lines later
3958     h->ref_count[0]= h->pps.ref_count[0];
3959     h->ref_count[1]= h->pps.ref_count[1];
3960
3961     if(h->slice_type_nos != FF_I_TYPE){
3962         if(h->slice_type_nos == FF_B_TYPE){
3963             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3964         }
3965         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3966
3967         if(num_ref_idx_active_override_flag){
3968             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3969             if(h->slice_type_nos==FF_B_TYPE)
3970                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3971
3972             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3973                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3974                 h->ref_count[0]= h->ref_count[1]= 1;
3975                 return -1;
3976             }
3977         }
3978         if(h->slice_type_nos == FF_B_TYPE)
3979             h->list_count= 2;
3980         else
3981             h->list_count= 1;
3982     }else
3983         h->list_count= 0;
3984
3985     if(!default_ref_list_done){
3986         fill_default_ref_list(h);
3987     }
3988
3989     if(decode_ref_pic_list_reordering(h) < 0)
3990         return -1;
3991
3992     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3993        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3994         pred_weight_table(h);
3995     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3996         implicit_weight_table(h);
3997     else
3998         h->use_weight = 0;
3999
4000     if(h->nal_ref_idc)
4001         decode_ref_pic_marking(h0, &s->gb);
4002
4003     if(FRAME_MBAFF)
4004         fill_mbaff_ref_list(h);
4005
4006     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4007         tmp = get_ue_golomb(&s->gb);
4008         if(tmp > 2){
4009             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4010             return -1;
4011         }
4012         h->cabac_init_idc= tmp;
4013     }
4014
4015     h->last_qscale_diff = 0;
4016     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4017     if(tmp>51){
4018         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4019         return -1;
4020     }
4021     s->qscale= tmp;
4022     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4023     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4024     //FIXME qscale / qp ... stuff
4025     if(h->slice_type == FF_SP_TYPE){
4026         get_bits1(&s->gb); /* sp_for_switch_flag */
4027     }
4028     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4029         get_se_golomb(&s->gb); /* slice_qs_delta */
4030     }
4031
4032     h->deblocking_filter = 1;
4033     h->slice_alpha_c0_offset = 0;
4034     h->slice_beta_offset = 0;
4035     if( h->pps.deblocking_filter_parameters_present ) {
4036         tmp= get_ue_golomb(&s->gb);
4037         if(tmp > 2){
4038             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4039             return -1;
4040         }
4041         h->deblocking_filter= tmp;
4042         if(h->deblocking_filter < 2)
4043             h->deblocking_filter^= 1; // 1<->0
4044
4045         if( h->deblocking_filter ) {
4046             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4047             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4048         }
4049     }
4050
4051     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4052        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4053        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4054        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4055         h->deblocking_filter= 0;
4056
4057     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4058         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4059             /* Cheat slightly for speed:
4060                Do not bother to deblock across slices. */
4061             h->deblocking_filter = 2;
4062         } else {
4063             h0->max_contexts = 1;
4064             if(!h0->single_decode_warning) {
4065                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4066                 h0->single_decode_warning = 1;
4067             }
4068             if(h != h0)
4069                 return 1; // deblocking switched inside frame
4070         }
4071     }
4072
4073 #if 0 //FMO
4074     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4075         slice_group_change_cycle= get_bits(&s->gb, ?);
4076 #endif
4077
4078     h0->last_slice_type = slice_type;
4079     h->slice_num = ++h0->current_slice;
4080
4081     for(j=0; j<2; j++){
4082         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4083         ref2frm[0]=
4084         ref2frm[1]= -1;
4085         for(i=0; i<48; i++)
4086             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4087                           +(h->ref_list[j][i].reference&3);
4088     }
4089
4090     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4091     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4092
4093     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4094         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4095                h->slice_num,
4096                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4097                first_mb_in_slice,
4098                av_get_pict_type_char(h->slice_type),
4099                pps_id, h->frame_num,
4100                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4101                h->ref_count[0], h->ref_count[1],
4102                s->qscale,
4103                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4104                h->use_weight,
4105                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4106                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4107                );
4108     }
4109
4110     return 0;
4111 }
4112
4113 /**
4114  *
4115  */
4116 static inline int get_level_prefix(GetBitContext *gb){
4117     unsigned int buf;
4118     int log;
4119
4120     OPEN_READER(re, gb);
4121     UPDATE_CACHE(re, gb);
4122     buf=GET_CACHE(re, gb);
4123
4124     log= 32 - av_log2(buf);
4125 #ifdef TRACE
4126     print_bin(buf>>(32-log), log);
4127     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4128 #endif
4129
4130     LAST_SKIP_BITS(re, gb, log);
4131     CLOSE_READER(re, gb);
4132
4133     return log-1;
4134 }
4135
4136 static inline int get_dct8x8_allowed(H264Context *h){
4137     int i;
4138     for(i=0; i<4; i++){
4139         if(!IS_SUB_8X8(h->sub_mb_type[i])
4140            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4141             return 0;
4142     }
4143     return 1;
4144 }
4145
4146 /**
4147  * decodes a residual block.
4148  * @param n block index
4149  * @param scantable scantable
4150  * @param max_coeff number of coefficients in the block
4151  * @return <0 if an error occurred
4152  */
4153 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4154     MpegEncContext * const s = &h->s;
4155     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4156     int level[16];
4157     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4158
4159     //FIXME put trailing_onex into the context
4160
4161     if(n == CHROMA_DC_BLOCK_INDEX){
4162         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4163         total_coeff= coeff_token>>2;
4164     }else{
4165         if(n == LUMA_DC_BLOCK_INDEX){
4166             total_coeff= pred_non_zero_count(h, 0);
4167             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4168             total_coeff= coeff_token>>2;
4169         }else{
4170             total_coeff= pred_non_zero_count(h, n);
4171             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4172             total_coeff= coeff_token>>2;
4173             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4174         }
4175     }
4176
4177     //FIXME set last_non_zero?
4178
4179     if(total_coeff==0)
4180         return 0;
4181     if(total_coeff > (unsigned)max_coeff) {
4182         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4183         return -1;
4184     }
4185
4186     trailing_ones= coeff_token&3;
4187     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4188     assert(total_coeff<=16);
4189
4190     for(i=0; i<trailing_ones; i++){
4191         level[i]= 1 - 2*get_bits1(gb);
4192     }
4193
4194     if(i<total_coeff) {
4195         int level_code, mask;
4196         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4197         int prefix= get_level_prefix(gb);
4198
4199         //first coefficient has suffix_length equal to 0 or 1
4200         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4201             if(suffix_length)
4202                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4203             else
4204                 level_code= (prefix<<suffix_length); //part
4205         }else if(prefix==14){
4206             if(suffix_length)
4207                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4208             else
4209                 level_code= prefix + get_bits(gb, 4); //part
4210         }else{
4211             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4212             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4213             if(prefix>=16)
4214                 level_code += (1<<(prefix-3))-4096;
4215         }
4216
4217         if(trailing_ones < 3) level_code += 2;
4218
4219         suffix_length = 1;
4220         if(level_code > 5)
4221             suffix_length++;
4222         mask= -(level_code&1);
4223         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4224         i++;
4225
4226         //remaining coefficients have suffix_length > 0
4227         for(;i<total_coeff;i++) {
4228             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4229             prefix = get_level_prefix(gb);
4230             if(prefix<15){
4231                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4232             }else{
4233                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4234                 if(prefix>=16)
4235                     level_code += (1<<(prefix-3))-4096;
4236             }
4237             mask= -(level_code&1);
4238             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4239             if(level_code > suffix_limit[suffix_length])
4240                 suffix_length++;
4241         }
4242     }
4243
4244     if(total_coeff == max_coeff)
4245         zeros_left=0;
4246     else{
4247         if(n == CHROMA_DC_BLOCK_INDEX)
4248             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4249         else
4250             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4251     }
4252
4253     coeff_num = zeros_left + total_coeff - 1;
4254     j = scantable[coeff_num];
4255     if(n > 24){
4256         block[j] = level[0];
4257         for(i=1;i<total_coeff;i++) {
4258             if(zeros_left <= 0)
4259                 run_before = 0;
4260             else if(zeros_left < 7){
4261                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4262             }else{
4263                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4264             }
4265             zeros_left -= run_before;
4266             coeff_num -= 1 + run_before;
4267             j= scantable[ coeff_num ];
4268
4269             block[j]= level[i];
4270         }
4271     }else{
4272         block[j] = (level[0] * qmul[j] + 32)>>6;
4273         for(i=1;i<total_coeff;i++) {
4274             if(zeros_left <= 0)
4275                 run_before = 0;
4276             else if(zeros_left < 7){
4277                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4278             }else{
4279                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4280             }
4281             zeros_left -= run_before;
4282             coeff_num -= 1 + run_before;
4283             j= scantable[ coeff_num ];
4284
4285             block[j]= (level[i] * qmul[j] + 32)>>6;
4286         }
4287     }
4288
4289     if(zeros_left<0){
4290         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4291         return -1;
4292     }
4293
4294     return 0;
4295 }
4296
4297 static void predict_field_decoding_flag(H264Context *h){
4298     MpegEncContext * const s = &h->s;
4299     const int mb_xy= h->mb_xy;
4300     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4301                 ? s->current_picture.mb_type[mb_xy-1]
4302                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4303                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4304                 : 0;
4305     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4306 }
4307
4308 /**
4309  * decodes a P_SKIP or B_SKIP macroblock
4310  */
4311 static void decode_mb_skip(H264Context *h){
4312     MpegEncContext * const s = &h->s;
4313     const int mb_xy= h->mb_xy;
4314     int mb_type=0;
4315
4316     memset(h->non_zero_count[mb_xy], 0, 16);
4317     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4318
4319     if(MB_FIELD)
4320         mb_type|= MB_TYPE_INTERLACED;
4321
4322     if( h->slice_type_nos == FF_B_TYPE )
4323     {
4324         // just for fill_caches. pred_direct_motion will set the real mb_type
4325         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4326
4327         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4328         pred_direct_motion(h, &mb_type);
4329         mb_type|= MB_TYPE_SKIP;
4330     }
4331     else
4332     {
4333         int mx, my;
4334         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4335
4336         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4337         pred_pskip_motion(h, &mx, &my);
4338         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4339         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4340     }
4341
4342     write_back_motion(h, mb_type);
4343     s->current_picture.mb_type[mb_xy]= mb_type;
4344     s->current_picture.qscale_table[mb_xy]= s->qscale;
4345     h->slice_table[ mb_xy ]= h->slice_num;
4346     h->prev_mb_skipped= 1;
4347 }
4348
4349 /**
4350  * decodes a macroblock
4351  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4352  */
4353 static int decode_mb_cavlc(H264Context *h){
4354     MpegEncContext * const s = &h->s;
4355     int mb_xy;
4356     int partition_count;
4357     unsigned int mb_type, cbp;
4358     int dct8x8_allowed= h->pps.transform_8x8_mode;
4359
4360     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4361
4362     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4363
4364     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4365     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4366                 down the code */
4367     if(h->slice_type_nos != FF_I_TYPE){
4368         if(s->mb_skip_run==-1)
4369             s->mb_skip_run= get_ue_golomb(&s->gb);
4370
4371         if (s->mb_skip_run--) {
4372             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4373                 if(s->mb_skip_run==0)
4374                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4375                 else
4376                     predict_field_decoding_flag(h);
4377             }
4378             decode_mb_skip(h);
4379             return 0;
4380         }
4381     }
4382     if(FRAME_MBAFF){
4383         if( (s->mb_y&1) == 0 )
4384             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4385     }else
4386         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4387
4388     h->prev_mb_skipped= 0;
4389
4390     mb_type= get_ue_golomb(&s->gb);
4391     if(h->slice_type_nos == FF_B_TYPE){
4392         if(mb_type < 23){
4393             partition_count= b_mb_type_info[mb_type].partition_count;
4394             mb_type=         b_mb_type_info[mb_type].type;
4395         }else{
4396             mb_type -= 23;
4397             goto decode_intra_mb;
4398         }
4399     }else if(h->slice_type_nos == FF_P_TYPE){
4400         if(mb_type < 5){
4401             partition_count= p_mb_type_info[mb_type].partition_count;
4402             mb_type=         p_mb_type_info[mb_type].type;
4403         }else{
4404             mb_type -= 5;
4405             goto decode_intra_mb;
4406         }
4407     }else{
4408        assert(h->slice_type_nos == FF_I_TYPE);
4409         if(h->slice_type == FF_SI_TYPE && mb_type)
4410             mb_type--;
4411 decode_intra_mb:
4412         if(mb_type > 25){
4413             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4414             return -1;
4415         }
4416         partition_count=0;
4417         cbp= i_mb_type_info[mb_type].cbp;
4418         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4419         mb_type= i_mb_type_info[mb_type].type;
4420     }
4421
4422     if(MB_FIELD)
4423         mb_type |= MB_TYPE_INTERLACED;
4424
4425     h->slice_table[ mb_xy ]= h->slice_num;
4426
4427     if(IS_INTRA_PCM(mb_type)){
4428         unsigned int x, y;
4429
4430         // We assume these blocks are very rare so we do not optimize it.
4431         align_get_bits(&s->gb);
4432
4433         // The pixels are stored in the same order as levels in h->mb array.
4434         for(y=0; y<16; y++){
4435             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4436             for(x=0; x<16; x++){
4437                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4438                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4439             }
4440         }
4441         for(y=0; y<8; y++){
4442             const int index= 256 + 4*(y&3) + 32*(y>>2);
4443             for(x=0; x<8; x++){
4444                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4445                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4446             }
4447         }
4448         for(y=0; y<8; y++){
4449             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4450             for(x=0; x<8; x++){
4451                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4452                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4453             }
4454         }
4455
4456         // In deblocking, the quantizer is 0
4457         s->current_picture.qscale_table[mb_xy]= 0;
4458         // All coeffs are present
4459         memset(h->non_zero_count[mb_xy], 16, 16);
4460
4461         s->current_picture.mb_type[mb_xy]= mb_type;
4462         return 0;
4463     }
4464
4465     if(MB_MBAFF){
4466         h->ref_count[0] <<= 1;
4467         h->ref_count[1] <<= 1;
4468     }
4469
4470     fill_caches(h, mb_type, 0);
4471
4472     //mb_pred
4473     if(IS_INTRA(mb_type)){
4474             int pred_mode;
4475 //            init_top_left_availability(h);
4476             if(IS_INTRA4x4(mb_type)){
4477                 int i;
4478                 int di = 1;
4479                 if(dct8x8_allowed && get_bits1(&s->gb)){
4480                     mb_type |= MB_TYPE_8x8DCT;
4481                     di = 4;
4482                 }
4483
4484 //                fill_intra4x4_pred_table(h);
4485                 for(i=0; i<16; i+=di){
4486                     int mode= pred_intra_mode(h, i);
4487
4488                     if(!get_bits1(&s->gb)){
4489                         const int rem_mode= get_bits(&s->gb, 3);
4490                         mode = rem_mode + (rem_mode >= mode);
4491                     }
4492
4493                     if(di==4)
4494                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4495                     else
4496                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4497                 }
4498                 write_back_intra_pred_mode(h);
4499                 if( check_intra4x4_pred_mode(h) < 0)
4500                     return -1;
4501             }else{
4502                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4503                 if(h->intra16x16_pred_mode < 0)
4504                     return -1;
4505             }
4506
4507             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4508             if(pred_mode < 0)
4509                 return -1;
4510             h->chroma_pred_mode= pred_mode;
4511     }else if(partition_count==4){
4512         int i, j, sub_partition_count[4], list, ref[2][4];
4513
4514         if(h->slice_type_nos == FF_B_TYPE){
4515             for(i=0; i<4; i++){
4516                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4517                 if(h->sub_mb_type[i] >=13){
4518                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4519                     return -1;
4520                 }
4521                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4522                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4523             }
4524             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4525                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4526                 pred_direct_motion(h, &mb_type);
4527                 h->ref_cache[0][scan8[4]] =
4528                 h->ref_cache[1][scan8[4]] =
4529                 h->ref_cache[0][scan8[12]] =
4530                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4531             }
4532         }else{
4533             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4534             for(i=0; i<4; i++){
4535                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4536                 if(h->sub_mb_type[i] >=4){
4537                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4538                     return -1;
4539                 }
4540                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4541                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4542             }
4543         }
4544
4545         for(list=0; list<h->list_count; list++){
4546             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4547             for(i=0; i<4; i++){
4548                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4549                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4550                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4551                     if(tmp>=ref_count){
4552                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4553                         return -1;
4554                     }
4555                     ref[list][i]= tmp;
4556                 }else{
4557                  //FIXME
4558                     ref[list][i] = -1;
4559                 }
4560             }
4561         }
4562
4563         if(dct8x8_allowed)
4564             dct8x8_allowed = get_dct8x8_allowed(h);
4565
4566         for(list=0; list<h->list_count; list++){
4567             for(i=0; i<4; i++){
4568                 if(IS_DIRECT(h->sub_mb_type[i])) {
4569                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4570                     continue;
4571                 }
4572                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4573                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4574
4575                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4576                     const int sub_mb_type= h->sub_mb_type[i];
4577                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4578                     for(j=0; j<sub_partition_count[i]; j++){
4579                         int mx, my;
4580                         const int index= 4*i + block_width*j;
4581                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4582                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4583                         mx += get_se_golomb(&s->gb);
4584                         my += get_se_golomb(&s->gb);
4585                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4586
4587                         if(IS_SUB_8X8(sub_mb_type)){
4588                             mv_cache[ 1 ][0]=
4589                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4590                             mv_cache[ 1 ][1]=
4591                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4592                         }else if(IS_SUB_8X4(sub_mb_type)){
4593                             mv_cache[ 1 ][0]= mx;
4594                             mv_cache[ 1 ][1]= my;
4595                         }else if(IS_SUB_4X8(sub_mb_type)){
4596                             mv_cache[ 8 ][0]= mx;
4597                             mv_cache[ 8 ][1]= my;
4598                         }
4599                         mv_cache[ 0 ][0]= mx;
4600                         mv_cache[ 0 ][1]= my;
4601                     }
4602                 }else{
4603                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4604                     p[0] = p[1]=
4605                     p[8] = p[9]= 0;
4606                 }
4607             }
4608         }
4609     }else if(IS_DIRECT(mb_type)){
4610         pred_direct_motion(h, &mb_type);
4611         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4612     }else{
4613         int list, mx, my, i;
4614          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4615         if(IS_16X16(mb_type)){
4616             for(list=0; list<h->list_count; list++){
4617                     unsigned int val;
4618                     if(IS_DIR(mb_type, 0, list)){
4619                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4620                         if(val >= h->ref_count[list]){
4621                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4622                             return -1;
4623                         }
4624                     }else
4625                         val= LIST_NOT_USED&0xFF;
4626                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4627             }
4628             for(list=0; list<h->list_count; list++){
4629                 unsigned int val;
4630                 if(IS_DIR(mb_type, 0, list)){
4631                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4632                     mx += get_se_golomb(&s->gb);
4633                     my += get_se_golomb(&s->gb);
4634                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4635
4636                     val= pack16to32(mx,my);
4637                 }else
4638                     val=0;
4639                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4640             }
4641         }
4642         else if(IS_16X8(mb_type)){
4643             for(list=0; list<h->list_count; list++){
4644                     for(i=0; i<2; i++){
4645                         unsigned int val;
4646                         if(IS_DIR(mb_type, i, list)){
4647                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4648                             if(val >= h->ref_count[list]){
4649                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4650                                 return -1;
4651                             }
4652                         }else
4653                             val= LIST_NOT_USED&0xFF;
4654                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4655                     }
4656             }
4657             for(list=0; list<h->list_count; list++){
4658                 for(i=0; i<2; i++){
4659                     unsigned int val;
4660                     if(IS_DIR(mb_type, i, list)){
4661                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4662                         mx += get_se_golomb(&s->gb);
4663                         my += get_se_golomb(&s->gb);
4664                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4665
4666                         val= pack16to32(mx,my);
4667                     }else
4668                         val=0;
4669                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4670                 }
4671             }
4672         }else{
4673             assert(IS_8X16(mb_type));
4674             for(list=0; list<h->list_count; list++){
4675                     for(i=0; i<2; i++){
4676                         unsigned int val;
4677                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4678                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4679                             if(val >= h->ref_count[list]){
4680                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4681                                 return -1;
4682                             }
4683                         }else
4684                             val= LIST_NOT_USED&0xFF;
4685                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4686                     }
4687             }
4688             for(list=0; list<h->list_count; list++){
4689                 for(i=0; i<2; i++){
4690                     unsigned int val;
4691                     if(IS_DIR(mb_type, i, list)){
4692                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4693                         mx += get_se_golomb(&s->gb);
4694                         my += get_se_golomb(&s->gb);
4695                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4696
4697                         val= pack16to32(mx,my);
4698                     }else
4699                         val=0;
4700                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4701                 }
4702             }
4703         }
4704     }
4705
4706     if(IS_INTER(mb_type))
4707         write_back_motion(h, mb_type);
4708
4709     if(!IS_INTRA16x16(mb_type)){
4710         cbp= get_ue_golomb(&s->gb);
4711         if(cbp > 47){
4712             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4713             return -1;
4714         }
4715
4716         if(IS_INTRA4x4(mb_type))
4717             cbp= golomb_to_intra4x4_cbp[cbp];
4718         else
4719             cbp= golomb_to_inter_cbp[cbp];
4720     }
4721     h->cbp = cbp;
4722
4723     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4724         if(get_bits1(&s->gb)){
4725             mb_type |= MB_TYPE_8x8DCT;
4726             h->cbp_table[mb_xy]= cbp;
4727         }
4728     }
4729     s->current_picture.mb_type[mb_xy]= mb_type;
4730
4731     if(cbp || IS_INTRA16x16(mb_type)){
4732         int i8x8, i4x4, chroma_idx;
4733         int dquant;
4734         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4735         const uint8_t *scan, *scan8x8, *dc_scan;
4736
4737 //        fill_non_zero_count_cache(h);
4738
4739         if(IS_INTERLACED(mb_type)){
4740             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4741             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4742             dc_scan= luma_dc_field_scan;
4743         }else{
4744             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4745             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4746             dc_scan= luma_dc_zigzag_scan;
4747         }
4748
4749         dquant= get_se_golomb(&s->gb);
4750
4751         if( dquant > 25 || dquant < -26 ){
4752             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4753             return -1;
4754         }
4755
4756         s->qscale += dquant;
4757         if(((unsigned)s->qscale) > 51){
4758             if(s->qscale<0) s->qscale+= 52;
4759             else            s->qscale-= 52;
4760         }
4761
4762         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4763         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4764         if(IS_INTRA16x16(mb_type)){
4765             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4766                 return -1; //FIXME continue if partitioned and other return -1 too
4767             }
4768
4769             assert((cbp&15) == 0 || (cbp&15) == 15);
4770
4771             if(cbp&15){
4772                 for(i8x8=0; i8x8<4; i8x8++){
4773                     for(i4x4=0; i4x4<4; i4x4++){
4774                         const int index= i4x4 + 4*i8x8;
4775                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4776                             return -1;
4777                         }
4778                     }
4779                 }
4780             }else{
4781                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4782             }
4783         }else{
4784             for(i8x8=0; i8x8<4; i8x8++){
4785                 if(cbp & (1<<i8x8)){
4786                     if(IS_8x8DCT(mb_type)){
4787                         DCTELEM *buf = &h->mb[64*i8x8];
4788                         uint8_t *nnz;
4789                         for(i4x4=0; i4x4<4; i4x4++){
4790                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4791                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4792                                 return -1;
4793                         }
4794                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4795                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4796                     }else{
4797                         for(i4x4=0; i4x4<4; i4x4++){
4798                             const int index= i4x4 + 4*i8x8;
4799
4800                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4801                                 return -1;
4802                             }
4803                         }
4804                     }
4805                 }else{
4806                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4807                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4808                 }
4809             }
4810         }
4811
4812         if(cbp&0x30){
4813             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4814                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4815                     return -1;
4816                 }
4817         }
4818
4819         if(cbp&0x20){
4820             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4821                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4822                 for(i4x4=0; i4x4<4; i4x4++){
4823                     const int index= 16 + 4*chroma_idx + i4x4;
4824                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4825                         return -1;
4826                     }
4827                 }
4828             }
4829         }else{
4830             uint8_t * const nnz= &h->non_zero_count_cache[0];
4831             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4832             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4833         }
4834     }else{
4835         uint8_t * const nnz= &h->non_zero_count_cache[0];
4836         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4837         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4838         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4839     }
4840     s->current_picture.qscale_table[mb_xy]= s->qscale;
4841     write_back_non_zero_count(h);
4842
4843     if(MB_MBAFF){
4844         h->ref_count[0] >>= 1;
4845         h->ref_count[1] >>= 1;
4846     }
4847
4848     return 0;
4849 }
4850
4851 static int decode_cabac_field_decoding_flag(H264Context *h) {
4852     MpegEncContext * const s = &h->s;
4853     const int mb_x = s->mb_x;
4854     const int mb_y = s->mb_y & ~1;
4855     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4856     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4857
4858     unsigned int ctx = 0;
4859
4860     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4861         ctx += 1;
4862     }
4863     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4864         ctx += 1;
4865     }
4866
4867     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4868 }
4869
4870 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4871     uint8_t *state= &h->cabac_state[ctx_base];
4872     int mb_type;
4873
4874     if(intra_slice){
4875         MpegEncContext * const s = &h->s;
4876         const int mba_xy = h->left_mb_xy[0];
4877         const int mbb_xy = h->top_mb_xy;
4878         int ctx=0;
4879         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4880             ctx++;
4881         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4882             ctx++;
4883         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4884             return 0;   /* I4x4 */
4885         state += 2;
4886     }else{
4887         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4888             return 0;   /* I4x4 */
4889     }
4890
4891     if( get_cabac_terminate( &h->cabac ) )
4892         return 25;  /* PCM */
4893
4894     mb_type = 1; /* I16x16 */
4895     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4896     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4897         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4898     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4899     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4900     return mb_type;
4901 }
4902
4903 static int decode_cabac_mb_type( H264Context *h ) {
4904     MpegEncContext * const s = &h->s;
4905
4906     if( h->slice_type_nos == FF_I_TYPE ) {
4907         return decode_cabac_intra_mb_type(h, 3, 1);
4908     } else if( h->slice_type_nos == FF_P_TYPE ) {
4909         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4910             /* P-type */
4911             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4912                 /* P_L0_D16x16, P_8x8 */
4913                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4914             } else {
4915                 /* P_L0_D8x16, P_L0_D16x8 */
4916                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4917             }
4918         } else {
4919             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4920         }
4921     } else if( h->slice_type_nos == FF_B_TYPE ) {
4922         const int mba_xy = h->left_mb_xy[0];
4923         const int mbb_xy = h->top_mb_xy;
4924         int ctx = 0;
4925         int bits;
4926
4927         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4928             ctx++;
4929         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4930             ctx++;
4931
4932         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4933             return 0; /* B_Direct_16x16 */
4934
4935         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4936             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4937         }
4938
4939         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4940         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4941         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4942         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4943         if( bits < 8 )
4944             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4945         else if( bits == 13 ) {
4946             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4947         } else if( bits == 14 )
4948             return 11; /* B_L1_L0_8x16 */
4949         else if( bits == 15 )
4950             return 22; /* B_8x8 */
4951
4952         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4953         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4954     } else {
4955         /* TODO SI/SP frames? */
4956         return -1;
4957     }
4958 }
4959
4960 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4961     MpegEncContext * const s = &h->s;
4962     int mba_xy, mbb_xy;
4963     int ctx = 0;
4964
4965     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4966         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4967         mba_xy = mb_xy - 1;
4968         if( (mb_y&1)
4969             && h->slice_table[mba_xy] == h->slice_num
4970             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4971             mba_xy += s->mb_stride;
4972         if( MB_FIELD ){
4973             mbb_xy = mb_xy - s->mb_stride;
4974             if( !(mb_y&1)
4975                 && h->slice_table[mbb_xy] == h->slice_num
4976                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4977                 mbb_xy -= s->mb_stride;
4978         }else
4979             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4980     }else{
4981         int mb_xy = h->mb_xy;
4982         mba_xy = mb_xy - 1;
4983         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4984     }
4985
4986     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4987         ctx++;
4988     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4989         ctx++;
4990
4991     if( h->slice_type_nos == FF_B_TYPE )
4992         ctx += 13;
4993     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4994 }
4995
4996 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4997     int mode = 0;
4998
4999     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5000         return pred_mode;
5001
5002     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5003     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5004     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5005
5006     if( mode >= pred_mode )
5007         return mode + 1;
5008     else
5009         return mode;
5010 }
5011
5012 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5013     const int mba_xy = h->left_mb_xy[0];
5014     const int mbb_xy = h->top_mb_xy;
5015
5016     int ctx = 0;
5017
5018     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5019     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5020         ctx++;
5021
5022     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5023         ctx++;
5024
5025     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5026         return 0;
5027
5028     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5029         return 1;
5030     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5031         return 2;
5032     else
5033         return 3;
5034 }
5035
5036 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5037     int cbp_b, cbp_a, ctx, cbp = 0;
5038
5039     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5040     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5041
5042     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5043     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5044     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5045     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5046     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5047     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5048     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5049     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5050     return cbp;
5051 }
5052 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5053     int ctx;
5054     int cbp_a, cbp_b;
5055
5056     cbp_a = (h->left_cbp>>4)&0x03;
5057     cbp_b = (h-> top_cbp>>4)&0x03;
5058
5059     ctx = 0;
5060     if( cbp_a > 0 ) ctx++;
5061     if( cbp_b > 0 ) ctx += 2;
5062     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5063         return 0;
5064
5065     ctx = 4;
5066     if( cbp_a == 2 ) ctx++;
5067     if( cbp_b == 2 ) ctx += 2;
5068     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5069 }
5070 static int decode_cabac_mb_dqp( H264Context *h) {
5071     int   ctx = 0;
5072     int   val = 0;
5073
5074     if( h->last_qscale_diff != 0 )
5075         ctx++;
5076
5077     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5078         if( ctx < 2 )
5079             ctx = 2;
5080         else
5081             ctx = 3;
5082         val++;
5083         if(val > 102) //prevent infinite loop
5084             return INT_MIN;
5085     }
5086
5087     if( val&0x01 )
5088         return (val + 1)/2;
5089     else
5090         return -(val + 1)/2;
5091 }
5092 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5093     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5094         return 0;   /* 8x8 */
5095     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5096         return 1;   /* 8x4 */
5097     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5098         return 2;   /* 4x8 */
5099     return 3;       /* 4x4 */
5100 }
5101 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5102     int type;
5103     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5104         return 0;   /* B_Direct_8x8 */
5105     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5106         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5107     type = 3;
5108     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5109         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5110             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5111         type += 4;
5112     }
5113     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5114     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5115     return type;
5116 }
5117
5118 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5119     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5120 }
5121
5122 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5123     int refa = h->ref_cache[list][scan8[n] - 1];
5124     int refb = h->ref_cache[list][scan8[n] - 8];
5125     int ref  = 0;
5126     int ctx  = 0;
5127
5128     if( h->slice_type_nos == FF_B_TYPE) {
5129         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5130             ctx++;
5131         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5132             ctx += 2;
5133     } else {
5134         if( refa > 0 )
5135             ctx++;
5136         if( refb > 0 )
5137             ctx += 2;
5138     }
5139
5140     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5141         ref++;
5142         if( ctx < 4 )
5143             ctx = 4;
5144         else
5145             ctx = 5;
5146         if(ref >= 32 /*h->ref_list[list]*/){
5147             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5148             return 0; //FIXME we should return -1 and check the return everywhere
5149         }
5150     }
5151     return ref;
5152 }
5153
5154 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5155     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5156                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5157     int ctxbase = (l == 0) ? 40 : 47;
5158     int ctx, mvd;
5159
5160     if( amvd < 3 )
5161         ctx = 0;
5162     else if( amvd > 32 )
5163         ctx = 2;
5164     else
5165         ctx = 1;
5166
5167     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5168         return 0;
5169
5170     mvd= 1;
5171     ctx= 3;
5172     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5173         mvd++;
5174         if( ctx < 6 )
5175             ctx++;
5176     }
5177
5178     if( mvd >= 9 ) {
5179         int k = 3;
5180         while( get_cabac_bypass( &h->cabac ) ) {
5181             mvd += 1 << k;
5182             k++;
5183             if(k>24){
5184                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5185                 return INT_MIN;
5186             }
5187         }
5188         while( k-- ) {
5189             if( get_cabac_bypass( &h->cabac ) )
5190                 mvd += 1 << k;
5191         }
5192     }
5193     return get_cabac_bypass_sign( &h->cabac, -mvd );
5194 }
5195
5196 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5197     int nza, nzb;
5198     int ctx = 0;
5199
5200     if( is_dc ) {
5201         if( cat == 0 ) {
5202             nza = h->left_cbp&0x100;
5203             nzb = h-> top_cbp&0x100;
5204         } else {
5205             nza = (h->left_cbp>>(6+idx))&0x01;
5206             nzb = (h-> top_cbp>>(6+idx))&0x01;
5207         }
5208     } else {
5209         if( cat == 4 ) {
5210             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5211             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5212         } else {
5213             assert(cat == 1 || cat == 2);
5214             nza = h->non_zero_count_cache[scan8[idx] - 1];
5215             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5216         }
5217     }
5218
5219     if( nza > 0 )
5220         ctx++;
5221
5222     if( nzb > 0 )
5223         ctx += 2;
5224
5225     return ctx + 4 * cat;
5226 }
5227
5228 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5229     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5230     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5231     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5232     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5233 };
5234
5235 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5236     static const int significant_coeff_flag_offset[2][6] = {
5237       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5238       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5239     };
5240     static const int last_coeff_flag_offset[2][6] = {
5241       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5242       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5243     };
5244     static const int coeff_abs_level_m1_offset[6] = {
5245         227+0, 227+10, 227+20, 227+30, 227+39, 426
5246     };
5247     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5248       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5249         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5250         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5251        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5252       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5253         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5254         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5255         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5256     };
5257     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5258      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5259      * map node ctx => cabac ctx for level=1 */
5260     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5261     /* map node ctx => cabac ctx for level>1 */
5262     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5263     static const uint8_t coeff_abs_level_transition[2][8] = {
5264     /* update node ctx after decoding a level=1 */
5265         { 1, 2, 3, 3, 4, 5, 6, 7 },
5266     /* update node ctx after decoding a level>1 */
5267         { 4, 4, 4, 4, 5, 6, 7, 7 }
5268     };
5269
5270     int index[64];
5271
5272     int av_unused last;
5273     int coeff_count = 0;
5274     int node_ctx = 0;
5275
5276     uint8_t *significant_coeff_ctx_base;
5277     uint8_t *last_coeff_ctx_base;
5278     uint8_t *abs_level_m1_ctx_base;
5279
5280 #ifndef ARCH_X86
5281 #define CABAC_ON_STACK
5282 #endif
5283 #ifdef CABAC_ON_STACK
5284 #define CC &cc
5285     CABACContext cc;
5286     cc.range     = h->cabac.range;
5287     cc.low       = h->cabac.low;
5288     cc.bytestream= h->cabac.bytestream;
5289 #else
5290 #define CC &h->cabac
5291 #endif
5292
5293
5294     /* cat: 0-> DC 16x16  n = 0
5295      *      1-> AC 16x16  n = luma4x4idx
5296      *      2-> Luma4x4   n = luma4x4idx
5297      *      3-> DC Chroma n = iCbCr
5298      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5299      *      5-> Luma8x8   n = 4 * luma8x8idx
5300      */
5301
5302     /* read coded block flag */
5303     if( is_dc || cat != 5 ) {
5304         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5305             if( !is_dc ) {
5306                 if( cat == 4 )
5307                     h->non_zero_count_cache[scan8[16+n]] = 0;
5308                 else
5309                     h->non_zero_count_cache[scan8[n]] = 0;
5310             }
5311
5312 #ifdef CABAC_ON_STACK
5313             h->cabac.range     = cc.range     ;
5314             h->cabac.low       = cc.low       ;
5315             h->cabac.bytestream= cc.bytestream;
5316 #endif
5317             return;
5318         }
5319     }
5320
5321     significant_coeff_ctx_base = h->cabac_state
5322         + significant_coeff_flag_offset[MB_FIELD][cat];
5323     last_coeff_ctx_base = h->cabac_state
5324         + last_coeff_flag_offset[MB_FIELD][cat];
5325     abs_level_m1_ctx_base = h->cabac_state
5326         + coeff_abs_level_m1_offset[cat];
5327
5328     if( !is_dc && cat == 5 ) {
5329 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5330         for(last= 0; last < coefs; last++) { \
5331             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5332             if( get_cabac( CC, sig_ctx )) { \
5333                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5334                 index[coeff_count++] = last; \
5335                 if( get_cabac( CC, last_ctx ) ) { \
5336                     last= max_coeff; \
5337                     break; \
5338                 } \
5339             } \
5340         }\
5341         if( last == max_coeff -1 ) {\
5342             index[coeff_count++] = last;\
5343         }
5344         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5345 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5346         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5347     } else {
5348         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5349 #else
5350         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5351     } else {
5352         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5353 #endif
5354     }
5355     assert(coeff_count > 0);
5356
5357     if( is_dc ) {
5358         if( cat == 0 )
5359             h->cbp_table[h->mb_xy] |= 0x100;
5360         else
5361             h->cbp_table[h->mb_xy] |= 0x40 << n;
5362     } else {
5363         if( cat == 5 )
5364             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5365         else if( cat == 4 )
5366             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5367         else {
5368             assert( cat == 1 || cat == 2 );
5369             h->non_zero_count_cache[scan8[n]] = coeff_count;
5370         }
5371     }
5372
5373     while( coeff_count-- ) {
5374         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5375
5376         int j= scantable[index[coeff_count]];
5377
5378         if( get_cabac( CC, ctx ) == 0 ) {
5379             node_ctx = coeff_abs_level_transition[0][node_ctx];
5380             if( is_dc ) {
5381                 block[j] = get_cabac_bypass_sign( CC, -1);
5382             }else{
5383                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5384             }
5385         } else {
5386             int coeff_abs = 2;
5387             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5388             node_ctx = coeff_abs_level_transition[1][node_ctx];
5389
5390             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5391                 coeff_abs++;
5392             }
5393
5394             if( coeff_abs >= 15 ) {
5395                 int j = 0;
5396                 while( get_cabac_bypass( CC ) ) {
5397                     j++;
5398                 }
5399
5400                 coeff_abs=1;
5401                 while( j-- ) {
5402                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5403                 }
5404                 coeff_abs+= 14;
5405             }
5406
5407             if( is_dc ) {
5408                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5409             }else{
5410                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5411             }
5412         }
5413     }
5414 #ifdef CABAC_ON_STACK
5415             h->cabac.range     = cc.range     ;
5416             h->cabac.low       = cc.low       ;
5417             h->cabac.bytestream= cc.bytestream;
5418 #endif
5419
5420 }
5421
5422 #ifndef CONFIG_SMALL
5423 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5424     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5425 }
5426
5427 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5428     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5429 }
5430 #endif
5431
5432 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5433 #ifdef CONFIG_SMALL
5434     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5435 #else
5436     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5437     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5438 #endif
5439 }
5440
5441 static inline void compute_mb_neighbors(H264Context *h)
5442 {
5443     MpegEncContext * const s = &h->s;
5444     const int mb_xy  = h->mb_xy;
5445     h->top_mb_xy     = mb_xy - s->mb_stride;
5446     h->left_mb_xy[0] = mb_xy - 1;
5447     if(FRAME_MBAFF){
5448         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5449         const int top_pair_xy      = pair_xy     - s->mb_stride;
5450         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5451         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5452         const int curr_mb_frame_flag = !MB_FIELD;
5453         const int bottom = (s->mb_y & 1);
5454         if (bottom
5455                 ? !curr_mb_frame_flag // bottom macroblock
5456                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5457                 ) {
5458             h->top_mb_xy -= s->mb_stride;
5459         }
5460         if (left_mb_frame_flag != curr_mb_frame_flag) {
5461             h->left_mb_xy[0] = pair_xy - 1;
5462         }
5463     } else if (FIELD_PICTURE) {
5464         h->top_mb_xy -= s->mb_stride;
5465     }
5466     return;
5467 }
5468
5469 /**
5470  * decodes a macroblock
5471  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5472  */
5473 static int decode_mb_cabac(H264Context *h) {
5474     MpegEncContext * const s = &h->s;
5475     int mb_xy;
5476     int mb_type, partition_count, cbp = 0;
5477     int dct8x8_allowed= h->pps.transform_8x8_mode;
5478
5479     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5480
5481     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5482
5483     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5484     if( h->slice_type_nos != FF_I_TYPE ) {
5485         int skip;
5486         /* a skipped mb needs the aff flag from the following mb */
5487         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5488             predict_field_decoding_flag(h);
5489         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5490             skip = h->next_mb_skipped;
5491         else
5492             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5493         /* read skip flags */
5494         if( skip ) {
5495             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5496                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5497                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5498                 if(h->next_mb_skipped)
5499                     predict_field_decoding_flag(h);
5500                 else
5501                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5502             }
5503
5504             decode_mb_skip(h);
5505
5506             h->cbp_table[mb_xy] = 0;
5507             h->chroma_pred_mode_table[mb_xy] = 0;
5508             h->last_qscale_diff = 0;
5509
5510             return 0;
5511
5512         }
5513     }
5514     if(FRAME_MBAFF){
5515         if( (s->mb_y&1) == 0 )
5516             h->mb_mbaff =
5517             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5518     }else
5519         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5520
5521     h->prev_mb_skipped = 0;
5522
5523     compute_mb_neighbors(h);
5524     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5525         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5526         return -1;
5527     }
5528
5529     if( h->slice_type_nos == FF_B_TYPE ) {
5530         if( mb_type < 23 ){
5531             partition_count= b_mb_type_info[mb_type].partition_count;
5532             mb_type=         b_mb_type_info[mb_type].type;
5533         }else{
5534             mb_type -= 23;
5535             goto decode_intra_mb;
5536         }
5537     } else if( h->slice_type_nos == FF_P_TYPE ) {
5538         if( mb_type < 5) {
5539             partition_count= p_mb_type_info[mb_type].partition_count;
5540             mb_type=         p_mb_type_info[mb_type].type;
5541         } else {
5542             mb_type -= 5;
5543             goto decode_intra_mb;
5544         }
5545     } else {
5546         if(h->slice_type == FF_SI_TYPE && mb_type)
5547             mb_type--;
5548         assert(h->slice_type_nos == FF_I_TYPE);
5549 decode_intra_mb:
5550         partition_count = 0;
5551         cbp= i_mb_type_info[mb_type].cbp;
5552         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5553         mb_type= i_mb_type_info[mb_type].type;
5554     }
5555     if(MB_FIELD)
5556         mb_type |= MB_TYPE_INTERLACED;
5557
5558     h->slice_table[ mb_xy ]= h->slice_num;
5559
5560     if(IS_INTRA_PCM(mb_type)) {
5561         const uint8_t *ptr;
5562         unsigned int x, y;
5563
5564         // We assume these blocks are very rare so we do not optimize it.
5565         // FIXME The two following lines get the bitstream position in the cabac
5566         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5567         ptr= h->cabac.bytestream;
5568         if(h->cabac.low&0x1) ptr--;
5569         if(CABAC_BITS==16){
5570             if(h->cabac.low&0x1FF) ptr--;
5571         }
5572
5573         // The pixels are stored in the same order as levels in h->mb array.
5574         for(y=0; y<16; y++){
5575             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5576             for(x=0; x<16; x++){
5577                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5578                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5579             }
5580         }
5581         for(y=0; y<8; y++){
5582             const int index= 256 + 4*(y&3) + 32*(y>>2);
5583             for(x=0; x<8; x++){
5584                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5585                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5586             }
5587         }
5588         for(y=0; y<8; y++){
5589             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5590             for(x=0; x<8; x++){
5591                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5592                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5593             }
5594         }
5595
5596         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5597
5598         // All blocks are present
5599         h->cbp_table[mb_xy] = 0x1ef;
5600         h->chroma_pred_mode_table[mb_xy] = 0;
5601         // In deblocking, the quantizer is 0
5602         s->current_picture.qscale_table[mb_xy]= 0;
5603         // All coeffs are present
5604         memset(h->non_zero_count[mb_xy], 16, 16);
5605         s->current_picture.mb_type[mb_xy]= mb_type;
5606         h->last_qscale_diff = 0;
5607         return 0;
5608     }
5609
5610     if(MB_MBAFF){
5611         h->ref_count[0] <<= 1;
5612         h->ref_count[1] <<= 1;
5613     }
5614
5615     fill_caches(h, mb_type, 0);
5616
5617     if( IS_INTRA( mb_type ) ) {
5618         int i, pred_mode;
5619         if( IS_INTRA4x4( mb_type ) ) {
5620             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5621                 mb_type |= MB_TYPE_8x8DCT;
5622                 for( i = 0; i < 16; i+=4 ) {
5623                     int pred = pred_intra_mode( h, i );
5624                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5625                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5626                 }
5627             } else {
5628                 for( i = 0; i < 16; i++ ) {
5629                     int pred = pred_intra_mode( h, i );
5630                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5631
5632                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5633                 }
5634             }
5635             write_back_intra_pred_mode(h);
5636             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5637         } else {
5638             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5639             if( h->intra16x16_pred_mode < 0 ) return -1;
5640         }
5641         h->chroma_pred_mode_table[mb_xy] =
5642         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5643
5644         pred_mode= check_intra_pred_mode( h, pred_mode );
5645         if( pred_mode < 0 ) return -1;
5646         h->chroma_pred_mode= pred_mode;
5647     } else if( partition_count == 4 ) {
5648         int i, j, sub_partition_count[4], list, ref[2][4];
5649
5650         if( h->slice_type_nos == FF_B_TYPE ) {
5651             for( i = 0; i < 4; i++ ) {
5652                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5653                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5654                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5655             }
5656             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5657                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5658                 pred_direct_motion(h, &mb_type);
5659                 h->ref_cache[0][scan8[4]] =
5660                 h->ref_cache[1][scan8[4]] =
5661                 h->ref_cache[0][scan8[12]] =
5662                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5663                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5664                     for( i = 0; i < 4; i++ )
5665                         if( IS_DIRECT(h->sub_mb_type[i]) )
5666                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5667                 }
5668             }
5669         } else {
5670             for( i = 0; i < 4; i++ ) {
5671                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5672                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5673                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5674             }
5675         }
5676
5677         for( list = 0; list < h->list_count; list++ ) {
5678                 for( i = 0; i < 4; i++ ) {
5679                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5680                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5681                         if( h->ref_count[list] > 1 )
5682                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5683                         else
5684                             ref[list][i] = 0;
5685                     } else {
5686                         ref[list][i] = -1;
5687                     }
5688                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5689                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5690                 }
5691         }
5692
5693         if(dct8x8_allowed)
5694             dct8x8_allowed = get_dct8x8_allowed(h);
5695
5696         for(list=0; list<h->list_count; list++){
5697             for(i=0; i<4; i++){
5698                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5699                 if(IS_DIRECT(h->sub_mb_type[i])){
5700                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5701                     continue;
5702                 }
5703
5704                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5705                     const int sub_mb_type= h->sub_mb_type[i];
5706                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5707                     for(j=0; j<sub_partition_count[i]; j++){
5708                         int mpx, mpy;
5709                         int mx, my;
5710                         const int index= 4*i + block_width*j;
5711                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5712                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5713                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5714
5715                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5716                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5717                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5718
5719                         if(IS_SUB_8X8(sub_mb_type)){
5720                             mv_cache[ 1 ][0]=
5721                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5722                             mv_cache[ 1 ][1]=
5723                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5724
5725                             mvd_cache[ 1 ][0]=
5726                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5727                             mvd_cache[ 1 ][1]=
5728                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5729                         }else if(IS_SUB_8X4(sub_mb_type)){
5730                             mv_cache[ 1 ][0]= mx;
5731                             mv_cache[ 1 ][1]= my;
5732
5733                             mvd_cache[ 1 ][0]= mx - mpx;
5734                             mvd_cache[ 1 ][1]= my - mpy;
5735                         }else if(IS_SUB_4X8(sub_mb_type)){
5736                             mv_cache[ 8 ][0]= mx;
5737                             mv_cache[ 8 ][1]= my;
5738
5739                             mvd_cache[ 8 ][0]= mx - mpx;
5740                             mvd_cache[ 8 ][1]= my - mpy;
5741                         }
5742                         mv_cache[ 0 ][0]= mx;
5743                         mv_cache[ 0 ][1]= my;
5744
5745                         mvd_cache[ 0 ][0]= mx - mpx;
5746                         mvd_cache[ 0 ][1]= my - mpy;
5747                     }
5748                 }else{
5749                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5750                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5751                     p[0] = p[1] = p[8] = p[9] = 0;
5752                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5753                 }
5754             }
5755         }
5756     } else if( IS_DIRECT(mb_type) ) {
5757         pred_direct_motion(h, &mb_type);
5758         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5759         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5760         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5761     } else {
5762         int list, mx, my, i, mpx, mpy;
5763         if(IS_16X16(mb_type)){
5764             for(list=0; list<h->list_count; list++){
5765                 if(IS_DIR(mb_type, 0, list)){
5766                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5767                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5768                 }else
5769                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5770             }
5771             for(list=0; list<h->list_count; list++){
5772                 if(IS_DIR(mb_type, 0, list)){
5773                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5774
5775                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5776                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5777                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5778
5779                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5780                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5781                 }else
5782                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5783             }
5784         }
5785         else if(IS_16X8(mb_type)){
5786             for(list=0; list<h->list_count; list++){
5787                     for(i=0; i<2; i++){
5788                         if(IS_DIR(mb_type, i, list)){
5789                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5790                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5791                         }else
5792                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5793                     }
5794             }
5795             for(list=0; list<h->list_count; list++){
5796                 for(i=0; i<2; i++){
5797                     if(IS_DIR(mb_type, i, list)){
5798                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5799                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5800                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5801                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5802
5803                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5804                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5805                     }else{
5806                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5807                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5808                     }
5809                 }
5810             }
5811         }else{
5812             assert(IS_8X16(mb_type));
5813             for(list=0; list<h->list_count; list++){
5814                     for(i=0; i<2; i++){
5815                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5816                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5817                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5818                         }else
5819                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5820                     }
5821             }
5822             for(list=0; list<h->list_count; list++){
5823                 for(i=0; i<2; i++){
5824                     if(IS_DIR(mb_type, i, list)){
5825                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5826                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5827                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5828
5829                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5830                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5831                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5832                     }else{
5833                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5834                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5835                     }
5836                 }
5837             }
5838         }
5839     }
5840
5841    if( IS_INTER( mb_type ) ) {
5842         h->chroma_pred_mode_table[mb_xy] = 0;
5843         write_back_motion( h, mb_type );
5844    }
5845
5846     if( !IS_INTRA16x16( mb_type ) ) {
5847         cbp  = decode_cabac_mb_cbp_luma( h );
5848         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5849     }
5850
5851     h->cbp_table[mb_xy] = h->cbp = cbp;
5852
5853     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5854         if( decode_cabac_mb_transform_size( h ) )
5855             mb_type |= MB_TYPE_8x8DCT;
5856     }
5857     s->current_picture.mb_type[mb_xy]= mb_type;
5858
5859     if( cbp || IS_INTRA16x16( mb_type ) ) {
5860         const uint8_t *scan, *scan8x8, *dc_scan;
5861         const uint32_t *qmul;
5862         int dqp;
5863
5864         if(IS_INTERLACED(mb_type)){
5865             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5866             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5867             dc_scan= luma_dc_field_scan;
5868         }else{
5869             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5870             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5871             dc_scan= luma_dc_zigzag_scan;
5872         }
5873
5874         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5875         if( dqp == INT_MIN ){
5876             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5877             return -1;
5878         }
5879         s->qscale += dqp;
5880         if(((unsigned)s->qscale) > 51){
5881             if(s->qscale<0) s->qscale+= 52;
5882             else            s->qscale-= 52;
5883         }
5884         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5885         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5886
5887         if( IS_INTRA16x16( mb_type ) ) {
5888             int i;
5889             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5890             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5891
5892             if( cbp&15 ) {
5893                 qmul = h->dequant4_coeff[0][s->qscale];
5894                 for( i = 0; i < 16; i++ ) {
5895                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5896                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5897                 }
5898             } else {
5899                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5900             }
5901         } else {
5902             int i8x8, i4x4;
5903             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5904                 if( cbp & (1<<i8x8) ) {
5905                     if( IS_8x8DCT(mb_type) ) {
5906                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5907                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5908                     } else {
5909                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5910                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5911                             const int index = 4*i8x8 + i4x4;
5912                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5913 //START_TIMER
5914                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5915 //STOP_TIMER("decode_residual")
5916                         }
5917                     }
5918                 } else {
5919                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5920                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5921                 }
5922             }
5923         }
5924
5925         if( cbp&0x30 ){
5926             int c;
5927             for( c = 0; c < 2; c++ ) {
5928                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5929                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5930             }
5931         }
5932
5933         if( cbp&0x20 ) {
5934             int c, i;
5935             for( c = 0; c < 2; c++ ) {
5936                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5937                 for( i = 0; i < 4; i++ ) {
5938                     const int index = 16 + 4 * c + i;
5939                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5940                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5941                 }
5942             }
5943         } else {
5944             uint8_t * const nnz= &h->non_zero_count_cache[0];
5945             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5946             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5947         }
5948     } else {
5949         uint8_t * const nnz= &h->non_zero_count_cache[0];
5950         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5951         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5952         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5953         h->last_qscale_diff = 0;
5954     }
5955
5956     s->current_picture.qscale_table[mb_xy]= s->qscale;
5957     write_back_non_zero_count(h);
5958
5959     if(MB_MBAFF){
5960         h->ref_count[0] >>= 1;
5961         h->ref_count[1] >>= 1;
5962     }
5963
5964     return 0;
5965 }
5966
5967
5968 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5969     int i, d;
5970     const int index_a = qp + h->slice_alpha_c0_offset;
5971     const int alpha = (alpha_table+52)[index_a];
5972     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5973
5974     if( bS[0] < 4 ) {
5975         int8_t tc[4];
5976         for(i=0; i<4; i++)
5977             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5978         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5979     } else {
5980         /* 16px edge length, because bS=4 is triggered by being at
5981          * the edge of an intra MB, so all 4 bS are the same */
5982             for( d = 0; d < 16; d++ ) {
5983                 const int p0 = pix[-1];
5984                 const int p1 = pix[-2];
5985                 const int p2 = pix[-3];
5986
5987                 const int q0 = pix[0];
5988                 const int q1 = pix[1];
5989                 const int q2 = pix[2];
5990
5991                 if( FFABS( p0 - q0 ) < alpha &&
5992                     FFABS( p1 - p0 ) < beta &&
5993                     FFABS( q1 - q0 ) < beta ) {
5994
5995                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5996                         if( FFABS( p2 - p0 ) < beta)
5997                         {
5998                             const int p3 = pix[-4];
5999                             /* p0', p1', p2' */
6000                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6001                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6002                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6003                         } else {
6004                             /* p0' */
6005                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6006                         }
6007                         if( FFABS( q2 - q0 ) < beta)
6008                         {
6009                             const int q3 = pix[3];
6010                             /* q0', q1', q2' */
6011                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6012                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6013                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6014                         } else {
6015                             /* q0' */
6016                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6017                         }
6018                     }else{
6019                         /* p0', q0' */
6020                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6021                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6022                     }
6023                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6024                 }
6025                 pix += stride;
6026             }
6027     }
6028 }
6029 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6030     int i;
6031     const int index_a = qp + h->slice_alpha_c0_offset;
6032     const int alpha = (alpha_table+52)[index_a];
6033     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6034
6035     if( bS[0] < 4 ) {
6036         int8_t tc[4];
6037         for(i=0; i<4; i++)
6038             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6039         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6040     } else {
6041         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6042     }
6043 }
6044
6045 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6046     int i;
6047     for( i = 0; i < 16; i++, pix += stride) {
6048         int index_a;
6049         int alpha;
6050         int beta;
6051
6052         int qp_index;
6053         int bS_index = (i >> 1);
6054         if (!MB_FIELD) {
6055             bS_index &= ~1;
6056             bS_index |= (i & 1);
6057         }
6058
6059         if( bS[bS_index] == 0 ) {
6060             continue;
6061         }
6062
6063         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6064         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6065         alpha = (alpha_table+52)[index_a];
6066         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6067
6068         if( bS[bS_index] < 4 ) {
6069             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6070             const int p0 = pix[-1];
6071             const int p1 = pix[-2];
6072             const int p2 = pix[-3];
6073             const int q0 = pix[0];
6074             const int q1 = pix[1];
6075             const int q2 = pix[2];
6076
6077             if( FFABS( p0 - q0 ) < alpha &&
6078                 FFABS( p1 - p0 ) < beta &&
6079                 FFABS( q1 - q0 ) < beta ) {
6080                 int tc = tc0;
6081                 int i_delta;
6082
6083                 if( FFABS( p2 - p0 ) < beta ) {
6084                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6085                     tc++;
6086                 }
6087                 if( FFABS( q2 - q0 ) < beta ) {
6088                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6089                     tc++;
6090                 }
6091
6092                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6093                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6094                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6095                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6096             }
6097         }else{
6098             const int p0 = pix[-1];
6099             const int p1 = pix[-2];
6100             const int p2 = pix[-3];
6101
6102             const int q0 = pix[0];
6103             const int q1 = pix[1];
6104             const int q2 = pix[2];
6105
6106             if( FFABS( p0 - q0 ) < alpha &&
6107                 FFABS( p1 - p0 ) < beta &&
6108                 FFABS( q1 - q0 ) < beta ) {
6109
6110                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6111                     if( FFABS( p2 - p0 ) < beta)
6112                     {
6113                         const int p3 = pix[-4];
6114                         /* p0', p1', p2' */
6115                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6116                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6117                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6118                     } else {
6119                         /* p0' */
6120                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6121                     }
6122                     if( FFABS( q2 - q0 ) < beta)
6123                     {
6124                         const int q3 = pix[3];
6125                         /* q0', q1', q2' */
6126                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6127                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6128                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6129                     } else {
6130                         /* q0' */
6131                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6132                     }
6133                 }else{
6134                     /* p0', q0' */
6135                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6136                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6137                 }
6138                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6139             }
6140         }
6141     }
6142 }
6143 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6144     int i;
6145     for( i = 0; i < 8; i++, pix += stride) {
6146         int index_a;
6147         int alpha;
6148         int beta;
6149
6150         int qp_index;
6151         int bS_index = i;
6152
6153         if( bS[bS_index] == 0 ) {
6154             continue;
6155         }
6156
6157         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6158         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6159         alpha = (alpha_table+52)[index_a];
6160         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6161
6162         if( bS[bS_index] < 4 ) {
6163             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6164             const int p0 = pix[-1];
6165             const int p1 = pix[-2];
6166             const int q0 = pix[0];
6167             const int q1 = pix[1];
6168
6169             if( FFABS( p0 - q0 ) < alpha &&
6170                 FFABS( p1 - p0 ) < beta &&
6171                 FFABS( q1 - q0 ) < beta ) {
6172                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6173
6174                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6175                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6176                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6177             }
6178         }else{
6179             const int p0 = pix[-1];
6180             const int p1 = pix[-2];
6181             const int q0 = pix[0];
6182             const int q1 = pix[1];
6183
6184             if( FFABS( p0 - q0 ) < alpha &&
6185                 FFABS( p1 - p0 ) < beta &&
6186                 FFABS( q1 - q0 ) < beta ) {
6187
6188                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6189                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6190                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6191             }
6192         }
6193     }
6194 }
6195
6196 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6197     int i, d;
6198     const int index_a = qp + h->slice_alpha_c0_offset;
6199     const int alpha = (alpha_table+52)[index_a];
6200     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6201     const int pix_next  = stride;
6202
6203     if( bS[0] < 4 ) {
6204         int8_t tc[4];
6205         for(i=0; i<4; i++)
6206             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6207         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6208     } else {
6209         /* 16px edge length, see filter_mb_edgev */
6210             for( d = 0; d < 16; d++ ) {
6211                 const int p0 = pix[-1*pix_next];
6212                 const int p1 = pix[-2*pix_next];
6213                 const int p2 = pix[-3*pix_next];
6214                 const int q0 = pix[0];
6215                 const int q1 = pix[1*pix_next];
6216                 const int q2 = pix[2*pix_next];
6217
6218                 if( FFABS( p0 - q0 ) < alpha &&
6219                     FFABS( p1 - p0 ) < beta &&
6220                     FFABS( q1 - q0 ) < beta ) {
6221
6222                     const int p3 = pix[-4*pix_next];
6223                     const int q3 = pix[ 3*pix_next];
6224
6225                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6226                         if( FFABS( p2 - p0 ) < beta) {
6227                             /* p0', p1', p2' */
6228                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6229                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6230                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6231                         } else {
6232                             /* p0' */
6233                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6234                         }
6235                         if( FFABS( q2 - q0 ) < beta) {
6236                             /* q0', q1', q2' */
6237                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6238                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6239                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6240                         } else {
6241                             /* q0' */
6242                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6243                         }
6244                     }else{
6245                         /* p0', q0' */
6246                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6247                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6248                     }
6249                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6250                 }
6251                 pix++;
6252             }
6253     }
6254 }
6255
6256 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6257     int i;
6258     const int index_a = qp + h->slice_alpha_c0_offset;
6259     const int alpha = (alpha_table+52)[index_a];
6260     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6261
6262     if( bS[0] < 4 ) {
6263         int8_t tc[4];
6264         for(i=0; i<4; i++)
6265             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6266         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6267     } else {
6268         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6269     }
6270 }
6271
6272 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6273     MpegEncContext * const s = &h->s;
6274     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6275     int mb_xy, mb_type;
6276     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6277
6278     mb_xy = h->mb_xy;
6279
6280     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6281 1 ||
6282        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6283                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6284         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6285         return;
6286     }
6287     assert(!FRAME_MBAFF);
6288
6289     mb_type = s->current_picture.mb_type[mb_xy];
6290     qp = s->current_picture.qscale_table[mb_xy];
6291     qp0 = s->current_picture.qscale_table[mb_xy-1];
6292     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6293     qpc = get_chroma_qp( h, 0, qp );
6294     qpc0 = get_chroma_qp( h, 0, qp0 );
6295     qpc1 = get_chroma_qp( h, 0, qp1 );
6296     qp0 = (qp + qp0 + 1) >> 1;
6297     qp1 = (qp + qp1 + 1) >> 1;
6298     qpc0 = (qpc + qpc0 + 1) >> 1;
6299     qpc1 = (qpc + qpc1 + 1) >> 1;
6300     qp_thresh = 15 - h->slice_alpha_c0_offset;
6301     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6302        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6303         return;
6304
6305     if( IS_INTRA(mb_type) ) {
6306         int16_t bS4[4] = {4,4,4,4};
6307         int16_t bS3[4] = {3,3,3,3};
6308         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6309         if( IS_8x8DCT(mb_type) ) {
6310             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6311             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6312             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6313             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6314         } else {
6315             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6316             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6317             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6318             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6319             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6320             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6321             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6322             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6323         }
6324         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6325         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6326         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6327         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6328         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6329         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6330         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6331         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6332         return;
6333     } else {
6334         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6335         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6336         int edges;
6337         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6338             edges = 4;
6339             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6340         } else {
6341             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6342                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6343             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6344                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6345                              ? 3 : 0;
6346             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6347             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6348             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6349                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6350         }
6351         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6352             bSv[0][0] = 0x0004000400040004ULL;
6353         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6354             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6355
6356 #define FILTER(hv,dir,edge)\
6357         if(bSv[dir][edge]) {\
6358             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6359             if(!(edge&1)) {\
6360                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6361                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6362             }\
6363         }
6364         if( edges == 1 ) {
6365             FILTER(v,0,0);
6366             FILTER(h,1,0);
6367         } else if( IS_8x8DCT(mb_type) ) {
6368             FILTER(v,0,0);
6369             FILTER(v,0,2);
6370             FILTER(h,1,0);
6371             FILTER(h,1,2);
6372         } else {
6373             FILTER(v,0,0);
6374             FILTER(v,0,1);
6375             FILTER(v,0,2);
6376             FILTER(v,0,3);
6377             FILTER(h,1,0);
6378             FILTER(h,1,1);
6379             FILTER(h,1,2);
6380             FILTER(h,1,3);
6381         }
6382 #undef FILTER
6383     }
6384 }
6385
6386 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6387     MpegEncContext * const s = &h->s;
6388     const int mb_xy= mb_x + mb_y*s->mb_stride;
6389     const int mb_type = s->current_picture.mb_type[mb_xy];
6390     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6391     int first_vertical_edge_done = 0;
6392     int dir;
6393
6394     //for sufficiently low qp, filtering wouldn't do anything
6395     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6396     if(!FRAME_MBAFF){
6397         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6398         int qp = s->current_picture.qscale_table[mb_xy];
6399         if(qp <= qp_thresh
6400            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6401            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6402             return;
6403         }
6404     }
6405
6406     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6407     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6408         int top_type, left_type[2];
6409         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6410         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6411         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6412
6413         if(IS_8x8DCT(top_type)){
6414             h->non_zero_count_cache[4+8*0]=
6415             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6416             h->non_zero_count_cache[6+8*0]=
6417             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6418         }
6419         if(IS_8x8DCT(left_type[0])){
6420             h->non_zero_count_cache[3+8*1]=
6421             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6422         }
6423         if(IS_8x8DCT(left_type[1])){
6424             h->non_zero_count_cache[3+8*3]=
6425             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6426         }
6427
6428         if(IS_8x8DCT(mb_type)){
6429             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6430             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6431
6432             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6433             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6434
6435             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6436             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6437
6438             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6439             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6440         }
6441     }
6442
6443     if (FRAME_MBAFF
6444             // left mb is in picture
6445             && h->slice_table[mb_xy-1] != 255
6446             // and current and left pair do not have the same interlaced type
6447             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6448             // and left mb is in the same slice if deblocking_filter == 2
6449             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6450         /* First vertical edge is different in MBAFF frames
6451          * There are 8 different bS to compute and 2 different Qp
6452          */
6453         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6454         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6455         int16_t bS[8];
6456         int qp[2];
6457         int bqp[2];
6458         int rqp[2];
6459         int mb_qp, mbn0_qp, mbn1_qp;
6460         int i;
6461         first_vertical_edge_done = 1;
6462
6463         if( IS_INTRA(mb_type) )
6464             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6465         else {
6466             for( i = 0; i < 8; i++ ) {
6467                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6468
6469                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6470                     bS[i] = 4;
6471                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6472                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6473                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6474                     bS[i] = 2;
6475                 else
6476                     bS[i] = 1;
6477             }
6478         }
6479
6480         mb_qp = s->current_picture.qscale_table[mb_xy];
6481         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6482         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6483         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6484         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6485                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6486         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6487                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6488         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6489         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6490                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6491         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6492                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6493
6494         /* Filter edge */
6495         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6496         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6497         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6498         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6499         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6500     }
6501     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6502     for( dir = 0; dir < 2; dir++ )
6503     {
6504         int edge;
6505         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6506         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6507         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6508         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6509         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6510
6511         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6512                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6513         // how often to recheck mv-based bS when iterating between edges
6514         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6515                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6516         // how often to recheck mv-based bS when iterating along each edge
6517         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6518
6519         if (first_vertical_edge_done) {
6520             start = 1;
6521             first_vertical_edge_done = 0;
6522         }
6523
6524         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6525             start = 1;
6526
6527         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6528             && !IS_INTERLACED(mb_type)
6529             && IS_INTERLACED(mbm_type)
6530             ) {
6531             // This is a special case in the norm where the filtering must
6532             // be done twice (one each of the field) even if we are in a
6533             // frame macroblock.
6534             //
6535             static const int nnz_idx[4] = {4,5,6,3};
6536             unsigned int tmp_linesize   = 2 *   linesize;
6537             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6538             int mbn_xy = mb_xy - 2 * s->mb_stride;
6539             int qp;
6540             int i, j;
6541             int16_t bS[4];
6542
6543             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6544                 if( IS_INTRA(mb_type) ||
6545                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6546                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6547                 } else {
6548                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6549                     for( i = 0; i < 4; i++ ) {
6550                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6551                             mbn_nnz[nnz_idx[i]] != 0 )
6552                             bS[i] = 2;
6553                         else
6554                             bS[i] = 1;
6555                     }
6556                 }
6557                 // Do not use s->qscale as luma quantizer because it has not the same
6558                 // value in IPCM macroblocks.
6559                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6560                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6561                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6562                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6563                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6564                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6565                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6566                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6567             }
6568
6569             start = 1;
6570         }
6571
6572         /* Calculate bS */
6573         for( edge = start; edge < edges; edge++ ) {
6574             /* mbn_xy: neighbor macroblock */
6575             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6576             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6577             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6578             int16_t bS[4];
6579             int qp;
6580
6581             if( (edge&1) && IS_8x8DCT(mb_type) )
6582                 continue;
6583
6584             if( IS_INTRA(mb_type) ||
6585                 IS_INTRA(mbn_type) ) {
6586                 int value;
6587                 if (edge == 0) {
6588                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6589                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6590                     ) {
6591                         value = 4;
6592                     } else {
6593                         value = 3;
6594                     }
6595                 } else {
6596                     value = 3;
6597                 }
6598                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6599             } else {
6600                 int i, l;
6601                 int mv_done;
6602
6603                 if( edge & mask_edge ) {
6604                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6605                     mv_done = 1;
6606                 }
6607                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6608                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6609                     mv_done = 1;
6610                 }
6611                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6612                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6613                     int bn_idx= b_idx - (dir ? 8:1);
6614                     int v = 0;
6615
6616                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6617                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6618                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6619                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6620                     }
6621
6622                     if(h->slice_type_nos == FF_B_TYPE && v){
6623                         v=0;
6624                         for( l = 0; !v && l < 2; l++ ) {
6625                             int ln= 1-l;
6626                             v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6627                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6628                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6629                         }
6630                     }
6631
6632                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6633                     mv_done = 1;
6634                 }
6635                 else
6636                     mv_done = 0;
6637
6638                 for( i = 0; i < 4; i++ ) {
6639                     int x = dir == 0 ? edge : i;
6640                     int y = dir == 0 ? i    : edge;
6641                     int b_idx= 8 + 4 + x + 8*y;
6642                     int bn_idx= b_idx - (dir ? 8:1);
6643
6644                     if( h->non_zero_count_cache[b_idx] != 0 ||
6645                         h->non_zero_count_cache[bn_idx] != 0 ) {
6646                         bS[i] = 2;
6647                     }
6648                     else if(!mv_done)
6649                     {
6650                         bS[i] = 0;
6651                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6652                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6653                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6654                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6655                                 bS[i] = 1;
6656                                 break;
6657                             }
6658                         }
6659
6660                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6661                             bS[i] = 0;
6662                             for( l = 0; l < 2; l++ ) {
6663                                 int ln= 1-l;
6664                                 if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6665                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6666                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6667                                     bS[i] = 1;
6668                                     break;
6669                                 }
6670                             }
6671                         }
6672                     }
6673                 }
6674
6675                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6676                     continue;
6677             }
6678
6679             /* Filter edge */
6680             // Do not use s->qscale as luma quantizer because it has not the same
6681             // value in IPCM macroblocks.
6682             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6683             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6684             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6685             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6686             if( dir == 0 ) {
6687                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6688                 if( (edge&1) == 0 ) {
6689                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6690                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6691                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6692                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6693                 }
6694             } else {
6695                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6696                 if( (edge&1) == 0 ) {
6697                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6698                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6699                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6700                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6701                 }
6702             }
6703         }
6704     }
6705 }
6706
6707 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6708     MpegEncContext * const s = &h->s;
6709     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6710
6711     s->mb_skip_run= -1;
6712
6713     if( h->pps.cabac ) {
6714         int i;
6715
6716         /* realign */
6717         align_get_bits( &s->gb );
6718
6719         /* init cabac */
6720         ff_init_cabac_states( &h->cabac);
6721         ff_init_cabac_decoder( &h->cabac,
6722                                s->gb.buffer + get_bits_count(&s->gb)/8,
6723                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6724         /* calculate pre-state */
6725         for( i= 0; i < 460; i++ ) {
6726             int pre;
6727             if( h->slice_type_nos == FF_I_TYPE )
6728                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6729             else
6730                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6731
6732             if( pre <= 63 )
6733                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6734             else
6735                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6736         }
6737
6738         for(;;){
6739 //START_TIMER
6740             int ret = decode_mb_cabac(h);
6741             int eos;
6742 //STOP_TIMER("decode_mb_cabac")
6743
6744             if(ret>=0) hl_decode_mb(h);
6745
6746             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6747                 s->mb_y++;
6748
6749                 if(ret>=0) ret = decode_mb_cabac(h);
6750
6751                 if(ret>=0) hl_decode_mb(h);
6752                 s->mb_y--;
6753             }
6754             eos = get_cabac_terminate( &h->cabac );
6755
6756             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6757                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6758                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6759                 return -1;
6760             }
6761
6762             if( ++s->mb_x >= s->mb_width ) {
6763                 s->mb_x = 0;
6764                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6765                 ++s->mb_y;
6766                 if(FIELD_OR_MBAFF_PICTURE) {
6767                     ++s->mb_y;
6768                 }
6769             }
6770
6771             if( eos || s->mb_y >= s->mb_height ) {
6772                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6773                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6774                 return 0;
6775             }
6776         }
6777
6778     } else {
6779         for(;;){
6780             int ret = decode_mb_cavlc(h);
6781
6782             if(ret>=0) hl_decode_mb(h);
6783
6784             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6785                 s->mb_y++;
6786                 ret = decode_mb_cavlc(h);
6787
6788                 if(ret>=0) hl_decode_mb(h);
6789                 s->mb_y--;
6790             }
6791
6792             if(ret<0){
6793                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6794                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6795
6796                 return -1;
6797             }
6798
6799             if(++s->mb_x >= s->mb_width){
6800                 s->mb_x=0;
6801                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6802                 ++s->mb_y;
6803                 if(FIELD_OR_MBAFF_PICTURE) {
6804                     ++s->mb_y;
6805                 }
6806                 if(s->mb_y >= s->mb_height){
6807                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6808
6809                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6810                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6811
6812                         return 0;
6813                     }else{
6814                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6815
6816                         return -1;
6817                     }
6818                 }
6819             }
6820
6821             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6822                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6823                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6824                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6825
6826                     return 0;
6827                 }else{
6828                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6829
6830                     return -1;
6831                 }
6832             }
6833         }
6834     }
6835
6836 #if 0
6837     for(;s->mb_y < s->mb_height; s->mb_y++){
6838         for(;s->mb_x < s->mb_width; s->mb_x++){
6839             int ret= decode_mb(h);
6840
6841             hl_decode_mb(h);
6842
6843             if(ret<0){
6844                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6845                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6846
6847                 return -1;
6848             }
6849
6850             if(++s->mb_x >= s->mb_width){
6851                 s->mb_x=0;
6852                 if(++s->mb_y >= s->mb_height){
6853                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6854                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6855
6856                         return 0;
6857                     }else{
6858                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6859
6860                         return -1;
6861                     }
6862                 }
6863             }
6864
6865             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6866                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6867                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6868
6869                     return 0;
6870                 }else{
6871                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6872
6873                     return -1;
6874                 }
6875             }
6876         }
6877         s->mb_x=0;
6878         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6879     }
6880 #endif
6881     return -1; //not reached
6882 }
6883
6884 static int decode_unregistered_user_data(H264Context *h, int size){
6885     MpegEncContext * const s = &h->s;
6886     uint8_t user_data[16+256];
6887     int e, build, i;
6888
6889     if(size<16)
6890         return -1;
6891
6892     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6893         user_data[i]= get_bits(&s->gb, 8);
6894     }
6895
6896     user_data[i]= 0;
6897     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6898     if(e==1 && build>=0)
6899         h->x264_build= build;
6900
6901     if(s->avctx->debug & FF_DEBUG_BUGS)
6902         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6903
6904     for(; i<size; i++)
6905         skip_bits(&s->gb, 8);
6906
6907     return 0;
6908 }
6909
6910 static int decode_sei(H264Context *h){
6911     MpegEncContext * const s = &h->s;
6912
6913     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6914         int size, type;
6915
6916         type=0;
6917         do{
6918             type+= show_bits(&s->gb, 8);
6919         }while(get_bits(&s->gb, 8) == 255);
6920
6921         size=0;
6922         do{
6923             size+= show_bits(&s->gb, 8);
6924         }while(get_bits(&s->gb, 8) == 255);
6925
6926         switch(type){
6927         case 5:
6928             if(decode_unregistered_user_data(h, size) < 0)
6929                 return -1;
6930             break;
6931         default:
6932             skip_bits(&s->gb, 8*size);
6933         }
6934
6935         //FIXME check bits here
6936         align_get_bits(&s->gb);
6937     }
6938
6939     return 0;
6940 }
6941
6942 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6943     MpegEncContext * const s = &h->s;
6944     int cpb_count, i;
6945     cpb_count = get_ue_golomb(&s->gb) + 1;
6946     get_bits(&s->gb, 4); /* bit_rate_scale */
6947     get_bits(&s->gb, 4); /* cpb_size_scale */
6948     for(i=0; i<cpb_count; i++){
6949         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6950         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6951         get_bits1(&s->gb);     /* cbr_flag */
6952     }
6953     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6954     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6955     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6956     get_bits(&s->gb, 5); /* time_offset_length */
6957 }
6958
6959 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6960     MpegEncContext * const s = &h->s;
6961     int aspect_ratio_info_present_flag;
6962     unsigned int aspect_ratio_idc;
6963     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6964
6965     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6966
6967     if( aspect_ratio_info_present_flag ) {
6968         aspect_ratio_idc= get_bits(&s->gb, 8);
6969         if( aspect_ratio_idc == EXTENDED_SAR ) {
6970             sps->sar.num= get_bits(&s->gb, 16);
6971             sps->sar.den= get_bits(&s->gb, 16);
6972         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
6973             sps->sar=  pixel_aspect[aspect_ratio_idc];
6974         }else{
6975             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6976             return -1;
6977         }
6978     }else{
6979         sps->sar.num=
6980         sps->sar.den= 0;
6981     }
6982 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6983
6984     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6985         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6986     }
6987
6988     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6989         get_bits(&s->gb, 3);    /* video_format */
6990         get_bits1(&s->gb);      /* video_full_range_flag */
6991         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6992             get_bits(&s->gb, 8); /* colour_primaries */
6993             get_bits(&s->gb, 8); /* transfer_characteristics */
6994             get_bits(&s->gb, 8); /* matrix_coefficients */
6995         }
6996     }
6997
6998     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6999         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7000         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7001     }
7002
7003     sps->timing_info_present_flag = get_bits1(&s->gb);
7004     if(sps->timing_info_present_flag){
7005         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7006         sps->time_scale = get_bits_long(&s->gb, 32);
7007         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7008     }
7009
7010     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7011     if(nal_hrd_parameters_present_flag)
7012         decode_hrd_parameters(h, sps);
7013     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7014     if(vcl_hrd_parameters_present_flag)
7015         decode_hrd_parameters(h, sps);
7016     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7017         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7018     get_bits1(&s->gb);         /* pic_struct_present_flag */
7019
7020     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7021     if(sps->bitstream_restriction_flag){
7022         unsigned int num_reorder_frames;
7023         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7024         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7025         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7026         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7027         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7028         num_reorder_frames= get_ue_golomb(&s->gb);
7029         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7030
7031         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7032             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7033             return -1;
7034         }
7035
7036         sps->num_reorder_frames= num_reorder_frames;
7037     }
7038
7039     return 0;
7040 }
7041
7042 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7043                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7044     MpegEncContext * const s = &h->s;
7045     int i, last = 8, next = 8;
7046     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7047     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7048         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7049     else
7050     for(i=0;i<size;i++){
7051         if(next)
7052             next = (last + get_se_golomb(&s->gb)) & 0xff;
7053         if(!i && !next){ /* matrix not written, we use the preset one */
7054             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7055             break;
7056         }
7057         last = factors[scan[i]] = next ? next : last;
7058     }
7059 }
7060
7061 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7062                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7063     MpegEncContext * const s = &h->s;
7064     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7065     const uint8_t *fallback[4] = {
7066         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7067         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7068         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7069         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7070     };
7071     if(get_bits1(&s->gb)){
7072         sps->scaling_matrix_present |= is_sps;
7073         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7074         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7075         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7076         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7077         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7078         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7079         if(is_sps || pps->transform_8x8_mode){
7080             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7081             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7082         }
7083     } else if(fallback_sps) {
7084         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7085         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7086     }
7087 }
7088
7089 /**
7090  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7091  */
7092 static void *
7093 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7094                     const size_t size, const char *name)
7095 {
7096     if(id>=max) {
7097         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7098         return NULL;
7099     }
7100
7101     if(!vec[id]) {
7102         vec[id] = av_mallocz(size);
7103         if(vec[id] == NULL)
7104             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7105     }
7106     return vec[id];
7107 }
7108
7109 static inline int decode_seq_parameter_set(H264Context *h){
7110     MpegEncContext * const s = &h->s;
7111     int profile_idc, level_idc;
7112     unsigned int sps_id, tmp, mb_width, mb_height;
7113     int i;
7114     SPS *sps;
7115
7116     profile_idc= get_bits(&s->gb, 8);
7117     get_bits1(&s->gb);   //constraint_set0_flag
7118     get_bits1(&s->gb);   //constraint_set1_flag
7119     get_bits1(&s->gb);   //constraint_set2_flag
7120     get_bits1(&s->gb);   //constraint_set3_flag
7121     get_bits(&s->gb, 4); // reserved
7122     level_idc= get_bits(&s->gb, 8);
7123     sps_id= get_ue_golomb(&s->gb);
7124
7125     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7126     if(sps == NULL)
7127         return -1;
7128
7129     sps->profile_idc= profile_idc;
7130     sps->level_idc= level_idc;
7131
7132     if(sps->profile_idc >= 100){ //high profile
7133         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7134             get_bits1(&s->gb);  //residual_color_transform_flag
7135         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7136         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7137         sps->transform_bypass = get_bits1(&s->gb);
7138         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7139     }else
7140         sps->scaling_matrix_present = 0;
7141
7142     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7143     sps->poc_type= get_ue_golomb(&s->gb);
7144
7145     if(sps->poc_type == 0){ //FIXME #define
7146         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7147     } else if(sps->poc_type == 1){//FIXME #define
7148         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7149         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7150         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7151         tmp= get_ue_golomb(&s->gb);
7152
7153         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7154             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7155             return -1;
7156         }
7157         sps->poc_cycle_length= tmp;
7158
7159         for(i=0; i<sps->poc_cycle_length; i++)
7160             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7161     }else if(sps->poc_type != 2){
7162         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7163         return -1;
7164     }
7165
7166     tmp= get_ue_golomb(&s->gb);
7167     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7168         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7169         return -1;
7170     }
7171     sps->ref_frame_count= tmp;
7172     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7173     mb_width= get_ue_golomb(&s->gb) + 1;
7174     mb_height= get_ue_golomb(&s->gb) + 1;
7175     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7176        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7177         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7178         return -1;
7179     }
7180     sps->mb_width = mb_width;
7181     sps->mb_height= mb_height;
7182
7183     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7184     if(!sps->frame_mbs_only_flag)
7185         sps->mb_aff= get_bits1(&s->gb);
7186     else
7187         sps->mb_aff= 0;
7188
7189     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7190
7191 #ifndef ALLOW_INTERLACE
7192     if(sps->mb_aff)
7193         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7194 #endif
7195     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7196         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7197
7198     sps->crop= get_bits1(&s->gb);
7199     if(sps->crop){
7200         sps->crop_left  = get_ue_golomb(&s->gb);
7201         sps->crop_right = get_ue_golomb(&s->gb);
7202         sps->crop_top   = get_ue_golomb(&s->gb);
7203         sps->crop_bottom= get_ue_golomb(&s->gb);
7204         if(sps->crop_left || sps->crop_top){
7205             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7206         }
7207         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7208             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7209         }
7210     }else{
7211         sps->crop_left  =
7212         sps->crop_right =
7213         sps->crop_top   =
7214         sps->crop_bottom= 0;
7215     }
7216
7217     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7218     if( sps->vui_parameters_present_flag )
7219         decode_vui_parameters(h, sps);
7220
7221     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7222         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7223                sps_id, sps->profile_idc, sps->level_idc,
7224                sps->poc_type,
7225                sps->ref_frame_count,
7226                sps->mb_width, sps->mb_height,
7227                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7228                sps->direct_8x8_inference_flag ? "8B8" : "",
7229                sps->crop_left, sps->crop_right,
7230                sps->crop_top, sps->crop_bottom,
7231                sps->vui_parameters_present_flag ? "VUI" : ""
7232                );
7233     }
7234     return 0;
7235 }
7236
7237 static void
7238 build_qp_table(PPS *pps, int t, int index)
7239 {
7240     int i;
7241     for(i = 0; i < 52; i++)
7242         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7243 }
7244
7245 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7246     MpegEncContext * const s = &h->s;
7247     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7248     PPS *pps;
7249
7250     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7251     if(pps == NULL)
7252         return -1;
7253
7254     tmp= get_ue_golomb(&s->gb);
7255     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7256         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7257         return -1;
7258     }
7259     pps->sps_id= tmp;
7260
7261     pps->cabac= get_bits1(&s->gb);
7262     pps->pic_order_present= get_bits1(&s->gb);
7263     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7264     if(pps->slice_group_count > 1 ){
7265         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7266         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7267         switch(pps->mb_slice_group_map_type){
7268         case 0:
7269 #if 0
7270 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7271 |    run_length[ i ]                                |1  |ue(v)   |
7272 #endif
7273             break;
7274         case 2:
7275 #if 0
7276 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7277 |{                                                  |   |        |
7278 |    top_left_mb[ i ]                               |1  |ue(v)   |
7279 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7280 |   }                                               |   |        |
7281 #endif
7282             break;
7283         case 3:
7284         case 4:
7285         case 5:
7286 #if 0
7287 |   slice_group_change_direction_flag               |1  |u(1)    |
7288 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7289 #endif
7290             break;
7291         case 6:
7292 #if 0
7293 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7294 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7295 |)                                                  |   |        |
7296 |    slice_group_id[ i ]                            |1  |u(v)    |
7297 #endif
7298             break;
7299         }
7300     }
7301     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7302     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7303     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7304         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7305         pps->ref_count[0]= pps->ref_count[1]= 1;
7306         return -1;
7307     }
7308
7309     pps->weighted_pred= get_bits1(&s->gb);
7310     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7311     pps->init_qp= get_se_golomb(&s->gb) + 26;
7312     pps->init_qs= get_se_golomb(&s->gb) + 26;
7313     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7314     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7315     pps->constrained_intra_pred= get_bits1(&s->gb);
7316     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7317
7318     pps->transform_8x8_mode= 0;
7319     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7320     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7321     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7322
7323     if(get_bits_count(&s->gb) < bit_length){
7324         pps->transform_8x8_mode= get_bits1(&s->gb);
7325         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7326         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7327     } else {
7328         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7329     }
7330
7331     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7332     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7333     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7334         h->pps.chroma_qp_diff= 1;
7335
7336     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7337         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7338                pps_id, pps->sps_id,
7339                pps->cabac ? "CABAC" : "CAVLC",
7340                pps->slice_group_count,
7341                pps->ref_count[0], pps->ref_count[1],
7342                pps->weighted_pred ? "weighted" : "",
7343                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7344                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7345                pps->constrained_intra_pred ? "CONSTR" : "",
7346                pps->redundant_pic_cnt_present ? "REDU" : "",
7347                pps->transform_8x8_mode ? "8x8DCT" : ""
7348                );
7349     }
7350
7351     return 0;
7352 }
7353
7354 /**
7355  * Call decode_slice() for each context.
7356  *
7357  * @param h h264 master context
7358  * @param context_count number of contexts to execute
7359  */
7360 static void execute_decode_slices(H264Context *h, int context_count){
7361     MpegEncContext * const s = &h->s;
7362     AVCodecContext * const avctx= s->avctx;
7363     H264Context *hx;
7364     int i;
7365
7366     if(context_count == 1) {
7367         decode_slice(avctx, h);
7368     } else {
7369         for(i = 1; i < context_count; i++) {
7370             hx = h->thread_context[i];
7371             hx->s.error_resilience = avctx->error_resilience;
7372             hx->s.error_count = 0;
7373         }
7374
7375         avctx->execute(avctx, (void *)decode_slice,
7376                        (void **)h->thread_context, NULL, context_count);
7377
7378         /* pull back stuff from slices to master context */
7379         hx = h->thread_context[context_count - 1];
7380         s->mb_x = hx->s.mb_x;
7381         s->mb_y = hx->s.mb_y;
7382         s->dropable = hx->s.dropable;
7383         s->picture_structure = hx->s.picture_structure;
7384         for(i = 1; i < context_count; i++)
7385             h->s.error_count += h->thread_context[i]->s.error_count;
7386     }
7387 }
7388
7389
7390 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7391     MpegEncContext * const s = &h->s;
7392     AVCodecContext * const avctx= s->avctx;
7393     int buf_index=0;
7394     H264Context *hx; ///< thread context
7395     int context_count = 0;
7396
7397     h->max_contexts = avctx->thread_count;
7398 #if 0
7399     int i;
7400     for(i=0; i<50; i++){
7401         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7402     }
7403 #endif
7404     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7405         h->current_slice = 0;
7406         if (!s->first_field)
7407             s->current_picture_ptr= NULL;
7408     }
7409
7410     for(;;){
7411         int consumed;
7412         int dst_length;
7413         int bit_length;
7414         const uint8_t *ptr;
7415         int i, nalsize = 0;
7416         int err;
7417
7418         if(h->is_avc) {
7419             if(buf_index >= buf_size) break;
7420             nalsize = 0;
7421             for(i = 0; i < h->nal_length_size; i++)
7422                 nalsize = (nalsize << 8) | buf[buf_index++];
7423             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7424                 if(nalsize == 1){
7425                     buf_index++;
7426                     continue;
7427                 }else{
7428                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7429                     break;
7430                 }
7431             }
7432         } else {
7433             // start code prefix search
7434             for(; buf_index + 3 < buf_size; buf_index++){
7435                 // This should always succeed in the first iteration.
7436                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7437                     break;
7438             }
7439
7440             if(buf_index+3 >= buf_size) break;
7441
7442             buf_index+=3;
7443         }
7444
7445         hx = h->thread_context[context_count];
7446
7447         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7448         if (ptr==NULL || dst_length < 0){
7449             return -1;
7450         }
7451         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7452             dst_length--;
7453         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7454
7455         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7456             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7457         }
7458
7459         if (h->is_avc && (nalsize != consumed)){
7460             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7461             consumed= nalsize;
7462         }
7463
7464         buf_index += consumed;
7465
7466         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7467            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7468             continue;
7469
7470       again:
7471         err = 0;
7472         switch(hx->nal_unit_type){
7473         case NAL_IDR_SLICE:
7474             if (h->nal_unit_type != NAL_IDR_SLICE) {
7475                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7476                 return -1;
7477             }
7478             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7479         case NAL_SLICE:
7480             init_get_bits(&hx->s.gb, ptr, bit_length);
7481             hx->intra_gb_ptr=
7482             hx->inter_gb_ptr= &hx->s.gb;
7483             hx->s.data_partitioning = 0;
7484
7485             if((err = decode_slice_header(hx, h)))
7486                break;
7487
7488             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7489             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7490                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7491                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7492                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7493                && avctx->skip_frame < AVDISCARD_ALL)
7494                 context_count++;
7495             break;
7496         case NAL_DPA:
7497             init_get_bits(&hx->s.gb, ptr, bit_length);
7498             hx->intra_gb_ptr=
7499             hx->inter_gb_ptr= NULL;
7500             hx->s.data_partitioning = 1;
7501
7502             err = decode_slice_header(hx, h);
7503             break;
7504         case NAL_DPB:
7505             init_get_bits(&hx->intra_gb, ptr, bit_length);
7506             hx->intra_gb_ptr= &hx->intra_gb;
7507             break;
7508         case NAL_DPC:
7509             init_get_bits(&hx->inter_gb, ptr, bit_length);
7510             hx->inter_gb_ptr= &hx->inter_gb;
7511
7512             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7513                && s->context_initialized
7514                && s->hurry_up < 5
7515                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7516                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7517                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7518                && avctx->skip_frame < AVDISCARD_ALL)
7519                 context_count++;
7520             break;
7521         case NAL_SEI:
7522             init_get_bits(&s->gb, ptr, bit_length);
7523             decode_sei(h);
7524             break;
7525         case NAL_SPS:
7526             init_get_bits(&s->gb, ptr, bit_length);
7527             decode_seq_parameter_set(h);
7528
7529             if(s->flags& CODEC_FLAG_LOW_DELAY)
7530                 s->low_delay=1;
7531
7532             if(avctx->has_b_frames < 2)
7533                 avctx->has_b_frames= !s->low_delay;
7534             break;
7535         case NAL_PPS:
7536             init_get_bits(&s->gb, ptr, bit_length);
7537
7538             decode_picture_parameter_set(h, bit_length);
7539
7540             break;
7541         case NAL_AUD:
7542         case NAL_END_SEQUENCE:
7543         case NAL_END_STREAM:
7544         case NAL_FILLER_DATA:
7545         case NAL_SPS_EXT:
7546         case NAL_AUXILIARY_SLICE:
7547             break;
7548         default:
7549             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7550         }
7551
7552         if(context_count == h->max_contexts) {
7553             execute_decode_slices(h, context_count);
7554             context_count = 0;
7555         }
7556
7557         if (err < 0)
7558             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7559         else if(err == 1) {
7560             /* Slice could not be decoded in parallel mode, copy down
7561              * NAL unit stuff to context 0 and restart. Note that
7562              * rbsp_buffer is not transferred, but since we no longer
7563              * run in parallel mode this should not be an issue. */
7564             h->nal_unit_type = hx->nal_unit_type;
7565             h->nal_ref_idc   = hx->nal_ref_idc;
7566             hx = h;
7567             goto again;
7568         }
7569     }
7570     if(context_count)
7571         execute_decode_slices(h, context_count);
7572     return buf_index;
7573 }
7574
7575 /**
7576  * returns the number of bytes consumed for building the current frame
7577  */
7578 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7579         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7580         if(pos+10>buf_size) pos=buf_size; // oops ;)
7581
7582         return pos;
7583 }
7584
7585 static int decode_frame(AVCodecContext *avctx,
7586                              void *data, int *data_size,
7587                              const uint8_t *buf, int buf_size)
7588 {
7589     H264Context *h = avctx->priv_data;
7590     MpegEncContext *s = &h->s;
7591     AVFrame *pict = data;
7592     int buf_index;
7593
7594     s->flags= avctx->flags;
7595     s->flags2= avctx->flags2;
7596
7597    /* end of stream, output what is still in the buffers */
7598     if (buf_size == 0) {
7599         Picture *out;
7600         int i, out_idx;
7601
7602 //FIXME factorize this with the output code below
7603         out = h->delayed_pic[0];
7604         out_idx = 0;
7605         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7606             if(h->delayed_pic[i]->poc < out->poc){
7607                 out = h->delayed_pic[i];
7608                 out_idx = i;
7609             }
7610
7611         for(i=out_idx; h->delayed_pic[i]; i++)
7612             h->delayed_pic[i] = h->delayed_pic[i+1];
7613
7614         if(out){
7615             *data_size = sizeof(AVFrame);
7616             *pict= *(AVFrame*)out;
7617         }
7618
7619         return 0;
7620     }
7621
7622     if(h->is_avc && !h->got_avcC) {
7623         int i, cnt, nalsize;
7624         unsigned char *p = avctx->extradata;
7625         if(avctx->extradata_size < 7) {
7626             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7627             return -1;
7628         }
7629         if(*p != 1) {
7630             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7631             return -1;
7632         }
7633         /* sps and pps in the avcC always have length coded with 2 bytes,
7634            so put a fake nal_length_size = 2 while parsing them */
7635         h->nal_length_size = 2;
7636         // Decode sps from avcC
7637         cnt = *(p+5) & 0x1f; // Number of sps
7638         p += 6;
7639         for (i = 0; i < cnt; i++) {
7640             nalsize = AV_RB16(p) + 2;
7641             if(decode_nal_units(h, p, nalsize) < 0) {
7642                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7643                 return -1;
7644             }
7645             p += nalsize;
7646         }
7647         // Decode pps from avcC
7648         cnt = *(p++); // Number of pps
7649         for (i = 0; i < cnt; i++) {
7650             nalsize = AV_RB16(p) + 2;
7651             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7652                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7653                 return -1;
7654             }
7655             p += nalsize;
7656         }
7657         // Now store right nal length size, that will be use to parse all other nals
7658         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7659         // Do not reparse avcC
7660         h->got_avcC = 1;
7661     }
7662
7663     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7664         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7665             return -1;
7666     }
7667
7668     buf_index=decode_nal_units(h, buf, buf_size);
7669     if(buf_index < 0)
7670         return -1;
7671
7672     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7673         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7674         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7675         return -1;
7676     }
7677
7678     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7679         Picture *out = s->current_picture_ptr;
7680         Picture *cur = s->current_picture_ptr;
7681         int i, pics, cross_idr, out_of_order, out_idx;
7682
7683         s->mb_y= 0;
7684
7685         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7686         s->current_picture_ptr->pict_type= s->pict_type;
7687
7688         if(!s->dropable) {
7689             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7690             h->prev_poc_msb= h->poc_msb;
7691             h->prev_poc_lsb= h->poc_lsb;
7692         }
7693         h->prev_frame_num_offset= h->frame_num_offset;
7694         h->prev_frame_num= h->frame_num;
7695
7696         /*
7697          * FIXME: Error handling code does not seem to support interlaced
7698          * when slices span multiple rows
7699          * The ff_er_add_slice calls don't work right for bottom
7700          * fields; they cause massive erroneous error concealing
7701          * Error marking covers both fields (top and bottom).
7702          * This causes a mismatched s->error_count
7703          * and a bad error table. Further, the error count goes to
7704          * INT_MAX when called for bottom field, because mb_y is
7705          * past end by one (callers fault) and resync_mb_y != 0
7706          * causes problems for the first MB line, too.
7707          */
7708         if (!FIELD_PICTURE)
7709             ff_er_frame_end(s);
7710
7711         MPV_frame_end(s);
7712
7713         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7714             /* Wait for second field. */
7715             *data_size = 0;
7716
7717         } else {
7718             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7719             /* Derive top_field_first from field pocs. */
7720             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7721
7722         //FIXME do something with unavailable reference frames
7723
7724             /* Sort B-frames into display order */
7725
7726             if(h->sps.bitstream_restriction_flag
7727                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7728                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7729                 s->low_delay = 0;
7730             }
7731
7732             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7733                && !h->sps.bitstream_restriction_flag){
7734                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7735                 s->low_delay= 0;
7736             }
7737
7738             pics = 0;
7739             while(h->delayed_pic[pics]) pics++;
7740
7741             assert(pics <= MAX_DELAYED_PIC_COUNT);
7742
7743             h->delayed_pic[pics++] = cur;
7744             if(cur->reference == 0)
7745                 cur->reference = DELAYED_PIC_REF;
7746
7747             out = h->delayed_pic[0];
7748             out_idx = 0;
7749             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7750                 if(h->delayed_pic[i]->poc < out->poc){
7751                     out = h->delayed_pic[i];
7752                     out_idx = i;
7753                 }
7754             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i];
7755
7756             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7757
7758             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7759                 { }
7760             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7761                || (s->low_delay &&
7762                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7763                  || cur->pict_type == FF_B_TYPE)))
7764             {
7765                 s->low_delay = 0;
7766                 s->avctx->has_b_frames++;
7767             }
7768
7769             if(out_of_order || pics > s->avctx->has_b_frames){
7770                 out->reference &= ~DELAYED_PIC_REF;
7771                 for(i=out_idx; h->delayed_pic[i]; i++)
7772                     h->delayed_pic[i] = h->delayed_pic[i+1];
7773             }
7774             if(!out_of_order && pics > s->avctx->has_b_frames){
7775                 *data_size = sizeof(AVFrame);
7776
7777                 h->outputed_poc = out->poc;
7778                 *pict= *(AVFrame*)out;
7779             }else{
7780                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7781             }
7782         }
7783     }
7784
7785     assert(pict->data[0] || !*data_size);
7786     ff_print_debug_info(s, pict);
7787 //printf("out %d\n", (int)pict->data[0]);
7788 #if 0 //?
7789
7790     /* Return the Picture timestamp as the frame number */
7791     /* we subtract 1 because it is added on utils.c     */
7792     avctx->frame_number = s->picture_number - 1;
7793 #endif
7794     return get_consumed_bytes(s, buf_index, buf_size);
7795 }
7796 #if 0
7797 static inline void fill_mb_avail(H264Context *h){
7798     MpegEncContext * const s = &h->s;
7799     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7800
7801     if(s->mb_y){
7802         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7803         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7804         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7805     }else{
7806         h->mb_avail[0]=
7807         h->mb_avail[1]=
7808         h->mb_avail[2]= 0;
7809     }
7810     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7811     h->mb_avail[4]= 1; //FIXME move out
7812     h->mb_avail[5]= 0; //FIXME move out
7813 }
7814 #endif
7815
7816 #ifdef TEST
7817 #undef printf
7818 #undef random
7819 #define COUNT 8000
7820 #define SIZE (COUNT*40)
7821 int main(void){
7822     int i;
7823     uint8_t temp[SIZE];
7824     PutBitContext pb;
7825     GetBitContext gb;
7826 //    int int_temp[10000];
7827     DSPContext dsp;
7828     AVCodecContext avctx;
7829
7830     dsputil_init(&dsp, &avctx);
7831
7832     init_put_bits(&pb, temp, SIZE);
7833     printf("testing unsigned exp golomb\n");
7834     for(i=0; i<COUNT; i++){
7835         START_TIMER
7836         set_ue_golomb(&pb, i);
7837         STOP_TIMER("set_ue_golomb");
7838     }
7839     flush_put_bits(&pb);
7840
7841     init_get_bits(&gb, temp, 8*SIZE);
7842     for(i=0; i<COUNT; i++){
7843         int j, s;
7844
7845         s= show_bits(&gb, 24);
7846
7847         START_TIMER
7848         j= get_ue_golomb(&gb);
7849         if(j != i){
7850             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7851 //            return -1;
7852         }
7853         STOP_TIMER("get_ue_golomb");
7854     }
7855
7856
7857     init_put_bits(&pb, temp, SIZE);
7858     printf("testing signed exp golomb\n");
7859     for(i=0; i<COUNT; i++){
7860         START_TIMER
7861         set_se_golomb(&pb, i - COUNT/2);
7862         STOP_TIMER("set_se_golomb");
7863     }
7864     flush_put_bits(&pb);
7865
7866     init_get_bits(&gb, temp, 8*SIZE);
7867     for(i=0; i<COUNT; i++){
7868         int j, s;
7869
7870         s= show_bits(&gb, 24);
7871
7872         START_TIMER
7873         j= get_se_golomb(&gb);
7874         if(j != i - COUNT/2){
7875             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7876 //            return -1;
7877         }
7878         STOP_TIMER("get_se_golomb");
7879     }
7880
7881 #if 0
7882     printf("testing 4x4 (I)DCT\n");
7883
7884     DCTELEM block[16];
7885     uint8_t src[16], ref[16];
7886     uint64_t error= 0, max_error=0;
7887
7888     for(i=0; i<COUNT; i++){
7889         int j;
7890 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7891         for(j=0; j<16; j++){
7892             ref[j]= random()%255;
7893             src[j]= random()%255;
7894         }
7895
7896         h264_diff_dct_c(block, src, ref, 4);
7897
7898         //normalize
7899         for(j=0; j<16; j++){
7900 //            printf("%d ", block[j]);
7901             block[j]= block[j]*4;
7902             if(j&1) block[j]= (block[j]*4 + 2)/5;
7903             if(j&4) block[j]= (block[j]*4 + 2)/5;
7904         }
7905 //        printf("\n");
7906
7907         s->dsp.h264_idct_add(ref, block, 4);
7908 /*        for(j=0; j<16; j++){
7909             printf("%d ", ref[j]);
7910         }
7911         printf("\n");*/
7912
7913         for(j=0; j<16; j++){
7914             int diff= FFABS(src[j] - ref[j]);
7915
7916             error+= diff*diff;
7917             max_error= FFMAX(max_error, diff);
7918         }
7919     }
7920     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7921     printf("testing quantizer\n");
7922     for(qp=0; qp<52; qp++){
7923         for(i=0; i<16; i++)
7924             src1_block[i]= src2_block[i]= random()%255;
7925
7926     }
7927     printf("Testing NAL layer\n");
7928
7929     uint8_t bitstream[COUNT];
7930     uint8_t nal[COUNT*2];
7931     H264Context h;
7932     memset(&h, 0, sizeof(H264Context));
7933
7934     for(i=0; i<COUNT; i++){
7935         int zeros= i;
7936         int nal_length;
7937         int consumed;
7938         int out_length;
7939         uint8_t *out;
7940         int j;
7941
7942         for(j=0; j<COUNT; j++){
7943             bitstream[j]= (random() % 255) + 1;
7944         }
7945
7946         for(j=0; j<zeros; j++){
7947             int pos= random() % COUNT;
7948             while(bitstream[pos] == 0){
7949                 pos++;
7950                 pos %= COUNT;
7951             }
7952             bitstream[pos]=0;
7953         }
7954
7955         START_TIMER
7956
7957         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7958         if(nal_length<0){
7959             printf("encoding failed\n");
7960             return -1;
7961         }
7962
7963         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7964
7965         STOP_TIMER("NAL")
7966
7967         if(out_length != COUNT){
7968             printf("incorrect length %d %d\n", out_length, COUNT);
7969             return -1;
7970         }
7971
7972         if(consumed != nal_length){
7973             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7974             return -1;
7975         }
7976
7977         if(memcmp(bitstream, out, COUNT)){
7978             printf("mismatch\n");
7979             return -1;
7980         }
7981     }
7982 #endif
7983
7984     printf("Testing RBSP\n");
7985
7986
7987     return 0;
7988 }
7989 #endif /* TEST */
7990
7991
7992 static av_cold int decode_end(AVCodecContext *avctx)
7993 {
7994     H264Context *h = avctx->priv_data;
7995     MpegEncContext *s = &h->s;
7996
7997     av_freep(&h->rbsp_buffer[0]);
7998     av_freep(&h->rbsp_buffer[1]);
7999     free_tables(h); //FIXME cleanup init stuff perhaps
8000     MPV_common_end(s);
8001
8002 //    memset(h, 0, sizeof(H264Context));
8003
8004     return 0;
8005 }
8006
8007
8008 AVCodec h264_decoder = {
8009     "h264",
8010     CODEC_TYPE_VIDEO,
8011     CODEC_ID_H264,
8012     sizeof(H264Context),
8013     decode_init,
8014     NULL,
8015     decode_end,
8016     decode_frame,
8017     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8018     .flush= flush_dpb,
8019     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8020 };
8021
8022 #include "svq3.c"