git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC chroma_dc_coeff_token_vlc;
  53
  54 static VLC total_zeros_vlc[15];
  55 static VLC chroma_dc_total_zeros_vlc[3];
  56
  57 static VLC run_vlc[6];
  58 static VLC run7_vlc;
  59
  60 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  61 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  62 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  63 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  64 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  65
  66 static av_always_inline uint32_t pack16to32(int a, int b){
  67 #ifdef WORDS_BIGENDIAN
  68    return (b&0xFFFF) + (a<<16);
  69 #else
  70    return (a&0xFFFF) + (b<<16);
  71 #endif
  72 }
  73
  74 const uint8_t ff_rem6[52]={
  75 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  76 };
  77
  78 const uint8_t ff_div6[52]={
  79 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  80 };
  81
  82
  83 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  84     MpegEncContext * const s = &h->s;
  85     const int mb_xy= h->mb_xy;
  86     int topleft_xy, top_xy, topright_xy, left_xy[2];
  87     int topleft_type, top_type, topright_type, left_type[2];
  88     int left_block[8];
  89     int topleft_partition= -1;
  90     int i;
  91
  92     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  93
  94     //FIXME deblocking could skip the intra and nnz parts.
  95     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  96         return;
  97
  98     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  99      * stuff, I can't imagine that these complex rules are worth it. */
 100
 101     topleft_xy = top_xy - 1;
 102     topright_xy= top_xy + 1;
 103     left_xy[1] = left_xy[0] = mb_xy-1;
 104     left_block[0]= 0;
 105     left_block[1]= 1;
 106     left_block[2]= 2;
 107     left_block[3]= 3;
 108     left_block[4]= 7;
 109     left_block[5]= 10;
 110     left_block[6]= 8;
 111     left_block[7]= 11;
 112     if(FRAME_MBAFF){
 113         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 114         const int top_pair_xy      = pair_xy     - s->mb_stride;
 115         const int topleft_pair_xy  = top_pair_xy - 1;
 116         const int topright_pair_xy = top_pair_xy + 1;
 117         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 118         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 119         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 120         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 121         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 122         const int bottom = (s->mb_y & 1);
 123         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 124         if (bottom
 125                 ? !curr_mb_frame_flag // bottom macroblock
 126                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 127                 ) {
 128             top_xy -= s->mb_stride;
 129         }
 130         if (bottom
 131                 ? !curr_mb_frame_flag // bottom macroblock
 132                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 133                 ) {
 134             topleft_xy -= s->mb_stride;
 135         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 136             topleft_xy += s->mb_stride;
 137             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 138             topleft_partition = 0;
 139         }
 140         if (bottom
 141                 ? !curr_mb_frame_flag // bottom macroblock
 142                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 143                 ) {
 144             topright_xy -= s->mb_stride;
 145         }
 146         if (left_mb_frame_flag != curr_mb_frame_flag) {
 147             left_xy[1] = left_xy[0] = pair_xy - 1;
 148             if (curr_mb_frame_flag) {
 149                 if (bottom) {
 150                     left_block[0]= 2;
 151                     left_block[1]= 2;
 152                     left_block[2]= 3;
 153                     left_block[3]= 3;
 154                     left_block[4]= 8;
 155                     left_block[5]= 11;
 156                     left_block[6]= 8;
 157                     left_block[7]= 11;
 158                 } else {
 159                     left_block[0]= 0;
 160                     left_block[1]= 0;
 161                     left_block[2]= 1;
 162                     left_block[3]= 1;
 163                     left_block[4]= 7;
 164                     left_block[5]= 10;
 165                     left_block[6]= 7;
 166                     left_block[7]= 10;
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 //left_block[0]= 0;
 171                 left_block[1]= 2;
 172                 left_block[2]= 0;
 173                 left_block[3]= 2;
 174                 //left_block[4]= 7;
 175                 left_block[5]= 10;
 176                 left_block[6]= 7;
 177                 left_block[7]= 10;
 178             }
 179         }
 180     }
 181
 182     h->top_mb_xy = top_xy;
 183     h->left_mb_xy[0] = left_xy[0];
 184     h->left_mb_xy[1] = left_xy[1];
 185     if(for_deblock){
 186         topleft_type = 0;
 187         topright_type = 0;
 188         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 189         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 190         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 191
 192         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 193             int list;
 194             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 195             for(i=0; i<16; i++)
 196                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 197             for(list=0; list<h->list_count; list++){
 198                 if(USES_LIST(mb_type,list)){
 199                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 200                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 201                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 202                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 203                         dst[0] = src[0];
 204                         dst[1] = src[1];
 205                         dst[2] = src[2];
 206                         dst[3] = src[3];
 207                     }
 208                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 209                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 210                     ref += h->b8_stride;
 211                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 212                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 213                 }else{
 214                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 215                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 216                 }
 217             }
 218         }
 219     }else{
 220         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 221         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 222         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 223         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 224         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 225     }
 226
 227     if(IS_INTRA(mb_type)){
 228         h->topleft_samples_available=
 229         h->top_samples_available=
 230         h->left_samples_available= 0xFFFF;
 231         h->topright_samples_available= 0xEEEA;
 232
 233         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 234             h->topleft_samples_available= 0xB3FF;
 235             h->top_samples_available= 0x33FF;
 236             h->topright_samples_available= 0x26EA;
 237         }
 238         for(i=0; i<2; i++){
 239             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 240                 h->topleft_samples_available&= 0xDF5F;
 241                 h->left_samples_available&= 0x5F5F;
 242             }
 243         }
 244
 245         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 246             h->topleft_samples_available&= 0x7FFF;
 247
 248         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 249             h->topright_samples_available&= 0xFBFF;
 250
 251         if(IS_INTRA4x4(mb_type)){
 252             if(IS_INTRA4x4(top_type)){
 253                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 254                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 255                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 256                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 257             }else{
 258                 int pred;
 259                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 260                     pred= -1;
 261                 else{
 262                     pred= 2;
 263                 }
 264                 h->intra4x4_pred_mode_cache[4+8*0]=
 265                 h->intra4x4_pred_mode_cache[5+8*0]=
 266                 h->intra4x4_pred_mode_cache[6+8*0]=
 267                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 268             }
 269             for(i=0; i<2; i++){
 270                 if(IS_INTRA4x4(left_type[i])){
 271                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 272                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 273                 }else{
 274                     int pred;
 275                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 276                         pred= -1;
 277                     else{
 278                         pred= 2;
 279                     }
 280                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 281                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 282                 }
 283             }
 284         }
 285     }
 286
 287
 288 /*
 289 0 . T T. T T T T
 290 1 L . .L . . . .
 291 2 L . .L . . . .
 292 3 . T TL . . . .
 293 4 L . .L . . . .
 294 5 L . .. . . . .
 295 */
 296 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 297     if(top_type){
 298         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 299         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 300         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 301         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 302
 303         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 304         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 305
 306         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 307         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 308
 309     }else{
 310         h->non_zero_count_cache[4+8*0]=
 311         h->non_zero_count_cache[5+8*0]=
 312         h->non_zero_count_cache[6+8*0]=
 313         h->non_zero_count_cache[7+8*0]=
 314
 315         h->non_zero_count_cache[1+8*0]=
 316         h->non_zero_count_cache[2+8*0]=
 317
 318         h->non_zero_count_cache[1+8*3]=
 319         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 320
 321     }
 322
 323     for (i=0; i<2; i++) {
 324         if(left_type[i]){
 325             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 326             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 327             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 328             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 329         }else{
 330             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 331             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 332             h->non_zero_count_cache[0+8*1 +   8*i]=
 333             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 334         }
 335     }
 336
 337     if( h->pps.cabac ) {
 338         // top_cbp
 339         if(top_type) {
 340             h->top_cbp = h->cbp_table[top_xy];
 341         } else if(IS_INTRA(mb_type)) {
 342             h->top_cbp = 0x1C0;
 343         } else {
 344             h->top_cbp = 0;
 345         }
 346         // left_cbp
 347         if (left_type[0]) {
 348             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 349         } else if(IS_INTRA(mb_type)) {
 350             h->left_cbp = 0x1C0;
 351         } else {
 352             h->left_cbp = 0;
 353         }
 354         if (left_type[0]) {
 355             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 356         }
 357         if (left_type[1]) {
 358             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 359         }
 360     }
 361
 362 #if 1
 363     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 364         int list;
 365         for(list=0; list<h->list_count; list++){
 366             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 367                 /*if(!h->mv_cache_clean[list]){
 368                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 369                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 370                     h->mv_cache_clean[list]= 1;
 371                 }*/
 372                 continue;
 373             }
 374             h->mv_cache_clean[list]= 0;
 375
 376             if(USES_LIST(top_type, list)){
 377                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 378                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 379                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 380                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 383                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 384                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 385                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 387             }else{
 388                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 389                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 392                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 393             }
 394
 395             for(i=0; i<2; i++){
 396                 int cache_idx = scan8[0] - 1 + i*2*8;
 397                 if(USES_LIST(left_type[i], list)){
 398                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 399                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 400                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 401                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 402                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 403                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 404                 }else{
 405                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 406                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 407                     h->ref_cache[list][cache_idx  ]=
 408                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 409                 }
 410             }
 411
 412             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 413                 continue;
 414
 415             if(USES_LIST(topleft_type, list)){
 416                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 417                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 418                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 419                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 420             }else{
 421                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 422                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 423             }
 424
 425             if(USES_LIST(topright_type, list)){
 426                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 427                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 428                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 429                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 430             }else{
 431                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 432                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 433             }
 434
 435             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 436                 continue;
 437
 438             h->ref_cache[list][scan8[5 ]+1] =
 439             h->ref_cache[list][scan8[7 ]+1] =
 440             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 441             h->ref_cache[list][scan8[4 ]] =
 442             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 443             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 444             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 445             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 446             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 447             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 448
 449             if( h->pps.cabac ) {
 450                 /* XXX beurk, Load mvd */
 451                 if(USES_LIST(top_type, list)){
 452                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 453                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 454                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 457                 }else{
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 462                 }
 463                 if(USES_LIST(left_type[0], list)){
 464                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 465                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 466                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 467                 }else{
 468                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 469                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 470                 }
 471                 if(USES_LIST(left_type[1], list)){
 472                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 473                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 474                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 475                 }else{
 476                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 477                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 478                 }
 479                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 480                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 481                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 482                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 484
 485                 if(h->slice_type_nos == FF_B_TYPE){
 486                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 487
 488                     if(IS_DIRECT(top_type)){
 489                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 490                     }else if(IS_8X8(top_type)){
 491                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 492                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 493                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 494                     }else{
 495                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 496                     }
 497
 498                     if(IS_DIRECT(left_type[0]))
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 500                     else if(IS_8X8(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 502                     else
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 504
 505                     if(IS_DIRECT(left_type[1]))
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 507                     else if(IS_8X8(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 509                     else
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 511                 }
 512             }
 513
 514             if(FRAME_MBAFF){
 515 #define MAP_MVS\
 516                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 517                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 518                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 519                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 522                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 523                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 524                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 525                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 526                 if(MB_FIELD){
 527 #define MAP_F2F(idx, mb_type)\
 528                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 529                         h->ref_cache[list][idx] <<= 1;\
 530                         h->mv_cache[list][idx][1] /= 2;\
 531                         h->mvd_cache[list][idx][1] /= 2;\
 532                     }
 533                     MAP_MVS
 534 #undef MAP_F2F
 535                 }else{
 536 #define MAP_F2F(idx, mb_type)\
 537                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 538                         h->ref_cache[list][idx] >>= 1;\
 539                         h->mv_cache[list][idx][1] <<= 1;\
 540                         h->mvd_cache[list][idx][1] <<= 1;\
 541                     }
 542                     MAP_MVS
 543 #undef MAP_F2F
 544                 }
 545             }
 546         }
 547     }
 548 #endif
 549
 550     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 551 }
 552
 553 static inline void write_back_intra_pred_mode(H264Context *h){
 554     const int mb_xy= h->mb_xy;
 555
 556     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 557     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 558     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 559     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 560     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 561     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 562     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 563 }
 564
 565 /**
 566  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 567  */
 568 static inline int check_intra4x4_pred_mode(H264Context *h){
 569     MpegEncContext * const s = &h->s;
 570     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 571     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 572     int i;
 573
 574     if(!(h->top_samples_available&0x8000)){
 575         for(i=0; i<4; i++){
 576             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 577             if(status<0){
 578                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 579                 return -1;
 580             } else if(status){
 581                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 582             }
 583         }
 584     }
 585
 586     if(!(h->left_samples_available&0x8000)){
 587         for(i=0; i<4; i++){
 588             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 589             if(status<0){
 590                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 591                 return -1;
 592             } else if(status){
 593                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 594             }
 595         }
 596     }
 597
 598     return 0;
 599 } //FIXME cleanup like next
 600
 601 /**
 602  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 603  */
 604 static inline int check_intra_pred_mode(H264Context *h, int mode){
 605     MpegEncContext * const s = &h->s;
 606     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 607     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 608
 609     if(mode > 6U) {
 610         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 611         return -1;
 612     }
 613
 614     if(!(h->top_samples_available&0x8000)){
 615         mode= top[ mode ];
 616         if(mode<0){
 617             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 618             return -1;
 619         }
 620     }
 621
 622     if(!(h->left_samples_available&0x8000)){
 623         mode= left[ mode ];
 624         if(mode<0){
 625             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 626             return -1;
 627         }
 628     }
 629
 630     return mode;
 631 }
 632
 633 /**
 634  * gets the predicted intra4x4 prediction mode.
 635  */
 636 static inline int pred_intra_mode(H264Context *h, int n){
 637     const int index8= scan8[n];
 638     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 639     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 640     const int min= FFMIN(left, top);
 641
 642     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 643
 644     if(min<0) return DC_PRED;
 645     else      return min;
 646 }
 647
 648 static inline void write_back_non_zero_count(H264Context *h){
 649     const int mb_xy= h->mb_xy;
 650
 651     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 652     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 653     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 654     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 655     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 656     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 657     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 658
 659     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 660     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 661     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 662
 663     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 664     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 665     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 666
 667     if(FRAME_MBAFF){
 668         // store all luma nnzs, for deblocking
 669         int v = 0, i;
 670         for(i=0; i<16; i++)
 671             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 672         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 673     }
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static inline void direct_dist_scale_factor(H264Context * const h){
 898     const int poc = h->s.current_picture_ptr->poc;
 899     const int poc1 = h->ref_list[1][0].poc;
 900     int i;
 901     for(i=0; i<h->ref_count[0]; i++){
 902         int poc0 = h->ref_list[0][i].poc;
 903         int td = av_clip(poc1 - poc0, -128, 127);
 904         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 905             h->dist_scale_factor[i] = 256;
 906         }else{
 907             int tb = av_clip(poc - poc0, -128, 127);
 908             int tx = (16384 + (FFABS(td) >> 1)) / td;
 909             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 910         }
 911     }
 912     if(FRAME_MBAFF){
 913         for(i=0; i<h->ref_count[0]; i++){
 914             h->dist_scale_factor_field[2*i] =
 915             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 916         }
 917     }
 918 }
 919 static inline void direct_ref_list_init(H264Context * const h){
 920     MpegEncContext * const s = &h->s;
 921     Picture * const ref1 = &h->ref_list[1][0];
 922     Picture * const cur = s->current_picture_ptr;
 923     int list, i, j;
 924     if(cur->pict_type == FF_I_TYPE)
 925         cur->ref_count[0] = 0;
 926     if(cur->pict_type != FF_B_TYPE)
 927         cur->ref_count[1] = 0;
 928     for(list=0; list<2; list++){
 929         cur->ref_count[list] = h->ref_count[list];
 930         for(j=0; j<h->ref_count[list]; j++)
 931             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 932     }
 933     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 934         return;
 935     for(list=0; list<2; list++){
 936         for(i=0; i<ref1->ref_count[list]; i++){
 937             const int poc = ref1->ref_poc[list][i];
 938             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 939             for(j=0; j<h->ref_count[list]; j++)
 940                 if(h->ref_list[list][j].poc == poc){
 941                     h->map_col_to_list0[list][i] = j;
 942                     break;
 943                 }
 944         }
 945     }
 946     if(FRAME_MBAFF){
 947         for(list=0; list<2; list++){
 948             for(i=0; i<ref1->ref_count[list]; i++){
 949                 j = h->map_col_to_list0[list][i];
 950                 h->map_col_to_list0_field[list][2*i] = 2*j;
 951                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 952             }
 953         }
 954     }
 955 }
 956
 957 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 958     MpegEncContext * const s = &h->s;
 959     const int mb_xy =   h->mb_xy;
 960     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 961     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 962     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 963     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 964     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 965     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 966     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 967     const int is_b8x8 = IS_8X8(*mb_type);
 968     unsigned int sub_mb_type;
 969     int i8, i4;
 970
 971 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 972     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 973         /* FIXME save sub mb types from previous frames (or derive from MVs)
 974          * so we know exactly what block size to use */
 975         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 976         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 977     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 978         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 979         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 980     }else{
 981         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 982         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 983     }
 984     if(!is_b8x8)
 985         *mb_type |= MB_TYPE_DIRECT2;
 986     if(MB_FIELD)
 987         *mb_type |= MB_TYPE_INTERLACED;
 988
 989     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 990
 991     if(h->direct_spatial_mv_pred){
 992         int ref[2];
 993         int mv[2][2];
 994         int list;
 995
 996         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 997
 998         /* ref = min(neighbors) */
 999         for(list=0; list<2; list++){
1000             int refa = h->ref_cache[list][scan8[0] - 1];
1001             int refb = h->ref_cache[list][scan8[0] - 8];
1002             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1003             if(refc == -2)
1004                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1005             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1006             if(ref[list] < 0)
1007                 ref[list] = -1;
1008         }
1009
1010         if(ref[0] < 0 && ref[1] < 0){
1011             ref[0] = ref[1] = 0;
1012             mv[0][0] = mv[0][1] =
1013             mv[1][0] = mv[1][1] = 0;
1014         }else{
1015             for(list=0; list<2; list++){
1016                 if(ref[list] >= 0)
1017                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1018                 else
1019                     mv[list][0] = mv[list][1] = 0;
1020             }
1021         }
1022
1023         if(ref[1] < 0){
1024             if(!is_b8x8)
1025                 *mb_type &= ~MB_TYPE_L1;
1026             sub_mb_type &= ~MB_TYPE_L1;
1027         }else if(ref[0] < 0){
1028             if(!is_b8x8)
1029                 *mb_type &= ~MB_TYPE_L0;
1030             sub_mb_type &= ~MB_TYPE_L0;
1031         }
1032
1033         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1034             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1035             int mb_types_col[2];
1036             int b8_stride = h->b8_stride;
1037             int b4_stride = h->b_stride;
1038
1039             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1040
1041             if(IS_INTERLACED(*mb_type)){
1042                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1043                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1044                 if(s->mb_y&1){
1045                     l1ref0 -= 2*b8_stride;
1046                     l1ref1 -= 2*b8_stride;
1047                     l1mv0 -= 4*b4_stride;
1048                     l1mv1 -= 4*b4_stride;
1049                 }
1050                 b8_stride *= 3;
1051                 b4_stride *= 6;
1052             }else{
1053                 int cur_poc = s->current_picture_ptr->poc;
1054                 int *col_poc = h->ref_list[1]->field_poc;
1055                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1056                 int dy = 2*col_parity - (s->mb_y&1);
1057                 mb_types_col[0] =
1058                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1059                 l1ref0 += dy*b8_stride;
1060                 l1ref1 += dy*b8_stride;
1061                 l1mv0 += 2*dy*b4_stride;
1062                 l1mv1 += 2*dy*b4_stride;
1063                 b8_stride = 0;
1064             }
1065
1066             for(i8=0; i8<4; i8++){
1067                 int x8 = i8&1;
1068                 int y8 = i8>>1;
1069                 int xy8 = x8+y8*b8_stride;
1070                 int xy4 = 3*x8+y8*b4_stride;
1071                 int a=0, b=0;
1072
1073                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1074                     continue;
1075                 h->sub_mb_type[i8] = sub_mb_type;
1076
1077                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1078                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1079                 if(!IS_INTRA(mb_types_col[y8])
1080                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1081                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1082                     if(ref[0] > 0)
1083                         a= pack16to32(mv[0][0],mv[0][1]);
1084                     if(ref[1] > 0)
1085                         b= pack16to32(mv[1][0],mv[1][1]);
1086                 }else{
1087                     a= pack16to32(mv[0][0],mv[0][1]);
1088                     b= pack16to32(mv[1][0],mv[1][1]);
1089                 }
1090                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1091                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1092             }
1093         }else if(IS_16X16(*mb_type)){
1094             int a=0, b=0;
1095
1096             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1097             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1098             if(!IS_INTRA(mb_type_col)
1099                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1100                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1101                        && (h->x264_build>33 || !h->x264_build)))){
1102                 if(ref[0] > 0)
1103                     a= pack16to32(mv[0][0],mv[0][1]);
1104                 if(ref[1] > 0)
1105                     b= pack16to32(mv[1][0],mv[1][1]);
1106             }else{
1107                 a= pack16to32(mv[0][0],mv[0][1]);
1108                 b= pack16to32(mv[1][0],mv[1][1]);
1109             }
1110             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1111             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1112         }else{
1113             for(i8=0; i8<4; i8++){
1114                 const int x8 = i8&1;
1115                 const int y8 = i8>>1;
1116
1117                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1118                     continue;
1119                 h->sub_mb_type[i8] = sub_mb_type;
1120
1121                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1122                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1123                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1124                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1125
1126                 /* col_zero_flag */
1127                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1128                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1129                                                   && (h->x264_build>33 || !h->x264_build)))){
1130                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1131                     if(IS_SUB_8X8(sub_mb_type)){
1132                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1133                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1134                             if(ref[0] == 0)
1135                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1136                             if(ref[1] == 0)
1137                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1138                         }
1139                     }else
1140                     for(i4=0; i4<4; i4++){
1141                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1142                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1143                             if(ref[0] == 0)
1144                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1145                             if(ref[1] == 0)
1146                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1147                         }
1148                     }
1149                 }
1150             }
1151         }
1152     }else{ /* direct temporal mv pred */
1153         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1154         const int *dist_scale_factor = h->dist_scale_factor;
1155
1156         if(FRAME_MBAFF){
1157             if(IS_INTERLACED(*mb_type)){
1158                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1159                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1160                 dist_scale_factor = h->dist_scale_factor_field;
1161             }
1162             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1163                 /* FIXME assumes direct_8x8_inference == 1 */
1164                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1165                 int mb_types_col[2];
1166                 int y_shift;
1167
1168                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1169                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1170                          | (*mb_type & MB_TYPE_INTERLACED);
1171                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1172
1173                 if(IS_INTERLACED(*mb_type)){
1174                     /* frame to field scaling */
1175                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1176                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1177                     if(s->mb_y&1){
1178                         l1ref0 -= 2*h->b8_stride;
1179                         l1ref1 -= 2*h->b8_stride;
1180                         l1mv0 -= 4*h->b_stride;
1181                         l1mv1 -= 4*h->b_stride;
1182                     }
1183                     y_shift = 0;
1184
1185                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1186                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1187                        && !is_b8x8)
1188                         *mb_type |= MB_TYPE_16x8;
1189                     else
1190                         *mb_type |= MB_TYPE_8x8;
1191                 }else{
1192                     /* field to frame scaling */
1193                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1194                      * but in MBAFF, top and bottom POC are equal */
1195                     int dy = (s->mb_y&1) ? 1 : 2;
1196                     mb_types_col[0] =
1197                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1198                     l1ref0 += dy*h->b8_stride;
1199                     l1ref1 += dy*h->b8_stride;
1200                     l1mv0 += 2*dy*h->b_stride;
1201                     l1mv1 += 2*dy*h->b_stride;
1202                     y_shift = 2;
1203
1204                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1205                        && !is_b8x8)
1206                         *mb_type |= MB_TYPE_16x16;
1207                     else
1208                         *mb_type |= MB_TYPE_8x8;
1209                 }
1210
1211                 for(i8=0; i8<4; i8++){
1212                     const int x8 = i8&1;
1213                     const int y8 = i8>>1;
1214                     int ref0, scale;
1215                     const int16_t (*l1mv)[2]= l1mv0;
1216
1217                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                         continue;
1219                     h->sub_mb_type[i8] = sub_mb_type;
1220
1221                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                     if(IS_INTRA(mb_types_col[y8])){
1223                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                         continue;
1227                     }
1228
1229                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1230                     if(ref0 >= 0)
1231                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1232                     else{
1233                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1234                         l1mv= l1mv1;
1235                     }
1236                     scale = dist_scale_factor[ref0];
1237                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                     {
1240                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1241                         int my_col = (mv_col[1]<<y_shift)/2;
1242                         int mx = (scale * mv_col[0] + 128) >> 8;
1243                         int my = (scale * my_col + 128) >> 8;
1244                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                     }
1247                 }
1248                 return;
1249             }
1250         }
1251
1252         /* one-to-one mv scaling */
1253
1254         if(IS_16X16(*mb_type)){
1255             int ref, mv0, mv1;
1256
1257             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1258             if(IS_INTRA(mb_type_col)){
1259                 ref=mv0=mv1=0;
1260             }else{
1261                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1262                                                 : map_col_to_list0[1][l1ref1[0]];
1263                 const int scale = dist_scale_factor[ref0];
1264                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1265                 int mv_l0[2];
1266                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1267                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1268                 ref= ref0;
1269                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1270                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1271             }
1272             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1273             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1274             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1275         }else{
1276             for(i8=0; i8<4; i8++){
1277                 const int x8 = i8&1;
1278                 const int y8 = i8>>1;
1279                 int ref0, scale;
1280                 const int16_t (*l1mv)[2]= l1mv0;
1281
1282                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1283                     continue;
1284                 h->sub_mb_type[i8] = sub_mb_type;
1285                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1286                 if(IS_INTRA(mb_type_col)){
1287                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1288                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1290                     continue;
1291                 }
1292
1293                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1294                 if(ref0 >= 0)
1295                     ref0 = map_col_to_list0[0][ref0];
1296                 else{
1297                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1298                     l1mv= l1mv1;
1299                 }
1300                 scale = dist_scale_factor[ref0];
1301
1302                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1303                 if(IS_SUB_8X8(sub_mb_type)){
1304                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1305                     int mx = (scale * mv_col[0] + 128) >> 8;
1306                     int my = (scale * mv_col[1] + 128) >> 8;
1307                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1308                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1309                 }else
1310                 for(i4=0; i4<4; i4++){
1311                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1312                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1313                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1314                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1315                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1316                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1317                 }
1318             }
1319         }
1320     }
1321 }
1322
1323 static inline void write_back_motion(H264Context *h, int mb_type){
1324     MpegEncContext * const s = &h->s;
1325     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1326     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1327     int list;
1328
1329     if(!USES_LIST(mb_type, 0))
1330         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1331
1332     for(list=0; list<h->list_count; list++){
1333         int y;
1334         if(!USES_LIST(mb_type, list))
1335             continue;
1336
1337         for(y=0; y<4; y++){
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1339             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1340         }
1341         if( h->pps.cabac ) {
1342             if(IS_SKIP(mb_type))
1343                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1344             else
1345             for(y=0; y<4; y++){
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1347                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1348             }
1349         }
1350
1351         {
1352             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1353             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1354             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1355             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1356             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1357         }
1358     }
1359
1360     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1361         if(IS_8X8(mb_type)){
1362             uint8_t *direct_table = &h->direct_table[b8_xy];
1363             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1364             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1365             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1366         }
1367     }
1368 }
1369
1370 /**
1371  * Decodes a network abstraction layer unit.
1372  * @param consumed is the number of bytes used as input
1373  * @param length is the length of the array
1374  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1375  * @returns decoded bytes, might be src+1 if no escapes
1376  */
1377 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1378     int i, si, di;
1379     uint8_t *dst;
1380     int bufidx;
1381
1382 //    src[0]&0x80;                //forbidden bit
1383     h->nal_ref_idc= src[0]>>5;
1384     h->nal_unit_type= src[0]&0x1F;
1385
1386     src++; length--;
1387 #if 0
1388     for(i=0; i<length; i++)
1389         printf("%2X ", src[i]);
1390 #endif
1391     for(i=0; i+1<length; i+=2){
1392         if(src[i]) continue;
1393         if(i>0 && src[i-1]==0) i--;
1394         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1395             if(src[i+2]!=3){
1396                 /* startcode, so we must be past the end */
1397                 length=i;
1398             }
1399             break;
1400         }
1401     }
1402
1403     if(i>=length-1){ //no escaped 0
1404         *dst_length= length;
1405         *consumed= length+1; //+1 for the header
1406         return src;
1407     }
1408
1409     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1410     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1411     dst= h->rbsp_buffer[bufidx];
1412
1413     if (dst == NULL){
1414         return NULL;
1415     }
1416
1417 //printf("decoding esc\n");
1418     si=di=0;
1419     while(si<length){
1420         //remove escapes (very rare 1:2^22)
1421         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1422             if(src[si+2]==3){ //escape
1423                 dst[di++]= 0;
1424                 dst[di++]= 0;
1425                 si+=3;
1426                 continue;
1427             }else //next start code
1428                 break;
1429         }
1430
1431         dst[di++]= src[si++];
1432     }
1433
1434     *dst_length= di;
1435     *consumed= si + 1;//+1 for the header
1436 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1437     return dst;
1438 }
1439
1440 /**
1441  * identifies the exact end of the bitstream
1442  * @return the length of the trailing, or 0 if damaged
1443  */
1444 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1445     int v= *src;
1446     int r;
1447
1448     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1449
1450     for(r=1; r<9; r++){
1451         if(v&1) return r;
1452         v>>=1;
1453     }
1454     return 0;
1455 }
1456
1457 /**
1458  * IDCT transforms the 16 dc values and dequantizes them.
1459  * @param qp quantization parameter
1460  */
1461 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1462 #define stride 16
1463     int i;
1464     int temp[16]; //FIXME check if this is a good idea
1465     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1466     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1467
1468 //memset(block, 64, 2*256);
1469 //return;
1470     for(i=0; i<4; i++){
1471         const int offset= y_offset[i];
1472         const int z0= block[offset+stride*0] + block[offset+stride*4];
1473         const int z1= block[offset+stride*0] - block[offset+stride*4];
1474         const int z2= block[offset+stride*1] - block[offset+stride*5];
1475         const int z3= block[offset+stride*1] + block[offset+stride*5];
1476
1477         temp[4*i+0]= z0+z3;
1478         temp[4*i+1]= z1+z2;
1479         temp[4*i+2]= z1-z2;
1480         temp[4*i+3]= z0-z3;
1481     }
1482
1483     for(i=0; i<4; i++){
1484         const int offset= x_offset[i];
1485         const int z0= temp[4*0+i] + temp[4*2+i];
1486         const int z1= temp[4*0+i] - temp[4*2+i];
1487         const int z2= temp[4*1+i] - temp[4*3+i];
1488         const int z3= temp[4*1+i] + temp[4*3+i];
1489
1490         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1491         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1492         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1493         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1494     }
1495 }
1496
1497 #if 0
1498 /**
1499  * DCT transforms the 16 dc values.
1500  * @param qp quantization parameter ??? FIXME
1501  */
1502 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1503 //    const int qmul= dequant_coeff[qp][0];
1504     int i;
1505     int temp[16]; //FIXME check if this is a good idea
1506     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1507     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1508
1509     for(i=0; i<4; i++){
1510         const int offset= y_offset[i];
1511         const int z0= block[offset+stride*0] + block[offset+stride*4];
1512         const int z1= block[offset+stride*0] - block[offset+stride*4];
1513         const int z2= block[offset+stride*1] - block[offset+stride*5];
1514         const int z3= block[offset+stride*1] + block[offset+stride*5];
1515
1516         temp[4*i+0]= z0+z3;
1517         temp[4*i+1]= z1+z2;
1518         temp[4*i+2]= z1-z2;
1519         temp[4*i+3]= z0-z3;
1520     }
1521
1522     for(i=0; i<4; i++){
1523         const int offset= x_offset[i];
1524         const int z0= temp[4*0+i] + temp[4*2+i];
1525         const int z1= temp[4*0+i] - temp[4*2+i];
1526         const int z2= temp[4*1+i] - temp[4*3+i];
1527         const int z3= temp[4*1+i] + temp[4*3+i];
1528
1529         block[stride*0 +offset]= (z0 + z3)>>1;
1530         block[stride*2 +offset]= (z1 + z2)>>1;
1531         block[stride*8 +offset]= (z1 - z2)>>1;
1532         block[stride*10+offset]= (z0 - z3)>>1;
1533     }
1534 }
1535 #endif
1536
1537 #undef xStride
1538 #undef stride
1539
1540 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1541     const int stride= 16*2;
1542     const int xStride= 16;
1543     int a,b,c,d,e;
1544
1545     a= block[stride*0 + xStride*0];
1546     b= block[stride*0 + xStride*1];
1547     c= block[stride*1 + xStride*0];
1548     d= block[stride*1 + xStride*1];
1549
1550     e= a-b;
1551     a= a+b;
1552     b= c-d;
1553     c= c+d;
1554
1555     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1556     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1557     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1558     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1559 }
1560
1561 #if 0
1562 static void chroma_dc_dct_c(DCTELEM *block){
1563     const int stride= 16*2;
1564     const int xStride= 16;
1565     int a,b,c,d,e;
1566
1567     a= block[stride*0 + xStride*0];
1568     b= block[stride*0 + xStride*1];
1569     c= block[stride*1 + xStride*0];
1570     d= block[stride*1 + xStride*1];
1571
1572     e= a-b;
1573     a= a+b;
1574     b= c-d;
1575     c= c+d;
1576
1577     block[stride*0 + xStride*0]= (a+c);
1578     block[stride*0 + xStride*1]= (e+b);
1579     block[stride*1 + xStride*0]= (a-c);
1580     block[stride*1 + xStride*1]= (e-b);
1581 }
1582 #endif
1583
1584 /**
1585  * gets the chroma qp.
1586  */
1587 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1588     return h->pps.chroma_qp_table[t][qscale];
1589 }
1590
1591 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1592 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1593 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1594     int i;
1595     const int * const quant_table= quant_coeff[qscale];
1596     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1597     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1598     const unsigned int threshold2= (threshold1<<1);
1599     int last_non_zero;
1600
1601     if(separate_dc){
1602         if(qscale<=18){
1603             //avoid overflows
1604             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1605             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1606             const unsigned int dc_threshold2= (dc_threshold1<<1);
1607
1608             int level= block[0]*quant_coeff[qscale+18][0];
1609             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1610                 if(level>0){
1611                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1612                     block[0]= level;
1613                 }else{
1614                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1615                     block[0]= -level;
1616                 }
1617 //                last_non_zero = i;
1618             }else{
1619                 block[0]=0;
1620             }
1621         }else{
1622             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1623             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1624             const unsigned int dc_threshold2= (dc_threshold1<<1);
1625
1626             int level= block[0]*quant_table[0];
1627             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1628                 if(level>0){
1629                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1630                     block[0]= level;
1631                 }else{
1632                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1633                     block[0]= -level;
1634                 }
1635 //                last_non_zero = i;
1636             }else{
1637                 block[0]=0;
1638             }
1639         }
1640         last_non_zero= 0;
1641         i=1;
1642     }else{
1643         last_non_zero= -1;
1644         i=0;
1645     }
1646
1647     for(; i<16; i++){
1648         const int j= scantable[i];
1649         int level= block[j]*quant_table[j];
1650
1651 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1652 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1653         if(((unsigned)(level+threshold1))>threshold2){
1654             if(level>0){
1655                 level= (bias + level)>>QUANT_SHIFT;
1656                 block[j]= level;
1657             }else{
1658                 level= (bias - level)>>QUANT_SHIFT;
1659                 block[j]= -level;
1660             }
1661             last_non_zero = i;
1662         }else{
1663             block[j]=0;
1664         }
1665     }
1666
1667     return last_non_zero;
1668 }
1669
1670 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1671                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1672                            int src_x_offset, int src_y_offset,
1673                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1674     MpegEncContext * const s = &h->s;
1675     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1676     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1677     const int luma_xy= (mx&3) + ((my&3)<<2);
1678     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1679     uint8_t * src_cb, * src_cr;
1680     int extra_width= h->emu_edge_width;
1681     int extra_height= h->emu_edge_height;
1682     int emu=0;
1683     const int full_mx= mx>>2;
1684     const int full_my= my>>2;
1685     const int pic_width  = 16*s->mb_width;
1686     const int pic_height = 16*s->mb_height >> MB_FIELD;
1687
1688     if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
1689         return;
1690
1691     if(mx&7) extra_width -= 3;
1692     if(my&7) extra_height -= 3;
1693
1694     if(   full_mx < 0-extra_width
1695        || full_my < 0-extra_height
1696        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1697        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1698         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1699             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1700         emu=1;
1701     }
1702
1703     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1704     if(!square){
1705         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1706     }
1707
1708     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1709
1710     if(MB_FIELD){
1711         // chroma offset when predicting from a field of opposite parity
1712         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1713         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1714     }
1715     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1716     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1717
1718     if(emu){
1719         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1720             src_cb= s->edge_emu_buffer;
1721     }
1722     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1723
1724     if(emu){
1725         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1726             src_cr= s->edge_emu_buffer;
1727     }
1728     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1729 }
1730
1731 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1732                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1733                            int x_offset, int y_offset,
1734                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1735                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1736                            int list0, int list1){
1737     MpegEncContext * const s = &h->s;
1738     qpel_mc_func *qpix_op=  qpix_put;
1739     h264_chroma_mc_func chroma_op= chroma_put;
1740
1741     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1742     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1743     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1744     x_offset += 8*s->mb_x;
1745     y_offset += 8*(s->mb_y >> MB_FIELD);
1746
1747     if(list0){
1748         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1749         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1750                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1751                            qpix_op, chroma_op);
1752
1753         qpix_op=  qpix_avg;
1754         chroma_op= chroma_avg;
1755     }
1756
1757     if(list1){
1758         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1759         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1760                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1761                            qpix_op, chroma_op);
1762     }
1763 }
1764
1765 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1766                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1767                            int x_offset, int y_offset,
1768                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1769                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1770                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1771                            int list0, int list1){
1772     MpegEncContext * const s = &h->s;
1773
1774     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1775     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1776     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1777     x_offset += 8*s->mb_x;
1778     y_offset += 8*(s->mb_y >> MB_FIELD);
1779
1780     if(list0 && list1){
1781         /* don't optimize for luma-only case, since B-frames usually
1782          * use implicit weights => chroma too. */
1783         uint8_t *tmp_cb = s->obmc_scratchpad;
1784         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1785         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1786         int refn0 = h->ref_cache[0][ scan8[n] ];
1787         int refn1 = h->ref_cache[1][ scan8[n] ];
1788
1789         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1790                     dest_y, dest_cb, dest_cr,
1791                     x_offset, y_offset, qpix_put, chroma_put);
1792         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1793                     tmp_y, tmp_cb, tmp_cr,
1794                     x_offset, y_offset, qpix_put, chroma_put);
1795
1796         if(h->use_weight == 2){
1797             int weight0 = h->implicit_weight[refn0][refn1];
1798             int weight1 = 64 - weight0;
1799             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1800             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1801             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1802         }else{
1803             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1804                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1805                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1806             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1807                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1808                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1809             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1810                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1811                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1812         }
1813     }else{
1814         int list = list1 ? 1 : 0;
1815         int refn = h->ref_cache[list][ scan8[n] ];
1816         Picture *ref= &h->ref_list[list][refn];
1817         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1818                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1819                     qpix_put, chroma_put);
1820
1821         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1822                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1823         if(h->use_weight_chroma){
1824             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1825                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1826             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1827                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1828         }
1829     }
1830 }
1831
1832 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1833                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1834                            int x_offset, int y_offset,
1835                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1836                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1837                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1838                            int list0, int list1){
1839     if((h->use_weight==2 && list0 && list1
1840         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1841        || h->use_weight==1)
1842         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1843                          x_offset, y_offset, qpix_put, chroma_put,
1844                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1845     else
1846         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1847                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1848 }
1849
1850 static inline void prefetch_motion(H264Context *h, int list){
1851     /* fetch pixels for estimated mv 4 macroblocks ahead
1852      * optimized for 64byte cache lines */
1853     MpegEncContext * const s = &h->s;
1854     const int refn = h->ref_cache[list][scan8[0]];
1855     if(refn >= 0){
1856         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1857         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1858         uint8_t **src= h->ref_list[list][refn].data;
1859         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1860         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1861         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1862         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1863     }
1864 }
1865
1866 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1867                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1868                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1869                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1870     MpegEncContext * const s = &h->s;
1871     const int mb_xy= h->mb_xy;
1872     const int mb_type= s->current_picture.mb_type[mb_xy];
1873
1874     assert(IS_INTER(mb_type));
1875
1876     prefetch_motion(h, 0);
1877
1878     if(IS_16X16(mb_type)){
1879         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1880                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1881                 &weight_op[0], &weight_avg[0],
1882                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1883     }else if(IS_16X8(mb_type)){
1884         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1885                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1886                 &weight_op[1], &weight_avg[1],
1887                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1888         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1889                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1890                 &weight_op[1], &weight_avg[1],
1891                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1892     }else if(IS_8X16(mb_type)){
1893         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1894                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1895                 &weight_op[2], &weight_avg[2],
1896                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1897         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1898                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1899                 &weight_op[2], &weight_avg[2],
1900                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1901     }else{
1902         int i;
1903
1904         assert(IS_8X8(mb_type));
1905
1906         for(i=0; i<4; i++){
1907             const int sub_mb_type= h->sub_mb_type[i];
1908             const int n= 4*i;
1909             int x_offset= (i&1)<<2;
1910             int y_offset= (i&2)<<1;
1911
1912             if(IS_SUB_8X8(sub_mb_type)){
1913                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1914                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1915                     &weight_op[3], &weight_avg[3],
1916                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1917             }else if(IS_SUB_8X4(sub_mb_type)){
1918                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1919                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1920                     &weight_op[4], &weight_avg[4],
1921                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1922                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1923                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1924                     &weight_op[4], &weight_avg[4],
1925                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1926             }else if(IS_SUB_4X8(sub_mb_type)){
1927                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1928                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1929                     &weight_op[5], &weight_avg[5],
1930                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1931                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1932                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1933                     &weight_op[5], &weight_avg[5],
1934                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1935             }else{
1936                 int j;
1937                 assert(IS_SUB_4X4(sub_mb_type));
1938                 for(j=0; j<4; j++){
1939                     int sub_x_offset= x_offset + 2*(j&1);
1940                     int sub_y_offset= y_offset +   (j&2);
1941                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1942                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1943                         &weight_op[6], &weight_avg[6],
1944                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1945                 }
1946             }
1947         }
1948     }
1949
1950     prefetch_motion(h, 1);
1951 }
1952
1953 static av_cold void decode_init_vlc(void){
1954     static int done = 0;
1955
1956     if (!done) {
1957         int i;
1958         done = 1;
1959
1960         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1961                  &chroma_dc_coeff_token_len [0], 1, 1,
1962                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1963
1964         for(i=0; i<4; i++){
1965             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1966                      &coeff_token_len [i][0], 1, 1,
1967                      &coeff_token_bits[i][0], 1, 1, 1);
1968         }
1969
1970         for(i=0; i<3; i++){
1971             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1972                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1973                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1974         }
1975         for(i=0; i<15; i++){
1976             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1977                      &total_zeros_len [i][0], 1, 1,
1978                      &total_zeros_bits[i][0], 1, 1, 1);
1979         }
1980
1981         for(i=0; i<6; i++){
1982             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1983                      &run_len [i][0], 1, 1,
1984                      &run_bits[i][0], 1, 1, 1);
1985         }
1986         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1987                  &run_len [6][0], 1, 1,
1988                  &run_bits[6][0], 1, 1, 1);
1989     }
1990 }
1991
1992 static void free_tables(H264Context *h){
1993     int i;
1994     H264Context *hx;
1995     av_freep(&h->intra4x4_pred_mode);
1996     av_freep(&h->chroma_pred_mode_table);
1997     av_freep(&h->cbp_table);
1998     av_freep(&h->mvd_table[0]);
1999     av_freep(&h->mvd_table[1]);
2000     av_freep(&h->direct_table);
2001     av_freep(&h->non_zero_count);
2002     av_freep(&h->slice_table_base);
2003     h->slice_table= NULL;
2004
2005     av_freep(&h->mb2b_xy);
2006     av_freep(&h->mb2b8_xy);
2007
2008     for(i = 0; i < MAX_SPS_COUNT; i++)
2009         av_freep(h->sps_buffers + i);
2010
2011     for(i = 0; i < MAX_PPS_COUNT; i++)
2012         av_freep(h->pps_buffers + i);
2013
2014     for(i = 0; i < h->s.avctx->thread_count; i++) {
2015         hx = h->thread_context[i];
2016         if(!hx) continue;
2017         av_freep(&hx->top_borders[1]);
2018         av_freep(&hx->top_borders[0]);
2019         av_freep(&hx->s.obmc_scratchpad);
2020     }
2021 }
2022
2023 static void init_dequant8_coeff_table(H264Context *h){
2024     int i,q,x;
2025     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2026     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2027     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2028
2029     for(i=0; i<2; i++ ){
2030         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2031             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2032             break;
2033         }
2034
2035         for(q=0; q<52; q++){
2036             int shift = ff_div6[q];
2037             int idx = ff_rem6[q];
2038             for(x=0; x<64; x++)
2039                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2040                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2041                     h->pps.scaling_matrix8[i][x]) << shift;
2042         }
2043     }
2044 }
2045
2046 static void init_dequant4_coeff_table(H264Context *h){
2047     int i,j,q,x;
2048     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2049     for(i=0; i<6; i++ ){
2050         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2051         for(j=0; j<i; j++){
2052             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2053                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2054                 break;
2055             }
2056         }
2057         if(j<i)
2058             continue;
2059
2060         for(q=0; q<52; q++){
2061             int shift = ff_div6[q] + 2;
2062             int idx = ff_rem6[q];
2063             for(x=0; x<16; x++)
2064                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2065                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2066                     h->pps.scaling_matrix4[i][x]) << shift;
2067         }
2068     }
2069 }
2070
2071 static void init_dequant_tables(H264Context *h){
2072     int i,x;
2073     init_dequant4_coeff_table(h);
2074     if(h->pps.transform_8x8_mode)
2075         init_dequant8_coeff_table(h);
2076     if(h->sps.transform_bypass){
2077         for(i=0; i<6; i++)
2078             for(x=0; x<16; x++)
2079                 h->dequant4_coeff[i][0][x] = 1<<6;
2080         if(h->pps.transform_8x8_mode)
2081             for(i=0; i<2; i++)
2082                 for(x=0; x<64; x++)
2083                     h->dequant8_coeff[i][0][x] = 1<<6;
2084     }
2085 }
2086
2087
2088 /**
2089  * allocates tables.
2090  * needs width/height
2091  */
2092 static int alloc_tables(H264Context *h){
2093     MpegEncContext * const s = &h->s;
2094     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2095     int x,y;
2096
2097     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2098
2099     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2100     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2101     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2102
2103     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2104     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2105     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2106     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2107
2108     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2109     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2110
2111     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2112     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2113     for(y=0; y<s->mb_height; y++){
2114         for(x=0; x<s->mb_width; x++){
2115             const int mb_xy= x + y*s->mb_stride;
2116             const int b_xy = 4*x + 4*y*h->b_stride;
2117             const int b8_xy= 2*x + 2*y*h->b8_stride;
2118
2119             h->mb2b_xy [mb_xy]= b_xy;
2120             h->mb2b8_xy[mb_xy]= b8_xy;
2121         }
2122     }
2123
2124     s->obmc_scratchpad = NULL;
2125
2126     if(!h->dequant4_coeff[0])
2127         init_dequant_tables(h);
2128
2129     return 0;
2130 fail:
2131     free_tables(h);
2132     return -1;
2133 }
2134
2135 /**
2136  * Mimic alloc_tables(), but for every context thread.
2137  */
2138 static void clone_tables(H264Context *dst, H264Context *src){
2139     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2140     dst->non_zero_count           = src->non_zero_count;
2141     dst->slice_table              = src->slice_table;
2142     dst->cbp_table                = src->cbp_table;
2143     dst->mb2b_xy                  = src->mb2b_xy;
2144     dst->mb2b8_xy                 = src->mb2b8_xy;
2145     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2146     dst->mvd_table[0]             = src->mvd_table[0];
2147     dst->mvd_table[1]             = src->mvd_table[1];
2148     dst->direct_table             = src->direct_table;
2149
2150     dst->s.obmc_scratchpad = NULL;
2151     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2152 }
2153
2154 /**
2155  * Init context
2156  * Allocate buffers which are not shared amongst multiple threads.
2157  */
2158 static int context_init(H264Context *h){
2159     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2160     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2161
2162     return 0;
2163 fail:
2164     return -1; // free_tables will clean up for us
2165 }
2166
2167 static av_cold void common_init(H264Context *h){
2168     MpegEncContext * const s = &h->s;
2169
2170     s->width = s->avctx->width;
2171     s->height = s->avctx->height;
2172     s->codec_id= s->avctx->codec->id;
2173
2174     ff_h264_pred_init(&h->hpc, s->codec_id);
2175
2176     h->dequant_coeff_pps= -1;
2177     s->unrestricted_mv=1;
2178     s->decode=1; //FIXME
2179
2180     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2181     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2182 }
2183
2184 static av_cold int decode_init(AVCodecContext *avctx){
2185     H264Context *h= avctx->priv_data;
2186     MpegEncContext * const s = &h->s;
2187
2188     MPV_decode_defaults(s);
2189
2190     s->avctx = avctx;
2191     common_init(h);
2192
2193     s->out_format = FMT_H264;
2194     s->workaround_bugs= avctx->workaround_bugs;
2195
2196     // set defaults
2197 //    s->decode_mb= ff_h263_decode_mb;
2198     s->quarter_sample = 1;
2199     s->low_delay= 1;
2200
2201     if(avctx->codec_id == CODEC_ID_SVQ3)
2202         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2203     else
2204         avctx->pix_fmt= PIX_FMT_YUV420P;
2205
2206     decode_init_vlc();
2207
2208     if(avctx->extradata_size > 0 && avctx->extradata &&
2209        *(char *)avctx->extradata == 1){
2210         h->is_avc = 1;
2211         h->got_avcC = 0;
2212     } else {
2213         h->is_avc = 0;
2214     }
2215
2216     h->thread_context[0] = h;
2217     return 0;
2218 }
2219
2220 static int frame_start(H264Context *h){
2221     MpegEncContext * const s = &h->s;
2222     int i;
2223
2224     if(MPV_frame_start(s, s->avctx) < 0)
2225         return -1;
2226     ff_er_frame_start(s);
2227     /*
2228      * MPV_frame_start uses pict_type to derive key_frame.
2229      * This is incorrect for H.264; IDR markings must be used.
2230      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2231      * See decode_nal_units().
2232      */
2233     s->current_picture_ptr->key_frame= 0;
2234
2235     assert(s->linesize && s->uvlinesize);
2236
2237     for(i=0; i<16; i++){
2238         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2239         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2240     }
2241     for(i=0; i<4; i++){
2242         h->block_offset[16+i]=
2243         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2244         h->block_offset[24+16+i]=
2245         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2246     }
2247
2248     /* can't be in alloc_tables because linesize isn't known there.
2249      * FIXME: redo bipred weight to not require extra buffer? */
2250     for(i = 0; i < s->avctx->thread_count; i++)
2251         if(!h->thread_context[i]->s.obmc_scratchpad)
2252             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2253
2254     /* some macroblocks will be accessed before they're available */
2255     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2256         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2257
2258 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2259
2260     // We mark the current picture as non-reference after allocating it, so
2261     // that if we break out due to an error it can be released automatically
2262     // in the next MPV_frame_start().
2263     // SVQ3 as well as most other codecs have only last/next/current and thus
2264     // get released even with set reference, besides SVQ3 and others do not
2265     // mark frames as reference later "naturally".
2266     if(s->codec_id != CODEC_ID_SVQ3)
2267         s->current_picture_ptr->reference= 0;
2268
2269     s->current_picture_ptr->field_poc[0]=
2270     s->current_picture_ptr->field_poc[1]= INT_MAX;
2271     assert(s->current_picture_ptr->long_ref==0);
2272
2273     return 0;
2274 }
2275
2276 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2277     MpegEncContext * const s = &h->s;
2278     int i;
2279
2280     src_y  -=   linesize;
2281     src_cb -= uvlinesize;
2282     src_cr -= uvlinesize;
2283
2284     // There are two lines saved, the line above the the top macroblock of a pair,
2285     // and the line above the bottom macroblock
2286     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2287     for(i=1; i<17; i++){
2288         h->left_border[i]= src_y[15+i*  linesize];
2289     }
2290
2291     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2292     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2293
2294     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2295         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2296         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2297         for(i=1; i<9; i++){
2298             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2299             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2300         }
2301         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2302         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2303     }
2304 }
2305
2306 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2307     MpegEncContext * const s = &h->s;
2308     int temp8, i;
2309     uint64_t temp64;
2310     int deblock_left;
2311     int deblock_top;
2312     int mb_xy;
2313
2314     if(h->deblocking_filter == 2) {
2315         mb_xy = h->mb_xy;
2316         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2317         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2318     } else {
2319         deblock_left = (s->mb_x > 0);
2320         deblock_top =  (s->mb_y > 0);
2321     }
2322
2323     src_y  -=   linesize + 1;
2324     src_cb -= uvlinesize + 1;
2325     src_cr -= uvlinesize + 1;
2326
2327 #define XCHG(a,b,t,xchg)\
2328 t= a;\
2329 if(xchg)\
2330     a= b;\
2331 b= t;
2332
2333     if(deblock_left){
2334         for(i = !deblock_top; i<17; i++){
2335             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2336         }
2337     }
2338
2339     if(deblock_top){
2340         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2341         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2342         if(s->mb_x+1 < s->mb_width){
2343             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2344         }
2345     }
2346
2347     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2348         if(deblock_left){
2349             for(i = !deblock_top; i<9; i++){
2350                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2351                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2352             }
2353         }
2354         if(deblock_top){
2355             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2356             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2357         }
2358     }
2359 }
2360
2361 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2362     MpegEncContext * const s = &h->s;
2363     int i;
2364
2365     src_y  -= 2 *   linesize;
2366     src_cb -= 2 * uvlinesize;
2367     src_cr -= 2 * uvlinesize;
2368
2369     // There are two lines saved, the line above the the top macroblock of a pair,
2370     // and the line above the bottom macroblock
2371     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2372     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2373     for(i=2; i<34; i++){
2374         h->left_border[i]= src_y[15+i*  linesize];
2375     }
2376
2377     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2378     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2379     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2380     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2381
2382     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2383         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2384         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2385         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2386         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2387         for(i=2; i<18; i++){
2388             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2389             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2390         }
2391         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2392         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2393         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2394         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2395     }
2396 }
2397
2398 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2399     MpegEncContext * const s = &h->s;
2400     int temp8, i;
2401     uint64_t temp64;
2402     int deblock_left = (s->mb_x > 0);
2403     int deblock_top  = (s->mb_y > 1);
2404
2405     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2406
2407     src_y  -= 2 *   linesize + 1;
2408     src_cb -= 2 * uvlinesize + 1;
2409     src_cr -= 2 * uvlinesize + 1;
2410
2411 #define XCHG(a,b,t,xchg)\
2412 t= a;\
2413 if(xchg)\
2414     a= b;\
2415 b= t;
2416
2417     if(deblock_left){
2418         for(i = (!deblock_top)<<1; i<34; i++){
2419             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2420         }
2421     }
2422
2423     if(deblock_top){
2424         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2425         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2426         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2427         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2428         if(s->mb_x+1 < s->mb_width){
2429             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2430             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2431         }
2432     }
2433
2434     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2435         if(deblock_left){
2436             for(i = (!deblock_top) << 1; i<18; i++){
2437                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2438                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2439             }
2440         }
2441         if(deblock_top){
2442             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2443             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2444             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2445             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2446         }
2447     }
2448 }
2449
2450 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2451     MpegEncContext * const s = &h->s;
2452     const int mb_x= s->mb_x;
2453     const int mb_y= s->mb_y;
2454     const int mb_xy= h->mb_xy;
2455     const int mb_type= s->current_picture.mb_type[mb_xy];
2456     uint8_t  *dest_y, *dest_cb, *dest_cr;
2457     int linesize, uvlinesize /*dct_offset*/;
2458     int i;
2459     int *block_offset = &h->block_offset[0];
2460     const unsigned int bottom = mb_y & 1;
2461     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2462     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2463     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2464
2465     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2466     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2467     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2468
2469     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2470     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2471
2472     if (!simple && MB_FIELD) {
2473         linesize   = h->mb_linesize   = s->linesize * 2;
2474         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2475         block_offset = &h->block_offset[24];
2476         if(mb_y&1){ //FIXME move out of this function?
2477             dest_y -= s->linesize*15;
2478             dest_cb-= s->uvlinesize*7;
2479             dest_cr-= s->uvlinesize*7;
2480         }
2481         if(FRAME_MBAFF) {
2482             int list;
2483             for(list=0; list<h->list_count; list++){
2484                 if(!USES_LIST(mb_type, list))
2485                     continue;
2486                 if(IS_16X16(mb_type)){
2487                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2488                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2489                 }else{
2490                     for(i=0; i<16; i+=4){
2491                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2492                         int ref = h->ref_cache[list][scan8[i]];
2493                         if(ref >= 0)
2494                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2495                     }
2496                 }
2497             }
2498         }
2499     } else {
2500         linesize   = h->mb_linesize   = s->linesize;
2501         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2502 //        dct_offset = s->linesize * 16;
2503     }
2504
2505     if(transform_bypass){
2506         idct_dc_add =
2507         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2508     }else if(IS_8x8DCT(mb_type)){
2509         idct_dc_add = s->dsp.h264_idct8_dc_add;
2510         idct_add = s->dsp.h264_idct8_add;
2511     }else{
2512         idct_dc_add = s->dsp.h264_idct_dc_add;
2513         idct_add = s->dsp.h264_idct_add;
2514     }
2515
2516     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2517        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2518         int mbt_y = mb_y&~1;
2519         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2520         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2521         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2522         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2523     }
2524
2525     if (!simple && IS_INTRA_PCM(mb_type)) {
2526         unsigned int x, y;
2527
2528         // The pixels are stored in h->mb array in the same order as levels,
2529         // copy them in output in the correct order.
2530         for(i=0; i<16; i++) {
2531             for (y=0; y<4; y++) {
2532                 for (x=0; x<4; x++) {
2533                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2534                 }
2535             }
2536         }
2537         for(i=16; i<16+4; i++) {
2538             for (y=0; y<4; y++) {
2539                 for (x=0; x<4; x++) {
2540                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2541                 }
2542             }
2543         }
2544         for(i=20; i<20+4; i++) {
2545             for (y=0; y<4; y++) {
2546                 for (x=0; x<4; x++) {
2547                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2548                 }
2549             }
2550         }
2551     } else {
2552         if(IS_INTRA(mb_type)){
2553             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2554                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2555
2556             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2557                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2558                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2559             }
2560
2561             if(IS_INTRA4x4(mb_type)){
2562                 if(simple || !s->encoding){
2563                     if(IS_8x8DCT(mb_type)){
2564                         for(i=0; i<16; i+=4){
2565                             uint8_t * const ptr= dest_y + block_offset[i];
2566                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2567                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2568                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2569                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2570                             if(nnz){
2571                                 if(nnz == 1 && h->mb[i*16])
2572                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2573                                 else
2574                                     idct_add(ptr, h->mb + i*16, linesize);
2575                             }
2576                         }
2577                     }else
2578                     for(i=0; i<16; i++){
2579                         uint8_t * const ptr= dest_y + block_offset[i];
2580                         uint8_t *topright;
2581                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2582                         int nnz, tr;
2583
2584                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2585                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2586                             assert(mb_y || linesize <= block_offset[i]);
2587                             if(!topright_avail){
2588                                 tr= ptr[3 - linesize]*0x01010101;
2589                                 topright= (uint8_t*) &tr;
2590                             }else
2591                                 topright= ptr + 4 - linesize;
2592                         }else
2593                             topright= NULL;
2594
2595                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2596                         nnz = h->non_zero_count_cache[ scan8[i] ];
2597                         if(nnz){
2598                             if(is_h264){
2599                                 if(nnz == 1 && h->mb[i*16])
2600                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2601                                 else
2602                                     idct_add(ptr, h->mb + i*16, linesize);
2603                             }else
2604                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2605                         }
2606                     }
2607                 }
2608             }else{
2609                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2610                 if(is_h264){
2611                     if(!transform_bypass)
2612                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2613                 }else
2614                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2615             }
2616             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2617                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2618         }else if(is_h264){
2619             hl_motion(h, dest_y, dest_cb, dest_cr,
2620                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2621                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2622                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2623         }
2624
2625
2626         if(!IS_INTRA4x4(mb_type)){
2627             if(is_h264){
2628                 if(IS_INTRA16x16(mb_type)){
2629                     for(i=0; i<16; i++){
2630                         if(h->non_zero_count_cache[ scan8[i] ])
2631                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2632                         else if(h->mb[i*16])
2633                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2634                     }
2635                 }else{
2636                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2637                     for(i=0; i<16; i+=di){
2638                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2639                         if(nnz){
2640                             if(nnz==1 && h->mb[i*16])
2641                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2642                             else
2643                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2644                         }
2645                     }
2646                 }
2647             }else{
2648                 for(i=0; i<16; i++){
2649                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2650                         uint8_t * const ptr= dest_y + block_offset[i];
2651                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2652                     }
2653                 }
2654             }
2655         }
2656
2657         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2658             uint8_t *dest[2] = {dest_cb, dest_cr};
2659             if(transform_bypass){
2660                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2661             }else{
2662                 idct_add = s->dsp.h264_idct_add;
2663                 idct_dc_add = s->dsp.h264_idct_dc_add;
2664                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2665                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2666             }
2667             if(is_h264){
2668                 for(i=16; i<16+8; i++){
2669                     if(h->non_zero_count_cache[ scan8[i] ])
2670                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2671                     else if(h->mb[i*16])
2672                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2673                 }
2674             }else{
2675                 for(i=16; i<16+8; i++){
2676                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2677                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2678                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2679                     }
2680                 }
2681             }
2682         }
2683     }
2684     if(h->deblocking_filter) {
2685         if (!simple && FRAME_MBAFF) {
2686             //FIXME try deblocking one mb at a time?
2687             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2688             const int mb_y = s->mb_y - 1;
2689             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2690             const int mb_xy= mb_x + mb_y*s->mb_stride;
2691             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2692             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2693             if (!bottom) return;
2694             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2695             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2696             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2697
2698             if(IS_INTRA(mb_type_top | mb_type_bottom))
2699                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2700
2701             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2702             // deblock a pair
2703             // top
2704             s->mb_y--; h->mb_xy -= s->mb_stride;
2705             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2706             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2707             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2708             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2709             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2710             // bottom
2711             s->mb_y++; h->mb_xy += s->mb_stride;
2712             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2713             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2714             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2715             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2716             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2717         } else {
2718             tprintf(h->s.avctx, "call filter_mb\n");
2719             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2720             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2721             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2722             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2723             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2724         }
2725     }
2726 }
2727
2728 /**
2729  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2730  */
2731 static void hl_decode_mb_simple(H264Context *h){
2732     hl_decode_mb_internal(h, 1);
2733 }
2734
2735 /**
2736  * Process a macroblock; this handles edge cases, such as interlacing.
2737  */
2738 static void av_noinline hl_decode_mb_complex(H264Context *h){
2739     hl_decode_mb_internal(h, 0);
2740 }
2741
2742 static void hl_decode_mb(H264Context *h){
2743     MpegEncContext * const s = &h->s;
2744     const int mb_xy= h->mb_xy;
2745     const int mb_type= s->current_picture.mb_type[mb_xy];
2746     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
2747                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
2748
2749     if(ENABLE_H264_ENCODER && !s->decode)
2750         return;
2751
2752     if (is_complex)
2753         hl_decode_mb_complex(h);
2754     else hl_decode_mb_simple(h);
2755 }
2756
2757 static void pic_as_field(Picture *pic, const int parity){
2758     int i;
2759     for (i = 0; i < 4; ++i) {
2760         if (parity == PICT_BOTTOM_FIELD)
2761             pic->data[i] += pic->linesize[i];
2762         pic->reference = parity;
2763         pic->linesize[i] *= 2;
2764     }
2765 }
2766
2767 static int split_field_copy(Picture *dest, Picture *src,
2768                             int parity, int id_add){
2769     int match = !!(src->reference & parity);
2770
2771     if (match) {
2772         *dest = *src;
2773         pic_as_field(dest, parity);
2774         dest->pic_id *= 2;
2775         dest->pic_id += id_add;
2776     }
2777
2778     return match;
2779 }
2780
2781 /**
2782  * Split one reference list into field parts, interleaving by parity
2783  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2784  * set to look at the actual start of data for that field.
2785  *
2786  * @param dest output list
2787  * @param dest_len maximum number of fields to put in dest
2788  * @param src the source reference list containing fields and/or field pairs
2789  *            (aka short_ref/long_ref, or
2790  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2791  * @param src_len number of Picture's in source (pairs and unmatched fields)
2792  * @param parity the parity of the picture being decoded/needing
2793  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2794  * @return number of fields placed in dest
2795  */
2796 static int split_field_half_ref_list(Picture *dest, int dest_len,
2797                                      Picture *src,  int src_len,  int parity){
2798     int same_parity   = 1;
2799     int same_i        = 0;
2800     int opp_i         = 0;
2801     int out_i;
2802     int field_output;
2803
2804     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2805         if (same_parity && same_i < src_len) {
2806             field_output = split_field_copy(dest + out_i, src + same_i,
2807                                             parity, 1);
2808             same_parity = !field_output;
2809             same_i++;
2810
2811         } else if (opp_i < src_len) {
2812             field_output = split_field_copy(dest + out_i, src + opp_i,
2813                                             PICT_FRAME - parity, 0);
2814             same_parity = field_output;
2815             opp_i++;
2816
2817         } else {
2818             break;
2819         }
2820     }
2821
2822     return out_i;
2823 }
2824
2825 /**
2826  * Split the reference frame list into a reference field list.
2827  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2828  * The input list contains both reference field pairs and
2829  * unmatched reference fields; it is ordered as spec describes
2830  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2831  * unmatched field pairs are also present. Conceptually this is equivalent
2832  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2833  *
2834  * @param dest output reference list where ordered fields are to be placed
2835  * @param dest_len max number of fields to place at dest
2836  * @param src source reference list, as described above
2837  * @param src_len number of pictures (pairs and unmatched fields) in src
2838  * @param parity parity of field being currently decoded
2839  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2840  * @param long_i index into src array that holds first long reference picture,
2841  *        or src_len if no long refs present.
2842  */
2843 static int split_field_ref_list(Picture *dest, int dest_len,
2844                                 Picture *src,  int src_len,
2845                                 int parity,    int long_i){
2846
2847     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2848     dest += i;
2849     dest_len -= i;
2850
2851     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2852                                    src_len - long_i, parity);
2853     return i;
2854 }
2855
2856 /**
2857  * fills the default_ref_list.
2858  */
2859 static int fill_default_ref_list(H264Context *h){
2860     MpegEncContext * const s = &h->s;
2861     int i;
2862     int smallest_poc_greater_than_current = -1;
2863     int structure_sel;
2864     Picture sorted_short_ref[32];
2865     Picture field_entry_list[2][32];
2866     Picture *frame_list[2];
2867
2868     if (FIELD_PICTURE) {
2869         structure_sel = PICT_FRAME;
2870         frame_list[0] = field_entry_list[0];
2871         frame_list[1] = field_entry_list[1];
2872     } else {
2873         structure_sel = 0;
2874         frame_list[0] = h->default_ref_list[0];
2875         frame_list[1] = h->default_ref_list[1];
2876     }
2877
2878     if(h->slice_type_nos==FF_B_TYPE){
2879         int list;
2880         int len[2];
2881         int short_len[2];
2882         int out_i;
2883         int limit= INT_MIN;
2884
2885         /* sort frame according to POC in B slice */
2886         for(out_i=0; out_i<h->short_ref_count; out_i++){
2887             int best_i=INT_MIN;
2888             int best_poc=INT_MAX;
2889
2890             for(i=0; i<h->short_ref_count; i++){
2891                 const int poc= h->short_ref[i]->poc;
2892                 if(poc > limit && poc < best_poc){
2893                     best_poc= poc;
2894                     best_i= i;
2895                 }
2896             }
2897
2898             assert(best_i != INT_MIN);
2899
2900             limit= best_poc;
2901             sorted_short_ref[out_i]= *h->short_ref[best_i];
2902             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2903             if (-1 == smallest_poc_greater_than_current) {
2904                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2905                     smallest_poc_greater_than_current = out_i;
2906                 }
2907             }
2908         }
2909
2910         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2911
2912         // find the largest POC
2913         for(list=0; list<2; list++){
2914             int index = 0;
2915             int j= -99;
2916             int step= list ? -1 : 1;
2917
2918             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2919                 int sel;
2920                 while(j<0 || j>= h->short_ref_count){
2921                     if(j != -99 && step == (list ? -1 : 1))
2922                         return -1;
2923                     step = -step;
2924                     j= smallest_poc_greater_than_current + (step>>1);
2925                 }
2926                 sel = sorted_short_ref[j].reference | structure_sel;
2927                 if(sel != PICT_FRAME) continue;
2928                 frame_list[list][index  ]= sorted_short_ref[j];
2929                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2930             }
2931             short_len[list] = index;
2932
2933             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2934                 int sel;
2935                 if(h->long_ref[i] == NULL) continue;
2936                 sel = h->long_ref[i]->reference | structure_sel;
2937                 if(sel != PICT_FRAME) continue;
2938
2939                 frame_list[ list ][index  ]= *h->long_ref[i];
2940                 frame_list[ list ][index++].pic_id= i;
2941             }
2942             len[list] = index;
2943         }
2944
2945         for(list=0; list<2; list++){
2946             if (FIELD_PICTURE)
2947                 len[list] = split_field_ref_list(h->default_ref_list[list],
2948                                                  h->ref_count[list],
2949                                                  frame_list[list],
2950                                                  len[list],
2951                                                  s->picture_structure,
2952                                                  short_len[list]);
2953
2954             // swap the two first elements of L1 when L0 and L1 are identical
2955             if(list && len[0] > 1 && len[0] == len[1])
2956                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2957                     if(i == len[0]){
2958                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2959                         break;
2960                     }
2961
2962             if(len[list] < h->ref_count[ list ])
2963                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2964         }
2965
2966
2967     }else{
2968         int index=0;
2969         int short_len;
2970         for(i=0; i<h->short_ref_count; i++){
2971             int sel;
2972             sel = h->short_ref[i]->reference | structure_sel;
2973             if(sel != PICT_FRAME) continue;
2974             frame_list[0][index  ]= *h->short_ref[i];
2975             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2976         }
2977         short_len = index;
2978         for(i = 0; i < 16; i++){
2979             int sel;
2980             if(h->long_ref[i] == NULL) continue;
2981             sel = h->long_ref[i]->reference | structure_sel;
2982             if(sel != PICT_FRAME) continue;
2983             frame_list[0][index  ]= *h->long_ref[i];
2984             frame_list[0][index++].pic_id= i;
2985         }
2986
2987         if (FIELD_PICTURE)
2988             index = split_field_ref_list(h->default_ref_list[0],
2989                                          h->ref_count[0], frame_list[0],
2990                                          index, s->picture_structure,
2991                                          short_len);
2992
2993         if(index < h->ref_count[0])
2994             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2995     }
2996 #ifdef TRACE
2997     for (i=0; i<h->ref_count[0]; i++) {
2998         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2999     }
3000     if(h->slice_type_nos==FF_B_TYPE){
3001         for (i=0; i<h->ref_count[1]; i++) {
3002             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
3003         }
3004     }
3005 #endif
3006     return 0;
3007 }
3008
3009 static void print_short_term(H264Context *h);
3010 static void print_long_term(H264Context *h);
3011
3012 /**
3013  * Extract structure information about the picture described by pic_num in
3014  * the current decoding context (frame or field). Note that pic_num is
3015  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3016  * @param pic_num picture number for which to extract structure information
3017  * @param structure one of PICT_XXX describing structure of picture
3018  *                      with pic_num
3019  * @return frame number (short term) or long term index of picture
3020  *         described by pic_num
3021  */
3022 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3023     MpegEncContext * const s = &h->s;
3024
3025     *structure = s->picture_structure;
3026     if(FIELD_PICTURE){
3027         if (!(pic_num & 1))
3028             /* opposite field */
3029             *structure ^= PICT_FRAME;
3030         pic_num >>= 1;
3031     }
3032
3033     return pic_num;
3034 }
3035
3036 static int decode_ref_pic_list_reordering(H264Context *h){
3037     MpegEncContext * const s = &h->s;
3038     int list, index, pic_structure;
3039
3040     print_short_term(h);
3041     print_long_term(h);
3042     if(h->slice_type_nos==FF_I_TYPE) return 0; //FIXME move before function
3043
3044     for(list=0; list<h->list_count; list++){
3045         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3046
3047         if(get_bits1(&s->gb)){
3048             int pred= h->curr_pic_num;
3049
3050             for(index=0; ; index++){
3051                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3052                 unsigned int pic_id;
3053                 int i;
3054                 Picture *ref = NULL;
3055
3056                 if(reordering_of_pic_nums_idc==3)
3057                     break;
3058
3059                 if(index >= h->ref_count[list]){
3060                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3061                     return -1;
3062                 }
3063
3064                 if(reordering_of_pic_nums_idc<3){
3065                     if(reordering_of_pic_nums_idc<2){
3066                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3067                         int frame_num;
3068
3069                         if(abs_diff_pic_num > h->max_pic_num){
3070                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3071                             return -1;
3072                         }
3073
3074                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3075                         else                                pred+= abs_diff_pic_num;
3076                         pred &= h->max_pic_num - 1;
3077
3078                         frame_num = pic_num_extract(h, pred, &pic_structure);
3079
3080                         for(i= h->short_ref_count-1; i>=0; i--){
3081                             ref = h->short_ref[i];
3082                             assert(ref->reference);
3083                             assert(!ref->long_ref);
3084                             if(ref->data[0] != NULL &&
3085                                    ref->frame_num == frame_num &&
3086                                    (ref->reference & pic_structure) &&
3087                                    ref->long_ref == 0) // ignore non-existing pictures by testing data[0] pointer
3088                                 break;
3089                         }
3090                         if(i>=0)
3091                             ref->pic_id= pred;
3092                     }else{
3093                         int long_idx;
3094                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3095
3096                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3097
3098                         if(long_idx>31){
3099                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3100                             return -1;
3101                         }
3102                         ref = h->long_ref[long_idx];
3103                         assert(!(ref && !ref->reference));
3104                         if(ref && (ref->reference & pic_structure)){
3105                             ref->pic_id= pic_id;
3106                             assert(ref->long_ref);
3107                             i=0;
3108                         }else{
3109                             i=-1;
3110                         }
3111                     }
3112
3113                     if (i < 0) {
3114                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3115                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3116                     } else {
3117                         for(i=index; i+1<h->ref_count[list]; i++){
3118                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3119                                 break;
3120                         }
3121                         for(; i > index; i--){
3122                             h->ref_list[list][i]= h->ref_list[list][i-1];
3123                         }
3124                         h->ref_list[list][index]= *ref;
3125                         if (FIELD_PICTURE){
3126                             pic_as_field(&h->ref_list[list][index], pic_structure);
3127                         }
3128                     }
3129                 }else{
3130                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3131                     return -1;
3132                 }
3133             }
3134         }
3135     }
3136     for(list=0; list<h->list_count; list++){
3137         for(index= 0; index < h->ref_count[list]; index++){
3138             if(!h->ref_list[list][index].data[0])
3139                 h->ref_list[list][index]= s->current_picture;
3140         }
3141     }
3142
3143     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3144         direct_dist_scale_factor(h);
3145     direct_ref_list_init(h);
3146     return 0;
3147 }
3148
3149 static void fill_mbaff_ref_list(H264Context *h){
3150     int list, i, j;
3151     for(list=0; list<2; list++){ //FIXME try list_count
3152         for(i=0; i<h->ref_count[list]; i++){
3153             Picture *frame = &h->ref_list[list][i];
3154             Picture *field = &h->ref_list[list][16+2*i];
3155             field[0] = *frame;
3156             for(j=0; j<3; j++)
3157                 field[0].linesize[j] <<= 1;
3158             field[0].reference = PICT_TOP_FIELD;
3159             field[1] = field[0];
3160             for(j=0; j<3; j++)
3161                 field[1].data[j] += frame->linesize[j];
3162             field[1].reference = PICT_BOTTOM_FIELD;
3163
3164             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3165             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3166             for(j=0; j<2; j++){
3167                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3168                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3169             }
3170         }
3171     }
3172     for(j=0; j<h->ref_count[1]; j++){
3173         for(i=0; i<h->ref_count[0]; i++)
3174             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3175         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3176         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3177     }
3178 }
3179
3180 static int pred_weight_table(H264Context *h){
3181     MpegEncContext * const s = &h->s;
3182     int list, i;
3183     int luma_def, chroma_def;
3184
3185     h->use_weight= 0;
3186     h->use_weight_chroma= 0;
3187     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3188     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3189     luma_def = 1<<h->luma_log2_weight_denom;
3190     chroma_def = 1<<h->chroma_log2_weight_denom;
3191
3192     for(list=0; list<2; list++){
3193         for(i=0; i<h->ref_count[list]; i++){
3194             int luma_weight_flag, chroma_weight_flag;
3195
3196             luma_weight_flag= get_bits1(&s->gb);
3197             if(luma_weight_flag){
3198                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3199                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3200                 if(   h->luma_weight[list][i] != luma_def
3201                    || h->luma_offset[list][i] != 0)
3202                     h->use_weight= 1;
3203             }else{
3204                 h->luma_weight[list][i]= luma_def;
3205                 h->luma_offset[list][i]= 0;
3206             }
3207
3208             chroma_weight_flag= get_bits1(&s->gb);
3209             if(chroma_weight_flag){
3210                 int j;
3211                 for(j=0; j<2; j++){
3212                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3213                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3214                     if(   h->chroma_weight[list][i][j] != chroma_def
3215                        || h->chroma_offset[list][i][j] != 0)
3216                         h->use_weight_chroma= 1;
3217                 }
3218             }else{
3219                 int j;
3220                 for(j=0; j<2; j++){
3221                     h->chroma_weight[list][i][j]= chroma_def;
3222                     h->chroma_offset[list][i][j]= 0;
3223                 }
3224             }
3225         }
3226         if(h->slice_type_nos != FF_B_TYPE) break;
3227     }
3228     h->use_weight= h->use_weight || h->use_weight_chroma;
3229     return 0;
3230 }
3231
3232 static void implicit_weight_table(H264Context *h){
3233     MpegEncContext * const s = &h->s;
3234     int ref0, ref1;
3235     int cur_poc = s->current_picture_ptr->poc;
3236
3237     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3238        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3239         h->use_weight= 0;
3240         h->use_weight_chroma= 0;
3241         return;
3242     }
3243
3244     h->use_weight= 2;
3245     h->use_weight_chroma= 2;
3246     h->luma_log2_weight_denom= 5;
3247     h->chroma_log2_weight_denom= 5;
3248
3249     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3250         int poc0 = h->ref_list[0][ref0].poc;
3251         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3252             int poc1 = h->ref_list[1][ref1].poc;
3253             int td = av_clip(poc1 - poc0, -128, 127);
3254             if(td){
3255                 int tb = av_clip(cur_poc - poc0, -128, 127);
3256                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3257                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3258                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3259                     h->implicit_weight[ref0][ref1] = 32;
3260                 else
3261                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3262             }else
3263                 h->implicit_weight[ref0][ref1] = 32;
3264         }
3265     }
3266 }
3267
3268 /**
3269  * Mark a picture as no longer needed for reference. The refmask
3270  * argument allows unreferencing of individual fields or the whole frame.
3271  * If the picture becomes entirely unreferenced, but is being held for
3272  * display purposes, it is marked as such.
3273  * @param refmask mask of fields to unreference; the mask is bitwise
3274  *                anded with the reference marking of pic
3275  * @return non-zero if pic becomes entirely unreferenced (except possibly
3276  *         for display purposes) zero if one of the fields remains in
3277  *         reference
3278  */
3279 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3280     int i;
3281     if (pic->reference &= refmask) {
3282         return 0;
3283     } else {
3284         for(i = 0; h->delayed_pic[i]; i++)
3285             if(pic == h->delayed_pic[i]){
3286                 pic->reference=DELAYED_PIC_REF;
3287                 break;
3288             }
3289         return 1;
3290     }
3291 }
3292
3293 /**
3294  * instantaneous decoder refresh.
3295  */
3296 static void idr(H264Context *h){
3297     int i;
3298
3299     for(i=0; i<16; i++){
3300         remove_long(h, i, 0);
3301     }
3302     assert(h->long_ref_count==0);
3303
3304     for(i=0; i<h->short_ref_count; i++){
3305         unreference_pic(h, h->short_ref[i], 0);
3306         h->short_ref[i]= NULL;
3307     }
3308     h->short_ref_count=0;
3309     h->prev_frame_num= 0;
3310     h->prev_frame_num_offset= 0;
3311     h->prev_poc_msb=
3312     h->prev_poc_lsb= 0;
3313 }
3314
3315 /* forget old pics after a seek */
3316 static void flush_dpb(AVCodecContext *avctx){
3317     H264Context *h= avctx->priv_data;
3318     int i;
3319     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3320         if(h->delayed_pic[i])
3321             h->delayed_pic[i]->reference= 0;
3322         h->delayed_pic[i]= NULL;
3323     }
3324     h->outputed_poc= INT_MIN;
3325     idr(h);
3326     if(h->s.current_picture_ptr)
3327         h->s.current_picture_ptr->reference= 0;
3328     h->s.first_field= 0;
3329     ff_mpeg_flush(avctx);
3330 }
3331
3332 /**
3333  * Find a Picture in the short term reference list by frame number.
3334  * @param frame_num frame number to search for
3335  * @param idx the index into h->short_ref where returned picture is found
3336  *            undefined if no picture found.
3337  * @return pointer to the found picture, or NULL if no pic with the provided
3338  *                 frame number is found
3339  */
3340 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3341     MpegEncContext * const s = &h->s;
3342     int i;
3343
3344     for(i=0; i<h->short_ref_count; i++){
3345         Picture *pic= h->short_ref[i];
3346         if(s->avctx->debug&FF_DEBUG_MMCO)
3347             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3348         if(pic->frame_num == frame_num) {
3349             *idx = i;
3350             return pic;
3351         }
3352     }
3353     return NULL;
3354 }
3355
3356 /**
3357  * Remove a picture from the short term reference list by its index in
3358  * that list.  This does no checking on the provided index; it is assumed
3359  * to be valid. Other list entries are shifted down.
3360  * @param i index into h->short_ref of picture to remove.
3361  */
3362 static void remove_short_at_index(H264Context *h, int i){
3363     assert(i >= 0 && i < h->short_ref_count);
3364     h->short_ref[i]= NULL;
3365     if (--h->short_ref_count)
3366         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3367 }
3368
3369 /**
3370  *
3371  * @return the removed picture or NULL if an error occurs
3372  */
3373 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3374     MpegEncContext * const s = &h->s;
3375     Picture *pic;
3376     int i;
3377
3378     if(s->avctx->debug&FF_DEBUG_MMCO)
3379         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3380
3381     pic = find_short(h, frame_num, &i);
3382     if (pic){
3383         if(unreference_pic(h, pic, ref_mask))
3384         remove_short_at_index(h, i);
3385     }
3386
3387     return pic;
3388 }
3389
3390 /**
3391  * Remove a picture from the long term reference list by its index in
3392  * that list.
3393  * @return the removed picture or NULL if an error occurs
3394  */
3395 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3396     Picture *pic;
3397
3398     pic= h->long_ref[i];
3399     if (pic){
3400         if(unreference_pic(h, pic, ref_mask)){
3401             assert(h->long_ref[i]->long_ref == 1);
3402             h->long_ref[i]->long_ref= 0;
3403             h->long_ref[i]= NULL;
3404             h->long_ref_count--;
3405         }
3406     }
3407
3408     return pic;
3409 }
3410
3411 /**
3412  * print short term list
3413  */
3414 static void print_short_term(H264Context *h) {
3415     uint32_t i;
3416     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3417         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3418         for(i=0; i<h->short_ref_count; i++){
3419             Picture *pic= h->short_ref[i];
3420             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3421         }
3422     }
3423 }
3424
3425 /**
3426  * print long term list
3427  */
3428 static void print_long_term(H264Context *h) {
3429     uint32_t i;
3430     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3431         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3432         for(i = 0; i < 16; i++){
3433             Picture *pic= h->long_ref[i];
3434             if (pic) {
3435                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3436             }
3437         }
3438     }
3439 }
3440
3441 /**
3442  * Executes the reference picture marking (memory management control operations).
3443  */
3444 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3445     MpegEncContext * const s = &h->s;
3446     int i, j;
3447     int current_ref_assigned=0;
3448     Picture *pic;
3449
3450     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3451         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3452
3453     for(i=0; i<mmco_count; i++){
3454         int structure, frame_num;
3455         if(s->avctx->debug&FF_DEBUG_MMCO)
3456             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3457
3458         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3459            || mmco[i].opcode == MMCO_SHORT2LONG){
3460             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3461             pic = find_short(h, frame_num, &j);
3462             if(!pic){
3463                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3464                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3465                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3466                 continue;
3467             }
3468         }
3469
3470         switch(mmco[i].opcode){
3471         case MMCO_SHORT2UNUSED:
3472             if(s->avctx->debug&FF_DEBUG_MMCO)
3473                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3474             remove_short(h, frame_num, structure ^ PICT_FRAME);
3475             break;
3476         case MMCO_SHORT2LONG:
3477                 if (h->long_ref[mmco[i].long_arg] != pic)
3478                     remove_long(h, mmco[i].long_arg, 0);
3479
3480                 remove_short_at_index(h, j);
3481                 h->long_ref[ mmco[i].long_arg ]= pic;
3482                 if (h->long_ref[ mmco[i].long_arg ]){
3483                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3484                     h->long_ref_count++;
3485                 }
3486             break;
3487         case MMCO_LONG2UNUSED:
3488             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3489             pic = h->long_ref[j];
3490             if (pic) {
3491                 remove_long(h, j, structure ^ PICT_FRAME);
3492             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3493                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3494             break;
3495         case MMCO_LONG:
3496                     // Comment below left from previous code as it is an interresting note.
3497                     /* First field in pair is in short term list or
3498                      * at a different long term index.
3499                      * This is not allowed; see 7.4.3, notes 2 and 3.
3500                      * Report the problem and keep the pair where it is,
3501                      * and mark this field valid.
3502                      */
3503
3504             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3505                 remove_long(h, mmco[i].long_arg, 0);
3506
3507                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3508                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3509                 h->long_ref_count++;
3510             }
3511
3512             s->current_picture_ptr->reference |= s->picture_structure;
3513             current_ref_assigned=1;
3514             break;
3515         case MMCO_SET_MAX_LONG:
3516             assert(mmco[i].long_arg <= 16);
3517             // just remove the long term which index is greater than new max
3518             for(j = mmco[i].long_arg; j<16; j++){
3519                 remove_long(h, j, 0);
3520             }
3521             break;
3522         case MMCO_RESET:
3523             while(h->short_ref_count){
3524                 remove_short(h, h->short_ref[0]->frame_num, 0);
3525             }
3526             for(j = 0; j < 16; j++) {
3527                 remove_long(h, j, 0);
3528             }
3529             s->current_picture_ptr->poc=
3530             s->current_picture_ptr->field_poc[0]=
3531             s->current_picture_ptr->field_poc[1]=
3532             h->poc_lsb=
3533             h->poc_msb=
3534             h->frame_num=
3535             s->current_picture_ptr->frame_num= 0;
3536             break;
3537         default: assert(0);
3538         }
3539     }
3540
3541     if (!current_ref_assigned) {
3542         /* Second field of complementary field pair; the first field of
3543          * which is already referenced. If short referenced, it
3544          * should be first entry in short_ref. If not, it must exist
3545          * in long_ref; trying to put it on the short list here is an
3546          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3547          */
3548         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3549             /* Just mark the second field valid */
3550             s->current_picture_ptr->reference = PICT_FRAME;
3551         } else if (s->current_picture_ptr->long_ref) {
3552             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3553                                              "assignment for second field "
3554                                              "in complementary field pair "
3555                                              "(first field is long term)\n");
3556         } else {
3557             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3558             if(pic){
3559                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3560             }
3561
3562             if(h->short_ref_count)
3563                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3564
3565             h->short_ref[0]= s->current_picture_ptr;
3566             h->short_ref_count++;
3567             s->current_picture_ptr->reference |= s->picture_structure;
3568         }
3569     }
3570
3571     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3572
3573         /* We have too many reference frames, probably due to corrupted
3574          * stream. Need to discard one frame. Prevents overrun of the
3575          * short_ref and long_ref buffers.
3576          */
3577         av_log(h->s.avctx, AV_LOG_ERROR,
3578                "number of reference frames exceeds max (probably "
3579                "corrupt input), discarding one\n");
3580
3581         if (h->long_ref_count && !h->short_ref_count) {
3582             for (i = 0; i < 16; ++i)
3583                 if (h->long_ref[i])
3584                     break;
3585
3586             assert(i < 16);
3587             remove_long(h, i, 0);
3588         } else {
3589             pic = h->short_ref[h->short_ref_count - 1];
3590             remove_short(h, pic->frame_num, 0);
3591         }
3592     }
3593
3594     print_short_term(h);
3595     print_long_term(h);
3596     return 0;
3597 }
3598
3599 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3600     MpegEncContext * const s = &h->s;
3601     int i;
3602
3603     h->mmco_index= 0;
3604     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3605         s->broken_link= get_bits1(gb) -1;
3606         if(get_bits1(gb)){
3607             h->mmco[0].opcode= MMCO_LONG;
3608             h->mmco[0].long_arg= 0;
3609             h->mmco_index= 1;
3610         }
3611     }else{
3612         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3613             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3614                 MMCOOpcode opcode= get_ue_golomb(gb);
3615
3616                 h->mmco[i].opcode= opcode;
3617                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3618                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3619 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3620                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3621                         return -1;
3622                     }*/
3623                 }
3624                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3625                     unsigned int long_arg= get_ue_golomb(gb);
3626                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3627                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3628                         return -1;
3629                     }
3630                     h->mmco[i].long_arg= long_arg;
3631                 }
3632
3633                 if(opcode > (unsigned)MMCO_LONG){
3634                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3635                     return -1;
3636                 }
3637                 if(opcode == MMCO_END)
3638                     break;
3639             }
3640             h->mmco_index= i;
3641         }else{
3642             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3643
3644             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3645                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3646                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3647                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3648                 h->mmco_index= 1;
3649                 if (FIELD_PICTURE) {
3650                     h->mmco[0].short_pic_num *= 2;
3651                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3652                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3653                     h->mmco_index= 2;
3654                 }
3655             }
3656         }
3657     }
3658
3659     return 0;
3660 }
3661
3662 static int init_poc(H264Context *h){
3663     MpegEncContext * const s = &h->s;
3664     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3665     int field_poc[2];
3666     Picture *cur = s->current_picture_ptr;
3667
3668     h->frame_num_offset= h->prev_frame_num_offset;
3669     if(h->frame_num < h->prev_frame_num)
3670         h->frame_num_offset += max_frame_num;
3671
3672     if(h->sps.poc_type==0){
3673         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3674
3675         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3676             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3677         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3678             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3679         else
3680             h->poc_msb = h->prev_poc_msb;
3681 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3682         field_poc[0] =
3683         field_poc[1] = h->poc_msb + h->poc_lsb;
3684         if(s->picture_structure == PICT_FRAME)
3685             field_poc[1] += h->delta_poc_bottom;
3686     }else if(h->sps.poc_type==1){
3687         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3688         int i;
3689
3690         if(h->sps.poc_cycle_length != 0)
3691             abs_frame_num = h->frame_num_offset + h->frame_num;
3692         else
3693             abs_frame_num = 0;
3694
3695         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3696             abs_frame_num--;
3697
3698         expected_delta_per_poc_cycle = 0;
3699         for(i=0; i < h->sps.poc_cycle_length; i++)
3700             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3701
3702         if(abs_frame_num > 0){
3703             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3704             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3705
3706             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3707             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3708                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3709         } else
3710             expectedpoc = 0;
3711
3712         if(h->nal_ref_idc == 0)
3713             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3714
3715         field_poc[0] = expectedpoc + h->delta_poc[0];
3716         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3717
3718         if(s->picture_structure == PICT_FRAME)
3719             field_poc[1] += h->delta_poc[1];
3720     }else{
3721         int poc= 2*(h->frame_num_offset + h->frame_num);
3722
3723         if(!h->nal_ref_idc)
3724             poc--;
3725
3726         field_poc[0]= poc;
3727         field_poc[1]= poc;
3728     }
3729
3730     if(s->picture_structure != PICT_BOTTOM_FIELD)
3731         s->current_picture_ptr->field_poc[0]= field_poc[0];
3732     if(s->picture_structure != PICT_TOP_FIELD)
3733         s->current_picture_ptr->field_poc[1]= field_poc[1];
3734     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3735
3736     return 0;
3737 }
3738
3739
3740 /**
3741  * initialize scan tables
3742  */
3743 static void init_scan_tables(H264Context *h){
3744     MpegEncContext * const s = &h->s;
3745     int i;
3746     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3747         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3748         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3749     }else{
3750         for(i=0; i<16; i++){
3751 #define T(x) (x>>2) | ((x<<2) & 0xF)
3752             h->zigzag_scan[i] = T(zigzag_scan[i]);
3753             h-> field_scan[i] = T( field_scan[i]);
3754 #undef T
3755         }
3756     }
3757     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3758         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3759         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3760         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3761         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3762     }else{
3763         for(i=0; i<64; i++){
3764 #define T(x) (x>>3) | ((x&7)<<3)
3765             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3766             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3767             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3768             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3769 #undef T
3770         }
3771     }
3772     if(h->sps.transform_bypass){ //FIXME same ugly
3773         h->zigzag_scan_q0          = zigzag_scan;
3774         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3775         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3776         h->field_scan_q0           = field_scan;
3777         h->field_scan8x8_q0        = field_scan8x8;
3778         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3779     }else{
3780         h->zigzag_scan_q0          = h->zigzag_scan;
3781         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3782         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3783         h->field_scan_q0           = h->field_scan;
3784         h->field_scan8x8_q0        = h->field_scan8x8;
3785         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3786     }
3787 }
3788
3789 /**
3790  * Replicates H264 "master" context to thread contexts.
3791  */
3792 static void clone_slice(H264Context *dst, H264Context *src)
3793 {
3794     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3795     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3796     dst->s.current_picture      = src->s.current_picture;
3797     dst->s.linesize             = src->s.linesize;
3798     dst->s.uvlinesize           = src->s.uvlinesize;
3799     dst->s.first_field          = src->s.first_field;
3800
3801     dst->prev_poc_msb           = src->prev_poc_msb;
3802     dst->prev_poc_lsb           = src->prev_poc_lsb;
3803     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3804     dst->prev_frame_num         = src->prev_frame_num;
3805     dst->short_ref_count        = src->short_ref_count;
3806
3807     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3808     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3809     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3810     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3811
3812     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3813     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3814 }
3815
3816 /**
3817  * decodes a slice header.
3818  * This will also call MPV_common_init() and frame_start() as needed.
3819  *
3820  * @param h h264context
3821  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3822  *
3823  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3824  */
3825 static int decode_slice_header(H264Context *h, H264Context *h0){
3826     MpegEncContext * const s = &h->s;
3827     MpegEncContext * const s0 = &h0->s;
3828     unsigned int first_mb_in_slice;
3829     unsigned int pps_id;
3830     int num_ref_idx_active_override_flag;
3831     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3832     unsigned int slice_type, tmp, i, j;
3833     int default_ref_list_done = 0;
3834     int last_pic_structure;
3835
3836     s->dropable= h->nal_ref_idc == 0;
3837
3838     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3839         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3840         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3841     }else{
3842         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3843         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3844     }
3845
3846     first_mb_in_slice= get_ue_golomb(&s->gb);
3847
3848     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3849         h0->current_slice = 0;
3850         if (!s0->first_field)
3851             s->current_picture_ptr= NULL;
3852     }
3853
3854     slice_type= get_ue_golomb(&s->gb);
3855     if(slice_type > 9){
3856         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3857         return -1;
3858     }
3859     if(slice_type > 4){
3860         slice_type -= 5;
3861         h->slice_type_fixed=1;
3862     }else
3863         h->slice_type_fixed=0;
3864
3865     slice_type= slice_type_map[ slice_type ];
3866     if (slice_type == FF_I_TYPE
3867         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3868         default_ref_list_done = 1;
3869     }
3870     h->slice_type= slice_type;
3871     h->slice_type_nos= slice_type & 3;
3872
3873     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3874     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3875         av_log(h->s.avctx, AV_LOG_ERROR,
3876                "B picture before any references, skipping\n");
3877         return -1;
3878     }
3879
3880     pps_id= get_ue_golomb(&s->gb);
3881     if(pps_id>=MAX_PPS_COUNT){
3882         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3883         return -1;
3884     }
3885     if(!h0->pps_buffers[pps_id]) {
3886         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3887         return -1;
3888     }
3889     h->pps= *h0->pps_buffers[pps_id];
3890
3891     if(!h0->sps_buffers[h->pps.sps_id]) {
3892         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3893         return -1;
3894     }
3895     h->sps = *h0->sps_buffers[h->pps.sps_id];
3896
3897     if(h == h0 && h->dequant_coeff_pps != pps_id){
3898         h->dequant_coeff_pps = pps_id;
3899         init_dequant_tables(h);
3900     }
3901
3902     s->mb_width= h->sps.mb_width;
3903     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3904
3905     h->b_stride=  s->mb_width*4;
3906     h->b8_stride= s->mb_width*2;
3907
3908     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3909     if(h->sps.frame_mbs_only_flag)
3910         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3911     else
3912         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3913
3914     if (s->context_initialized
3915         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3916         if(h != h0)
3917             return -1;   // width / height changed during parallelized decoding
3918         free_tables(h);
3919         MPV_common_end(s);
3920     }
3921     if (!s->context_initialized) {
3922         if(h != h0)
3923             return -1;  // we cant (re-)initialize context during parallel decoding
3924         if (MPV_common_init(s) < 0)
3925             return -1;
3926         s->first_field = 0;
3927
3928         init_scan_tables(h);
3929         alloc_tables(h);
3930
3931         for(i = 1; i < s->avctx->thread_count; i++) {
3932             H264Context *c;
3933             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3934             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3935             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3936             c->sps = h->sps;
3937             c->pps = h->pps;
3938             init_scan_tables(c);
3939             clone_tables(c, h);
3940         }
3941
3942         for(i = 0; i < s->avctx->thread_count; i++)
3943             if(context_init(h->thread_context[i]) < 0)
3944                 return -1;
3945
3946         s->avctx->width = s->width;
3947         s->avctx->height = s->height;
3948         s->avctx->sample_aspect_ratio= h->sps.sar;
3949         if(!s->avctx->sample_aspect_ratio.den)
3950             s->avctx->sample_aspect_ratio.den = 1;
3951
3952         if(h->sps.timing_info_present_flag){
3953             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3954             if(h->x264_build > 0 && h->x264_build < 44)
3955                 s->avctx->time_base.den *= 2;
3956             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3957                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3958         }
3959     }
3960
3961     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3962
3963     h->mb_mbaff = 0;
3964     h->mb_aff_frame = 0;
3965     last_pic_structure = s0->picture_structure;
3966     if(h->sps.frame_mbs_only_flag){
3967         s->picture_structure= PICT_FRAME;
3968     }else{
3969         if(get_bits1(&s->gb)) { //field_pic_flag
3970             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3971         } else {
3972             s->picture_structure= PICT_FRAME;
3973             h->mb_aff_frame = h->sps.mb_aff;
3974         }
3975     }
3976
3977     if(h0->current_slice == 0){
3978         while(h->frame_num !=  h->prev_frame_num &&
3979               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3980             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3981             frame_start(h);
3982             h->prev_frame_num++;
3983             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3984             s->current_picture_ptr->frame_num= h->prev_frame_num;
3985             execute_ref_pic_marking(h, NULL, 0);
3986         }
3987
3988         /* See if we have a decoded first field looking for a pair... */
3989         if (s0->first_field) {
3990             assert(s0->current_picture_ptr);
3991             assert(s0->current_picture_ptr->data[0]);
3992             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3993
3994             /* figure out if we have a complementary field pair */
3995             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3996                 /*
3997                  * Previous field is unmatched. Don't display it, but let it
3998                  * remain for reference if marked as such.
3999                  */
4000                 s0->current_picture_ptr = NULL;
4001                 s0->first_field = FIELD_PICTURE;
4002
4003             } else {
4004                 if (h->nal_ref_idc &&
4005                         s0->current_picture_ptr->reference &&
4006                         s0->current_picture_ptr->frame_num != h->frame_num) {
4007                     /*
4008                      * This and previous field were reference, but had
4009                      * different frame_nums. Consider this field first in
4010                      * pair. Throw away previous field except for reference
4011                      * purposes.
4012                      */
4013                     s0->first_field = 1;
4014                     s0->current_picture_ptr = NULL;
4015
4016                 } else {
4017                     /* Second field in complementary pair */
4018                     s0->first_field = 0;
4019                 }
4020             }
4021
4022         } else {
4023             /* Frame or first field in a potentially complementary pair */
4024             assert(!s0->current_picture_ptr);
4025             s0->first_field = FIELD_PICTURE;
4026         }
4027
4028         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4029             s0->first_field = 0;
4030             return -1;
4031         }
4032     }
4033     if(h != h0)
4034         clone_slice(h, h0);
4035
4036     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4037
4038     assert(s->mb_num == s->mb_width * s->mb_height);
4039     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4040        first_mb_in_slice                    >= s->mb_num){
4041         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4042         return -1;
4043     }
4044     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4045     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4046     if (s->picture_structure == PICT_BOTTOM_FIELD)
4047         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4048     assert(s->mb_y < s->mb_height);
4049
4050     if(s->picture_structure==PICT_FRAME){
4051         h->curr_pic_num=   h->frame_num;
4052         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4053     }else{
4054         h->curr_pic_num= 2*h->frame_num + 1;
4055         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4056     }
4057
4058     if(h->nal_unit_type == NAL_IDR_SLICE){
4059         get_ue_golomb(&s->gb); /* idr_pic_id */
4060     }
4061
4062     if(h->sps.poc_type==0){
4063         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4064
4065         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4066             h->delta_poc_bottom= get_se_golomb(&s->gb);
4067         }
4068     }
4069
4070     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4071         h->delta_poc[0]= get_se_golomb(&s->gb);
4072
4073         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4074             h->delta_poc[1]= get_se_golomb(&s->gb);
4075     }
4076
4077     init_poc(h);
4078
4079     if(h->pps.redundant_pic_cnt_present){
4080         h->redundant_pic_count= get_ue_golomb(&s->gb);
4081     }
4082
4083     //set defaults, might be overridden a few lines later
4084     h->ref_count[0]= h->pps.ref_count[0];
4085     h->ref_count[1]= h->pps.ref_count[1];
4086
4087     if(h->slice_type_nos != FF_I_TYPE){
4088         if(h->slice_type_nos == FF_B_TYPE){
4089             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4090         }
4091         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4092
4093         if(num_ref_idx_active_override_flag){
4094             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4095             if(h->slice_type_nos==FF_B_TYPE)
4096                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4097
4098             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4099                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4100                 h->ref_count[0]= h->ref_count[1]= 1;
4101                 return -1;
4102             }
4103         }
4104         if(h->slice_type_nos == FF_B_TYPE)
4105             h->list_count= 2;
4106         else
4107             h->list_count= 1;
4108     }else
4109         h->list_count= 0;
4110
4111     if(!default_ref_list_done){
4112         fill_default_ref_list(h);
4113     }
4114
4115     if(decode_ref_pic_list_reordering(h) < 0)
4116         return -1;
4117
4118     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4119        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4120         pred_weight_table(h);
4121     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4122         implicit_weight_table(h);
4123     else
4124         h->use_weight = 0;
4125
4126     if(h->nal_ref_idc)
4127         decode_ref_pic_marking(h0, &s->gb);
4128
4129     if(FRAME_MBAFF)
4130         fill_mbaff_ref_list(h);
4131
4132     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4133         tmp = get_ue_golomb(&s->gb);
4134         if(tmp > 2){
4135             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4136             return -1;
4137         }
4138         h->cabac_init_idc= tmp;
4139     }
4140
4141     h->last_qscale_diff = 0;
4142     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4143     if(tmp>51){
4144         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4145         return -1;
4146     }
4147     s->qscale= tmp;
4148     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4149     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4150     //FIXME qscale / qp ... stuff
4151     if(h->slice_type == FF_SP_TYPE){
4152         get_bits1(&s->gb); /* sp_for_switch_flag */
4153     }
4154     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4155         get_se_golomb(&s->gb); /* slice_qs_delta */
4156     }
4157
4158     h->deblocking_filter = 1;
4159     h->slice_alpha_c0_offset = 0;
4160     h->slice_beta_offset = 0;
4161     if( h->pps.deblocking_filter_parameters_present ) {
4162         tmp= get_ue_golomb(&s->gb);
4163         if(tmp > 2){
4164             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4165             return -1;
4166         }
4167         h->deblocking_filter= tmp;
4168         if(h->deblocking_filter < 2)
4169             h->deblocking_filter^= 1; // 1<->0
4170
4171         if( h->deblocking_filter ) {
4172             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4173             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4174         }
4175     }
4176
4177     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4178        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4179        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4180        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4181         h->deblocking_filter= 0;
4182
4183     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4184         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4185             /* Cheat slightly for speed:
4186                Do not bother to deblock across slices. */
4187             h->deblocking_filter = 2;
4188         } else {
4189             h0->max_contexts = 1;
4190             if(!h0->single_decode_warning) {
4191                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4192                 h0->single_decode_warning = 1;
4193             }
4194             if(h != h0)
4195                 return 1; // deblocking switched inside frame
4196         }
4197     }
4198
4199 #if 0 //FMO
4200     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4201         slice_group_change_cycle= get_bits(&s->gb, ?);
4202 #endif
4203
4204     h0->last_slice_type = slice_type;
4205     h->slice_num = ++h0->current_slice;
4206
4207     for(j=0; j<2; j++){
4208         int *ref2frm= h->ref2frm[h->slice_num&15][j];
4209         ref2frm[0]=
4210         ref2frm[1]= -1;
4211         for(i=0; i<48; i++)
4212             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4213                           +(h->ref_list[j][i].reference&3);
4214     }
4215
4216     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4217     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4218
4219     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4220         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4221                h->slice_num,
4222                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4223                first_mb_in_slice,
4224                av_get_pict_type_char(h->slice_type),
4225                pps_id, h->frame_num,
4226                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4227                h->ref_count[0], h->ref_count[1],
4228                s->qscale,
4229                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4230                h->use_weight,
4231                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4232                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4233                );
4234     }
4235
4236     return 0;
4237 }
4238
4239 /**
4240  *
4241  */
4242 static inline int get_level_prefix(GetBitContext *gb){
4243     unsigned int buf;
4244     int log;
4245
4246     OPEN_READER(re, gb);
4247     UPDATE_CACHE(re, gb);
4248     buf=GET_CACHE(re, gb);
4249
4250     log= 32 - av_log2(buf);
4251 #ifdef TRACE
4252     print_bin(buf>>(32-log), log);
4253     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4254 #endif
4255
4256     LAST_SKIP_BITS(re, gb, log);
4257     CLOSE_READER(re, gb);
4258
4259     return log-1;
4260 }
4261
4262 static inline int get_dct8x8_allowed(H264Context *h){
4263     int i;
4264     for(i=0; i<4; i++){
4265         if(!IS_SUB_8X8(h->sub_mb_type[i])
4266            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4267             return 0;
4268     }
4269     return 1;
4270 }
4271
4272 /**
4273  * decodes a residual block.
4274  * @param n block index
4275  * @param scantable scantable
4276  * @param max_coeff number of coefficients in the block
4277  * @return <0 if an error occurred
4278  */
4279 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4280     MpegEncContext * const s = &h->s;
4281     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4282     int level[16];
4283     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4284
4285     //FIXME put trailing_onex into the context
4286
4287     if(n == CHROMA_DC_BLOCK_INDEX){
4288         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4289         total_coeff= coeff_token>>2;
4290     }else{
4291         if(n == LUMA_DC_BLOCK_INDEX){
4292             total_coeff= pred_non_zero_count(h, 0);
4293             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4294             total_coeff= coeff_token>>2;
4295         }else{
4296             total_coeff= pred_non_zero_count(h, n);
4297             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4298             total_coeff= coeff_token>>2;
4299             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4300         }
4301     }
4302
4303     //FIXME set last_non_zero?
4304
4305     if(total_coeff==0)
4306         return 0;
4307     if(total_coeff > (unsigned)max_coeff) {
4308         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4309         return -1;
4310     }
4311
4312     trailing_ones= coeff_token&3;
4313     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4314     assert(total_coeff<=16);
4315
4316     for(i=0; i<trailing_ones; i++){
4317         level[i]= 1 - 2*get_bits1(gb);
4318     }
4319
4320     if(i<total_coeff) {
4321         int level_code, mask;
4322         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4323         int prefix= get_level_prefix(gb);
4324
4325         //first coefficient has suffix_length equal to 0 or 1
4326         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4327             if(suffix_length)
4328                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4329             else
4330                 level_code= (prefix<<suffix_length); //part
4331         }else if(prefix==14){
4332             if(suffix_length)
4333                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4334             else
4335                 level_code= prefix + get_bits(gb, 4); //part
4336         }else{
4337             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4338             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4339             if(prefix>=16)
4340                 level_code += (1<<(prefix-3))-4096;
4341         }
4342
4343         if(trailing_ones < 3) level_code += 2;
4344
4345         suffix_length = 1;
4346         if(level_code > 5)
4347             suffix_length++;
4348         mask= -(level_code&1);
4349         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4350         i++;
4351
4352         //remaining coefficients have suffix_length > 0
4353         for(;i<total_coeff;i++) {
4354             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4355             prefix = get_level_prefix(gb);
4356             if(prefix<15){
4357                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4358             }else{
4359                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4360                 if(prefix>=16)
4361                     level_code += (1<<(prefix-3))-4096;
4362             }
4363             mask= -(level_code&1);
4364             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4365             if(level_code > suffix_limit[suffix_length])
4366                 suffix_length++;
4367         }
4368     }
4369
4370     if(total_coeff == max_coeff)
4371         zeros_left=0;
4372     else{
4373         if(n == CHROMA_DC_BLOCK_INDEX)
4374             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4375         else
4376             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4377     }
4378
4379     coeff_num = zeros_left + total_coeff - 1;
4380     j = scantable[coeff_num];
4381     if(n > 24){
4382         block[j] = level[0];
4383         for(i=1;i<total_coeff;i++) {
4384             if(zeros_left <= 0)
4385                 run_before = 0;
4386             else if(zeros_left < 7){
4387                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4388             }else{
4389                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4390             }
4391             zeros_left -= run_before;
4392             coeff_num -= 1 + run_before;
4393             j= scantable[ coeff_num ];
4394
4395             block[j]= level[i];
4396         }
4397     }else{
4398         block[j] = (level[0] * qmul[j] + 32)>>6;
4399         for(i=1;i<total_coeff;i++) {
4400             if(zeros_left <= 0)
4401                 run_before = 0;
4402             else if(zeros_left < 7){
4403                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4404             }else{
4405                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4406             }
4407             zeros_left -= run_before;
4408             coeff_num -= 1 + run_before;
4409             j= scantable[ coeff_num ];
4410
4411             block[j]= (level[i] * qmul[j] + 32)>>6;
4412         }
4413     }
4414
4415     if(zeros_left<0){
4416         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4417         return -1;
4418     }
4419
4420     return 0;
4421 }
4422
4423 static void predict_field_decoding_flag(H264Context *h){
4424     MpegEncContext * const s = &h->s;
4425     const int mb_xy= h->mb_xy;
4426     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4427                 ? s->current_picture.mb_type[mb_xy-1]
4428                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4429                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4430                 : 0;
4431     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4432 }
4433
4434 /**
4435  * decodes a P_SKIP or B_SKIP macroblock
4436  */
4437 static void decode_mb_skip(H264Context *h){
4438     MpegEncContext * const s = &h->s;
4439     const int mb_xy= h->mb_xy;
4440     int mb_type=0;
4441
4442     memset(h->non_zero_count[mb_xy], 0, 16);
4443     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4444
4445     if(MB_FIELD)
4446         mb_type|= MB_TYPE_INTERLACED;
4447
4448     if( h->slice_type_nos == FF_B_TYPE )
4449     {
4450         // just for fill_caches. pred_direct_motion will set the real mb_type
4451         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4452
4453         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4454         pred_direct_motion(h, &mb_type);
4455         mb_type|= MB_TYPE_SKIP;
4456     }
4457     else
4458     {
4459         int mx, my;
4460         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4461
4462         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4463         pred_pskip_motion(h, &mx, &my);
4464         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4465         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4466     }
4467
4468     write_back_motion(h, mb_type);
4469     s->current_picture.mb_type[mb_xy]= mb_type;
4470     s->current_picture.qscale_table[mb_xy]= s->qscale;
4471     h->slice_table[ mb_xy ]= h->slice_num;
4472     h->prev_mb_skipped= 1;
4473 }
4474
4475 /**
4476  * decodes a macroblock
4477  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4478  */
4479 static int decode_mb_cavlc(H264Context *h){
4480     MpegEncContext * const s = &h->s;
4481     int mb_xy;
4482     int partition_count;
4483     unsigned int mb_type, cbp;
4484     int dct8x8_allowed= h->pps.transform_8x8_mode;
4485
4486     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4487
4488     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4489
4490     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4491     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4492                 down the code */
4493     if(h->slice_type_nos != FF_I_TYPE){
4494         if(s->mb_skip_run==-1)
4495             s->mb_skip_run= get_ue_golomb(&s->gb);
4496
4497         if (s->mb_skip_run--) {
4498             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4499                 if(s->mb_skip_run==0)
4500                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4501                 else
4502                     predict_field_decoding_flag(h);
4503             }
4504             decode_mb_skip(h);
4505             return 0;
4506         }
4507     }
4508     if(FRAME_MBAFF){
4509         if( (s->mb_y&1) == 0 )
4510             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4511     }else
4512         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4513
4514     h->prev_mb_skipped= 0;
4515
4516     mb_type= get_ue_golomb(&s->gb);
4517     if(h->slice_type_nos == FF_B_TYPE){
4518         if(mb_type < 23){
4519             partition_count= b_mb_type_info[mb_type].partition_count;
4520             mb_type=         b_mb_type_info[mb_type].type;
4521         }else{
4522             mb_type -= 23;
4523             goto decode_intra_mb;
4524         }
4525     }else if(h->slice_type_nos == FF_P_TYPE){
4526         if(mb_type < 5){
4527             partition_count= p_mb_type_info[mb_type].partition_count;
4528             mb_type=         p_mb_type_info[mb_type].type;
4529         }else{
4530             mb_type -= 5;
4531             goto decode_intra_mb;
4532         }
4533     }else{
4534        assert(h->slice_type_nos == FF_I_TYPE);
4535         if(h->slice_type == FF_SI_TYPE && mb_type)
4536             mb_type--;
4537 decode_intra_mb:
4538         if(mb_type > 25){
4539             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4540             return -1;
4541         }
4542         partition_count=0;
4543         cbp= i_mb_type_info[mb_type].cbp;
4544         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4545         mb_type= i_mb_type_info[mb_type].type;
4546     }
4547
4548     if(MB_FIELD)
4549         mb_type |= MB_TYPE_INTERLACED;
4550
4551     h->slice_table[ mb_xy ]= h->slice_num;
4552
4553     if(IS_INTRA_PCM(mb_type)){
4554         unsigned int x, y;
4555
4556         // We assume these blocks are very rare so we do not optimize it.
4557         align_get_bits(&s->gb);
4558
4559         // The pixels are stored in the same order as levels in h->mb array.
4560         for(y=0; y<16; y++){
4561             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4562             for(x=0; x<16; x++){
4563                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4564                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4565             }
4566         }
4567         for(y=0; y<8; y++){
4568             const int index= 256 + 4*(y&3) + 32*(y>>2);
4569             for(x=0; x<8; x++){
4570                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4571                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4572             }
4573         }
4574         for(y=0; y<8; y++){
4575             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4576             for(x=0; x<8; x++){
4577                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4578                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4579             }
4580         }
4581
4582         // In deblocking, the quantizer is 0
4583         s->current_picture.qscale_table[mb_xy]= 0;
4584         // All coeffs are present
4585         memset(h->non_zero_count[mb_xy], 16, 16);
4586
4587         s->current_picture.mb_type[mb_xy]= mb_type;
4588         return 0;
4589     }
4590
4591     if(MB_MBAFF){
4592         h->ref_count[0] <<= 1;
4593         h->ref_count[1] <<= 1;
4594     }
4595
4596     fill_caches(h, mb_type, 0);
4597
4598     //mb_pred
4599     if(IS_INTRA(mb_type)){
4600             int pred_mode;
4601 //            init_top_left_availability(h);
4602             if(IS_INTRA4x4(mb_type)){
4603                 int i;
4604                 int di = 1;
4605                 if(dct8x8_allowed && get_bits1(&s->gb)){
4606                     mb_type |= MB_TYPE_8x8DCT;
4607                     di = 4;
4608                 }
4609
4610 //                fill_intra4x4_pred_table(h);
4611                 for(i=0; i<16; i+=di){
4612                     int mode= pred_intra_mode(h, i);
4613
4614                     if(!get_bits1(&s->gb)){
4615                         const int rem_mode= get_bits(&s->gb, 3);
4616                         mode = rem_mode + (rem_mode >= mode);
4617                     }
4618
4619                     if(di==4)
4620                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4621                     else
4622                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4623                 }
4624                 write_back_intra_pred_mode(h);
4625                 if( check_intra4x4_pred_mode(h) < 0)
4626                     return -1;
4627             }else{
4628                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4629                 if(h->intra16x16_pred_mode < 0)
4630                     return -1;
4631             }
4632
4633             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4634             if(pred_mode < 0)
4635                 return -1;
4636             h->chroma_pred_mode= pred_mode;
4637     }else if(partition_count==4){
4638         int i, j, sub_partition_count[4], list, ref[2][4];
4639
4640         if(h->slice_type_nos == FF_B_TYPE){
4641             for(i=0; i<4; i++){
4642                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4643                 if(h->sub_mb_type[i] >=13){
4644                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4645                     return -1;
4646                 }
4647                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4648                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4649             }
4650             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4651                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4652                 pred_direct_motion(h, &mb_type);
4653                 h->ref_cache[0][scan8[4]] =
4654                 h->ref_cache[1][scan8[4]] =
4655                 h->ref_cache[0][scan8[12]] =
4656                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4657             }
4658         }else{
4659             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4660             for(i=0; i<4; i++){
4661                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4662                 if(h->sub_mb_type[i] >=4){
4663                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4664                     return -1;
4665                 }
4666                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4667                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4668             }
4669         }
4670
4671         for(list=0; list<h->list_count; list++){
4672             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4673             for(i=0; i<4; i++){
4674                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4675                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4676                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4677                     if(tmp>=ref_count){
4678                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4679                         return -1;
4680                     }
4681                     ref[list][i]= tmp;
4682                 }else{
4683                  //FIXME
4684                     ref[list][i] = -1;
4685                 }
4686             }
4687         }
4688
4689         if(dct8x8_allowed)
4690             dct8x8_allowed = get_dct8x8_allowed(h);
4691
4692         for(list=0; list<h->list_count; list++){
4693             for(i=0; i<4; i++){
4694                 if(IS_DIRECT(h->sub_mb_type[i])) {
4695                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4696                     continue;
4697                 }
4698                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4699                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4700
4701                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4702                     const int sub_mb_type= h->sub_mb_type[i];
4703                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4704                     for(j=0; j<sub_partition_count[i]; j++){
4705                         int mx, my;
4706                         const int index= 4*i + block_width*j;
4707                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4708                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4709                         mx += get_se_golomb(&s->gb);
4710                         my += get_se_golomb(&s->gb);
4711                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4712
4713                         if(IS_SUB_8X8(sub_mb_type)){
4714                             mv_cache[ 1 ][0]=
4715                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4716                             mv_cache[ 1 ][1]=
4717                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4718                         }else if(IS_SUB_8X4(sub_mb_type)){
4719                             mv_cache[ 1 ][0]= mx;
4720                             mv_cache[ 1 ][1]= my;
4721                         }else if(IS_SUB_4X8(sub_mb_type)){
4722                             mv_cache[ 8 ][0]= mx;
4723                             mv_cache[ 8 ][1]= my;
4724                         }
4725                         mv_cache[ 0 ][0]= mx;
4726                         mv_cache[ 0 ][1]= my;
4727                     }
4728                 }else{
4729                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4730                     p[0] = p[1]=
4731                     p[8] = p[9]= 0;
4732                 }
4733             }
4734         }
4735     }else if(IS_DIRECT(mb_type)){
4736         pred_direct_motion(h, &mb_type);
4737         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4738     }else{
4739         int list, mx, my, i;
4740          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4741         if(IS_16X16(mb_type)){
4742             for(list=0; list<h->list_count; list++){
4743                     unsigned int val;
4744                     if(IS_DIR(mb_type, 0, list)){
4745                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4746                         if(val >= h->ref_count[list]){
4747                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4748                             return -1;
4749                         }
4750                     }else
4751                         val= LIST_NOT_USED&0xFF;
4752                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4753             }
4754             for(list=0; list<h->list_count; list++){
4755                 unsigned int val;
4756                 if(IS_DIR(mb_type, 0, list)){
4757                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4758                     mx += get_se_golomb(&s->gb);
4759                     my += get_se_golomb(&s->gb);
4760                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4761
4762                     val= pack16to32(mx,my);
4763                 }else
4764                     val=0;
4765                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4766             }
4767         }
4768         else if(IS_16X8(mb_type)){
4769             for(list=0; list<h->list_count; list++){
4770                     for(i=0; i<2; i++){
4771                         unsigned int val;
4772                         if(IS_DIR(mb_type, i, list)){
4773                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4774                             if(val >= h->ref_count[list]){
4775                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4776                                 return -1;
4777                             }
4778                         }else
4779                             val= LIST_NOT_USED&0xFF;
4780                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4781                     }
4782             }
4783             for(list=0; list<h->list_count; list++){
4784                 for(i=0; i<2; i++){
4785                     unsigned int val;
4786                     if(IS_DIR(mb_type, i, list)){
4787                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4788                         mx += get_se_golomb(&s->gb);
4789                         my += get_se_golomb(&s->gb);
4790                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4791
4792                         val= pack16to32(mx,my);
4793                     }else
4794                         val=0;
4795                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4796                 }
4797             }
4798         }else{
4799             assert(IS_8X16(mb_type));
4800             for(list=0; list<h->list_count; list++){
4801                     for(i=0; i<2; i++){
4802                         unsigned int val;
4803                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4804                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4805                             if(val >= h->ref_count[list]){
4806                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4807                                 return -1;
4808                             }
4809                         }else
4810                             val= LIST_NOT_USED&0xFF;
4811                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4812                     }
4813             }
4814             for(list=0; list<h->list_count; list++){
4815                 for(i=0; i<2; i++){
4816                     unsigned int val;
4817                     if(IS_DIR(mb_type, i, list)){
4818                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4819                         mx += get_se_golomb(&s->gb);
4820                         my += get_se_golomb(&s->gb);
4821                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4822
4823                         val= pack16to32(mx,my);
4824                     }else
4825                         val=0;
4826                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4827                 }
4828             }
4829         }
4830     }
4831
4832     if(IS_INTER(mb_type))
4833         write_back_motion(h, mb_type);
4834
4835     if(!IS_INTRA16x16(mb_type)){
4836         cbp= get_ue_golomb(&s->gb);
4837         if(cbp > 47){
4838             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4839             return -1;
4840         }
4841
4842         if(IS_INTRA4x4(mb_type))
4843             cbp= golomb_to_intra4x4_cbp[cbp];
4844         else
4845             cbp= golomb_to_inter_cbp[cbp];
4846     }
4847     h->cbp = cbp;
4848
4849     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4850         if(get_bits1(&s->gb)){
4851             mb_type |= MB_TYPE_8x8DCT;
4852             h->cbp_table[mb_xy]= cbp;
4853         }
4854     }
4855     s->current_picture.mb_type[mb_xy]= mb_type;
4856
4857     if(cbp || IS_INTRA16x16(mb_type)){
4858         int i8x8, i4x4, chroma_idx;
4859         int dquant;
4860         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4861         const uint8_t *scan, *scan8x8, *dc_scan;
4862
4863 //        fill_non_zero_count_cache(h);
4864
4865         if(IS_INTERLACED(mb_type)){
4866             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4867             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4868             dc_scan= luma_dc_field_scan;
4869         }else{
4870             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4871             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4872             dc_scan= luma_dc_zigzag_scan;
4873         }
4874
4875         dquant= get_se_golomb(&s->gb);
4876
4877         if( dquant > 25 || dquant < -26 ){
4878             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4879             return -1;
4880         }
4881
4882         s->qscale += dquant;
4883         if(((unsigned)s->qscale) > 51){
4884             if(s->qscale<0) s->qscale+= 52;
4885             else            s->qscale-= 52;
4886         }
4887
4888         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4889         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4890         if(IS_INTRA16x16(mb_type)){
4891             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4892                 return -1; //FIXME continue if partitioned and other return -1 too
4893             }
4894
4895             assert((cbp&15) == 0 || (cbp&15) == 15);
4896
4897             if(cbp&15){
4898                 for(i8x8=0; i8x8<4; i8x8++){
4899                     for(i4x4=0; i4x4<4; i4x4++){
4900                         const int index= i4x4 + 4*i8x8;
4901                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4902                             return -1;
4903                         }
4904                     }
4905                 }
4906             }else{
4907                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4908             }
4909         }else{
4910             for(i8x8=0; i8x8<4; i8x8++){
4911                 if(cbp & (1<<i8x8)){
4912                     if(IS_8x8DCT(mb_type)){
4913                         DCTELEM *buf = &h->mb[64*i8x8];
4914                         uint8_t *nnz;
4915                         for(i4x4=0; i4x4<4; i4x4++){
4916                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4917                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4918                                 return -1;
4919                         }
4920                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4921                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4922                     }else{
4923                         for(i4x4=0; i4x4<4; i4x4++){
4924                             const int index= i4x4 + 4*i8x8;
4925
4926                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4927                                 return -1;
4928                             }
4929                         }
4930                     }
4931                 }else{
4932                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4933                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4934                 }
4935             }
4936         }
4937
4938         if(cbp&0x30){
4939             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4940                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4941                     return -1;
4942                 }
4943         }
4944
4945         if(cbp&0x20){
4946             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4947                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4948                 for(i4x4=0; i4x4<4; i4x4++){
4949                     const int index= 16 + 4*chroma_idx + i4x4;
4950                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4951                         return -1;
4952                     }
4953                 }
4954             }
4955         }else{
4956             uint8_t * const nnz= &h->non_zero_count_cache[0];
4957             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4958             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4959         }
4960     }else{
4961         uint8_t * const nnz= &h->non_zero_count_cache[0];
4962         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4963         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4964         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4965     }
4966     s->current_picture.qscale_table[mb_xy]= s->qscale;
4967     write_back_non_zero_count(h);
4968
4969     if(MB_MBAFF){
4970         h->ref_count[0] >>= 1;
4971         h->ref_count[1] >>= 1;
4972     }
4973
4974     return 0;
4975 }
4976
4977 static int decode_cabac_field_decoding_flag(H264Context *h) {
4978     MpegEncContext * const s = &h->s;
4979     const int mb_x = s->mb_x;
4980     const int mb_y = s->mb_y & ~1;
4981     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4982     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4983
4984     unsigned int ctx = 0;
4985
4986     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4987         ctx += 1;
4988     }
4989     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4990         ctx += 1;
4991     }
4992
4993     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4994 }
4995
4996 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4997     uint8_t *state= &h->cabac_state[ctx_base];
4998     int mb_type;
4999
5000     if(intra_slice){
5001         MpegEncContext * const s = &h->s;
5002         const int mba_xy = h->left_mb_xy[0];
5003         const int mbb_xy = h->top_mb_xy;
5004         int ctx=0;
5005         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5006             ctx++;
5007         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5008             ctx++;
5009         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5010             return 0;   /* I4x4 */
5011         state += 2;
5012     }else{
5013         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5014             return 0;   /* I4x4 */
5015     }
5016
5017     if( get_cabac_terminate( &h->cabac ) )
5018         return 25;  /* PCM */
5019
5020     mb_type = 1; /* I16x16 */
5021     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5022     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5023         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5024     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5025     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5026     return mb_type;
5027 }
5028
5029 static int decode_cabac_mb_type( H264Context *h ) {
5030     MpegEncContext * const s = &h->s;
5031
5032     if( h->slice_type_nos == FF_I_TYPE ) {
5033         return decode_cabac_intra_mb_type(h, 3, 1);
5034     } else if( h->slice_type_nos == FF_P_TYPE ) {
5035         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5036             /* P-type */
5037             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5038                 /* P_L0_D16x16, P_8x8 */
5039                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5040             } else {
5041                 /* P_L0_D8x16, P_L0_D16x8 */
5042                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5043             }
5044         } else {
5045             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5046         }
5047     } else if( h->slice_type_nos == FF_B_TYPE ) {
5048         const int mba_xy = h->left_mb_xy[0];
5049         const int mbb_xy = h->top_mb_xy;
5050         int ctx = 0;
5051         int bits;
5052
5053         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5054             ctx++;
5055         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5056             ctx++;
5057
5058         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5059             return 0; /* B_Direct_16x16 */
5060
5061         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5062             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5063         }
5064
5065         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5066         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5067         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5068         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5069         if( bits < 8 )
5070             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5071         else if( bits == 13 ) {
5072             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5073         } else if( bits == 14 )
5074             return 11; /* B_L1_L0_8x16 */
5075         else if( bits == 15 )
5076             return 22; /* B_8x8 */
5077
5078         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5079         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5080     } else {
5081         /* TODO SI/SP frames? */
5082         return -1;
5083     }
5084 }
5085
5086 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5087     MpegEncContext * const s = &h->s;
5088     int mba_xy, mbb_xy;
5089     int ctx = 0;
5090
5091     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5092         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5093         mba_xy = mb_xy - 1;
5094         if( (mb_y&1)
5095             && h->slice_table[mba_xy] == h->slice_num
5096             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5097             mba_xy += s->mb_stride;
5098         if( MB_FIELD ){
5099             mbb_xy = mb_xy - s->mb_stride;
5100             if( !(mb_y&1)
5101                 && h->slice_table[mbb_xy] == h->slice_num
5102                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5103                 mbb_xy -= s->mb_stride;
5104         }else
5105             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5106     }else{
5107         int mb_xy = h->mb_xy;
5108         mba_xy = mb_xy - 1;
5109         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5110     }
5111
5112     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5113         ctx++;
5114     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5115         ctx++;
5116
5117     if( h->slice_type_nos == FF_B_TYPE )
5118         ctx += 13;
5119     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5120 }
5121
5122 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5123     int mode = 0;
5124
5125     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5126         return pred_mode;
5127
5128     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5129     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5130     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5131
5132     if( mode >= pred_mode )
5133         return mode + 1;
5134     else
5135         return mode;
5136 }
5137
5138 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5139     const int mba_xy = h->left_mb_xy[0];
5140     const int mbb_xy = h->top_mb_xy;
5141
5142     int ctx = 0;
5143
5144     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5145     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5146         ctx++;
5147
5148     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5149         ctx++;
5150
5151     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5152         return 0;
5153
5154     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5155         return 1;
5156     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5157         return 2;
5158     else
5159         return 3;
5160 }
5161
5162 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5163     int cbp_b, cbp_a, ctx, cbp = 0;
5164
5165     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5166     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5167
5168     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5169     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5170     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5171     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5172     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5173     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5174     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5175     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5176     return cbp;
5177 }
5178 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5179     int ctx;
5180     int cbp_a, cbp_b;
5181
5182     cbp_a = (h->left_cbp>>4)&0x03;
5183     cbp_b = (h-> top_cbp>>4)&0x03;
5184
5185     ctx = 0;
5186     if( cbp_a > 0 ) ctx++;
5187     if( cbp_b > 0 ) ctx += 2;
5188     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5189         return 0;
5190
5191     ctx = 4;
5192     if( cbp_a == 2 ) ctx++;
5193     if( cbp_b == 2 ) ctx += 2;
5194     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5195 }
5196 static int decode_cabac_mb_dqp( H264Context *h) {
5197     int   ctx = 0;
5198     int   val = 0;
5199
5200     if( h->last_qscale_diff != 0 )
5201         ctx++;
5202
5203     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5204         if( ctx < 2 )
5205             ctx = 2;
5206         else
5207             ctx = 3;
5208         val++;
5209         if(val > 102) //prevent infinite loop
5210             return INT_MIN;
5211     }
5212
5213     if( val&0x01 )
5214         return (val + 1)/2;
5215     else
5216         return -(val + 1)/2;
5217 }
5218 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5219     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5220         return 0;   /* 8x8 */
5221     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5222         return 1;   /* 8x4 */
5223     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5224         return 2;   /* 4x8 */
5225     return 3;       /* 4x4 */
5226 }
5227 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5228     int type;
5229     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5230         return 0;   /* B_Direct_8x8 */
5231     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5232         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5233     type = 3;
5234     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5235         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5236             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5237         type += 4;
5238     }
5239     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5240     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5241     return type;
5242 }
5243
5244 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5245     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5246 }
5247
5248 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5249     int refa = h->ref_cache[list][scan8[n] - 1];
5250     int refb = h->ref_cache[list][scan8[n] - 8];
5251     int ref  = 0;
5252     int ctx  = 0;
5253
5254     if( h->slice_type_nos == FF_B_TYPE) {
5255         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5256             ctx++;
5257         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5258             ctx += 2;
5259     } else {
5260         if( refa > 0 )
5261             ctx++;
5262         if( refb > 0 )
5263             ctx += 2;
5264     }
5265
5266     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5267         ref++;
5268         if( ctx < 4 )
5269             ctx = 4;
5270         else
5271             ctx = 5;
5272         if(ref >= 32 /*h->ref_list[list]*/){
5273             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5274             return 0; //FIXME we should return -1 and check the return everywhere
5275         }
5276     }
5277     return ref;
5278 }
5279
5280 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5281     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5282                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5283     int ctxbase = (l == 0) ? 40 : 47;
5284     int ctx, mvd;
5285
5286     if( amvd < 3 )
5287         ctx = 0;
5288     else if( amvd > 32 )
5289         ctx = 2;
5290     else
5291         ctx = 1;
5292
5293     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5294         return 0;
5295
5296     mvd= 1;
5297     ctx= 3;
5298     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5299         mvd++;
5300         if( ctx < 6 )
5301             ctx++;
5302     }
5303
5304     if( mvd >= 9 ) {
5305         int k = 3;
5306         while( get_cabac_bypass( &h->cabac ) ) {
5307             mvd += 1 << k;
5308             k++;
5309             if(k>24){
5310                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5311                 return INT_MIN;
5312             }
5313         }
5314         while( k-- ) {
5315             if( get_cabac_bypass( &h->cabac ) )
5316                 mvd += 1 << k;
5317         }
5318     }
5319     return get_cabac_bypass_sign( &h->cabac, -mvd );
5320 }
5321
5322 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5323     int nza, nzb;
5324     int ctx = 0;
5325
5326     if( is_dc ) {
5327         if( cat == 0 ) {
5328             nza = h->left_cbp&0x100;
5329             nzb = h-> top_cbp&0x100;
5330         } else {
5331             nza = (h->left_cbp>>(6+idx))&0x01;
5332             nzb = (h-> top_cbp>>(6+idx))&0x01;
5333         }
5334     } else {
5335         if( cat == 4 ) {
5336             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5337             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5338         } else {
5339             assert(cat == 1 || cat == 2);
5340             nza = h->non_zero_count_cache[scan8[idx] - 1];
5341             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5342         }
5343     }
5344
5345     if( nza > 0 )
5346         ctx++;
5347
5348     if( nzb > 0 )
5349         ctx += 2;
5350
5351     return ctx + 4 * cat;
5352 }
5353
5354 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5355     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5356     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5357     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5358     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5359 };
5360
5361 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5362     static const int significant_coeff_flag_offset[2][6] = {
5363       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5364       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5365     };
5366     static const int last_coeff_flag_offset[2][6] = {
5367       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5368       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5369     };
5370     static const int coeff_abs_level_m1_offset[6] = {
5371         227+0, 227+10, 227+20, 227+30, 227+39, 426
5372     };
5373     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5374       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5375         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5376         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5377        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5378       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5379         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5380         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5381         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5382     };
5383     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5384      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5385      * map node ctx => cabac ctx for level=1 */
5386     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5387     /* map node ctx => cabac ctx for level>1 */
5388     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5389     static const uint8_t coeff_abs_level_transition[2][8] = {
5390     /* update node ctx after decoding a level=1 */
5391         { 1, 2, 3, 3, 4, 5, 6, 7 },
5392     /* update node ctx after decoding a level>1 */
5393         { 4, 4, 4, 4, 5, 6, 7, 7 }
5394     };
5395
5396     int index[64];
5397
5398     int av_unused last;
5399     int coeff_count = 0;
5400     int node_ctx = 0;
5401
5402     uint8_t *significant_coeff_ctx_base;
5403     uint8_t *last_coeff_ctx_base;
5404     uint8_t *abs_level_m1_ctx_base;
5405
5406 #ifndef ARCH_X86
5407 #define CABAC_ON_STACK
5408 #endif
5409 #ifdef CABAC_ON_STACK
5410 #define CC &cc
5411     CABACContext cc;
5412     cc.range     = h->cabac.range;
5413     cc.low       = h->cabac.low;
5414     cc.bytestream= h->cabac.bytestream;
5415 #else
5416 #define CC &h->cabac
5417 #endif
5418
5419
5420     /* cat: 0-> DC 16x16  n = 0
5421      *      1-> AC 16x16  n = luma4x4idx
5422      *      2-> Luma4x4   n = luma4x4idx
5423      *      3-> DC Chroma n = iCbCr
5424      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5425      *      5-> Luma8x8   n = 4 * luma8x8idx
5426      */
5427
5428     /* read coded block flag */
5429     if( is_dc || cat != 5 ) {
5430         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5431             if( !is_dc ) {
5432                 if( cat == 4 )
5433                     h->non_zero_count_cache[scan8[16+n]] = 0;
5434                 else
5435                     h->non_zero_count_cache[scan8[n]] = 0;
5436             }
5437
5438 #ifdef CABAC_ON_STACK
5439             h->cabac.range     = cc.range     ;
5440             h->cabac.low       = cc.low       ;
5441             h->cabac.bytestream= cc.bytestream;
5442 #endif
5443             return;
5444         }
5445     }
5446
5447     significant_coeff_ctx_base = h->cabac_state
5448         + significant_coeff_flag_offset[MB_FIELD][cat];
5449     last_coeff_ctx_base = h->cabac_state
5450         + last_coeff_flag_offset[MB_FIELD][cat];
5451     abs_level_m1_ctx_base = h->cabac_state
5452         + coeff_abs_level_m1_offset[cat];
5453
5454     if( !is_dc && cat == 5 ) {
5455 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5456         for(last= 0; last < coefs; last++) { \
5457             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5458             if( get_cabac( CC, sig_ctx )) { \
5459                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5460                 index[coeff_count++] = last; \
5461                 if( get_cabac( CC, last_ctx ) ) { \
5462                     last= max_coeff; \
5463                     break; \
5464                 } \
5465             } \
5466         }\
5467         if( last == max_coeff -1 ) {\
5468             index[coeff_count++] = last;\
5469         }
5470         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5471 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5472         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5473     } else {
5474         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5475 #else
5476         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5477     } else {
5478         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5479 #endif
5480     }
5481     assert(coeff_count > 0);
5482
5483     if( is_dc ) {
5484         if( cat == 0 )
5485             h->cbp_table[h->mb_xy] |= 0x100;
5486         else
5487             h->cbp_table[h->mb_xy] |= 0x40 << n;
5488     } else {
5489         if( cat == 5 )
5490             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5491         else if( cat == 4 )
5492             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5493         else {
5494             assert( cat == 1 || cat == 2 );
5495             h->non_zero_count_cache[scan8[n]] = coeff_count;
5496         }
5497     }
5498
5499     while( coeff_count-- ) {
5500         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5501
5502         int j= scantable[index[coeff_count]];
5503
5504         if( get_cabac( CC, ctx ) == 0 ) {
5505             node_ctx = coeff_abs_level_transition[0][node_ctx];
5506             if( is_dc ) {
5507                 block[j] = get_cabac_bypass_sign( CC, -1);
5508             }else{
5509                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5510             }
5511         } else {
5512             int coeff_abs = 2;
5513             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5514             node_ctx = coeff_abs_level_transition[1][node_ctx];
5515
5516             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5517                 coeff_abs++;
5518             }
5519
5520             if( coeff_abs >= 15 ) {
5521                 int j = 0;
5522                 while( get_cabac_bypass( CC ) ) {
5523                     j++;
5524                 }
5525
5526                 coeff_abs=1;
5527                 while( j-- ) {
5528                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5529                 }
5530                 coeff_abs+= 14;
5531             }
5532
5533             if( is_dc ) {
5534                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5535             }else{
5536                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5537             }
5538         }
5539     }
5540 #ifdef CABAC_ON_STACK
5541             h->cabac.range     = cc.range     ;
5542             h->cabac.low       = cc.low       ;
5543             h->cabac.bytestream= cc.bytestream;
5544 #endif
5545
5546 }
5547
5548 #ifndef CONFIG_SMALL
5549 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5550     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5551 }
5552
5553 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5554     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5555 }
5556 #endif
5557
5558 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5559 #ifdef CONFIG_SMALL
5560     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5561 #else
5562     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5563     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5564 #endif
5565 }
5566
5567 static inline void compute_mb_neighbors(H264Context *h)
5568 {
5569     MpegEncContext * const s = &h->s;
5570     const int mb_xy  = h->mb_xy;
5571     h->top_mb_xy     = mb_xy - s->mb_stride;
5572     h->left_mb_xy[0] = mb_xy - 1;
5573     if(FRAME_MBAFF){
5574         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5575         const int top_pair_xy      = pair_xy     - s->mb_stride;
5576         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5577         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5578         const int curr_mb_frame_flag = !MB_FIELD;
5579         const int bottom = (s->mb_y & 1);
5580         if (bottom
5581                 ? !curr_mb_frame_flag // bottom macroblock
5582                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5583                 ) {
5584             h->top_mb_xy -= s->mb_stride;
5585         }
5586         if (left_mb_frame_flag != curr_mb_frame_flag) {
5587             h->left_mb_xy[0] = pair_xy - 1;
5588         }
5589     } else if (FIELD_PICTURE) {
5590         h->top_mb_xy -= s->mb_stride;
5591     }
5592     return;
5593 }
5594
5595 /**
5596  * decodes a macroblock
5597  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5598  */
5599 static int decode_mb_cabac(H264Context *h) {
5600     MpegEncContext * const s = &h->s;
5601     int mb_xy;
5602     int mb_type, partition_count, cbp = 0;
5603     int dct8x8_allowed= h->pps.transform_8x8_mode;
5604
5605     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5606
5607     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5608
5609     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5610     if( h->slice_type_nos != FF_I_TYPE ) {
5611         int skip;
5612         /* a skipped mb needs the aff flag from the following mb */
5613         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5614             predict_field_decoding_flag(h);
5615         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5616             skip = h->next_mb_skipped;
5617         else
5618             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5619         /* read skip flags */
5620         if( skip ) {
5621             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5622                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5623                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5624                 if(h->next_mb_skipped)
5625                     predict_field_decoding_flag(h);
5626                 else
5627                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5628             }
5629
5630             decode_mb_skip(h);
5631
5632             h->cbp_table[mb_xy] = 0;
5633             h->chroma_pred_mode_table[mb_xy] = 0;
5634             h->last_qscale_diff = 0;
5635
5636             return 0;
5637
5638         }
5639     }
5640     if(FRAME_MBAFF){
5641         if( (s->mb_y&1) == 0 )
5642             h->mb_mbaff =
5643             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5644     }else
5645         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5646
5647     h->prev_mb_skipped = 0;
5648
5649     compute_mb_neighbors(h);
5650     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5651         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5652         return -1;
5653     }
5654
5655     if( h->slice_type_nos == FF_B_TYPE ) {
5656         if( mb_type < 23 ){
5657             partition_count= b_mb_type_info[mb_type].partition_count;
5658             mb_type=         b_mb_type_info[mb_type].type;
5659         }else{
5660             mb_type -= 23;
5661             goto decode_intra_mb;
5662         }
5663     } else if( h->slice_type_nos == FF_P_TYPE ) {
5664         if( mb_type < 5) {
5665             partition_count= p_mb_type_info[mb_type].partition_count;
5666             mb_type=         p_mb_type_info[mb_type].type;
5667         } else {
5668             mb_type -= 5;
5669             goto decode_intra_mb;
5670         }
5671     } else {
5672         if(h->slice_type == FF_SI_TYPE && mb_type)
5673             mb_type--;
5674         assert(h->slice_type_nos == FF_I_TYPE);
5675 decode_intra_mb:
5676         partition_count = 0;
5677         cbp= i_mb_type_info[mb_type].cbp;
5678         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5679         mb_type= i_mb_type_info[mb_type].type;
5680     }
5681     if(MB_FIELD)
5682         mb_type |= MB_TYPE_INTERLACED;
5683
5684     h->slice_table[ mb_xy ]= h->slice_num;
5685
5686     if(IS_INTRA_PCM(mb_type)) {
5687         const uint8_t *ptr;
5688         unsigned int x, y;
5689
5690         // We assume these blocks are very rare so we do not optimize it.
5691         // FIXME The two following lines get the bitstream position in the cabac
5692         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5693         ptr= h->cabac.bytestream;
5694         if(h->cabac.low&0x1) ptr--;
5695         if(CABAC_BITS==16){
5696             if(h->cabac.low&0x1FF) ptr--;
5697         }
5698
5699         // The pixels are stored in the same order as levels in h->mb array.
5700         for(y=0; y<16; y++){
5701             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5702             for(x=0; x<16; x++){
5703                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5704                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5705             }
5706         }
5707         for(y=0; y<8; y++){
5708             const int index= 256 + 4*(y&3) + 32*(y>>2);
5709             for(x=0; x<8; x++){
5710                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5711                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5712             }
5713         }
5714         for(y=0; y<8; y++){
5715             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5716             for(x=0; x<8; x++){
5717                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5718                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5719             }
5720         }
5721
5722         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5723
5724         // All blocks are present
5725         h->cbp_table[mb_xy] = 0x1ef;
5726         h->chroma_pred_mode_table[mb_xy] = 0;
5727         // In deblocking, the quantizer is 0
5728         s->current_picture.qscale_table[mb_xy]= 0;
5729         // All coeffs are present
5730         memset(h->non_zero_count[mb_xy], 16, 16);
5731         s->current_picture.mb_type[mb_xy]= mb_type;
5732         h->last_qscale_diff = 0;
5733         return 0;
5734     }
5735
5736     if(MB_MBAFF){
5737         h->ref_count[0] <<= 1;
5738         h->ref_count[1] <<= 1;
5739     }
5740
5741     fill_caches(h, mb_type, 0);
5742
5743     if( IS_INTRA( mb_type ) ) {
5744         int i, pred_mode;
5745         if( IS_INTRA4x4( mb_type ) ) {
5746             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5747                 mb_type |= MB_TYPE_8x8DCT;
5748                 for( i = 0; i < 16; i+=4 ) {
5749                     int pred = pred_intra_mode( h, i );
5750                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5751                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5752                 }
5753             } else {
5754                 for( i = 0; i < 16; i++ ) {
5755                     int pred = pred_intra_mode( h, i );
5756                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5757
5758                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5759                 }
5760             }
5761             write_back_intra_pred_mode(h);
5762             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5763         } else {
5764             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5765             if( h->intra16x16_pred_mode < 0 ) return -1;
5766         }
5767         h->chroma_pred_mode_table[mb_xy] =
5768         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5769
5770         pred_mode= check_intra_pred_mode( h, pred_mode );
5771         if( pred_mode < 0 ) return -1;
5772         h->chroma_pred_mode= pred_mode;
5773     } else if( partition_count == 4 ) {
5774         int i, j, sub_partition_count[4], list, ref[2][4];
5775
5776         if( h->slice_type_nos == FF_B_TYPE ) {
5777             for( i = 0; i < 4; i++ ) {
5778                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5779                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5780                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5781             }
5782             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5783                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5784                 pred_direct_motion(h, &mb_type);
5785                 h->ref_cache[0][scan8[4]] =
5786                 h->ref_cache[1][scan8[4]] =
5787                 h->ref_cache[0][scan8[12]] =
5788                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5789                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5790                     for( i = 0; i < 4; i++ )
5791                         if( IS_DIRECT(h->sub_mb_type[i]) )
5792                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5793                 }
5794             }
5795         } else {
5796             for( i = 0; i < 4; i++ ) {
5797                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5798                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5799                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5800             }
5801         }
5802
5803         for( list = 0; list < h->list_count; list++ ) {
5804                 for( i = 0; i < 4; i++ ) {
5805                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5806                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5807                         if( h->ref_count[list] > 1 )
5808                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5809                         else
5810                             ref[list][i] = 0;
5811                     } else {
5812                         ref[list][i] = -1;
5813                     }
5814                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5815                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5816                 }
5817         }
5818
5819         if(dct8x8_allowed)
5820             dct8x8_allowed = get_dct8x8_allowed(h);
5821
5822         for(list=0; list<h->list_count; list++){
5823             for(i=0; i<4; i++){
5824                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5825                 if(IS_DIRECT(h->sub_mb_type[i])){
5826                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5827                     continue;
5828                 }
5829
5830                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5831                     const int sub_mb_type= h->sub_mb_type[i];
5832                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5833                     for(j=0; j<sub_partition_count[i]; j++){
5834                         int mpx, mpy;
5835                         int mx, my;
5836                         const int index= 4*i + block_width*j;
5837                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5838                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5839                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5840
5841                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5842                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5843                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5844
5845                         if(IS_SUB_8X8(sub_mb_type)){
5846                             mv_cache[ 1 ][0]=
5847                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5848                             mv_cache[ 1 ][1]=
5849                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5850
5851                             mvd_cache[ 1 ][0]=
5852                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5853                             mvd_cache[ 1 ][1]=
5854                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5855                         }else if(IS_SUB_8X4(sub_mb_type)){
5856                             mv_cache[ 1 ][0]= mx;
5857                             mv_cache[ 1 ][1]= my;
5858
5859                             mvd_cache[ 1 ][0]= mx - mpx;
5860                             mvd_cache[ 1 ][1]= my - mpy;
5861                         }else if(IS_SUB_4X8(sub_mb_type)){
5862                             mv_cache[ 8 ][0]= mx;
5863                             mv_cache[ 8 ][1]= my;
5864
5865                             mvd_cache[ 8 ][0]= mx - mpx;
5866                             mvd_cache[ 8 ][1]= my - mpy;
5867                         }
5868                         mv_cache[ 0 ][0]= mx;
5869                         mv_cache[ 0 ][1]= my;
5870
5871                         mvd_cache[ 0 ][0]= mx - mpx;
5872                         mvd_cache[ 0 ][1]= my - mpy;
5873                     }
5874                 }else{
5875                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5876                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5877                     p[0] = p[1] = p[8] = p[9] = 0;
5878                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5879                 }
5880             }
5881         }
5882     } else if( IS_DIRECT(mb_type) ) {
5883         pred_direct_motion(h, &mb_type);
5884         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5885         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5886         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5887     } else {
5888         int list, mx, my, i, mpx, mpy;
5889         if(IS_16X16(mb_type)){
5890             for(list=0; list<h->list_count; list++){
5891                 if(IS_DIR(mb_type, 0, list)){
5892                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5893                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5894                 }else
5895                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5896             }
5897             for(list=0; list<h->list_count; list++){
5898                 if(IS_DIR(mb_type, 0, list)){
5899                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5900
5901                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5902                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5903                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5904
5905                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5906                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5907                 }else
5908                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5909             }
5910         }
5911         else if(IS_16X8(mb_type)){
5912             for(list=0; list<h->list_count; list++){
5913                     for(i=0; i<2; i++){
5914                         if(IS_DIR(mb_type, i, list)){
5915                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5916                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5917                         }else
5918                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5919                     }
5920             }
5921             for(list=0; list<h->list_count; list++){
5922                 for(i=0; i<2; i++){
5923                     if(IS_DIR(mb_type, i, list)){
5924                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5925                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5926                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5927                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5928
5929                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5930                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5931                     }else{
5932                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5933                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5934                     }
5935                 }
5936             }
5937         }else{
5938             assert(IS_8X16(mb_type));
5939             for(list=0; list<h->list_count; list++){
5940                     for(i=0; i<2; i++){
5941                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5942                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5943                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5944                         }else
5945                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5946                     }
5947             }
5948             for(list=0; list<h->list_count; list++){
5949                 for(i=0; i<2; i++){
5950                     if(IS_DIR(mb_type, i, list)){
5951                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5952                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5953                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5954
5955                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5956                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5957                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5958                     }else{
5959                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5960                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5961                     }
5962                 }
5963             }
5964         }
5965     }
5966
5967    if( IS_INTER( mb_type ) ) {
5968         h->chroma_pred_mode_table[mb_xy] = 0;
5969         write_back_motion( h, mb_type );
5970    }
5971
5972     if( !IS_INTRA16x16( mb_type ) ) {
5973         cbp  = decode_cabac_mb_cbp_luma( h );
5974         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5975     }
5976
5977     h->cbp_table[mb_xy] = h->cbp = cbp;
5978
5979     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5980         if( decode_cabac_mb_transform_size( h ) )
5981             mb_type |= MB_TYPE_8x8DCT;
5982     }
5983     s->current_picture.mb_type[mb_xy]= mb_type;
5984
5985     if( cbp || IS_INTRA16x16( mb_type ) ) {
5986         const uint8_t *scan, *scan8x8, *dc_scan;
5987         const uint32_t *qmul;
5988         int dqp;
5989
5990         if(IS_INTERLACED(mb_type)){
5991             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5992             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5993             dc_scan= luma_dc_field_scan;
5994         }else{
5995             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5996             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5997             dc_scan= luma_dc_zigzag_scan;
5998         }
5999
6000         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6001         if( dqp == INT_MIN ){
6002             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6003             return -1;
6004         }
6005         s->qscale += dqp;
6006         if(((unsigned)s->qscale) > 51){
6007             if(s->qscale<0) s->qscale+= 52;
6008             else            s->qscale-= 52;
6009         }
6010         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6011         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6012
6013         if( IS_INTRA16x16( mb_type ) ) {
6014             int i;
6015             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6016             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
6017
6018             if( cbp&15 ) {
6019                 qmul = h->dequant4_coeff[0][s->qscale];
6020                 for( i = 0; i < 16; i++ ) {
6021                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6022                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
6023                 }
6024             } else {
6025                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6026             }
6027         } else {
6028             int i8x8, i4x4;
6029             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6030                 if( cbp & (1<<i8x8) ) {
6031                     if( IS_8x8DCT(mb_type) ) {
6032                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6033                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6034                     } else {
6035                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6036                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6037                             const int index = 4*i8x8 + i4x4;
6038                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6039 //START_TIMER
6040                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6041 //STOP_TIMER("decode_residual")
6042                         }
6043                     }
6044                 } else {
6045                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6046                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6047                 }
6048             }
6049         }
6050
6051         if( cbp&0x30 ){
6052             int c;
6053             for( c = 0; c < 2; c++ ) {
6054                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6055                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6056             }
6057         }
6058
6059         if( cbp&0x20 ) {
6060             int c, i;
6061             for( c = 0; c < 2; c++ ) {
6062                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6063                 for( i = 0; i < 4; i++ ) {
6064                     const int index = 16 + 4 * c + i;
6065                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6066                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6067                 }
6068             }
6069         } else {
6070             uint8_t * const nnz= &h->non_zero_count_cache[0];
6071             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6072             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6073         }
6074     } else {
6075         uint8_t * const nnz= &h->non_zero_count_cache[0];
6076         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6077         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6078         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6079         h->last_qscale_diff = 0;
6080     }
6081
6082     s->current_picture.qscale_table[mb_xy]= s->qscale;
6083     write_back_non_zero_count(h);
6084
6085     if(MB_MBAFF){
6086         h->ref_count[0] >>= 1;
6087         h->ref_count[1] >>= 1;
6088     }
6089
6090     return 0;
6091 }
6092
6093
6094 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6095     int i, d;
6096     const int index_a = qp + h->slice_alpha_c0_offset;
6097     const int alpha = (alpha_table+52)[index_a];
6098     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6099
6100     if( bS[0] < 4 ) {
6101         int8_t tc[4];
6102         for(i=0; i<4; i++)
6103             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6104         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6105     } else {
6106         /* 16px edge length, because bS=4 is triggered by being at
6107          * the edge of an intra MB, so all 4 bS are the same */
6108             for( d = 0; d < 16; d++ ) {
6109                 const int p0 = pix[-1];
6110                 const int p1 = pix[-2];
6111                 const int p2 = pix[-3];
6112
6113                 const int q0 = pix[0];
6114                 const int q1 = pix[1];
6115                 const int q2 = pix[2];
6116
6117                 if( FFABS( p0 - q0 ) < alpha &&
6118                     FFABS( p1 - p0 ) < beta &&
6119                     FFABS( q1 - q0 ) < beta ) {
6120
6121                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6122                         if( FFABS( p2 - p0 ) < beta)
6123                         {
6124                             const int p3 = pix[-4];
6125                             /* p0', p1', p2' */
6126                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6127                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6128                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6129                         } else {
6130                             /* p0' */
6131                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6132                         }
6133                         if( FFABS( q2 - q0 ) < beta)
6134                         {
6135                             const int q3 = pix[3];
6136                             /* q0', q1', q2' */
6137                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6138                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6139                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6140                         } else {
6141                             /* q0' */
6142                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6143                         }
6144                     }else{
6145                         /* p0', q0' */
6146                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6147                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6148                     }
6149                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6150                 }
6151                 pix += stride;
6152             }
6153     }
6154 }
6155 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6156     int i;
6157     const int index_a = qp + h->slice_alpha_c0_offset;
6158     const int alpha = (alpha_table+52)[index_a];
6159     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6160
6161     if( bS[0] < 4 ) {
6162         int8_t tc[4];
6163         for(i=0; i<4; i++)
6164             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6165         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6166     } else {
6167         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6168     }
6169 }
6170
6171 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6172     int i;
6173     for( i = 0; i < 16; i++, pix += stride) {
6174         int index_a;
6175         int alpha;
6176         int beta;
6177
6178         int qp_index;
6179         int bS_index = (i >> 1);
6180         if (!MB_FIELD) {
6181             bS_index &= ~1;
6182             bS_index |= (i & 1);
6183         }
6184
6185         if( bS[bS_index] == 0 ) {
6186             continue;
6187         }
6188
6189         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6190         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6191         alpha = (alpha_table+52)[index_a];
6192         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6193
6194         if( bS[bS_index] < 4 ) {
6195             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6196             const int p0 = pix[-1];
6197             const int p1 = pix[-2];
6198             const int p2 = pix[-3];
6199             const int q0 = pix[0];
6200             const int q1 = pix[1];
6201             const int q2 = pix[2];
6202
6203             if( FFABS( p0 - q0 ) < alpha &&
6204                 FFABS( p1 - p0 ) < beta &&
6205                 FFABS( q1 - q0 ) < beta ) {
6206                 int tc = tc0;
6207                 int i_delta;
6208
6209                 if( FFABS( p2 - p0 ) < beta ) {
6210                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6211                     tc++;
6212                 }
6213                 if( FFABS( q2 - q0 ) < beta ) {
6214                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6215                     tc++;
6216                 }
6217
6218                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6219                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6220                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6221                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6222             }
6223         }else{
6224             const int p0 = pix[-1];
6225             const int p1 = pix[-2];
6226             const int p2 = pix[-3];
6227
6228             const int q0 = pix[0];
6229             const int q1 = pix[1];
6230             const int q2 = pix[2];
6231
6232             if( FFABS( p0 - q0 ) < alpha &&
6233                 FFABS( p1 - p0 ) < beta &&
6234                 FFABS( q1 - q0 ) < beta ) {
6235
6236                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6237                     if( FFABS( p2 - p0 ) < beta)
6238                     {
6239                         const int p3 = pix[-4];
6240                         /* p0', p1', p2' */
6241                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6242                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6243                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6244                     } else {
6245                         /* p0' */
6246                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6247                     }
6248                     if( FFABS( q2 - q0 ) < beta)
6249                     {
6250                         const int q3 = pix[3];
6251                         /* q0', q1', q2' */
6252                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6253                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6254                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6255                     } else {
6256                         /* q0' */
6257                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6258                     }
6259                 }else{
6260                     /* p0', q0' */
6261                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6262                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6263                 }
6264                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6265             }
6266         }
6267     }
6268 }
6269 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6270     int i;
6271     for( i = 0; i < 8; i++, pix += stride) {
6272         int index_a;
6273         int alpha;
6274         int beta;
6275
6276         int qp_index;
6277         int bS_index = i;
6278
6279         if( bS[bS_index] == 0 ) {
6280             continue;
6281         }
6282
6283         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6284         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6285         alpha = (alpha_table+52)[index_a];
6286         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6287
6288         if( bS[bS_index] < 4 ) {
6289             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6290             const int p0 = pix[-1];
6291             const int p1 = pix[-2];
6292             const int q0 = pix[0];
6293             const int q1 = pix[1];
6294
6295             if( FFABS( p0 - q0 ) < alpha &&
6296                 FFABS( p1 - p0 ) < beta &&
6297                 FFABS( q1 - q0 ) < beta ) {
6298                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6299
6300                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6301                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6302                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6303             }
6304         }else{
6305             const int p0 = pix[-1];
6306             const int p1 = pix[-2];
6307             const int q0 = pix[0];
6308             const int q1 = pix[1];
6309
6310             if( FFABS( p0 - q0 ) < alpha &&
6311                 FFABS( p1 - p0 ) < beta &&
6312                 FFABS( q1 - q0 ) < beta ) {
6313
6314                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6315                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6316                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6317             }
6318         }
6319     }
6320 }
6321
6322 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6323     int i, d;
6324     const int index_a = qp + h->slice_alpha_c0_offset;
6325     const int alpha = (alpha_table+52)[index_a];
6326     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6327     const int pix_next  = stride;
6328
6329     if( bS[0] < 4 ) {
6330         int8_t tc[4];
6331         for(i=0; i<4; i++)
6332             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6333         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6334     } else {
6335         /* 16px edge length, see filter_mb_edgev */
6336             for( d = 0; d < 16; d++ ) {
6337                 const int p0 = pix[-1*pix_next];
6338                 const int p1 = pix[-2*pix_next];
6339                 const int p2 = pix[-3*pix_next];
6340                 const int q0 = pix[0];
6341                 const int q1 = pix[1*pix_next];
6342                 const int q2 = pix[2*pix_next];
6343
6344                 if( FFABS( p0 - q0 ) < alpha &&
6345                     FFABS( p1 - p0 ) < beta &&
6346                     FFABS( q1 - q0 ) < beta ) {
6347
6348                     const int p3 = pix[-4*pix_next];
6349                     const int q3 = pix[ 3*pix_next];
6350
6351                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6352                         if( FFABS( p2 - p0 ) < beta) {
6353                             /* p0', p1', p2' */
6354                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6355                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6356                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6357                         } else {
6358                             /* p0' */
6359                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6360                         }
6361                         if( FFABS( q2 - q0 ) < beta) {
6362                             /* q0', q1', q2' */
6363                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6364                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6365                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6366                         } else {
6367                             /* q0' */
6368                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6369                         }
6370                     }else{
6371                         /* p0', q0' */
6372                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6373                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6374                     }
6375                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6376                 }
6377                 pix++;
6378             }
6379     }
6380 }
6381
6382 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6383     int i;
6384     const int index_a = qp + h->slice_alpha_c0_offset;
6385     const int alpha = (alpha_table+52)[index_a];
6386     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6387
6388     if( bS[0] < 4 ) {
6389         int8_t tc[4];
6390         for(i=0; i<4; i++)
6391             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6392         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6393     } else {
6394         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6395     }
6396 }
6397
6398 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6399     MpegEncContext * const s = &h->s;
6400     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6401     int mb_xy, mb_type;
6402     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6403
6404     mb_xy = h->mb_xy;
6405
6406     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6407 1 ||
6408        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6409                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6410         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6411         return;
6412     }
6413     assert(!FRAME_MBAFF);
6414
6415     mb_type = s->current_picture.mb_type[mb_xy];
6416     qp = s->current_picture.qscale_table[mb_xy];
6417     qp0 = s->current_picture.qscale_table[mb_xy-1];
6418     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6419     qpc = get_chroma_qp( h, 0, qp );
6420     qpc0 = get_chroma_qp( h, 0, qp0 );
6421     qpc1 = get_chroma_qp( h, 0, qp1 );
6422     qp0 = (qp + qp0 + 1) >> 1;
6423     qp1 = (qp + qp1 + 1) >> 1;
6424     qpc0 = (qpc + qpc0 + 1) >> 1;
6425     qpc1 = (qpc + qpc1 + 1) >> 1;
6426     qp_thresh = 15 - h->slice_alpha_c0_offset;
6427     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6428        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6429         return;
6430
6431     if( IS_INTRA(mb_type) ) {
6432         int16_t bS4[4] = {4,4,4,4};
6433         int16_t bS3[4] = {3,3,3,3};
6434         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6435         if( IS_8x8DCT(mb_type) ) {
6436             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6437             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6438             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6439             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6440         } else {
6441             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6442             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6443             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6444             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6445             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6446             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6447             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6448             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6449         }
6450         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6451         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6452         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6453         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6454         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6455         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6456         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6457         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6458         return;
6459     } else {
6460         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6461         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6462         int edges;
6463         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6464             edges = 4;
6465             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6466         } else {
6467             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6468                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6469             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6470                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6471                              ? 3 : 0;
6472             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6473             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6474             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6475                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6476         }
6477         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6478             bSv[0][0] = 0x0004000400040004ULL;
6479         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6480             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6481
6482 #define FILTER(hv,dir,edge)\
6483         if(bSv[dir][edge]) {\
6484             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6485             if(!(edge&1)) {\
6486                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6487                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6488             }\
6489         }
6490         if( edges == 1 ) {
6491             FILTER(v,0,0);
6492             FILTER(h,1,0);
6493         } else if( IS_8x8DCT(mb_type) ) {
6494             FILTER(v,0,0);
6495             FILTER(v,0,2);
6496             FILTER(h,1,0);
6497             FILTER(h,1,2);
6498         } else {
6499             FILTER(v,0,0);
6500             FILTER(v,0,1);
6501             FILTER(v,0,2);
6502             FILTER(v,0,3);
6503             FILTER(h,1,0);
6504             FILTER(h,1,1);
6505             FILTER(h,1,2);
6506             FILTER(h,1,3);
6507         }
6508 #undef FILTER
6509     }
6510 }
6511
6512 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6513     MpegEncContext * const s = &h->s;
6514     const int mb_xy= mb_x + mb_y*s->mb_stride;
6515     const int mb_type = s->current_picture.mb_type[mb_xy];
6516     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6517     int first_vertical_edge_done = 0;
6518     int dir;
6519
6520     //for sufficiently low qp, filtering wouldn't do anything
6521     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6522     if(!FRAME_MBAFF){
6523         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6524         int qp = s->current_picture.qscale_table[mb_xy];
6525         if(qp <= qp_thresh
6526            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6527            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6528             return;
6529         }
6530     }
6531
6532     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6533     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6534         int top_type, left_type[2];
6535         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6536         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6537         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6538
6539         if(IS_8x8DCT(top_type)){
6540             h->non_zero_count_cache[4+8*0]=
6541             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6542             h->non_zero_count_cache[6+8*0]=
6543             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6544         }
6545         if(IS_8x8DCT(left_type[0])){
6546             h->non_zero_count_cache[3+8*1]=
6547             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6548         }
6549         if(IS_8x8DCT(left_type[1])){
6550             h->non_zero_count_cache[3+8*3]=
6551             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6552         }
6553
6554         if(IS_8x8DCT(mb_type)){
6555             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6556             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
6557
6558             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6559             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
6560
6561             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6562             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
6563
6564             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6565             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
6566         }
6567     }
6568
6569     if (FRAME_MBAFF
6570             // left mb is in picture
6571             && h->slice_table[mb_xy-1] != 255
6572             // and current and left pair do not have the same interlaced type
6573             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6574             // and left mb is in the same slice if deblocking_filter == 2
6575             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6576         /* First vertical edge is different in MBAFF frames
6577          * There are 8 different bS to compute and 2 different Qp
6578          */
6579         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6580         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6581         int16_t bS[8];
6582         int qp[2];
6583         int bqp[2];
6584         int rqp[2];
6585         int mb_qp, mbn0_qp, mbn1_qp;
6586         int i;
6587         first_vertical_edge_done = 1;
6588
6589         if( IS_INTRA(mb_type) )
6590             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6591         else {
6592             for( i = 0; i < 8; i++ ) {
6593                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6594
6595                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6596                     bS[i] = 4;
6597                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6598                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6599                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6600                     bS[i] = 2;
6601                 else
6602                     bS[i] = 1;
6603             }
6604         }
6605
6606         mb_qp = s->current_picture.qscale_table[mb_xy];
6607         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6608         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6609         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6610         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6611                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6612         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6613                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6614         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6615         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6616                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6617         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6618                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6619
6620         /* Filter edge */
6621         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6622         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6623         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6624         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6625         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6626     }
6627     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6628     for( dir = 0; dir < 2; dir++ )
6629     {
6630         int edge;
6631         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6632         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6633         int (*ref2frm) [48+2] = h->ref2frm[ h->slice_num          &15 ];
6634         int (*ref2frmm)[48+2] = h->ref2frm[ h->slice_table[mbm_xy]&15 ];
6635         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6636
6637         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6638                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6639         // how often to recheck mv-based bS when iterating between edges
6640         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6641                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6642         // how often to recheck mv-based bS when iterating along each edge
6643         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6644
6645         if (first_vertical_edge_done) {
6646             start = 1;
6647             first_vertical_edge_done = 0;
6648         }
6649
6650         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6651             start = 1;
6652
6653         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6654             && !IS_INTERLACED(mb_type)
6655             && IS_INTERLACED(mbm_type)
6656             ) {
6657             // This is a special case in the norm where the filtering must
6658             // be done twice (one each of the field) even if we are in a
6659             // frame macroblock.
6660             //
6661             static const int nnz_idx[4] = {4,5,6,3};
6662             unsigned int tmp_linesize   = 2 *   linesize;
6663             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6664             int mbn_xy = mb_xy - 2 * s->mb_stride;
6665             int qp;
6666             int i, j;
6667             int16_t bS[4];
6668
6669             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6670                 if( IS_INTRA(mb_type) ||
6671                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6672                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6673                 } else {
6674                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6675                     for( i = 0; i < 4; i++ ) {
6676                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6677                             mbn_nnz[nnz_idx[i]] != 0 )
6678                             bS[i] = 2;
6679                         else
6680                             bS[i] = 1;
6681                     }
6682                 }
6683                 // Do not use s->qscale as luma quantizer because it has not the same
6684                 // value in IPCM macroblocks.
6685                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6686                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6687                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6688                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6689                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6690                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6691                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6692                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6693             }
6694
6695             start = 1;
6696         }
6697
6698         /* Calculate bS */
6699         for( edge = start; edge < edges; edge++ ) {
6700             /* mbn_xy: neighbor macroblock */
6701             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6702             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6703             int (*ref2frmn)[48+2] = edge > 0 ? ref2frm : ref2frmm;
6704             int16_t bS[4];
6705             int qp;
6706
6707             if( (edge&1) && IS_8x8DCT(mb_type) )
6708                 continue;
6709
6710             if( IS_INTRA(mb_type) ||
6711                 IS_INTRA(mbn_type) ) {
6712                 int value;
6713                 if (edge == 0) {
6714                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6715                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6716                     ) {
6717                         value = 4;
6718                     } else {
6719                         value = 3;
6720                     }
6721                 } else {
6722                     value = 3;
6723                 }
6724                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6725             } else {
6726                 int i, l;
6727                 int mv_done;
6728
6729                 if( edge & mask_edge ) {
6730                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6731                     mv_done = 1;
6732                 }
6733                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6734                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6735                     mv_done = 1;
6736                 }
6737                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6738                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6739                     int bn_idx= b_idx - (dir ? 8:1);
6740                     int v = 0;
6741
6742                     for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6743                         v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6744                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6745                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6746                     }
6747
6748                     if(h->slice_type_nos == FF_B_TYPE && v){
6749                         v=0;
6750                         for( l = 0; !v && l < 2; l++ ) {
6751                             int ln= 1-l;
6752                             v |= ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6753                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6754                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6755                         }
6756                     }
6757
6758                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6759                     mv_done = 1;
6760                 }
6761                 else
6762                     mv_done = 0;
6763
6764                 for( i = 0; i < 4; i++ ) {
6765                     int x = dir == 0 ? edge : i;
6766                     int y = dir == 0 ? i    : edge;
6767                     int b_idx= 8 + 4 + x + 8*y;
6768                     int bn_idx= b_idx - (dir ? 8:1);
6769
6770                     if( h->non_zero_count_cache[b_idx] != 0 ||
6771                         h->non_zero_count_cache[bn_idx] != 0 ) {
6772                         bS[i] = 2;
6773                     }
6774                     else if(!mv_done)
6775                     {
6776                         bS[i] = 0;
6777                         for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6778                             if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[l][h->ref_cache[l][bn_idx]+2] ||
6779                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6780                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6781                                 bS[i] = 1;
6782                                 break;
6783                             }
6784                         }
6785
6786                         if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6787                             bS[i] = 0;
6788                             for( l = 0; l < 2; l++ ) {
6789                                 int ln= 1-l;
6790                                 if( ref2frm[l][h->ref_cache[l][b_idx]+2] != ref2frmn[ln][h->ref_cache[ln][bn_idx]+2] ||
6791                                     FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6792                                     FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6793                                     bS[i] = 1;
6794                                     break;
6795                                 }
6796                             }
6797                         }
6798                     }
6799                 }
6800
6801                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6802                     continue;
6803             }
6804
6805             /* Filter edge */
6806             // Do not use s->qscale as luma quantizer because it has not the same
6807             // value in IPCM macroblocks.
6808             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6809             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6810             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6811             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6812             if( dir == 0 ) {
6813                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6814                 if( (edge&1) == 0 ) {
6815                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6816                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6817                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6818                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6819                 }
6820             } else {
6821                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6822                 if( (edge&1) == 0 ) {
6823                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6824                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6825                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6826                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6827                 }
6828             }
6829         }
6830     }
6831 }
6832
6833 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6834     MpegEncContext * const s = &h->s;
6835     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6836
6837     s->mb_skip_run= -1;
6838
6839     if( h->pps.cabac ) {
6840         int i;
6841
6842         /* realign */
6843         align_get_bits( &s->gb );
6844
6845         /* init cabac */
6846         ff_init_cabac_states( &h->cabac);
6847         ff_init_cabac_decoder( &h->cabac,
6848                                s->gb.buffer + get_bits_count(&s->gb)/8,
6849                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6850         /* calculate pre-state */
6851         for( i= 0; i < 460; i++ ) {
6852             int pre;
6853             if( h->slice_type_nos == FF_I_TYPE )
6854                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6855             else
6856                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6857
6858             if( pre <= 63 )
6859                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6860             else
6861                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6862         }
6863
6864         for(;;){
6865 //START_TIMER
6866             int ret = decode_mb_cabac(h);
6867             int eos;
6868 //STOP_TIMER("decode_mb_cabac")
6869
6870             if(ret>=0) hl_decode_mb(h);
6871
6872             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6873                 s->mb_y++;
6874
6875                 if(ret>=0) ret = decode_mb_cabac(h);
6876
6877                 if(ret>=0) hl_decode_mb(h);
6878                 s->mb_y--;
6879             }
6880             eos = get_cabac_terminate( &h->cabac );
6881
6882             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6883                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6884                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6885                 return -1;
6886             }
6887
6888             if( ++s->mb_x >= s->mb_width ) {
6889                 s->mb_x = 0;
6890                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6891                 ++s->mb_y;
6892                 if(FIELD_OR_MBAFF_PICTURE) {
6893                     ++s->mb_y;
6894                 }
6895             }
6896
6897             if( eos || s->mb_y >= s->mb_height ) {
6898                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6899                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6900                 return 0;
6901             }
6902         }
6903
6904     } else {
6905         for(;;){
6906             int ret = decode_mb_cavlc(h);
6907
6908             if(ret>=0) hl_decode_mb(h);
6909
6910             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6911                 s->mb_y++;
6912                 ret = decode_mb_cavlc(h);
6913
6914                 if(ret>=0) hl_decode_mb(h);
6915                 s->mb_y--;
6916             }
6917
6918             if(ret<0){
6919                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6920                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6921
6922                 return -1;
6923             }
6924
6925             if(++s->mb_x >= s->mb_width){
6926                 s->mb_x=0;
6927                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6928                 ++s->mb_y;
6929                 if(FIELD_OR_MBAFF_PICTURE) {
6930                     ++s->mb_y;
6931                 }
6932                 if(s->mb_y >= s->mb_height){
6933                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6934
6935                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6936                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6937
6938                         return 0;
6939                     }else{
6940                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6941
6942                         return -1;
6943                     }
6944                 }
6945             }
6946
6947             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6948                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6949                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6950                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6951
6952                     return 0;
6953                 }else{
6954                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6955
6956                     return -1;
6957                 }
6958             }
6959         }
6960     }
6961
6962 #if 0
6963     for(;s->mb_y < s->mb_height; s->mb_y++){
6964         for(;s->mb_x < s->mb_width; s->mb_x++){
6965             int ret= decode_mb(h);
6966
6967             hl_decode_mb(h);
6968
6969             if(ret<0){
6970                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6971                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6972
6973                 return -1;
6974             }
6975
6976             if(++s->mb_x >= s->mb_width){
6977                 s->mb_x=0;
6978                 if(++s->mb_y >= s->mb_height){
6979                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6980                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6981
6982                         return 0;
6983                     }else{
6984                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6985
6986                         return -1;
6987                     }
6988                 }
6989             }
6990
6991             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6992                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6993                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6994
6995                     return 0;
6996                 }else{
6997                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6998
6999                     return -1;
7000                 }
7001             }
7002         }
7003         s->mb_x=0;
7004         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7005     }
7006 #endif
7007     return -1; //not reached
7008 }
7009
7010 static int decode_unregistered_user_data(H264Context *h, int size){
7011     MpegEncContext * const s = &h->s;
7012     uint8_t user_data[16+256];
7013     int e, build, i;
7014
7015     if(size<16)
7016         return -1;
7017
7018     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7019         user_data[i]= get_bits(&s->gb, 8);
7020     }
7021
7022     user_data[i]= 0;
7023     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7024     if(e==1 && build>=0)
7025         h->x264_build= build;
7026
7027     if(s->avctx->debug & FF_DEBUG_BUGS)
7028         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7029
7030     for(; i<size; i++)
7031         skip_bits(&s->gb, 8);
7032
7033     return 0;
7034 }
7035
7036 static int decode_sei(H264Context *h){
7037     MpegEncContext * const s = &h->s;
7038
7039     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7040         int size, type;
7041
7042         type=0;
7043         do{
7044             type+= show_bits(&s->gb, 8);
7045         }while(get_bits(&s->gb, 8) == 255);
7046
7047         size=0;
7048         do{
7049             size+= show_bits(&s->gb, 8);
7050         }while(get_bits(&s->gb, 8) == 255);
7051
7052         switch(type){
7053         case 5:
7054             if(decode_unregistered_user_data(h, size) < 0)
7055                 return -1;
7056             break;
7057         default:
7058             skip_bits(&s->gb, 8*size);
7059         }
7060
7061         //FIXME check bits here
7062         align_get_bits(&s->gb);
7063     }
7064
7065     return 0;
7066 }
7067
7068 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7069     MpegEncContext * const s = &h->s;
7070     int cpb_count, i;
7071     cpb_count = get_ue_golomb(&s->gb) + 1;
7072     get_bits(&s->gb, 4); /* bit_rate_scale */
7073     get_bits(&s->gb, 4); /* cpb_size_scale */
7074     for(i=0; i<cpb_count; i++){
7075         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7076         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7077         get_bits1(&s->gb);     /* cbr_flag */
7078     }
7079     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7080     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7081     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7082     get_bits(&s->gb, 5); /* time_offset_length */
7083 }
7084
7085 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7086     MpegEncContext * const s = &h->s;
7087     int aspect_ratio_info_present_flag;
7088     unsigned int aspect_ratio_idc;
7089     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7090
7091     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7092
7093     if( aspect_ratio_info_present_flag ) {
7094         aspect_ratio_idc= get_bits(&s->gb, 8);
7095         if( aspect_ratio_idc == EXTENDED_SAR ) {
7096             sps->sar.num= get_bits(&s->gb, 16);
7097             sps->sar.den= get_bits(&s->gb, 16);
7098         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7099             sps->sar=  pixel_aspect[aspect_ratio_idc];
7100         }else{
7101             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7102             return -1;
7103         }
7104     }else{
7105         sps->sar.num=
7106         sps->sar.den= 0;
7107     }
7108 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7109
7110     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7111         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7112     }
7113
7114     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7115         get_bits(&s->gb, 3);    /* video_format */
7116         get_bits1(&s->gb);      /* video_full_range_flag */
7117         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7118             get_bits(&s->gb, 8); /* colour_primaries */
7119             get_bits(&s->gb, 8); /* transfer_characteristics */
7120             get_bits(&s->gb, 8); /* matrix_coefficients */
7121         }
7122     }
7123
7124     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7125         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7126         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7127     }
7128
7129     sps->timing_info_present_flag = get_bits1(&s->gb);
7130     if(sps->timing_info_present_flag){
7131         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7132         sps->time_scale = get_bits_long(&s->gb, 32);
7133         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7134     }
7135
7136     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7137     if(nal_hrd_parameters_present_flag)
7138         decode_hrd_parameters(h, sps);
7139     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7140     if(vcl_hrd_parameters_present_flag)
7141         decode_hrd_parameters(h, sps);
7142     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7143         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7144     get_bits1(&s->gb);         /* pic_struct_present_flag */
7145
7146     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7147     if(sps->bitstream_restriction_flag){
7148         unsigned int num_reorder_frames;
7149         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7150         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7151         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7152         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7153         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7154         num_reorder_frames= get_ue_golomb(&s->gb);
7155         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7156
7157         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7158             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7159             return -1;
7160         }
7161
7162         sps->num_reorder_frames= num_reorder_frames;
7163     }
7164
7165     return 0;
7166 }
7167
7168 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7169                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7170     MpegEncContext * const s = &h->s;
7171     int i, last = 8, next = 8;
7172     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7173     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7174         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7175     else
7176     for(i=0;i<size;i++){
7177         if(next)
7178             next = (last + get_se_golomb(&s->gb)) & 0xff;
7179         if(!i && !next){ /* matrix not written, we use the preset one */
7180             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7181             break;
7182         }
7183         last = factors[scan[i]] = next ? next : last;
7184     }
7185 }
7186
7187 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7188                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7189     MpegEncContext * const s = &h->s;
7190     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7191     const uint8_t *fallback[4] = {
7192         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7193         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7194         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7195         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7196     };
7197     if(get_bits1(&s->gb)){
7198         sps->scaling_matrix_present |= is_sps;
7199         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7200         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7201         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7202         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7203         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7204         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7205         if(is_sps || pps->transform_8x8_mode){
7206             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7207             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7208         }
7209     } else if(fallback_sps) {
7210         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7211         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7212     }
7213 }
7214
7215 /**
7216  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7217  */
7218 static void *
7219 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7220                     const size_t size, const char *name)
7221 {
7222     if(id>=max) {
7223         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7224         return NULL;
7225     }
7226
7227     if(!vec[id]) {
7228         vec[id] = av_mallocz(size);
7229         if(vec[id] == NULL)
7230             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7231     }
7232     return vec[id];
7233 }
7234
7235 static inline int decode_seq_parameter_set(H264Context *h){
7236     MpegEncContext * const s = &h->s;
7237     int profile_idc, level_idc;
7238     unsigned int sps_id, tmp, mb_width, mb_height;
7239     int i;
7240     SPS *sps;
7241
7242     profile_idc= get_bits(&s->gb, 8);
7243     get_bits1(&s->gb);   //constraint_set0_flag
7244     get_bits1(&s->gb);   //constraint_set1_flag
7245     get_bits1(&s->gb);   //constraint_set2_flag
7246     get_bits1(&s->gb);   //constraint_set3_flag
7247     get_bits(&s->gb, 4); // reserved
7248     level_idc= get_bits(&s->gb, 8);
7249     sps_id= get_ue_golomb(&s->gb);
7250
7251     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7252     if(sps == NULL)
7253         return -1;
7254
7255     sps->profile_idc= profile_idc;
7256     sps->level_idc= level_idc;
7257
7258     if(sps->profile_idc >= 100){ //high profile
7259         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7260             get_bits1(&s->gb);  //residual_color_transform_flag
7261         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7262         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7263         sps->transform_bypass = get_bits1(&s->gb);
7264         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7265     }else
7266         sps->scaling_matrix_present = 0;
7267
7268     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7269     sps->poc_type= get_ue_golomb(&s->gb);
7270
7271     if(sps->poc_type == 0){ //FIXME #define
7272         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7273     } else if(sps->poc_type == 1){//FIXME #define
7274         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7275         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7276         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7277         tmp= get_ue_golomb(&s->gb);
7278
7279         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7280             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7281             return -1;
7282         }
7283         sps->poc_cycle_length= tmp;
7284
7285         for(i=0; i<sps->poc_cycle_length; i++)
7286             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7287     }else if(sps->poc_type != 2){
7288         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7289         return -1;
7290     }
7291
7292     tmp= get_ue_golomb(&s->gb);
7293     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7294         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7295         return -1;
7296     }
7297     sps->ref_frame_count= tmp;
7298     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7299     mb_width= get_ue_golomb(&s->gb) + 1;
7300     mb_height= get_ue_golomb(&s->gb) + 1;
7301     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7302        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7303         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7304         return -1;
7305     }
7306     sps->mb_width = mb_width;
7307     sps->mb_height= mb_height;
7308
7309     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7310     if(!sps->frame_mbs_only_flag)
7311         sps->mb_aff= get_bits1(&s->gb);
7312     else
7313         sps->mb_aff= 0;
7314
7315     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7316
7317 #ifndef ALLOW_INTERLACE
7318     if(sps->mb_aff)
7319         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7320 #endif
7321     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7322         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7323
7324     sps->crop= get_bits1(&s->gb);
7325     if(sps->crop){
7326         sps->crop_left  = get_ue_golomb(&s->gb);
7327         sps->crop_right = get_ue_golomb(&s->gb);
7328         sps->crop_top   = get_ue_golomb(&s->gb);
7329         sps->crop_bottom= get_ue_golomb(&s->gb);
7330         if(sps->crop_left || sps->crop_top){
7331             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7332         }
7333         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !h->sps.frame_mbs_only_flag)){
7334             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7335         }
7336     }else{
7337         sps->crop_left  =
7338         sps->crop_right =
7339         sps->crop_top   =
7340         sps->crop_bottom= 0;
7341     }
7342
7343     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7344     if( sps->vui_parameters_present_flag )
7345         decode_vui_parameters(h, sps);
7346
7347     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7348         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7349                sps_id, sps->profile_idc, sps->level_idc,
7350                sps->poc_type,
7351                sps->ref_frame_count,
7352                sps->mb_width, sps->mb_height,
7353                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7354                sps->direct_8x8_inference_flag ? "8B8" : "",
7355                sps->crop_left, sps->crop_right,
7356                sps->crop_top, sps->crop_bottom,
7357                sps->vui_parameters_present_flag ? "VUI" : ""
7358                );
7359     }
7360     return 0;
7361 }
7362
7363 static void
7364 build_qp_table(PPS *pps, int t, int index)
7365 {
7366     int i;
7367     for(i = 0; i < 52; i++)
7368         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7369 }
7370
7371 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7372     MpegEncContext * const s = &h->s;
7373     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7374     PPS *pps;
7375
7376     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7377     if(pps == NULL)
7378         return -1;
7379
7380     tmp= get_ue_golomb(&s->gb);
7381     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7382         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7383         return -1;
7384     }
7385     pps->sps_id= tmp;
7386
7387     pps->cabac= get_bits1(&s->gb);
7388     pps->pic_order_present= get_bits1(&s->gb);
7389     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7390     if(pps->slice_group_count > 1 ){
7391         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7392         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7393         switch(pps->mb_slice_group_map_type){
7394         case 0:
7395 #if 0
7396 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7397 |    run_length[ i ]                                |1  |ue(v)   |
7398 #endif
7399             break;
7400         case 2:
7401 #if 0
7402 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7403 |{                                                  |   |        |
7404 |    top_left_mb[ i ]                               |1  |ue(v)   |
7405 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7406 |   }                                               |   |        |
7407 #endif
7408             break;
7409         case 3:
7410         case 4:
7411         case 5:
7412 #if 0
7413 |   slice_group_change_direction_flag               |1  |u(1)    |
7414 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7415 #endif
7416             break;
7417         case 6:
7418 #if 0
7419 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7420 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7421 |)                                                  |   |        |
7422 |    slice_group_id[ i ]                            |1  |u(v)    |
7423 #endif
7424             break;
7425         }
7426     }
7427     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7428     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7429     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7430         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7431         pps->ref_count[0]= pps->ref_count[1]= 1;
7432         return -1;
7433     }
7434
7435     pps->weighted_pred= get_bits1(&s->gb);
7436     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7437     pps->init_qp= get_se_golomb(&s->gb) + 26;
7438     pps->init_qs= get_se_golomb(&s->gb) + 26;
7439     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7440     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7441     pps->constrained_intra_pred= get_bits1(&s->gb);
7442     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7443
7444     pps->transform_8x8_mode= 0;
7445     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7446     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7447     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7448
7449     if(get_bits_count(&s->gb) < bit_length){
7450         pps->transform_8x8_mode= get_bits1(&s->gb);
7451         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7452         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7453     } else {
7454         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7455     }
7456
7457     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7458     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7459     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7460         h->pps.chroma_qp_diff= 1;
7461
7462     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7463         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7464                pps_id, pps->sps_id,
7465                pps->cabac ? "CABAC" : "CAVLC",
7466                pps->slice_group_count,
7467                pps->ref_count[0], pps->ref_count[1],
7468                pps->weighted_pred ? "weighted" : "",
7469                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7470                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7471                pps->constrained_intra_pred ? "CONSTR" : "",
7472                pps->redundant_pic_cnt_present ? "REDU" : "",
7473                pps->transform_8x8_mode ? "8x8DCT" : ""
7474                );
7475     }
7476
7477     return 0;
7478 }
7479
7480 /**
7481  * Call decode_slice() for each context.
7482  *
7483  * @param h h264 master context
7484  * @param context_count number of contexts to execute
7485  */
7486 static void execute_decode_slices(H264Context *h, int context_count){
7487     MpegEncContext * const s = &h->s;
7488     AVCodecContext * const avctx= s->avctx;
7489     H264Context *hx;
7490     int i;
7491
7492     if(context_count == 1) {
7493         decode_slice(avctx, h);
7494     } else {
7495         for(i = 1; i < context_count; i++) {
7496             hx = h->thread_context[i];
7497             hx->s.error_resilience = avctx->error_resilience;
7498             hx->s.error_count = 0;
7499         }
7500
7501         avctx->execute(avctx, (void *)decode_slice,
7502                        (void **)h->thread_context, NULL, context_count);
7503
7504         /* pull back stuff from slices to master context */
7505         hx = h->thread_context[context_count - 1];
7506         s->mb_x = hx->s.mb_x;
7507         s->mb_y = hx->s.mb_y;
7508         s->dropable = hx->s.dropable;
7509         s->picture_structure = hx->s.picture_structure;
7510         for(i = 1; i < context_count; i++)
7511             h->s.error_count += h->thread_context[i]->s.error_count;
7512     }
7513 }
7514
7515
7516 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7517     MpegEncContext * const s = &h->s;
7518     AVCodecContext * const avctx= s->avctx;
7519     int buf_index=0;
7520     H264Context *hx; ///< thread context
7521     int context_count = 0;
7522
7523     h->max_contexts = avctx->thread_count;
7524 #if 0
7525     int i;
7526     for(i=0; i<50; i++){
7527         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7528     }
7529 #endif
7530     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7531         h->current_slice = 0;
7532         if (!s->first_field)
7533             s->current_picture_ptr= NULL;
7534     }
7535
7536     for(;;){
7537         int consumed;
7538         int dst_length;
7539         int bit_length;
7540         const uint8_t *ptr;
7541         int i, nalsize = 0;
7542         int err;
7543
7544         if(h->is_avc) {
7545             if(buf_index >= buf_size) break;
7546             nalsize = 0;
7547             for(i = 0; i < h->nal_length_size; i++)
7548                 nalsize = (nalsize << 8) | buf[buf_index++];
7549             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7550                 if(nalsize == 1){
7551                     buf_index++;
7552                     continue;
7553                 }else{
7554                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7555                     break;
7556                 }
7557             }
7558         } else {
7559             // start code prefix search
7560             for(; buf_index + 3 < buf_size; buf_index++){
7561                 // This should always succeed in the first iteration.
7562                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7563                     break;
7564             }
7565
7566             if(buf_index+3 >= buf_size) break;
7567
7568             buf_index+=3;
7569         }
7570
7571         hx = h->thread_context[context_count];
7572
7573         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7574         if (ptr==NULL || dst_length < 0){
7575             return -1;
7576         }
7577         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7578             dst_length--;
7579         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7580
7581         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7582             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7583         }
7584
7585         if (h->is_avc && (nalsize != consumed)){
7586             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7587             consumed= nalsize;
7588         }
7589
7590         buf_index += consumed;
7591
7592         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7593            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7594             continue;
7595
7596       again:
7597         err = 0;
7598         switch(hx->nal_unit_type){
7599         case NAL_IDR_SLICE:
7600             if (h->nal_unit_type != NAL_IDR_SLICE) {
7601                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7602                 return -1;
7603             }
7604             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7605         case NAL_SLICE:
7606             init_get_bits(&hx->s.gb, ptr, bit_length);
7607             hx->intra_gb_ptr=
7608             hx->inter_gb_ptr= &hx->s.gb;
7609             hx->s.data_partitioning = 0;
7610
7611             if((err = decode_slice_header(hx, h)))
7612                break;
7613
7614             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7615             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7616                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7617                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7618                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7619                && avctx->skip_frame < AVDISCARD_ALL)
7620                 context_count++;
7621             break;
7622         case NAL_DPA:
7623             init_get_bits(&hx->s.gb, ptr, bit_length);
7624             hx->intra_gb_ptr=
7625             hx->inter_gb_ptr= NULL;
7626             hx->s.data_partitioning = 1;
7627
7628             err = decode_slice_header(hx, h);
7629             break;
7630         case NAL_DPB:
7631             init_get_bits(&hx->intra_gb, ptr, bit_length);
7632             hx->intra_gb_ptr= &hx->intra_gb;
7633             break;
7634         case NAL_DPC:
7635             init_get_bits(&hx->inter_gb, ptr, bit_length);
7636             hx->inter_gb_ptr= &hx->inter_gb;
7637
7638             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7639                && s->context_initialized
7640                && s->hurry_up < 5
7641                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7642                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7643                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7644                && avctx->skip_frame < AVDISCARD_ALL)
7645                 context_count++;
7646             break;
7647         case NAL_SEI:
7648             init_get_bits(&s->gb, ptr, bit_length);
7649             decode_sei(h);
7650             break;
7651         case NAL_SPS:
7652             init_get_bits(&s->gb, ptr, bit_length);
7653             decode_seq_parameter_set(h);
7654
7655             if(s->flags& CODEC_FLAG_LOW_DELAY)
7656                 s->low_delay=1;
7657
7658             if(avctx->has_b_frames < 2)
7659                 avctx->has_b_frames= !s->low_delay;
7660             break;
7661         case NAL_PPS:
7662             init_get_bits(&s->gb, ptr, bit_length);
7663
7664             decode_picture_parameter_set(h, bit_length);
7665
7666             break;
7667         case NAL_AUD:
7668         case NAL_END_SEQUENCE:
7669         case NAL_END_STREAM:
7670         case NAL_FILLER_DATA:
7671         case NAL_SPS_EXT:
7672         case NAL_AUXILIARY_SLICE:
7673             break;
7674         default:
7675             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7676         }
7677
7678         if(context_count == h->max_contexts) {
7679             execute_decode_slices(h, context_count);
7680             context_count = 0;
7681         }
7682
7683         if (err < 0)
7684             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7685         else if(err == 1) {
7686             /* Slice could not be decoded in parallel mode, copy down
7687              * NAL unit stuff to context 0 and restart. Note that
7688              * rbsp_buffer is not transferred, but since we no longer
7689              * run in parallel mode this should not be an issue. */
7690             h->nal_unit_type = hx->nal_unit_type;
7691             h->nal_ref_idc   = hx->nal_ref_idc;
7692             hx = h;
7693             goto again;
7694         }
7695     }
7696     if(context_count)
7697         execute_decode_slices(h, context_count);
7698     return buf_index;
7699 }
7700
7701 /**
7702  * returns the number of bytes consumed for building the current frame
7703  */
7704 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7705         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7706         if(pos+10>buf_size) pos=buf_size; // oops ;)
7707
7708         return pos;
7709 }
7710
7711 static int decode_frame(AVCodecContext *avctx,
7712                              void *data, int *data_size,
7713                              const uint8_t *buf, int buf_size)
7714 {
7715     H264Context *h = avctx->priv_data;
7716     MpegEncContext *s = &h->s;
7717     AVFrame *pict = data;
7718     int buf_index;
7719
7720     s->flags= avctx->flags;
7721     s->flags2= avctx->flags2;
7722
7723    /* end of stream, output what is still in the buffers */
7724     if (buf_size == 0) {
7725         Picture *out;
7726         int i, out_idx;
7727
7728 //FIXME factorize this with the output code below
7729         out = h->delayed_pic[0];
7730         out_idx = 0;
7731         for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7732             if(h->delayed_pic[i]->poc < out->poc){
7733                 out = h->delayed_pic[i];
7734                 out_idx = i;
7735             }
7736
7737         for(i=out_idx; h->delayed_pic[i]; i++)
7738             h->delayed_pic[i] = h->delayed_pic[i+1];
7739
7740         if(out){
7741             *data_size = sizeof(AVFrame);
7742             *pict= *(AVFrame*)out;
7743         }
7744
7745         return 0;
7746     }
7747
7748     if(h->is_avc && !h->got_avcC) {
7749         int i, cnt, nalsize;
7750         unsigned char *p = avctx->extradata;
7751         if(avctx->extradata_size < 7) {
7752             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7753             return -1;
7754         }
7755         if(*p != 1) {
7756             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7757             return -1;
7758         }
7759         /* sps and pps in the avcC always have length coded with 2 bytes,
7760            so put a fake nal_length_size = 2 while parsing them */
7761         h->nal_length_size = 2;
7762         // Decode sps from avcC
7763         cnt = *(p+5) & 0x1f; // Number of sps
7764         p += 6;
7765         for (i = 0; i < cnt; i++) {
7766             nalsize = AV_RB16(p) + 2;
7767             if(decode_nal_units(h, p, nalsize) < 0) {
7768                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7769                 return -1;
7770             }
7771             p += nalsize;
7772         }
7773         // Decode pps from avcC
7774         cnt = *(p++); // Number of pps
7775         for (i = 0; i < cnt; i++) {
7776             nalsize = AV_RB16(p) + 2;
7777             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7778                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7779                 return -1;
7780             }
7781             p += nalsize;
7782         }
7783         // Now store right nal length size, that will be use to parse all other nals
7784         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7785         // Do not reparse avcC
7786         h->got_avcC = 1;
7787     }
7788
7789     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7790         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7791             return -1;
7792     }
7793
7794     buf_index=decode_nal_units(h, buf, buf_size);
7795     if(buf_index < 0)
7796         return -1;
7797
7798     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7799         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7800         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7801         return -1;
7802     }
7803
7804     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7805         Picture *out = s->current_picture_ptr;
7806         Picture *cur = s->current_picture_ptr;
7807         int i, pics, cross_idr, out_of_order, out_idx;
7808
7809         s->mb_y= 0;
7810
7811         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7812         s->current_picture_ptr->pict_type= s->pict_type;
7813
7814         if(!s->dropable) {
7815             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7816             h->prev_poc_msb= h->poc_msb;
7817             h->prev_poc_lsb= h->poc_lsb;
7818         }
7819         h->prev_frame_num_offset= h->frame_num_offset;
7820         h->prev_frame_num= h->frame_num;
7821
7822         /*
7823          * FIXME: Error handling code does not seem to support interlaced
7824          * when slices span multiple rows
7825          * The ff_er_add_slice calls don't work right for bottom
7826          * fields; they cause massive erroneous error concealing
7827          * Error marking covers both fields (top and bottom).
7828          * This causes a mismatched s->error_count
7829          * and a bad error table. Further, the error count goes to
7830          * INT_MAX when called for bottom field, because mb_y is
7831          * past end by one (callers fault) and resync_mb_y != 0
7832          * causes problems for the first MB line, too.
7833          */
7834         if (!FIELD_PICTURE)
7835             ff_er_frame_end(s);
7836
7837         MPV_frame_end(s);
7838
7839         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7840             /* Wait for second field. */
7841             *data_size = 0;
7842
7843         } else {
7844             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7845             /* Derive top_field_first from field pocs. */
7846             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7847
7848         //FIXME do something with unavailable reference frames
7849
7850             /* Sort B-frames into display order */
7851
7852             if(h->sps.bitstream_restriction_flag
7853                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7854                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7855                 s->low_delay = 0;
7856             }
7857
7858             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7859                && !h->sps.bitstream_restriction_flag){
7860                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7861                 s->low_delay= 0;
7862             }
7863
7864             pics = 0;
7865             while(h->delayed_pic[pics]) pics++;
7866
7867             assert(pics <= MAX_DELAYED_PIC_COUNT);
7868
7869             h->delayed_pic[pics++] = cur;
7870             if(cur->reference == 0)
7871                 cur->reference = DELAYED_PIC_REF;
7872
7873             out = h->delayed_pic[0];
7874             out_idx = 0;
7875             for(i=1; h->delayed_pic[i] && h->delayed_pic[i]->poc; i++)
7876                 if(h->delayed_pic[i]->poc < out->poc){
7877                     out = h->delayed_pic[i];
7878                     out_idx = i;
7879                 }
7880             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i];
7881
7882             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7883
7884             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7885                 { }
7886             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7887                || (s->low_delay &&
7888                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7889                  || cur->pict_type == FF_B_TYPE)))
7890             {
7891                 s->low_delay = 0;
7892                 s->avctx->has_b_frames++;
7893             }
7894
7895             if(out_of_order || pics > s->avctx->has_b_frames){
7896                 out->reference &= ~DELAYED_PIC_REF;
7897                 for(i=out_idx; h->delayed_pic[i]; i++)
7898                     h->delayed_pic[i] = h->delayed_pic[i+1];
7899             }
7900             if(!out_of_order && pics > s->avctx->has_b_frames){
7901                 *data_size = sizeof(AVFrame);
7902
7903                 h->outputed_poc = out->poc;
7904                 *pict= *(AVFrame*)out;
7905             }else{
7906                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7907             }
7908         }
7909     }
7910
7911     assert(pict->data[0] || !*data_size);
7912     ff_print_debug_info(s, pict);
7913 //printf("out %d\n", (int)pict->data[0]);
7914 #if 0 //?
7915
7916     /* Return the Picture timestamp as the frame number */
7917     /* we subtract 1 because it is added on utils.c     */
7918     avctx->frame_number = s->picture_number - 1;
7919 #endif
7920     return get_consumed_bytes(s, buf_index, buf_size);
7921 }
7922 #if 0
7923 static inline void fill_mb_avail(H264Context *h){
7924     MpegEncContext * const s = &h->s;
7925     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7926
7927     if(s->mb_y){
7928         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7929         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7930         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7931     }else{
7932         h->mb_avail[0]=
7933         h->mb_avail[1]=
7934         h->mb_avail[2]= 0;
7935     }
7936     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7937     h->mb_avail[4]= 1; //FIXME move out
7938     h->mb_avail[5]= 0; //FIXME move out
7939 }
7940 #endif
7941
7942 #ifdef TEST
7943 #undef printf
7944 #undef random
7945 #define COUNT 8000
7946 #define SIZE (COUNT*40)
7947 int main(void){
7948     int i;
7949     uint8_t temp[SIZE];
7950     PutBitContext pb;
7951     GetBitContext gb;
7952 //    int int_temp[10000];
7953     DSPContext dsp;
7954     AVCodecContext avctx;
7955
7956     dsputil_init(&dsp, &avctx);
7957
7958     init_put_bits(&pb, temp, SIZE);
7959     printf("testing unsigned exp golomb\n");
7960     for(i=0; i<COUNT; i++){
7961         START_TIMER
7962         set_ue_golomb(&pb, i);
7963         STOP_TIMER("set_ue_golomb");
7964     }
7965     flush_put_bits(&pb);
7966
7967     init_get_bits(&gb, temp, 8*SIZE);
7968     for(i=0; i<COUNT; i++){
7969         int j, s;
7970
7971         s= show_bits(&gb, 24);
7972
7973         START_TIMER
7974         j= get_ue_golomb(&gb);
7975         if(j != i){
7976             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7977 //            return -1;
7978         }
7979         STOP_TIMER("get_ue_golomb");
7980     }
7981
7982
7983     init_put_bits(&pb, temp, SIZE);
7984     printf("testing signed exp golomb\n");
7985     for(i=0; i<COUNT; i++){
7986         START_TIMER
7987         set_se_golomb(&pb, i - COUNT/2);
7988         STOP_TIMER("set_se_golomb");
7989     }
7990     flush_put_bits(&pb);
7991
7992     init_get_bits(&gb, temp, 8*SIZE);
7993     for(i=0; i<COUNT; i++){
7994         int j, s;
7995
7996         s= show_bits(&gb, 24);
7997
7998         START_TIMER
7999         j= get_se_golomb(&gb);
8000         if(j != i - COUNT/2){
8001             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8002 //            return -1;
8003         }
8004         STOP_TIMER("get_se_golomb");
8005     }
8006
8007 #if 0
8008     printf("testing 4x4 (I)DCT\n");
8009
8010     DCTELEM block[16];
8011     uint8_t src[16], ref[16];
8012     uint64_t error= 0, max_error=0;
8013
8014     for(i=0; i<COUNT; i++){
8015         int j;
8016 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8017         for(j=0; j<16; j++){
8018             ref[j]= random()%255;
8019             src[j]= random()%255;
8020         }
8021
8022         h264_diff_dct_c(block, src, ref, 4);
8023
8024         //normalize
8025         for(j=0; j<16; j++){
8026 //            printf("%d ", block[j]);
8027             block[j]= block[j]*4;
8028             if(j&1) block[j]= (block[j]*4 + 2)/5;
8029             if(j&4) block[j]= (block[j]*4 + 2)/5;
8030         }
8031 //        printf("\n");
8032
8033         s->dsp.h264_idct_add(ref, block, 4);
8034 /*        for(j=0; j<16; j++){
8035             printf("%d ", ref[j]);
8036         }
8037         printf("\n");*/
8038
8039         for(j=0; j<16; j++){
8040             int diff= FFABS(src[j] - ref[j]);
8041
8042             error+= diff*diff;
8043             max_error= FFMAX(max_error, diff);
8044         }
8045     }
8046     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8047     printf("testing quantizer\n");
8048     for(qp=0; qp<52; qp++){
8049         for(i=0; i<16; i++)
8050             src1_block[i]= src2_block[i]= random()%255;
8051
8052     }
8053     printf("Testing NAL layer\n");
8054
8055     uint8_t bitstream[COUNT];
8056     uint8_t nal[COUNT*2];
8057     H264Context h;
8058     memset(&h, 0, sizeof(H264Context));
8059
8060     for(i=0; i<COUNT; i++){
8061         int zeros= i;
8062         int nal_length;
8063         int consumed;
8064         int out_length;
8065         uint8_t *out;
8066         int j;
8067
8068         for(j=0; j<COUNT; j++){
8069             bitstream[j]= (random() % 255) + 1;
8070         }
8071
8072         for(j=0; j<zeros; j++){
8073             int pos= random() % COUNT;
8074             while(bitstream[pos] == 0){
8075                 pos++;
8076                 pos %= COUNT;
8077             }
8078             bitstream[pos]=0;
8079         }
8080
8081         START_TIMER
8082
8083         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8084         if(nal_length<0){
8085             printf("encoding failed\n");
8086             return -1;
8087         }
8088
8089         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8090
8091         STOP_TIMER("NAL")
8092
8093         if(out_length != COUNT){
8094             printf("incorrect length %d %d\n", out_length, COUNT);
8095             return -1;
8096         }
8097
8098         if(consumed != nal_length){
8099             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8100             return -1;
8101         }
8102
8103         if(memcmp(bitstream, out, COUNT)){
8104             printf("mismatch\n");
8105             return -1;
8106         }
8107     }
8108 #endif
8109
8110     printf("Testing RBSP\n");
8111
8112
8113     return 0;
8114 }
8115 #endif /* TEST */
8116
8117
8118 static av_cold int decode_end(AVCodecContext *avctx)
8119 {
8120     H264Context *h = avctx->priv_data;
8121     MpegEncContext *s = &h->s;
8122
8123     av_freep(&h->rbsp_buffer[0]);
8124     av_freep(&h->rbsp_buffer[1]);
8125     free_tables(h); //FIXME cleanup init stuff perhaps
8126     MPV_common_end(s);
8127
8128 //    memset(h, 0, sizeof(H264Context));
8129
8130     return 0;
8131 }
8132
8133
8134 AVCodec h264_decoder = {
8135     "h264",
8136     CODEC_TYPE_VIDEO,
8137     CODEC_ID_H264,
8138     sizeof(H264Context),
8139     decode_init,
8140     NULL,
8141     decode_end,
8142     decode_frame,
8143     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8144     .flush= flush_dpb,
8145     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8146 };
8147
8148 #include "svq3.c"