git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 /**
  42  * Value of Picture.reference when Picture is not a reference picture, but
  43  * is held for delayed output.
  44  */
  45 #define DELAYED_PIC_REF 4
  46
  47 static VLC coeff_token_vlc[4];
  48 static VLC chroma_dc_coeff_token_vlc;
  49
  50 static VLC total_zeros_vlc[15];
  51 static VLC chroma_dc_total_zeros_vlc[3];
  52
  53 static VLC run_vlc[6];
  54 static VLC run7_vlc;
  55
  56 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  57 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  58 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  59 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  60
  61 static av_always_inline uint32_t pack16to32(int a, int b){
  62 #ifdef WORDS_BIGENDIAN
  63    return (b&0xFFFF) + (a<<16);
  64 #else
  65    return (a&0xFFFF) + (b<<16);
  66 #endif
  67 }
  68
  69 const uint8_t ff_rem6[52]={
  70 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  71 };
  72
  73 const uint8_t ff_div6[52]={
  74 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  75 };
  76
  77
  78 /**
  79  * fill a rectangle.
  80  * @param h height of the rectangle, should be a constant
  81  * @param w width of the rectangle, should be a constant
  82  * @param size the size of val (1 or 4), should be a constant
  83  */
  84 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  85     uint8_t *p= (uint8_t*)vp;
  86     assert(size==1 || size==4);
  87     assert(w<=4);
  88
  89     w      *= size;
  90     stride *= size;
  91
  92     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  93     assert((stride&(w-1))==0);
  94     if(w==2){
  95         const uint16_t v= size==4 ? val : val*0x0101;
  96         *(uint16_t*)(p + 0*stride)= v;
  97         if(h==1) return;
  98         *(uint16_t*)(p + 1*stride)= v;
  99         if(h==2) return;
 100         *(uint16_t*)(p + 2*stride)= v;
 101         *(uint16_t*)(p + 3*stride)= v;
 102     }else if(w==4){
 103         const uint32_t v= size==4 ? val : val*0x01010101;
 104         *(uint32_t*)(p + 0*stride)= v;
 105         if(h==1) return;
 106         *(uint32_t*)(p + 1*stride)= v;
 107         if(h==2) return;
 108         *(uint32_t*)(p + 2*stride)= v;
 109         *(uint32_t*)(p + 3*stride)= v;
 110     }else if(w==8){
 111     //gcc can't optimize 64bit math on x86_32
 112 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 113         const uint64_t v= val*0x0100000001ULL;
 114         *(uint64_t*)(p + 0*stride)= v;
 115         if(h==1) return;
 116         *(uint64_t*)(p + 1*stride)= v;
 117         if(h==2) return;
 118         *(uint64_t*)(p + 2*stride)= v;
 119         *(uint64_t*)(p + 3*stride)= v;
 120     }else if(w==16){
 121         const uint64_t v= val*0x0100000001ULL;
 122         *(uint64_t*)(p + 0+0*stride)= v;
 123         *(uint64_t*)(p + 8+0*stride)= v;
 124         *(uint64_t*)(p + 0+1*stride)= v;
 125         *(uint64_t*)(p + 8+1*stride)= v;
 126         if(h==2) return;
 127         *(uint64_t*)(p + 0+2*stride)= v;
 128         *(uint64_t*)(p + 8+2*stride)= v;
 129         *(uint64_t*)(p + 0+3*stride)= v;
 130         *(uint64_t*)(p + 8+3*stride)= v;
 131 #else
 132         *(uint32_t*)(p + 0+0*stride)= val;
 133         *(uint32_t*)(p + 4+0*stride)= val;
 134         if(h==1) return;
 135         *(uint32_t*)(p + 0+1*stride)= val;
 136         *(uint32_t*)(p + 4+1*stride)= val;
 137         if(h==2) return;
 138         *(uint32_t*)(p + 0+2*stride)= val;
 139         *(uint32_t*)(p + 4+2*stride)= val;
 140         *(uint32_t*)(p + 0+3*stride)= val;
 141         *(uint32_t*)(p + 4+3*stride)= val;
 142     }else if(w==16){
 143         *(uint32_t*)(p + 0+0*stride)= val;
 144         *(uint32_t*)(p + 4+0*stride)= val;
 145         *(uint32_t*)(p + 8+0*stride)= val;
 146         *(uint32_t*)(p +12+0*stride)= val;
 147         *(uint32_t*)(p + 0+1*stride)= val;
 148         *(uint32_t*)(p + 4+1*stride)= val;
 149         *(uint32_t*)(p + 8+1*stride)= val;
 150         *(uint32_t*)(p +12+1*stride)= val;
 151         if(h==2) return;
 152         *(uint32_t*)(p + 0+2*stride)= val;
 153         *(uint32_t*)(p + 4+2*stride)= val;
 154         *(uint32_t*)(p + 8+2*stride)= val;
 155         *(uint32_t*)(p +12+2*stride)= val;
 156         *(uint32_t*)(p + 0+3*stride)= val;
 157         *(uint32_t*)(p + 4+3*stride)= val;
 158         *(uint32_t*)(p + 8+3*stride)= val;
 159         *(uint32_t*)(p +12+3*stride)= val;
 160 #endif
 161     }else
 162         assert(0);
 163     assert(h==4);
 164 }
 165
 166 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 167     MpegEncContext * const s = &h->s;
 168     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 169     int topleft_xy, top_xy, topright_xy, left_xy[2];
 170     int topleft_type, top_type, topright_type, left_type[2];
 171     int left_block[8];
 172     int i;
 173
 174     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 175
 176     //FIXME deblocking could skip the intra and nnz parts.
 177     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 178         return;
 179
 180     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 181
 182     topleft_xy = top_xy - 1;
 183     topright_xy= top_xy + 1;
 184     left_xy[1] = left_xy[0] = mb_xy-1;
 185     left_block[0]= 0;
 186     left_block[1]= 1;
 187     left_block[2]= 2;
 188     left_block[3]= 3;
 189     left_block[4]= 7;
 190     left_block[5]= 10;
 191     left_block[6]= 8;
 192     left_block[7]= 11;
 193     if(FRAME_MBAFF){
 194         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 195         const int top_pair_xy      = pair_xy     - s->mb_stride;
 196         const int topleft_pair_xy  = top_pair_xy - 1;
 197         const int topright_pair_xy = top_pair_xy + 1;
 198         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 199         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 200         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 201         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 202         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 203         const int bottom = (s->mb_y & 1);
 204         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 205         if (bottom
 206                 ? !curr_mb_frame_flag // bottom macroblock
 207                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 208                 ) {
 209             top_xy -= s->mb_stride;
 210         }
 211         if (bottom
 212                 ? !curr_mb_frame_flag // bottom macroblock
 213                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 214                 ) {
 215             topleft_xy -= s->mb_stride;
 216         }
 217         if (bottom
 218                 ? !curr_mb_frame_flag // bottom macroblock
 219                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 220                 ) {
 221             topright_xy -= s->mb_stride;
 222         }
 223         if (left_mb_frame_flag != curr_mb_frame_flag) {
 224             left_xy[1] = left_xy[0] = pair_xy - 1;
 225             if (curr_mb_frame_flag) {
 226                 if (bottom) {
 227                     left_block[0]= 2;
 228                     left_block[1]= 2;
 229                     left_block[2]= 3;
 230                     left_block[3]= 3;
 231                     left_block[4]= 8;
 232                     left_block[5]= 11;
 233                     left_block[6]= 8;
 234                     left_block[7]= 11;
 235                 } else {
 236                     left_block[0]= 0;
 237                     left_block[1]= 0;
 238                     left_block[2]= 1;
 239                     left_block[3]= 1;
 240                     left_block[4]= 7;
 241                     left_block[5]= 10;
 242                     left_block[6]= 7;
 243                     left_block[7]= 10;
 244                 }
 245             } else {
 246                 left_xy[1] += s->mb_stride;
 247                 //left_block[0]= 0;
 248                 left_block[1]= 2;
 249                 left_block[2]= 0;
 250                 left_block[3]= 2;
 251                 //left_block[4]= 7;
 252                 left_block[5]= 10;
 253                 left_block[6]= 7;
 254                 left_block[7]= 10;
 255             }
 256         }
 257     }
 258
 259     h->top_mb_xy = top_xy;
 260     h->left_mb_xy[0] = left_xy[0];
 261     h->left_mb_xy[1] = left_xy[1];
 262     if(for_deblock){
 263         topleft_type = 0;
 264         topright_type = 0;
 265         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 266         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 267         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 268
 269         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 270             int list;
 271             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 272             for(i=0; i<16; i++)
 273                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 274             for(list=0; list<h->list_count; list++){
 275                 if(USES_LIST(mb_type,list)){
 276                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 277                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 278                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 279                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 280                         dst[0] = src[0];
 281                         dst[1] = src[1];
 282                         dst[2] = src[2];
 283                         dst[3] = src[3];
 284                     }
 285                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 286                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 287                     ref += h->b8_stride;
 288                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 289                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 290                 }else{
 291                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 292                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 293                 }
 294             }
 295         }
 296     }else{
 297         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 298         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 299         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 300         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 301         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 302     }
 303
 304     if(IS_INTRA(mb_type)){
 305         h->topleft_samples_available=
 306         h->top_samples_available=
 307         h->left_samples_available= 0xFFFF;
 308         h->topright_samples_available= 0xEEEA;
 309
 310         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 311             h->topleft_samples_available= 0xB3FF;
 312             h->top_samples_available= 0x33FF;
 313             h->topright_samples_available= 0x26EA;
 314         }
 315         for(i=0; i<2; i++){
 316             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 317                 h->topleft_samples_available&= 0xDF5F;
 318                 h->left_samples_available&= 0x5F5F;
 319             }
 320         }
 321
 322         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 323             h->topleft_samples_available&= 0x7FFF;
 324
 325         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 326             h->topright_samples_available&= 0xFBFF;
 327
 328         if(IS_INTRA4x4(mb_type)){
 329             if(IS_INTRA4x4(top_type)){
 330                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 331                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 332                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 333                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 334             }else{
 335                 int pred;
 336                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 337                     pred= -1;
 338                 else{
 339                     pred= 2;
 340                 }
 341                 h->intra4x4_pred_mode_cache[4+8*0]=
 342                 h->intra4x4_pred_mode_cache[5+8*0]=
 343                 h->intra4x4_pred_mode_cache[6+8*0]=
 344                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 345             }
 346             for(i=0; i<2; i++){
 347                 if(IS_INTRA4x4(left_type[i])){
 348                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 349                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 350                 }else{
 351                     int pred;
 352                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 353                         pred= -1;
 354                     else{
 355                         pred= 2;
 356                     }
 357                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 358                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 359                 }
 360             }
 361         }
 362     }
 363
 364
 365 /*
 366 0 . T T. T T T T
 367 1 L . .L . . . .
 368 2 L . .L . . . .
 369 3 . T TL . . . .
 370 4 L . .L . . . .
 371 5 L . .. . . . .
 372 */
 373 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 374     if(top_type){
 375         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 376         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 377         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 378         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 379
 380         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 381         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 382
 383         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 384         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 385
 386     }else{
 387         h->non_zero_count_cache[4+8*0]=
 388         h->non_zero_count_cache[5+8*0]=
 389         h->non_zero_count_cache[6+8*0]=
 390         h->non_zero_count_cache[7+8*0]=
 391
 392         h->non_zero_count_cache[1+8*0]=
 393         h->non_zero_count_cache[2+8*0]=
 394
 395         h->non_zero_count_cache[1+8*3]=
 396         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 397
 398     }
 399
 400     for (i=0; i<2; i++) {
 401         if(left_type[i]){
 402             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 403             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 404             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 405             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 406         }else{
 407             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 408             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 409             h->non_zero_count_cache[0+8*1 +   8*i]=
 410             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 411         }
 412     }
 413
 414     if( h->pps.cabac ) {
 415         // top_cbp
 416         if(top_type) {
 417             h->top_cbp = h->cbp_table[top_xy];
 418         } else if(IS_INTRA(mb_type)) {
 419             h->top_cbp = 0x1C0;
 420         } else {
 421             h->top_cbp = 0;
 422         }
 423         // left_cbp
 424         if (left_type[0]) {
 425             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 426         } else if(IS_INTRA(mb_type)) {
 427             h->left_cbp = 0x1C0;
 428         } else {
 429             h->left_cbp = 0;
 430         }
 431         if (left_type[0]) {
 432             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 433         }
 434         if (left_type[1]) {
 435             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 436         }
 437     }
 438
 439 #if 1
 440     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 441         int list;
 442         for(list=0; list<h->list_count; list++){
 443             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 444                 /*if(!h->mv_cache_clean[list]){
 445                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 446                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 447                     h->mv_cache_clean[list]= 1;
 448                 }*/
 449                 continue;
 450             }
 451             h->mv_cache_clean[list]= 0;
 452
 453             if(USES_LIST(top_type, list)){
 454                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 456                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 457                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 458                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 459                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 460                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 461                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 462                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 463                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 464             }else{
 465                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 466                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 467                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 468                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 469                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 470             }
 471
 472             for(i=0; i<2; i++){
 473                 int cache_idx = scan8[0] - 1 + i*2*8;
 474                 if(USES_LIST(left_type[i], list)){
 475                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 476                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 477                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 478                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 479                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 480                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 481                 }else{
 482                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 483                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 484                     h->ref_cache[list][cache_idx  ]=
 485                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 486                 }
 487             }
 488
 489             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 490                 continue;
 491
 492             if(USES_LIST(topleft_type, list)){
 493                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 494                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 495                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 496                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 497             }else{
 498                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 499                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 500             }
 501
 502             if(USES_LIST(topright_type, list)){
 503                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 504                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 505                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 506                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 507             }else{
 508                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 509                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 510             }
 511
 512             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 513                 continue;
 514
 515             h->ref_cache[list][scan8[5 ]+1] =
 516             h->ref_cache[list][scan8[7 ]+1] =
 517             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 518             h->ref_cache[list][scan8[4 ]] =
 519             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 520             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 521             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 522             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 523             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 524             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 525
 526             if( h->pps.cabac ) {
 527                 /* XXX beurk, Load mvd */
 528                 if(USES_LIST(top_type, list)){
 529                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 530                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 531                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 532                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 533                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 534                 }else{
 535                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 536                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 537                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 538                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 539                 }
 540                 if(USES_LIST(left_type[0], list)){
 541                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 542                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 543                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 544                 }else{
 545                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 546                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 547                 }
 548                 if(USES_LIST(left_type[1], list)){
 549                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 550                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 551                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 552                 }else{
 553                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 554                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 555                 }
 556                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 557                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 558                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 559                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 560                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 561
 562                 if(h->slice_type == B_TYPE){
 563                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 564
 565                     if(IS_DIRECT(top_type)){
 566                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 567                     }else if(IS_8X8(top_type)){
 568                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 569                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 570                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 571                     }else{
 572                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 573                     }
 574
 575                     if(IS_DIRECT(left_type[0]))
 576                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 577                     else if(IS_8X8(left_type[0]))
 578                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 579                     else
 580                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 581
 582                     if(IS_DIRECT(left_type[1]))
 583                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 584                     else if(IS_8X8(left_type[1]))
 585                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 586                     else
 587                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 588                 }
 589             }
 590
 591             if(FRAME_MBAFF){
 592 #define MAP_MVS\
 593                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 594                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 595                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 596                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 597                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 598                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 599                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 600                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 601                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 602                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 603                 if(MB_FIELD){
 604 #define MAP_F2F(idx, mb_type)\
 605                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 606                         h->ref_cache[list][idx] <<= 1;\
 607                         h->mv_cache[list][idx][1] /= 2;\
 608                         h->mvd_cache[list][idx][1] /= 2;\
 609                     }
 610                     MAP_MVS
 611 #undef MAP_F2F
 612                 }else{
 613 #define MAP_F2F(idx, mb_type)\
 614                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 615                         h->ref_cache[list][idx] >>= 1;\
 616                         h->mv_cache[list][idx][1] <<= 1;\
 617                         h->mvd_cache[list][idx][1] <<= 1;\
 618                     }
 619                     MAP_MVS
 620 #undef MAP_F2F
 621                 }
 622             }
 623         }
 624     }
 625 #endif
 626
 627     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 628 }
 629
 630 static inline void write_back_intra_pred_mode(H264Context *h){
 631     MpegEncContext * const s = &h->s;
 632     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 633
 634     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 635     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 636     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 637     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 638     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 639     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 640     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 641 }
 642
 643 /**
 644  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 645  */
 646 static inline int check_intra4x4_pred_mode(H264Context *h){
 647     MpegEncContext * const s = &h->s;
 648     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 649     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 650     int i;
 651
 652     if(!(h->top_samples_available&0x8000)){
 653         for(i=0; i<4; i++){
 654             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 655             if(status<0){
 656                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 657                 return -1;
 658             } else if(status){
 659                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 660             }
 661         }
 662     }
 663
 664     if(!(h->left_samples_available&0x8000)){
 665         for(i=0; i<4; i++){
 666             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 667             if(status<0){
 668                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 669                 return -1;
 670             } else if(status){
 671                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 672             }
 673         }
 674     }
 675
 676     return 0;
 677 } //FIXME cleanup like next
 678
 679 /**
 680  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 681  */
 682 static inline int check_intra_pred_mode(H264Context *h, int mode){
 683     MpegEncContext * const s = &h->s;
 684     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 685     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 686
 687     if(mode > 6U) {
 688         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 689         return -1;
 690     }
 691
 692     if(!(h->top_samples_available&0x8000)){
 693         mode= top[ mode ];
 694         if(mode<0){
 695             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 696             return -1;
 697         }
 698     }
 699
 700     if(!(h->left_samples_available&0x8000)){
 701         mode= left[ mode ];
 702         if(mode<0){
 703             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 704             return -1;
 705         }
 706     }
 707
 708     return mode;
 709 }
 710
 711 /**
 712  * gets the predicted intra4x4 prediction mode.
 713  */
 714 static inline int pred_intra_mode(H264Context *h, int n){
 715     const int index8= scan8[n];
 716     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 717     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 718     const int min= FFMIN(left, top);
 719
 720     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 721
 722     if(min<0) return DC_PRED;
 723     else      return min;
 724 }
 725
 726 static inline void write_back_non_zero_count(H264Context *h){
 727     MpegEncContext * const s = &h->s;
 728     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 729
 730     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 731     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 732     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 733     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 734     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 735     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 736     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 737
 738     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 739     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 740     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 741
 742     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 743     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 744     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 745
 746     if(FRAME_MBAFF){
 747         // store all luma nnzs, for deblocking
 748         int v = 0, i;
 749         for(i=0; i<16; i++)
 750             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 751         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 752     }
 753 }
 754
 755 /**
 756  * gets the predicted number of non zero coefficients.
 757  * @param n block index
 758  */
 759 static inline int pred_non_zero_count(H264Context *h, int n){
 760     const int index8= scan8[n];
 761     const int left= h->non_zero_count_cache[index8 - 1];
 762     const int top = h->non_zero_count_cache[index8 - 8];
 763     int i= left + top;
 764
 765     if(i<64) i= (i+1)>>1;
 766
 767     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 768
 769     return i&31;
 770 }
 771
 772 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 773     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 774     MpegEncContext *s = &h->s;
 775
 776     /* there is no consistent mapping of mvs to neighboring locations that will
 777      * make mbaff happy, so we can't move all this logic to fill_caches */
 778     if(FRAME_MBAFF){
 779         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 780         const int16_t *mv;
 781         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 782         *C = h->mv_cache[list][scan8[0]-2];
 783
 784         if(!MB_FIELD
 785            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 786             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 787             if(IS_INTERLACED(mb_types[topright_xy])){
 788 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 789                 const int x4 = X4, y4 = Y4;\
 790                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 791                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 792                     return LIST_NOT_USED;\
 793                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 794                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 795                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 796                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 797
 798                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 799             }
 800         }
 801         if(topright_ref == PART_NOT_AVAILABLE
 802            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 803            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 804             if(!MB_FIELD
 805                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 806                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 807             }
 808             if(MB_FIELD
 809                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 810                && i >= scan8[0]+8){
 811                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 812                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 813             }
 814         }
 815 #undef SET_DIAG_MV
 816     }
 817
 818     if(topright_ref != PART_NOT_AVAILABLE){
 819         *C= h->mv_cache[list][ i - 8 + part_width ];
 820         return topright_ref;
 821     }else{
 822         tprintf(s->avctx, "topright MV not available\n");
 823
 824         *C= h->mv_cache[list][ i - 8 - 1 ];
 825         return h->ref_cache[list][ i - 8 - 1 ];
 826     }
 827 }
 828
 829 /**
 830  * gets the predicted MV.
 831  * @param n the block index
 832  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 833  * @param mx the x component of the predicted motion vector
 834  * @param my the y component of the predicted motion vector
 835  */
 836 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 837     const int index8= scan8[n];
 838     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 839     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 840     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 841     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 842     const int16_t * C;
 843     int diagonal_ref, match_count;
 844
 845     assert(part_width==1 || part_width==2 || part_width==4);
 846
 847 /* mv_cache
 848   B . . A T T T T
 849   U . . L . . , .
 850   U . . L . . . .
 851   U . . L . . , .
 852   . . . L . . . .
 853 */
 854
 855     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 856     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 857     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 858     if(match_count > 1){ //most common
 859         *mx= mid_pred(A[0], B[0], C[0]);
 860         *my= mid_pred(A[1], B[1], C[1]);
 861     }else if(match_count==1){
 862         if(left_ref==ref){
 863             *mx= A[0];
 864             *my= A[1];
 865         }else if(top_ref==ref){
 866             *mx= B[0];
 867             *my= B[1];
 868         }else{
 869             *mx= C[0];
 870             *my= C[1];
 871         }
 872     }else{
 873         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 874             *mx= A[0];
 875             *my= A[1];
 876         }else{
 877             *mx= mid_pred(A[0], B[0], C[0]);
 878             *my= mid_pred(A[1], B[1], C[1]);
 879         }
 880     }
 881
 882     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 883 }
 884
 885 /**
 886  * gets the directionally predicted 16x8 MV.
 887  * @param n the block index
 888  * @param mx the x component of the predicted motion vector
 889  * @param my the y component of the predicted motion vector
 890  */
 891 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 892     if(n==0){
 893         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 894         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 895
 896         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 897
 898         if(top_ref == ref){
 899             *mx= B[0];
 900             *my= B[1];
 901             return;
 902         }
 903     }else{
 904         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 905         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 906
 907         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 908
 909         if(left_ref == ref){
 910             *mx= A[0];
 911             *my= A[1];
 912             return;
 913         }
 914     }
 915
 916     //RARE
 917     pred_motion(h, n, 4, list, ref, mx, my);
 918 }
 919
 920 /**
 921  * gets the directionally predicted 8x16 MV.
 922  * @param n the block index
 923  * @param mx the x component of the predicted motion vector
 924  * @param my the y component of the predicted motion vector
 925  */
 926 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 927     if(n==0){
 928         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 929         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 930
 931         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 932
 933         if(left_ref == ref){
 934             *mx= A[0];
 935             *my= A[1];
 936             return;
 937         }
 938     }else{
 939         const int16_t * C;
 940         int diagonal_ref;
 941
 942         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 943
 944         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 945
 946         if(diagonal_ref == ref){
 947             *mx= C[0];
 948             *my= C[1];
 949             return;
 950         }
 951     }
 952
 953     //RARE
 954     pred_motion(h, n, 2, list, ref, mx, my);
 955 }
 956
 957 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 958     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 959     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 960
 961     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 962
 963     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 964        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 965        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 966
 967         *mx = *my = 0;
 968         return;
 969     }
 970
 971     pred_motion(h, 0, 4, 0, 0, mx, my);
 972
 973     return;
 974 }
 975
 976 static inline void direct_dist_scale_factor(H264Context * const h){
 977     const int poc = h->s.current_picture_ptr->poc;
 978     const int poc1 = h->ref_list[1][0].poc;
 979     int i;
 980     for(i=0; i<h->ref_count[0]; i++){
 981         int poc0 = h->ref_list[0][i].poc;
 982         int td = av_clip(poc1 - poc0, -128, 127);
 983         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 984             h->dist_scale_factor[i] = 256;
 985         }else{
 986             int tb = av_clip(poc - poc0, -128, 127);
 987             int tx = (16384 + (FFABS(td) >> 1)) / td;
 988             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 989         }
 990     }
 991     if(FRAME_MBAFF){
 992         for(i=0; i<h->ref_count[0]; i++){
 993             h->dist_scale_factor_field[2*i] =
 994             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 995         }
 996     }
 997 }
 998 static inline void direct_ref_list_init(H264Context * const h){
 999     MpegEncContext * const s = &h->s;
1000     Picture * const ref1 = &h->ref_list[1][0];
1001     Picture * const cur = s->current_picture_ptr;
1002     int list, i, j;
1003     if(cur->pict_type == I_TYPE)
1004         cur->ref_count[0] = 0;
1005     if(cur->pict_type != B_TYPE)
1006         cur->ref_count[1] = 0;
1007     for(list=0; list<2; list++){
1008         cur->ref_count[list] = h->ref_count[list];
1009         for(j=0; j<h->ref_count[list]; j++)
1010             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1011     }
1012     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1013         return;
1014     for(list=0; list<2; list++){
1015         for(i=0; i<ref1->ref_count[list]; i++){
1016             const int poc = ref1->ref_poc[list][i];
1017             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1018             for(j=0; j<h->ref_count[list]; j++)
1019                 if(h->ref_list[list][j].poc == poc){
1020                     h->map_col_to_list0[list][i] = j;
1021                     break;
1022                 }
1023         }
1024     }
1025     if(FRAME_MBAFF){
1026         for(list=0; list<2; list++){
1027             for(i=0; i<ref1->ref_count[list]; i++){
1028                 j = h->map_col_to_list0[list][i];
1029                 h->map_col_to_list0_field[list][2*i] = 2*j;
1030                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1031             }
1032         }
1033     }
1034 }
1035
1036 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1037     MpegEncContext * const s = &h->s;
1038     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1039     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1040     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1041     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1042     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1043     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1044     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1045     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1046     const int is_b8x8 = IS_8X8(*mb_type);
1047     unsigned int sub_mb_type;
1048     int i8, i4;
1049
1050 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1051     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1052         /* FIXME save sub mb types from previous frames (or derive from MVs)
1053          * so we know exactly what block size to use */
1054         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1055         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1056     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1057         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1058         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1059     }else{
1060         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1061         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1062     }
1063     if(!is_b8x8)
1064         *mb_type |= MB_TYPE_DIRECT2;
1065     if(MB_FIELD)
1066         *mb_type |= MB_TYPE_INTERLACED;
1067
1068     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1069
1070     if(h->direct_spatial_mv_pred){
1071         int ref[2];
1072         int mv[2][2];
1073         int list;
1074
1075         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1076
1077         /* ref = min(neighbors) */
1078         for(list=0; list<2; list++){
1079             int refa = h->ref_cache[list][scan8[0] - 1];
1080             int refb = h->ref_cache[list][scan8[0] - 8];
1081             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1082             if(refc == -2)
1083                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1084             ref[list] = refa;
1085             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1086                 ref[list] = refb;
1087             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1088                 ref[list] = refc;
1089             if(ref[list] < 0)
1090                 ref[list] = -1;
1091         }
1092
1093         if(ref[0] < 0 && ref[1] < 0){
1094             ref[0] = ref[1] = 0;
1095             mv[0][0] = mv[0][1] =
1096             mv[1][0] = mv[1][1] = 0;
1097         }else{
1098             for(list=0; list<2; list++){
1099                 if(ref[list] >= 0)
1100                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1101                 else
1102                     mv[list][0] = mv[list][1] = 0;
1103             }
1104         }
1105
1106         if(ref[1] < 0){
1107             *mb_type &= ~MB_TYPE_P0L1;
1108             sub_mb_type &= ~MB_TYPE_P0L1;
1109         }else if(ref[0] < 0){
1110             *mb_type &= ~MB_TYPE_P0L0;
1111             sub_mb_type &= ~MB_TYPE_P0L0;
1112         }
1113
1114         if(IS_16X16(*mb_type)){
1115             int a=0, b=0;
1116
1117             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1118             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1119             if(!IS_INTRA(mb_type_col)
1120                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1121                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1122                        && (h->x264_build>33 || !h->x264_build)))){
1123                 if(ref[0] > 0)
1124                     a= pack16to32(mv[0][0],mv[0][1]);
1125                 if(ref[1] > 0)
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127             }else{
1128                 a= pack16to32(mv[0][0],mv[0][1]);
1129                 b= pack16to32(mv[1][0],mv[1][1]);
1130             }
1131             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1132             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1133         }else{
1134             for(i8=0; i8<4; i8++){
1135                 const int x8 = i8&1;
1136                 const int y8 = i8>>1;
1137
1138                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1139                     continue;
1140                 h->sub_mb_type[i8] = sub_mb_type;
1141
1142                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1143                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1144                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1145                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1146
1147                 /* col_zero_flag */
1148                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1149                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1150                                                   && (h->x264_build>33 || !h->x264_build)))){
1151                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1152                     if(IS_SUB_8X8(sub_mb_type)){
1153                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1154                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1155                             if(ref[0] == 0)
1156                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1157                             if(ref[1] == 0)
1158                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1159                         }
1160                     }else
1161                     for(i4=0; i4<4; i4++){
1162                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1163                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1164                             if(ref[0] == 0)
1165                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1166                             if(ref[1] == 0)
1167                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1168                         }
1169                     }
1170                 }
1171             }
1172         }
1173     }else{ /* direct temporal mv pred */
1174         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1175         const int *dist_scale_factor = h->dist_scale_factor;
1176
1177         if(FRAME_MBAFF){
1178             if(IS_INTERLACED(*mb_type)){
1179                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1180                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1181                 dist_scale_factor = h->dist_scale_factor_field;
1182             }
1183             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1184                 /* FIXME assumes direct_8x8_inference == 1 */
1185                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1186                 int mb_types_col[2];
1187                 int y_shift;
1188
1189                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1190                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1191                          | (*mb_type & MB_TYPE_INTERLACED);
1192                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1193
1194                 if(IS_INTERLACED(*mb_type)){
1195                     /* frame to field scaling */
1196                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1197                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1198                     if(s->mb_y&1){
1199                         l1ref0 -= 2*h->b8_stride;
1200                         l1ref1 -= 2*h->b8_stride;
1201                         l1mv0 -= 4*h->b_stride;
1202                         l1mv1 -= 4*h->b_stride;
1203                     }
1204                     y_shift = 0;
1205
1206                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1207                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1208                        && !is_b8x8)
1209                         *mb_type |= MB_TYPE_16x8;
1210                     else
1211                         *mb_type |= MB_TYPE_8x8;
1212                 }else{
1213                     /* field to frame scaling */
1214                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1215                      * but in MBAFF, top and bottom POC are equal */
1216                     int dy = (s->mb_y&1) ? 1 : 2;
1217                     mb_types_col[0] =
1218                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1219                     l1ref0 += dy*h->b8_stride;
1220                     l1ref1 += dy*h->b8_stride;
1221                     l1mv0 += 2*dy*h->b_stride;
1222                     l1mv1 += 2*dy*h->b_stride;
1223                     y_shift = 2;
1224
1225                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1226                        && !is_b8x8)
1227                         *mb_type |= MB_TYPE_16x16;
1228                     else
1229                         *mb_type |= MB_TYPE_8x8;
1230                 }
1231
1232                 for(i8=0; i8<4; i8++){
1233                     const int x8 = i8&1;
1234                     const int y8 = i8>>1;
1235                     int ref0, scale;
1236                     const int16_t (*l1mv)[2]= l1mv0;
1237
1238                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1239                         continue;
1240                     h->sub_mb_type[i8] = sub_mb_type;
1241
1242                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1243                     if(IS_INTRA(mb_types_col[y8])){
1244                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1245                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1246                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1247                         continue;
1248                     }
1249
1250                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1251                     if(ref0 >= 0)
1252                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1253                     else{
1254                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1255                         l1mv= l1mv1;
1256                     }
1257                     scale = dist_scale_factor[ref0];
1258                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1259
1260                     {
1261                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1262                         int my_col = (mv_col[1]<<y_shift)/2;
1263                         int mx = (scale * mv_col[0] + 128) >> 8;
1264                         int my = (scale * my_col + 128) >> 8;
1265                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1266                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1267                     }
1268                 }
1269                 return;
1270             }
1271         }
1272
1273         /* one-to-one mv scaling */
1274
1275         if(IS_16X16(*mb_type)){
1276             int ref, mv0, mv1;
1277
1278             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1279             if(IS_INTRA(mb_type_col)){
1280                 ref=mv0=mv1=0;
1281             }else{
1282                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1283                                                 : map_col_to_list0[1][l1ref1[0]];
1284                 const int scale = dist_scale_factor[ref0];
1285                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1286                 int mv_l0[2];
1287                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1288                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1289                 ref= ref0;
1290                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1291                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1292             }
1293             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1294             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1295             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1296         }else{
1297             for(i8=0; i8<4; i8++){
1298                 const int x8 = i8&1;
1299                 const int y8 = i8>>1;
1300                 int ref0, scale;
1301                 const int16_t (*l1mv)[2]= l1mv0;
1302
1303                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1304                     continue;
1305                 h->sub_mb_type[i8] = sub_mb_type;
1306                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1307                 if(IS_INTRA(mb_type_col)){
1308                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1309                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1310                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1311                     continue;
1312                 }
1313
1314                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1315                 if(ref0 >= 0)
1316                     ref0 = map_col_to_list0[0][ref0];
1317                 else{
1318                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1319                     l1mv= l1mv1;
1320                 }
1321                 scale = dist_scale_factor[ref0];
1322
1323                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1324                 if(IS_SUB_8X8(sub_mb_type)){
1325                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1326                     int mx = (scale * mv_col[0] + 128) >> 8;
1327                     int my = (scale * mv_col[1] + 128) >> 8;
1328                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1329                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1330                 }else
1331                 for(i4=0; i4<4; i4++){
1332                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1333                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1334                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1335                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1336                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1337                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1338                 }
1339             }
1340         }
1341     }
1342 }
1343
1344 static inline void write_back_motion(H264Context *h, int mb_type){
1345     MpegEncContext * const s = &h->s;
1346     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1347     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1348     int list;
1349
1350     if(!USES_LIST(mb_type, 0))
1351         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1352
1353     for(list=0; list<h->list_count; list++){
1354         int y;
1355         if(!USES_LIST(mb_type, list))
1356             continue;
1357
1358         for(y=0; y<4; y++){
1359             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1360             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1361         }
1362         if( h->pps.cabac ) {
1363             if(IS_SKIP(mb_type))
1364                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1365             else
1366             for(y=0; y<4; y++){
1367                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1368                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1369             }
1370         }
1371
1372         {
1373             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1374             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1375             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1376             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1377             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1378         }
1379     }
1380
1381     if(h->slice_type == B_TYPE && h->pps.cabac){
1382         if(IS_8X8(mb_type)){
1383             uint8_t *direct_table = &h->direct_table[b8_xy];
1384             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1385             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1386             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1387         }
1388     }
1389 }
1390
1391 /**
1392  * Decodes a network abstraction layer unit.
1393  * @param consumed is the number of bytes used as input
1394  * @param length is the length of the array
1395  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1396  * @returns decoded bytes, might be src+1 if no escapes
1397  */
1398 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1399     int i, si, di;
1400     uint8_t *dst;
1401     int bufidx;
1402
1403 //    src[0]&0x80;                //forbidden bit
1404     h->nal_ref_idc= src[0]>>5;
1405     h->nal_unit_type= src[0]&0x1F;
1406
1407     src++; length--;
1408 #if 0
1409     for(i=0; i<length; i++)
1410         printf("%2X ", src[i]);
1411 #endif
1412     for(i=0; i+1<length; i+=2){
1413         if(src[i]) continue;
1414         if(i>0 && src[i-1]==0) i--;
1415         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1416             if(src[i+2]!=3){
1417                 /* startcode, so we must be past the end */
1418                 length=i;
1419             }
1420             break;
1421         }
1422     }
1423
1424     if(i>=length-1){ //no escaped 0
1425         *dst_length= length;
1426         *consumed= length+1; //+1 for the header
1427         return src;
1428     }
1429
1430     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1431     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1432     dst= h->rbsp_buffer[bufidx];
1433
1434     if (dst == NULL){
1435         return NULL;
1436     }
1437
1438 //printf("decoding esc\n");
1439     si=di=0;
1440     while(si<length){
1441         //remove escapes (very rare 1:2^22)
1442         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1443             if(src[si+2]==3){ //escape
1444                 dst[di++]= 0;
1445                 dst[di++]= 0;
1446                 si+=3;
1447                 continue;
1448             }else //next start code
1449                 break;
1450         }
1451
1452         dst[di++]= src[si++];
1453     }
1454
1455     *dst_length= di;
1456     *consumed= si + 1;//+1 for the header
1457 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1458     return dst;
1459 }
1460
1461 /**
1462  * identifies the exact end of the bitstream
1463  * @return the length of the trailing, or 0 if damaged
1464  */
1465 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1466     int v= *src;
1467     int r;
1468
1469     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1470
1471     for(r=1; r<9; r++){
1472         if(v&1) return r;
1473         v>>=1;
1474     }
1475     return 0;
1476 }
1477
1478 /**
1479  * idct tranforms the 16 dc values and dequantize them.
1480  * @param qp quantization parameter
1481  */
1482 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1483 #define stride 16
1484     int i;
1485     int temp[16]; //FIXME check if this is a good idea
1486     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1487     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1488
1489 //memset(block, 64, 2*256);
1490 //return;
1491     for(i=0; i<4; i++){
1492         const int offset= y_offset[i];
1493         const int z0= block[offset+stride*0] + block[offset+stride*4];
1494         const int z1= block[offset+stride*0] - block[offset+stride*4];
1495         const int z2= block[offset+stride*1] - block[offset+stride*5];
1496         const int z3= block[offset+stride*1] + block[offset+stride*5];
1497
1498         temp[4*i+0]= z0+z3;
1499         temp[4*i+1]= z1+z2;
1500         temp[4*i+2]= z1-z2;
1501         temp[4*i+3]= z0-z3;
1502     }
1503
1504     for(i=0; i<4; i++){
1505         const int offset= x_offset[i];
1506         const int z0= temp[4*0+i] + temp[4*2+i];
1507         const int z1= temp[4*0+i] - temp[4*2+i];
1508         const int z2= temp[4*1+i] - temp[4*3+i];
1509         const int z3= temp[4*1+i] + temp[4*3+i];
1510
1511         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1512         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1513         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1514         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1515     }
1516 }
1517
1518 #if 0
1519 /**
1520  * dct tranforms the 16 dc values.
1521  * @param qp quantization parameter ??? FIXME
1522  */
1523 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1524 //    const int qmul= dequant_coeff[qp][0];
1525     int i;
1526     int temp[16]; //FIXME check if this is a good idea
1527     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1528     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1529
1530     for(i=0; i<4; i++){
1531         const int offset= y_offset[i];
1532         const int z0= block[offset+stride*0] + block[offset+stride*4];
1533         const int z1= block[offset+stride*0] - block[offset+stride*4];
1534         const int z2= block[offset+stride*1] - block[offset+stride*5];
1535         const int z3= block[offset+stride*1] + block[offset+stride*5];
1536
1537         temp[4*i+0]= z0+z3;
1538         temp[4*i+1]= z1+z2;
1539         temp[4*i+2]= z1-z2;
1540         temp[4*i+3]= z0-z3;
1541     }
1542
1543     for(i=0; i<4; i++){
1544         const int offset= x_offset[i];
1545         const int z0= temp[4*0+i] + temp[4*2+i];
1546         const int z1= temp[4*0+i] - temp[4*2+i];
1547         const int z2= temp[4*1+i] - temp[4*3+i];
1548         const int z3= temp[4*1+i] + temp[4*3+i];
1549
1550         block[stride*0 +offset]= (z0 + z3)>>1;
1551         block[stride*2 +offset]= (z1 + z2)>>1;
1552         block[stride*8 +offset]= (z1 - z2)>>1;
1553         block[stride*10+offset]= (z0 - z3)>>1;
1554     }
1555 }
1556 #endif
1557
1558 #undef xStride
1559 #undef stride
1560
1561 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1577     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1578     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1579     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1580 }
1581
1582 #if 0
1583 static void chroma_dc_dct_c(DCTELEM *block){
1584     const int stride= 16*2;
1585     const int xStride= 16;
1586     int a,b,c,d,e;
1587
1588     a= block[stride*0 + xStride*0];
1589     b= block[stride*0 + xStride*1];
1590     c= block[stride*1 + xStride*0];
1591     d= block[stride*1 + xStride*1];
1592
1593     e= a-b;
1594     a= a+b;
1595     b= c-d;
1596     c= c+d;
1597
1598     block[stride*0 + xStride*0]= (a+c);
1599     block[stride*0 + xStride*1]= (e+b);
1600     block[stride*1 + xStride*0]= (a-c);
1601     block[stride*1 + xStride*1]= (e-b);
1602 }
1603 #endif
1604
1605 /**
1606  * gets the chroma qp.
1607  */
1608 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1609     return h->pps.chroma_qp_table[t][qscale & 0xff];
1610 }
1611
1612 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1613 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1614 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1615     int i;
1616     const int * const quant_table= quant_coeff[qscale];
1617     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1618     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1619     const unsigned int threshold2= (threshold1<<1);
1620     int last_non_zero;
1621
1622     if(separate_dc){
1623         if(qscale<=18){
1624             //avoid overflows
1625             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1626             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1627             const unsigned int dc_threshold2= (dc_threshold1<<1);
1628
1629             int level= block[0]*quant_coeff[qscale+18][0];
1630             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1631                 if(level>0){
1632                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1633                     block[0]= level;
1634                 }else{
1635                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1636                     block[0]= -level;
1637                 }
1638 //                last_non_zero = i;
1639             }else{
1640                 block[0]=0;
1641             }
1642         }else{
1643             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1644             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1645             const unsigned int dc_threshold2= (dc_threshold1<<1);
1646
1647             int level= block[0]*quant_table[0];
1648             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1649                 if(level>0){
1650                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1651                     block[0]= level;
1652                 }else{
1653                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1654                     block[0]= -level;
1655                 }
1656 //                last_non_zero = i;
1657             }else{
1658                 block[0]=0;
1659             }
1660         }
1661         last_non_zero= 0;
1662         i=1;
1663     }else{
1664         last_non_zero= -1;
1665         i=0;
1666     }
1667
1668     for(; i<16; i++){
1669         const int j= scantable[i];
1670         int level= block[j]*quant_table[j];
1671
1672 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1673 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1674         if(((unsigned)(level+threshold1))>threshold2){
1675             if(level>0){
1676                 level= (bias + level)>>QUANT_SHIFT;
1677                 block[j]= level;
1678             }else{
1679                 level= (bias - level)>>QUANT_SHIFT;
1680                 block[j]= -level;
1681             }
1682             last_non_zero = i;
1683         }else{
1684             block[j]=0;
1685         }
1686     }
1687
1688     return last_non_zero;
1689 }
1690
1691 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1692                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1693                            int src_x_offset, int src_y_offset,
1694                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1695     MpegEncContext * const s = &h->s;
1696     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1697     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1698     const int luma_xy= (mx&3) + ((my&3)<<2);
1699     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1700     uint8_t * src_cb, * src_cr;
1701     int extra_width= h->emu_edge_width;
1702     int extra_height= h->emu_edge_height;
1703     int emu=0;
1704     const int full_mx= mx>>2;
1705     const int full_my= my>>2;
1706     const int pic_width  = 16*s->mb_width;
1707     const int pic_height = 16*s->mb_height >> MB_FIELD;
1708
1709     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1710         return;
1711
1712     if(mx&7) extra_width -= 3;
1713     if(my&7) extra_height -= 3;
1714
1715     if(   full_mx < 0-extra_width
1716        || full_my < 0-extra_height
1717        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1718        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1719         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1720             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1721         emu=1;
1722     }
1723
1724     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1725     if(!square){
1726         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1727     }
1728
1729     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1730
1731     if(MB_FIELD){
1732         // chroma offset when predicting from a field of opposite parity
1733         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1734         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1735     }
1736     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1737     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1738
1739     if(emu){
1740         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1741             src_cb= s->edge_emu_buffer;
1742     }
1743     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1744
1745     if(emu){
1746         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1747             src_cr= s->edge_emu_buffer;
1748     }
1749     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1750 }
1751
1752 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1753                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1754                            int x_offset, int y_offset,
1755                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1756                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1757                            int list0, int list1){
1758     MpegEncContext * const s = &h->s;
1759     qpel_mc_func *qpix_op=  qpix_put;
1760     h264_chroma_mc_func chroma_op= chroma_put;
1761
1762     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1763     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1764     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1765     x_offset += 8*s->mb_x;
1766     y_offset += 8*(s->mb_y >> MB_FIELD);
1767
1768     if(list0){
1769         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1770         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1771                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1772                            qpix_op, chroma_op);
1773
1774         qpix_op=  qpix_avg;
1775         chroma_op= chroma_avg;
1776     }
1777
1778     if(list1){
1779         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1780         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1781                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1782                            qpix_op, chroma_op);
1783     }
1784 }
1785
1786 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1787                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1788                            int x_offset, int y_offset,
1789                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1790                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1791                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1792                            int list0, int list1){
1793     MpegEncContext * const s = &h->s;
1794
1795     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1796     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1797     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1798     x_offset += 8*s->mb_x;
1799     y_offset += 8*(s->mb_y >> MB_FIELD);
1800
1801     if(list0 && list1){
1802         /* don't optimize for luma-only case, since B-frames usually
1803          * use implicit weights => chroma too. */
1804         uint8_t *tmp_cb = s->obmc_scratchpad;
1805         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1806         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1807         int refn0 = h->ref_cache[0][ scan8[n] ];
1808         int refn1 = h->ref_cache[1][ scan8[n] ];
1809
1810         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1811                     dest_y, dest_cb, dest_cr,
1812                     x_offset, y_offset, qpix_put, chroma_put);
1813         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1814                     tmp_y, tmp_cb, tmp_cr,
1815                     x_offset, y_offset, qpix_put, chroma_put);
1816
1817         if(h->use_weight == 2){
1818             int weight0 = h->implicit_weight[refn0][refn1];
1819             int weight1 = 64 - weight0;
1820             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1821             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1822             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1823         }else{
1824             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1825                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1826                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1827             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1828                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1829                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1830             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1831                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1832                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1833         }
1834     }else{
1835         int list = list1 ? 1 : 0;
1836         int refn = h->ref_cache[list][ scan8[n] ];
1837         Picture *ref= &h->ref_list[list][refn];
1838         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1839                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1840                     qpix_put, chroma_put);
1841
1842         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1843                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1844         if(h->use_weight_chroma){
1845             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1846                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1847             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1848                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1849         }
1850     }
1851 }
1852
1853 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1854                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1855                            int x_offset, int y_offset,
1856                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1857                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1858                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1859                            int list0, int list1){
1860     if((h->use_weight==2 && list0 && list1
1861         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1862        || h->use_weight==1)
1863         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1864                          x_offset, y_offset, qpix_put, chroma_put,
1865                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1866     else
1867         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1868                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1869 }
1870
1871 static inline void prefetch_motion(H264Context *h, int list){
1872     /* fetch pixels for estimated mv 4 macroblocks ahead
1873      * optimized for 64byte cache lines */
1874     MpegEncContext * const s = &h->s;
1875     const int refn = h->ref_cache[list][scan8[0]];
1876     if(refn >= 0){
1877         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1878         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1879         uint8_t **src= h->ref_list[list][refn].data;
1880         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1881         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1882         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1883         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1884     }
1885 }
1886
1887 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1888                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1889                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1890                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1891     MpegEncContext * const s = &h->s;
1892     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1893     const int mb_type= s->current_picture.mb_type[mb_xy];
1894
1895     assert(IS_INTER(mb_type));
1896
1897     prefetch_motion(h, 0);
1898
1899     if(IS_16X16(mb_type)){
1900         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1901                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1902                 &weight_op[0], &weight_avg[0],
1903                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1904     }else if(IS_16X8(mb_type)){
1905         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1906                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1907                 &weight_op[1], &weight_avg[1],
1908                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1909         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1910                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1911                 &weight_op[1], &weight_avg[1],
1912                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1913     }else if(IS_8X16(mb_type)){
1914         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1915                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1916                 &weight_op[2], &weight_avg[2],
1917                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1918         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1919                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1920                 &weight_op[2], &weight_avg[2],
1921                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1922     }else{
1923         int i;
1924
1925         assert(IS_8X8(mb_type));
1926
1927         for(i=0; i<4; i++){
1928             const int sub_mb_type= h->sub_mb_type[i];
1929             const int n= 4*i;
1930             int x_offset= (i&1)<<2;
1931             int y_offset= (i&2)<<1;
1932
1933             if(IS_SUB_8X8(sub_mb_type)){
1934                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1935                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1936                     &weight_op[3], &weight_avg[3],
1937                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1938             }else if(IS_SUB_8X4(sub_mb_type)){
1939                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1940                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1941                     &weight_op[4], &weight_avg[4],
1942                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1943                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1944                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1945                     &weight_op[4], &weight_avg[4],
1946                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1947             }else if(IS_SUB_4X8(sub_mb_type)){
1948                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1949                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1950                     &weight_op[5], &weight_avg[5],
1951                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1952                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1953                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1954                     &weight_op[5], &weight_avg[5],
1955                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1956             }else{
1957                 int j;
1958                 assert(IS_SUB_4X4(sub_mb_type));
1959                 for(j=0; j<4; j++){
1960                     int sub_x_offset= x_offset + 2*(j&1);
1961                     int sub_y_offset= y_offset +   (j&2);
1962                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1963                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1964                         &weight_op[6], &weight_avg[6],
1965                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1966                 }
1967             }
1968         }
1969     }
1970
1971     prefetch_motion(h, 1);
1972 }
1973
1974 static void decode_init_vlc(void){
1975     static int done = 0;
1976
1977     if (!done) {
1978         int i;
1979         done = 1;
1980
1981         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1982                  &chroma_dc_coeff_token_len [0], 1, 1,
1983                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1984
1985         for(i=0; i<4; i++){
1986             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1987                      &coeff_token_len [i][0], 1, 1,
1988                      &coeff_token_bits[i][0], 1, 1, 1);
1989         }
1990
1991         for(i=0; i<3; i++){
1992             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1993                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1994                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1995         }
1996         for(i=0; i<15; i++){
1997             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1998                      &total_zeros_len [i][0], 1, 1,
1999                      &total_zeros_bits[i][0], 1, 1, 1);
2000         }
2001
2002         for(i=0; i<6; i++){
2003             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2004                      &run_len [i][0], 1, 1,
2005                      &run_bits[i][0], 1, 1, 1);
2006         }
2007         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2008                  &run_len [6][0], 1, 1,
2009                  &run_bits[6][0], 1, 1, 1);
2010     }
2011 }
2012
2013 static void free_tables(H264Context *h){
2014     int i;
2015     H264Context *hx;
2016     av_freep(&h->intra4x4_pred_mode);
2017     av_freep(&h->chroma_pred_mode_table);
2018     av_freep(&h->cbp_table);
2019     av_freep(&h->mvd_table[0]);
2020     av_freep(&h->mvd_table[1]);
2021     av_freep(&h->direct_table);
2022     av_freep(&h->non_zero_count);
2023     av_freep(&h->slice_table_base);
2024     h->slice_table= NULL;
2025
2026     av_freep(&h->mb2b_xy);
2027     av_freep(&h->mb2b8_xy);
2028
2029     for(i = 0; i < MAX_SPS_COUNT; i++)
2030         av_freep(h->sps_buffers + i);
2031
2032     for(i = 0; i < MAX_PPS_COUNT; i++)
2033         av_freep(h->pps_buffers + i);
2034
2035     for(i = 0; i < h->s.avctx->thread_count; i++) {
2036         hx = h->thread_context[i];
2037         if(!hx) continue;
2038         av_freep(&hx->top_borders[1]);
2039         av_freep(&hx->top_borders[0]);
2040         av_freep(&hx->s.obmc_scratchpad);
2041         av_freep(&hx->s.allocated_edge_emu_buffer);
2042     }
2043 }
2044
2045 static void init_dequant8_coeff_table(H264Context *h){
2046     int i,q,x;
2047     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2048     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2049     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2050
2051     for(i=0; i<2; i++ ){
2052         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2053             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2054             break;
2055         }
2056
2057         for(q=0; q<52; q++){
2058             int shift = ff_div6[q];
2059             int idx = ff_rem6[q];
2060             for(x=0; x<64; x++)
2061                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2062                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2063                     h->pps.scaling_matrix8[i][x]) << shift;
2064         }
2065     }
2066 }
2067
2068 static void init_dequant4_coeff_table(H264Context *h){
2069     int i,j,q,x;
2070     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2071     for(i=0; i<6; i++ ){
2072         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2073         for(j=0; j<i; j++){
2074             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2075                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2076                 break;
2077             }
2078         }
2079         if(j<i)
2080             continue;
2081
2082         for(q=0; q<52; q++){
2083             int shift = ff_div6[q] + 2;
2084             int idx = ff_rem6[q];
2085             for(x=0; x<16; x++)
2086                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2087                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2088                     h->pps.scaling_matrix4[i][x]) << shift;
2089         }
2090     }
2091 }
2092
2093 static void init_dequant_tables(H264Context *h){
2094     int i,x;
2095     init_dequant4_coeff_table(h);
2096     if(h->pps.transform_8x8_mode)
2097         init_dequant8_coeff_table(h);
2098     if(h->sps.transform_bypass){
2099         for(i=0; i<6; i++)
2100             for(x=0; x<16; x++)
2101                 h->dequant4_coeff[i][0][x] = 1<<6;
2102         if(h->pps.transform_8x8_mode)
2103             for(i=0; i<2; i++)
2104                 for(x=0; x<64; x++)
2105                     h->dequant8_coeff[i][0][x] = 1<<6;
2106     }
2107 }
2108
2109
2110 /**
2111  * allocates tables.
2112  * needs width/height
2113  */
2114 static int alloc_tables(H264Context *h){
2115     MpegEncContext * const s = &h->s;
2116     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2117     int x,y;
2118
2119     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2120
2121     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2122     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2123     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2124
2125     if( h->pps.cabac ) {
2126         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2127         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2128         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2129         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2130     }
2131
2132     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2133     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2134
2135     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2136     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2137     for(y=0; y<s->mb_height; y++){
2138         for(x=0; x<s->mb_width; x++){
2139             const int mb_xy= x + y*s->mb_stride;
2140             const int b_xy = 4*x + 4*y*h->b_stride;
2141             const int b8_xy= 2*x + 2*y*h->b8_stride;
2142
2143             h->mb2b_xy [mb_xy]= b_xy;
2144             h->mb2b8_xy[mb_xy]= b8_xy;
2145         }
2146     }
2147
2148     s->obmc_scratchpad = NULL;
2149
2150     if(!h->dequant4_coeff[0])
2151         init_dequant_tables(h);
2152
2153     return 0;
2154 fail:
2155     free_tables(h);
2156     return -1;
2157 }
2158
2159 /**
2160  * Mimic alloc_tables(), but for every context thread.
2161  */
2162 static void clone_tables(H264Context *dst, H264Context *src){
2163     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2164     dst->non_zero_count           = src->non_zero_count;
2165     dst->slice_table              = src->slice_table;
2166     dst->cbp_table                = src->cbp_table;
2167     dst->mb2b_xy                  = src->mb2b_xy;
2168     dst->mb2b8_xy                 = src->mb2b8_xy;
2169     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2170     dst->mvd_table[0]             = src->mvd_table[0];
2171     dst->mvd_table[1]             = src->mvd_table[1];
2172     dst->direct_table             = src->direct_table;
2173
2174     dst->s.obmc_scratchpad = NULL;
2175     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2176 }
2177
2178 /**
2179  * Init context
2180  * Allocate buffers which are not shared amongst multiple threads.
2181  */
2182 static int context_init(H264Context *h){
2183     MpegEncContext * const s = &h->s;
2184
2185     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2186     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2187
2188     // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264)
2189     CHECKED_ALLOCZ(s->allocated_edge_emu_buffer,
2190                    (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance
2191     s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21;
2192     return 0;
2193 fail:
2194     return -1; // free_tables will clean up for us
2195 }
2196
2197 static void common_init(H264Context *h){
2198     MpegEncContext * const s = &h->s;
2199
2200     s->width = s->avctx->width;
2201     s->height = s->avctx->height;
2202     s->codec_id= s->avctx->codec->id;
2203
2204     ff_h264_pred_init(&h->hpc, s->codec_id);
2205
2206     h->dequant_coeff_pps= -1;
2207     s->unrestricted_mv=1;
2208     s->decode=1; //FIXME
2209
2210     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2211     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2212 }
2213
2214 static int decode_init(AVCodecContext *avctx){
2215     H264Context *h= avctx->priv_data;
2216     MpegEncContext * const s = &h->s;
2217
2218     MPV_decode_defaults(s);
2219
2220     s->avctx = avctx;
2221     common_init(h);
2222
2223     s->out_format = FMT_H264;
2224     s->workaround_bugs= avctx->workaround_bugs;
2225
2226     // set defaults
2227 //    s->decode_mb= ff_h263_decode_mb;
2228     s->quarter_sample = 1;
2229     s->low_delay= 1;
2230     avctx->pix_fmt= PIX_FMT_YUV420P;
2231
2232     decode_init_vlc();
2233
2234     if(avctx->extradata_size > 0 && avctx->extradata &&
2235        *(char *)avctx->extradata == 1){
2236         h->is_avc = 1;
2237         h->got_avcC = 0;
2238     } else {
2239         h->is_avc = 0;
2240     }
2241
2242     h->thread_context[0] = h;
2243     return 0;
2244 }
2245
2246 static int frame_start(H264Context *h){
2247     MpegEncContext * const s = &h->s;
2248     int i;
2249
2250     if(MPV_frame_start(s, s->avctx) < 0)
2251         return -1;
2252     ff_er_frame_start(s);
2253     /*
2254      * MPV_frame_start uses pict_type to derive key_frame.
2255      * This is incorrect for H.264; IDR markings must be used.
2256      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2257      * See decode_nal_units().
2258      */
2259     s->current_picture_ptr->key_frame= 0;
2260
2261     assert(s->linesize && s->uvlinesize);
2262
2263     for(i=0; i<16; i++){
2264         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2265         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2266     }
2267     for(i=0; i<4; i++){
2268         h->block_offset[16+i]=
2269         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2270         h->block_offset[24+16+i]=
2271         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2272     }
2273
2274     /* can't be in alloc_tables because linesize isn't known there.
2275      * FIXME: redo bipred weight to not require extra buffer? */
2276     for(i = 0; i < s->avctx->thread_count; i++)
2277         if(!h->thread_context[i]->s.obmc_scratchpad)
2278             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2279
2280     /* some macroblocks will be accessed before they're available */
2281     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2282         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2283
2284 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2285     return 0;
2286 }
2287
2288 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2289     MpegEncContext * const s = &h->s;
2290     int i;
2291
2292     src_y  -=   linesize;
2293     src_cb -= uvlinesize;
2294     src_cr -= uvlinesize;
2295
2296     // There are two lines saved, the line above the the top macroblock of a pair,
2297     // and the line above the bottom macroblock
2298     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2299     for(i=1; i<17; i++){
2300         h->left_border[i]= src_y[15+i*  linesize];
2301     }
2302
2303     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2304     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2305
2306     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2307         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2308         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2309         for(i=1; i<9; i++){
2310             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2311             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2312         }
2313         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2314         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2315     }
2316 }
2317
2318 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2319     MpegEncContext * const s = &h->s;
2320     int temp8, i;
2321     uint64_t temp64;
2322     int deblock_left;
2323     int deblock_top;
2324     int mb_xy;
2325
2326     if(h->deblocking_filter == 2) {
2327         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2328         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2329         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2330     } else {
2331         deblock_left = (s->mb_x > 0);
2332         deblock_top =  (s->mb_y > 0);
2333     }
2334
2335     src_y  -=   linesize + 1;
2336     src_cb -= uvlinesize + 1;
2337     src_cr -= uvlinesize + 1;
2338
2339 #define XCHG(a,b,t,xchg)\
2340 t= a;\
2341 if(xchg)\
2342     a= b;\
2343 b= t;
2344
2345     if(deblock_left){
2346         for(i = !deblock_top; i<17; i++){
2347             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2348         }
2349     }
2350
2351     if(deblock_top){
2352         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2353         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2354         if(s->mb_x+1 < s->mb_width){
2355             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2356         }
2357     }
2358
2359     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2360         if(deblock_left){
2361             for(i = !deblock_top; i<9; i++){
2362                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2363                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2364             }
2365         }
2366         if(deblock_top){
2367             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2368             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2369         }
2370     }
2371 }
2372
2373 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2374     MpegEncContext * const s = &h->s;
2375     int i;
2376
2377     src_y  -= 2 *   linesize;
2378     src_cb -= 2 * uvlinesize;
2379     src_cr -= 2 * uvlinesize;
2380
2381     // There are two lines saved, the line above the the top macroblock of a pair,
2382     // and the line above the bottom macroblock
2383     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2384     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2385     for(i=2; i<34; i++){
2386         h->left_border[i]= src_y[15+i*  linesize];
2387     }
2388
2389     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2390     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2391     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2392     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2393
2394     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2395         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2396         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2397         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2398         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2399         for(i=2; i<18; i++){
2400             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2401             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2402         }
2403         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2404         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2405         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2406         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2407     }
2408 }
2409
2410 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2411     MpegEncContext * const s = &h->s;
2412     int temp8, i;
2413     uint64_t temp64;
2414     int deblock_left = (s->mb_x > 0);
2415     int deblock_top  = (s->mb_y > 1);
2416
2417     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2418
2419     src_y  -= 2 *   linesize + 1;
2420     src_cb -= 2 * uvlinesize + 1;
2421     src_cr -= 2 * uvlinesize + 1;
2422
2423 #define XCHG(a,b,t,xchg)\
2424 t= a;\
2425 if(xchg)\
2426     a= b;\
2427 b= t;
2428
2429     if(deblock_left){
2430         for(i = (!deblock_top)<<1; i<34; i++){
2431             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2432         }
2433     }
2434
2435     if(deblock_top){
2436         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2437         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2438         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2439         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2440         if(s->mb_x+1 < s->mb_width){
2441             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2442             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2443         }
2444     }
2445
2446     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2447         if(deblock_left){
2448             for(i = (!deblock_top) << 1; i<18; i++){
2449                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2450                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2451             }
2452         }
2453         if(deblock_top){
2454             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2455             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2456             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2457             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2458         }
2459     }
2460 }
2461
2462 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2463     MpegEncContext * const s = &h->s;
2464     const int mb_x= s->mb_x;
2465     const int mb_y= s->mb_y;
2466     const int mb_xy= mb_x + mb_y*s->mb_stride;
2467     const int mb_type= s->current_picture.mb_type[mb_xy];
2468     uint8_t  *dest_y, *dest_cb, *dest_cr;
2469     int linesize, uvlinesize /*dct_offset*/;
2470     int i;
2471     int *block_offset = &h->block_offset[0];
2472     const unsigned int bottom = mb_y & 1;
2473     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2474     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2475     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2476
2477     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2478     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2479     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2480
2481     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2482     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2483
2484     if (!simple && MB_FIELD) {
2485         linesize   = h->mb_linesize   = s->linesize * 2;
2486         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2487         block_offset = &h->block_offset[24];
2488         if(mb_y&1){ //FIXME move out of this func?
2489             dest_y -= s->linesize*15;
2490             dest_cb-= s->uvlinesize*7;
2491             dest_cr-= s->uvlinesize*7;
2492         }
2493         if(FRAME_MBAFF) {
2494             int list;
2495             for(list=0; list<h->list_count; list++){
2496                 if(!USES_LIST(mb_type, list))
2497                     continue;
2498                 if(IS_16X16(mb_type)){
2499                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2500                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
2501                 }else{
2502                     for(i=0; i<16; i+=4){
2503                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2504                         int ref = h->ref_cache[list][scan8[i]];
2505                         if(ref >= 0)
2506                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
2507                     }
2508                 }
2509             }
2510         }
2511     } else {
2512         linesize   = h->mb_linesize   = s->linesize;
2513         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2514 //        dct_offset = s->linesize * 16;
2515     }
2516
2517     if(transform_bypass){
2518         idct_dc_add =
2519         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2520     }else if(IS_8x8DCT(mb_type)){
2521         idct_dc_add = s->dsp.h264_idct8_dc_add;
2522         idct_add = s->dsp.h264_idct8_add;
2523     }else{
2524         idct_dc_add = s->dsp.h264_idct_dc_add;
2525         idct_add = s->dsp.h264_idct_add;
2526     }
2527
2528     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2529        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2530         int mbt_y = mb_y&~1;
2531         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2532         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2533         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2534         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2535     }
2536
2537     if (!simple && IS_INTRA_PCM(mb_type)) {
2538         unsigned int x, y;
2539
2540         // The pixels are stored in h->mb array in the same order as levels,
2541         // copy them in output in the correct order.
2542         for(i=0; i<16; i++) {
2543             for (y=0; y<4; y++) {
2544                 for (x=0; x<4; x++) {
2545                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2546                 }
2547             }
2548         }
2549         for(i=16; i<16+4; i++) {
2550             for (y=0; y<4; y++) {
2551                 for (x=0; x<4; x++) {
2552                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2553                 }
2554             }
2555         }
2556         for(i=20; i<20+4; i++) {
2557             for (y=0; y<4; y++) {
2558                 for (x=0; x<4; x++) {
2559                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2560                 }
2561             }
2562         }
2563     } else {
2564         if(IS_INTRA(mb_type)){
2565             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2566                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2567
2568             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2569                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2570                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2571             }
2572
2573             if(IS_INTRA4x4(mb_type)){
2574                 if(simple || !s->encoding){
2575                     if(IS_8x8DCT(mb_type)){
2576                         for(i=0; i<16; i+=4){
2577                             uint8_t * const ptr= dest_y + block_offset[i];
2578                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2579                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2580                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2581                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2582                             if(nnz){
2583                                 if(nnz == 1 && h->mb[i*16])
2584                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2585                                 else
2586                                     idct_add(ptr, h->mb + i*16, linesize);
2587                             }
2588                         }
2589                     }else
2590                     for(i=0; i<16; i++){
2591                         uint8_t * const ptr= dest_y + block_offset[i];
2592                         uint8_t *topright;
2593                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2594                         int nnz, tr;
2595
2596                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2597                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2598                             assert(mb_y || linesize <= block_offset[i]);
2599                             if(!topright_avail){
2600                                 tr= ptr[3 - linesize]*0x01010101;
2601                                 topright= (uint8_t*) &tr;
2602                             }else
2603                                 topright= ptr + 4 - linesize;
2604                         }else
2605                             topright= NULL;
2606
2607                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2608                         nnz = h->non_zero_count_cache[ scan8[i] ];
2609                         if(nnz){
2610                             if(is_h264){
2611                                 if(nnz == 1 && h->mb[i*16])
2612                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2613                                 else
2614                                     idct_add(ptr, h->mb + i*16, linesize);
2615                             }else
2616                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2617                         }
2618                     }
2619                 }
2620             }else{
2621                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2622                 if(is_h264){
2623                     if(!transform_bypass)
2624                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2625                 }else
2626                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2627             }
2628             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2629                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2630         }else if(is_h264){
2631             hl_motion(h, dest_y, dest_cb, dest_cr,
2632                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2633                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2634                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2635         }
2636
2637
2638         if(!IS_INTRA4x4(mb_type)){
2639             if(is_h264){
2640                 if(IS_INTRA16x16(mb_type)){
2641                     for(i=0; i<16; i++){
2642                         if(h->non_zero_count_cache[ scan8[i] ])
2643                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2644                         else if(h->mb[i*16])
2645                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2646                     }
2647                 }else{
2648                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2649                     for(i=0; i<16; i+=di){
2650                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2651                         if(nnz){
2652                             if(nnz==1 && h->mb[i*16])
2653                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2654                             else
2655                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2656                         }
2657                     }
2658                 }
2659             }else{
2660                 for(i=0; i<16; i++){
2661                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2662                         uint8_t * const ptr= dest_y + block_offset[i];
2663                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2664                     }
2665                 }
2666             }
2667         }
2668
2669         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2670             uint8_t *dest[2] = {dest_cb, dest_cr};
2671             if(transform_bypass){
2672                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2673             }else{
2674                 idct_add = s->dsp.h264_idct_add;
2675                 idct_dc_add = s->dsp.h264_idct_dc_add;
2676                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2677                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2678             }
2679             if(is_h264){
2680                 for(i=16; i<16+8; i++){
2681                     if(h->non_zero_count_cache[ scan8[i] ])
2682                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2683                     else if(h->mb[i*16])
2684                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2685                 }
2686             }else{
2687                 for(i=16; i<16+8; i++){
2688                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2689                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2690                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2691                     }
2692                 }
2693             }
2694         }
2695     }
2696     if(h->deblocking_filter) {
2697         if (!simple && FRAME_MBAFF) {
2698             //FIXME try deblocking one mb at a time?
2699             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2700             const int mb_y = s->mb_y - 1;
2701             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2702             const int mb_xy= mb_x + mb_y*s->mb_stride;
2703             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2704             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2705             if (!bottom) return;
2706             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2707             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2708             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2709
2710             if(IS_INTRA(mb_type_top | mb_type_bottom))
2711                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2712
2713             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2714             // deblock a pair
2715             // top
2716             s->mb_y--;
2717             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2718             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2719             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2720             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2721             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2722             // bottom
2723             s->mb_y++;
2724             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2725             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2726             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2727             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2728             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2729         } else {
2730             tprintf(h->s.avctx, "call filter_mb\n");
2731             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2732             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2733             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2734         }
2735     }
2736 }
2737
2738 /**
2739  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2740  */
2741 static void hl_decode_mb_simple(H264Context *h){
2742     hl_decode_mb_internal(h, 1);
2743 }
2744
2745 /**
2746  * Process a macroblock; this handles edge cases, such as interlacing.
2747  */
2748 static void av_noinline hl_decode_mb_complex(H264Context *h){
2749     hl_decode_mb_internal(h, 0);
2750 }
2751
2752 static void hl_decode_mb(H264Context *h){
2753     MpegEncContext * const s = &h->s;
2754     const int mb_x= s->mb_x;
2755     const int mb_y= s->mb_y;
2756     const int mb_xy= mb_x + mb_y*s->mb_stride;
2757     const int mb_type= s->current_picture.mb_type[mb_xy];
2758     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2759
2760     if(!s->decode)
2761         return;
2762
2763     if (is_complex)
2764         hl_decode_mb_complex(h);
2765     else hl_decode_mb_simple(h);
2766 }
2767
2768 static void pic_as_field(Picture *pic, const int parity){
2769     int i;
2770     for (i = 0; i < 4; ++i) {
2771         if (parity == PICT_BOTTOM_FIELD)
2772             pic->data[i] += pic->linesize[i];
2773         pic->reference = parity;
2774         pic->linesize[i] *= 2;
2775     }
2776 }
2777
2778 static int split_field_copy(Picture *dest, Picture *src,
2779                             int parity, int id_add){
2780     int match = !!(src->reference & parity);
2781
2782     if (match) {
2783         *dest = *src;
2784         pic_as_field(dest, parity);
2785         dest->pic_id *= 2;
2786         dest->pic_id += id_add;
2787     }
2788
2789     return match;
2790 }
2791
2792 /**
2793  * Split one reference list into field parts, interleaving by parity
2794  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2795  * set to look at the actual start of data for that field.
2796  *
2797  * @param dest output list
2798  * @param dest_len maximum number of fields to put in dest
2799  * @param src the source reference list containing fields and/or field pairs
2800  *            (aka short_ref/long_ref, or
2801  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2802  * @param src_len number of Picture's in source (pairs and unmatched fields)
2803  * @param parity the parity of the picture being decoded/needing
2804  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2805  * @return number of fields placed in dest
2806  */
2807 static int split_field_half_ref_list(Picture *dest, int dest_len,
2808                                      Picture *src,  int src_len,  int parity){
2809     int same_parity   = 1;
2810     int same_i        = 0;
2811     int opp_i         = 0;
2812     int out_i;
2813     int field_output;
2814
2815     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2816         if (same_parity && same_i < src_len) {
2817             field_output = split_field_copy(dest + out_i, src + same_i,
2818                                             parity, 1);
2819             same_parity = !field_output;
2820             same_i++;
2821
2822         } else if (opp_i < src_len) {
2823             field_output = split_field_copy(dest + out_i, src + opp_i,
2824                                             PICT_FRAME - parity, 0);
2825             same_parity = field_output;
2826             opp_i++;
2827
2828         } else {
2829             break;
2830         }
2831     }
2832
2833     return out_i;
2834 }
2835
2836 /**
2837  * Split the reference frame list into a reference field list.
2838  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2839  * The input list contains both reference field pairs and
2840  * unmatched reference fields; it is ordered as spec describes
2841  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2842  * unmatched field pairs are also present. Conceptually this is equivalent
2843  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2844  *
2845  * @param dest output reference list where ordered fields are to be placed
2846  * @param dest_len max number of fields to place at dest
2847  * @param src source reference list, as described above
2848  * @param src_len number of pictures (pairs and unmatched fields) in src
2849  * @param parity parity of field being currently decoded
2850  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2851  * @param long_i index into src array that holds first long reference picture,
2852  *        or src_len if no long refs present.
2853  */
2854 static int split_field_ref_list(Picture *dest, int dest_len,
2855                                 Picture *src,  int src_len,
2856                                 int parity,    int long_i){
2857
2858     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2859     dest += i;
2860     dest_len -= i;
2861
2862     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2863                                    src_len - long_i, parity);
2864     return i;
2865 }
2866
2867 /**
2868  * fills the default_ref_list.
2869  */
2870 static int fill_default_ref_list(H264Context *h){
2871     MpegEncContext * const s = &h->s;
2872     int i;
2873     int smallest_poc_greater_than_current = -1;
2874     int structure_sel;
2875     Picture sorted_short_ref[32];
2876     Picture field_entry_list[2][32];
2877     Picture *frame_list[2];
2878
2879     if (FIELD_PICTURE) {
2880         structure_sel = PICT_FRAME;
2881         frame_list[0] = field_entry_list[0];
2882         frame_list[1] = field_entry_list[1];
2883     } else {
2884         structure_sel = 0;
2885         frame_list[0] = h->default_ref_list[0];
2886         frame_list[1] = h->default_ref_list[1];
2887     }
2888
2889     if(h->slice_type==B_TYPE){
2890         int list;
2891         int len[2];
2892         int short_len[2];
2893         int out_i;
2894         int limit= INT_MIN;
2895
2896         /* sort frame according to poc in B slice */
2897         for(out_i=0; out_i<h->short_ref_count; out_i++){
2898             int best_i=INT_MIN;
2899             int best_poc=INT_MAX;
2900
2901             for(i=0; i<h->short_ref_count; i++){
2902                 const int poc= h->short_ref[i]->poc;
2903                 if(poc > limit && poc < best_poc){
2904                     best_poc= poc;
2905                     best_i= i;
2906                 }
2907             }
2908
2909             assert(best_i != INT_MIN);
2910
2911             limit= best_poc;
2912             sorted_short_ref[out_i]= *h->short_ref[best_i];
2913             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2914             if (-1 == smallest_poc_greater_than_current) {
2915                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2916                     smallest_poc_greater_than_current = out_i;
2917                 }
2918             }
2919         }
2920
2921         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2922
2923         // find the largest poc
2924         for(list=0; list<2; list++){
2925             int index = 0;
2926             int j= -99;
2927             int step= list ? -1 : 1;
2928
2929             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2930                 int sel;
2931                 while(j<0 || j>= h->short_ref_count){
2932                     if(j != -99 && step == (list ? -1 : 1))
2933                         return -1;
2934                     step = -step;
2935                     j= smallest_poc_greater_than_current + (step>>1);
2936                 }
2937                 sel = sorted_short_ref[j].reference | structure_sel;
2938                 if(sel != PICT_FRAME) continue;
2939                 frame_list[list][index  ]= sorted_short_ref[j];
2940                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2941             }
2942             short_len[list] = index;
2943
2944             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2945                 int sel;
2946                 if(h->long_ref[i] == NULL) continue;
2947                 sel = h->long_ref[i]->reference | structure_sel;
2948                 if(sel != PICT_FRAME) continue;
2949
2950                 frame_list[ list ][index  ]= *h->long_ref[i];
2951                 frame_list[ list ][index++].pic_id= i;;
2952             }
2953             len[list] = index;
2954
2955             if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
2956                 // swap the two first elements of L1 when
2957                 // L0 and L1 are identical
2958                 Picture temp= frame_list[1][0];
2959                 frame_list[1][0] = frame_list[1][1];
2960                 frame_list[1][1] = temp;
2961             }
2962
2963         }
2964
2965         for(list=0; list<2; list++){
2966             if (FIELD_PICTURE)
2967                 len[list] = split_field_ref_list(h->default_ref_list[list],
2968                                                  h->ref_count[list],
2969                                                  frame_list[list],
2970                                                  len[list],
2971                                                  s->picture_structure,
2972                                                  short_len[list]);
2973
2974             if(len[list] < h->ref_count[ list ])
2975                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2976         }
2977
2978
2979     }else{
2980         int index=0;
2981         int short_len;
2982         for(i=0; i<h->short_ref_count; i++){
2983             int sel;
2984             sel = h->short_ref[i]->reference | structure_sel;
2985             if(sel != PICT_FRAME) continue;
2986             frame_list[0][index  ]= *h->short_ref[i];
2987             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2988         }
2989         short_len = index;
2990         for(i = 0; i < 16; i++){
2991             int sel;
2992             if(h->long_ref[i] == NULL) continue;
2993             sel = h->long_ref[i]->reference | structure_sel;
2994             if(sel != PICT_FRAME) continue;
2995             frame_list[0][index  ]= *h->long_ref[i];
2996             frame_list[0][index++].pic_id= i;;
2997         }
2998
2999         if (FIELD_PICTURE)
3000             index = split_field_ref_list(h->default_ref_list[0],
3001                                          h->ref_count[0], frame_list[0],
3002                                          index, s->picture_structure,
3003                                          short_len);
3004
3005         if(index < h->ref_count[0])
3006             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3007     }
3008 #ifdef TRACE
3009     for (i=0; i<h->ref_count[0]; i++) {
3010         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3011     }
3012     if(h->slice_type==B_TYPE){
3013         for (i=0; i<h->ref_count[1]; i++) {
3014             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3015         }
3016     }
3017 #endif
3018     return 0;
3019 }
3020
3021 static void print_short_term(H264Context *h);
3022 static void print_long_term(H264Context *h);
3023
3024 /**
3025  * Extract structure information about the picture described by pic_num in
3026  * the current decoding context (frame or field). Note that pic_num is
3027  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3028  * @param pic_num picture number for which to extract structure information
3029  * @param structure one of PICT_XXX describing structure of picture
3030  *                      with pic_num
3031  * @return frame number (short term) or long term index of picture
3032  *         described by pic_num
3033  */
3034 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3035     MpegEncContext * const s = &h->s;
3036
3037     *structure = s->picture_structure;
3038     if(FIELD_PICTURE){
3039         if (!(pic_num & 1))
3040             /* opposite field */
3041             *structure ^= PICT_FRAME;
3042         pic_num >>= 1;
3043     }
3044
3045     return pic_num;
3046 }
3047
3048 static int decode_ref_pic_list_reordering(H264Context *h){
3049     MpegEncContext * const s = &h->s;
3050     int list, index, pic_structure;
3051
3052     print_short_term(h);
3053     print_long_term(h);
3054     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3055
3056     for(list=0; list<h->list_count; list++){
3057         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3058
3059         if(get_bits1(&s->gb)){
3060             int pred= h->curr_pic_num;
3061
3062             for(index=0; ; index++){
3063                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3064                 unsigned int pic_id;
3065                 int i;
3066                 Picture *ref = NULL;
3067
3068                 if(reordering_of_pic_nums_idc==3)
3069                     break;
3070
3071                 if(index >= h->ref_count[list]){
3072                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3073                     return -1;
3074                 }
3075
3076                 if(reordering_of_pic_nums_idc<3){
3077                     if(reordering_of_pic_nums_idc<2){
3078                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3079                         int frame_num;
3080
3081                         if(abs_diff_pic_num > h->max_pic_num){
3082                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3083                             return -1;
3084                         }
3085
3086                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3087                         else                                pred+= abs_diff_pic_num;
3088                         pred &= h->max_pic_num - 1;
3089
3090                         frame_num = pic_num_extract(h, pred, &pic_structure);
3091
3092                         for(i= h->short_ref_count-1; i>=0; i--){
3093                             ref = h->short_ref[i];
3094                             assert(ref->reference);
3095                             assert(!ref->long_ref);
3096                             if(ref->data[0] != NULL &&
3097                                    ref->frame_num == frame_num &&
3098                                    (ref->reference & pic_structure) &&
3099                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3100                                 break;
3101                         }
3102                         if(i>=0)
3103                             ref->pic_id= pred;
3104                     }else{
3105                         int long_idx;
3106                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3107
3108                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3109
3110                         if(long_idx>31){
3111                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3112                             return -1;
3113                         }
3114                         ref = h->long_ref[long_idx];
3115                         assert(!(ref && !ref->reference));
3116                         if(ref && (ref->reference & pic_structure)){
3117                             ref->pic_id= pic_id;
3118                             assert(ref->long_ref);
3119                             i=0;
3120                         }else{
3121                             i=-1;
3122                         }
3123                     }
3124
3125                     if (i < 0) {
3126                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3127                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3128                     } else {
3129                         for(i=index; i+1<h->ref_count[list]; i++){
3130                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3131                                 break;
3132                         }
3133                         for(; i > index; i--){
3134                             h->ref_list[list][i]= h->ref_list[list][i-1];
3135                         }
3136                         h->ref_list[list][index]= *ref;
3137                         if (FIELD_PICTURE){
3138                             pic_as_field(&h->ref_list[list][index], pic_structure);
3139                         }
3140                     }
3141                 }else{
3142                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3143                     return -1;
3144                 }
3145             }
3146         }
3147     }
3148     for(list=0; list<h->list_count; list++){
3149         for(index= 0; index < h->ref_count[list]; index++){
3150             if(!h->ref_list[list][index].data[0])
3151                 h->ref_list[list][index]= s->current_picture;
3152         }
3153     }
3154
3155     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3156         direct_dist_scale_factor(h);
3157     direct_ref_list_init(h);
3158     return 0;
3159 }
3160
3161 static void fill_mbaff_ref_list(H264Context *h){
3162     int list, i, j;
3163     for(list=0; list<2; list++){ //FIXME try list_count
3164         for(i=0; i<h->ref_count[list]; i++){
3165             Picture *frame = &h->ref_list[list][i];
3166             Picture *field = &h->ref_list[list][16+2*i];
3167             field[0] = *frame;
3168             for(j=0; j<3; j++)
3169                 field[0].linesize[j] <<= 1;
3170             field[0].reference = PICT_TOP_FIELD;
3171             field[1] = field[0];
3172             for(j=0; j<3; j++)
3173                 field[1].data[j] += frame->linesize[j];
3174             field[1].reference = PICT_BOTTOM_FIELD;
3175
3176             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3177             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3178             for(j=0; j<2; j++){
3179                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3180                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3181             }
3182         }
3183     }
3184     for(j=0; j<h->ref_count[1]; j++){
3185         for(i=0; i<h->ref_count[0]; i++)
3186             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3187         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3188         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3189     }
3190 }
3191
3192 static int pred_weight_table(H264Context *h){
3193     MpegEncContext * const s = &h->s;
3194     int list, i;
3195     int luma_def, chroma_def;
3196
3197     h->use_weight= 0;
3198     h->use_weight_chroma= 0;
3199     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3200     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3201     luma_def = 1<<h->luma_log2_weight_denom;
3202     chroma_def = 1<<h->chroma_log2_weight_denom;
3203
3204     for(list=0; list<2; list++){
3205         for(i=0; i<h->ref_count[list]; i++){
3206             int luma_weight_flag, chroma_weight_flag;
3207
3208             luma_weight_flag= get_bits1(&s->gb);
3209             if(luma_weight_flag){
3210                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3211                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3212                 if(   h->luma_weight[list][i] != luma_def
3213                    || h->luma_offset[list][i] != 0)
3214                     h->use_weight= 1;
3215             }else{
3216                 h->luma_weight[list][i]= luma_def;
3217                 h->luma_offset[list][i]= 0;
3218             }
3219
3220             chroma_weight_flag= get_bits1(&s->gb);
3221             if(chroma_weight_flag){
3222                 int j;
3223                 for(j=0; j<2; j++){
3224                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3225                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3226                     if(   h->chroma_weight[list][i][j] != chroma_def
3227                        || h->chroma_offset[list][i][j] != 0)
3228                         h->use_weight_chroma= 1;
3229                 }
3230             }else{
3231                 int j;
3232                 for(j=0; j<2; j++){
3233                     h->chroma_weight[list][i][j]= chroma_def;
3234                     h->chroma_offset[list][i][j]= 0;
3235                 }
3236             }
3237         }
3238         if(h->slice_type != B_TYPE) break;
3239     }
3240     h->use_weight= h->use_weight || h->use_weight_chroma;
3241     return 0;
3242 }
3243
3244 static void implicit_weight_table(H264Context *h){
3245     MpegEncContext * const s = &h->s;
3246     int ref0, ref1;
3247     int cur_poc = s->current_picture_ptr->poc;
3248
3249     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3250        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3251         h->use_weight= 0;
3252         h->use_weight_chroma= 0;
3253         return;
3254     }
3255
3256     h->use_weight= 2;
3257     h->use_weight_chroma= 2;
3258     h->luma_log2_weight_denom= 5;
3259     h->chroma_log2_weight_denom= 5;
3260
3261     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3262         int poc0 = h->ref_list[0][ref0].poc;
3263         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3264             int poc1 = h->ref_list[1][ref1].poc;
3265             int td = av_clip(poc1 - poc0, -128, 127);
3266             if(td){
3267                 int tb = av_clip(cur_poc - poc0, -128, 127);
3268                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3269                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3270                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3271                     h->implicit_weight[ref0][ref1] = 32;
3272                 else
3273                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3274             }else
3275                 h->implicit_weight[ref0][ref1] = 32;
3276         }
3277     }
3278 }
3279
3280 /**
3281  * Mark a picture as no longer needed for reference. The refmask
3282  * argument allows unreferencing of individual fields or the whole frame.
3283  * If the picture becomes entirely unreferenced, but is being held for
3284  * display purposes, it is marked as such.
3285  * @param refmask mask of fields to unreference; the mask is bitwise
3286  *                anded with the reference marking of pic
3287  * @return non-zero if pic becomes entirely unreferenced (except possibly
3288  *         for display purposes) zero if one of the fields remains in
3289  *         reference
3290  */
3291 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3292     int i;
3293     if (pic->reference &= refmask) {
3294         return 0;
3295     } else {
3296         if(pic == h->delayed_output_pic)
3297             pic->reference=DELAYED_PIC_REF;
3298         else{
3299             for(i = 0; h->delayed_pic[i]; i++)
3300                 if(pic == h->delayed_pic[i]){
3301                     pic->reference=DELAYED_PIC_REF;
3302                     break;
3303                 }
3304         }
3305         return 1;
3306     }
3307 }
3308
3309 /**
3310  * instantaneous decoder refresh.
3311  */
3312 static void idr(H264Context *h){
3313     int i;
3314
3315     for(i=0; i<16; i++){
3316         if (h->long_ref[i] != NULL) {
3317             unreference_pic(h, h->long_ref[i], 0);
3318             h->long_ref[i]= NULL;
3319         }
3320     }
3321     h->long_ref_count=0;
3322
3323     for(i=0; i<h->short_ref_count; i++){
3324         unreference_pic(h, h->short_ref[i], 0);
3325         h->short_ref[i]= NULL;
3326     }
3327     h->short_ref_count=0;
3328 }
3329
3330 /* forget old pics after a seek */
3331 static void flush_dpb(AVCodecContext *avctx){
3332     H264Context *h= avctx->priv_data;
3333     int i;
3334     for(i=0; i<16; i++) {
3335         if(h->delayed_pic[i])
3336             h->delayed_pic[i]->reference= 0;
3337         h->delayed_pic[i]= NULL;
3338     }
3339     if(h->delayed_output_pic)
3340         h->delayed_output_pic->reference= 0;
3341     h->delayed_output_pic= NULL;
3342     idr(h);
3343     if(h->s.current_picture_ptr)
3344         h->s.current_picture_ptr->reference= 0;
3345     h->s.first_field= 0;
3346     ff_mpeg_flush(avctx);
3347 }
3348
3349 /**
3350  * Find a Picture in the short term reference list by frame number.
3351  * @param frame_num frame number to search for
3352  * @param idx the index into h->short_ref where returned picture is found
3353  *            undefined if no picture found.
3354  * @return pointer to the found picture, or NULL if no pic with the provided
3355  *                 frame number is found
3356  */
3357 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3358     MpegEncContext * const s = &h->s;
3359     int i;
3360
3361     for(i=0; i<h->short_ref_count; i++){
3362         Picture *pic= h->short_ref[i];
3363         if(s->avctx->debug&FF_DEBUG_MMCO)
3364             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3365         if(pic->frame_num == frame_num) {
3366             *idx = i;
3367             return pic;
3368         }
3369     }
3370     return NULL;
3371 }
3372
3373 /**
3374  * Remove a picture from the short term reference list by its index in
3375  * that list.  This does no checking on the provided index; it is assumed
3376  * to be valid. Other list entries are shifted down.
3377  * @param i index into h->short_ref of picture to remove.
3378  */
3379 static void remove_short_at_index(H264Context *h, int i){
3380     assert(i > 0 && i < h->short_ref_count);
3381     h->short_ref[i]= NULL;
3382     if (--h->short_ref_count)
3383         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3384 }
3385
3386 /**
3387  *
3388  * @return the removed picture or NULL if an error occurs
3389  */
3390 static Picture * remove_short(H264Context *h, int frame_num){
3391     MpegEncContext * const s = &h->s;
3392     Picture *pic;
3393     int i;
3394
3395     if(s->avctx->debug&FF_DEBUG_MMCO)
3396         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3397
3398     pic = find_short(h, frame_num, &i);
3399     if (pic)
3400         remove_short_at_index(h, i);
3401
3402     return pic;
3403 }
3404
3405 /**
3406  * Remove a picture from the long term reference list by its index in
3407  * that list.  This does no checking on the provided index; it is assumed
3408  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3409  * @param i index into h->long_ref of picture to remove.
3410  */
3411 static void remove_long_at_index(H264Context *h, int i){
3412     h->long_ref[i]= NULL;
3413     h->long_ref_count--;
3414 }
3415
3416 /**
3417  *
3418  * @return the removed picture or NULL if an error occurs
3419  */
3420 static Picture * remove_long(H264Context *h, int i){
3421     Picture *pic;
3422
3423     pic= h->long_ref[i];
3424     if (pic)
3425         remove_long_at_index(h, i);
3426
3427     return pic;
3428 }
3429
3430 /**
3431  * print short term list
3432  */
3433 static void print_short_term(H264Context *h) {
3434     uint32_t i;
3435     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3436         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3437         for(i=0; i<h->short_ref_count; i++){
3438             Picture *pic= h->short_ref[i];
3439             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3440         }
3441     }
3442 }
3443
3444 /**
3445  * print long term list
3446  */
3447 static void print_long_term(H264Context *h) {
3448     uint32_t i;
3449     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3450         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3451         for(i = 0; i < 16; i++){
3452             Picture *pic= h->long_ref[i];
3453             if (pic) {
3454                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3455             }
3456         }
3457     }
3458 }
3459
3460 /**
3461  * Executes the reference picture marking (memory management control operations).
3462  */
3463 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3464     MpegEncContext * const s = &h->s;
3465     int i, j;
3466     int current_ref_assigned=0;
3467     Picture *pic;
3468
3469     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3470         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3471
3472     for(i=0; i<mmco_count; i++){
3473         int structure, frame_num, unref_pic;
3474         if(s->avctx->debug&FF_DEBUG_MMCO)
3475             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3476
3477         switch(mmco[i].opcode){
3478         case MMCO_SHORT2UNUSED:
3479             if(s->avctx->debug&FF_DEBUG_MMCO)
3480                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3481             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3482             pic = find_short(h, frame_num, &j);
3483             if (pic) {
3484                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3485                     remove_short_at_index(h, j);
3486             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3487                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3488             break;
3489         case MMCO_SHORT2LONG:
3490             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3491                     h->long_ref[mmco[i].long_arg]->frame_num ==
3492                                               mmco[i].short_pic_num / 2) {
3493                 /* do nothing, we've already moved this field pair. */
3494             } else {
3495                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3496
3497                 pic= remove_long(h, mmco[i].long_arg);
3498                 if(pic) unreference_pic(h, pic, 0);
3499
3500                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3501                 if (h->long_ref[ mmco[i].long_arg ]){
3502                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3503                     h->long_ref_count++;
3504                 }
3505             }
3506             break;
3507         case MMCO_LONG2UNUSED:
3508             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3509             pic = h->long_ref[j];
3510             if (pic) {
3511                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3512                     remove_long_at_index(h, j);
3513             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3514                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3515             break;
3516         case MMCO_LONG:
3517             unref_pic = 1;
3518             if (FIELD_PICTURE && !s->first_field) {
3519                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3520                     /* Just mark second field as referenced */
3521                     unref_pic = 0;
3522                 } else if (s->current_picture_ptr->reference) {
3523                     /* First field in pair is in short term list or
3524                      * at a different long term index.
3525                      * This is not allowed; see 7.4.3, notes 2 and 3.
3526                      * Report the problem and keep the pair where it is,
3527                      * and mark this field valid.
3528                      */
3529                     av_log(h->s.avctx, AV_LOG_ERROR,
3530                         "illegal long term reference assignment for second "
3531                         "field in complementary field pair (first field is "
3532                         "short term or has non-matching long index)\n");
3533                     unref_pic = 0;
3534                 }
3535             }
3536
3537             if (unref_pic) {
3538                 pic= remove_long(h, mmco[i].long_arg);
3539                 if(pic) unreference_pic(h, pic, 0);
3540
3541                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3542                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3543                 h->long_ref_count++;
3544             }
3545
3546             s->current_picture_ptr->reference |= s->picture_structure;
3547             current_ref_assigned=1;
3548             break;
3549         case MMCO_SET_MAX_LONG:
3550             assert(mmco[i].long_arg <= 16);
3551             // just remove the long term which index is greater than new max
3552             for(j = mmco[i].long_arg; j<16; j++){
3553                 pic = remove_long(h, j);
3554                 if (pic) unreference_pic(h, pic, 0);
3555             }
3556             break;
3557         case MMCO_RESET:
3558             while(h->short_ref_count){
3559                 pic= remove_short(h, h->short_ref[0]->frame_num);
3560                 if(pic) unreference_pic(h, pic, 0);
3561             }
3562             for(j = 0; j < 16; j++) {
3563                 pic= remove_long(h, j);
3564                 if(pic) unreference_pic(h, pic, 0);
3565             }
3566             break;
3567         default: assert(0);
3568         }
3569     }
3570
3571     if (!current_ref_assigned && FIELD_PICTURE &&
3572             !s->first_field && s->current_picture_ptr->reference) {
3573
3574         /* Second field of complementary field pair; the first field of
3575          * which is already referenced. If short referenced, it
3576          * should be first entry in short_ref. If not, it must exist
3577          * in long_ref; trying to put it on the short list here is an
3578          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3579          */
3580         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3581             /* Just mark the second field valid */
3582             s->current_picture_ptr->reference = PICT_FRAME;
3583         } else if (s->current_picture_ptr->long_ref) {
3584             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3585                                              "assignment for second field "
3586                                              "in complementary field pair "
3587                                              "(first field is long term)\n");
3588         } else {
3589             /*
3590              * First field in reference, but not in any sensible place on our
3591              * reference lists. This shouldn't happen unless reference
3592              * handling somewhere else is wrong.
3593              */
3594             assert(0);
3595         }
3596         current_ref_assigned = 1;
3597     }
3598
3599     if(!current_ref_assigned){
3600         pic= remove_short(h, s->current_picture_ptr->frame_num);
3601         if(pic){
3602             unreference_pic(h, pic, 0);
3603             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3604         }
3605
3606         if(h->short_ref_count)
3607             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3608
3609         h->short_ref[0]= s->current_picture_ptr;
3610         h->short_ref[0]->long_ref=0;
3611         h->short_ref_count++;
3612         s->current_picture_ptr->reference |= s->picture_structure;
3613     }
3614
3615     print_short_term(h);
3616     print_long_term(h);
3617     return 0;
3618 }
3619
3620 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3621     MpegEncContext * const s = &h->s;
3622     int i;
3623
3624     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3625         s->broken_link= get_bits1(gb) -1;
3626         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3627         if(h->mmco[0].long_arg == -1)
3628             h->mmco_index= 0;
3629         else{
3630             h->mmco[0].opcode= MMCO_LONG;
3631             h->mmco_index= 1;
3632         }
3633     }else{
3634         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3635             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3636                 MMCOOpcode opcode= get_ue_golomb(gb);
3637
3638                 h->mmco[i].opcode= opcode;
3639                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3640                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3641 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3642                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3643                         return -1;
3644                     }*/
3645                 }
3646                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3647                     unsigned int long_arg= get_ue_golomb(gb);
3648                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3649                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3650                         return -1;
3651                     }
3652                     h->mmco[i].long_arg= long_arg;
3653                 }
3654
3655                 if(opcode > (unsigned)MMCO_LONG){
3656                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3657                     return -1;
3658                 }
3659                 if(opcode == MMCO_END)
3660                     break;
3661             }
3662             h->mmco_index= i;
3663         }else{
3664             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3665
3666             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3667                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3668                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3669                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3670                 h->mmco_index= 1;
3671                 if (FIELD_PICTURE) {
3672                     h->mmco[0].short_pic_num *= 2;
3673                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3674                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3675                     h->mmco_index= 2;
3676                 }
3677             }else
3678                 h->mmco_index= 0;
3679         }
3680     }
3681
3682     return 0;
3683 }
3684
3685 static int init_poc(H264Context *h){
3686     MpegEncContext * const s = &h->s;
3687     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3688     int field_poc[2];
3689
3690     if(h->nal_unit_type == NAL_IDR_SLICE){
3691         h->frame_num_offset= 0;
3692     }else{
3693         if(h->frame_num < h->prev_frame_num)
3694             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3695         else
3696             h->frame_num_offset= h->prev_frame_num_offset;
3697     }
3698
3699     if(h->sps.poc_type==0){
3700         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3701
3702         if(h->nal_unit_type == NAL_IDR_SLICE){
3703              h->prev_poc_msb=
3704              h->prev_poc_lsb= 0;
3705         }
3706
3707         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3708             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3709         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3710             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3711         else
3712             h->poc_msb = h->prev_poc_msb;
3713 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3714         field_poc[0] =
3715         field_poc[1] = h->poc_msb + h->poc_lsb;
3716         if(s->picture_structure == PICT_FRAME)
3717             field_poc[1] += h->delta_poc_bottom;
3718     }else if(h->sps.poc_type==1){
3719         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3720         int i;
3721
3722         if(h->sps.poc_cycle_length != 0)
3723             abs_frame_num = h->frame_num_offset + h->frame_num;
3724         else
3725             abs_frame_num = 0;
3726
3727         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3728             abs_frame_num--;
3729
3730         expected_delta_per_poc_cycle = 0;
3731         for(i=0; i < h->sps.poc_cycle_length; i++)
3732             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3733
3734         if(abs_frame_num > 0){
3735             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3736             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3737
3738             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3739             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3740                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3741         } else
3742             expectedpoc = 0;
3743
3744         if(h->nal_ref_idc == 0)
3745             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3746
3747         field_poc[0] = expectedpoc + h->delta_poc[0];
3748         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3749
3750         if(s->picture_structure == PICT_FRAME)
3751             field_poc[1] += h->delta_poc[1];
3752     }else{
3753         int poc;
3754         if(h->nal_unit_type == NAL_IDR_SLICE){
3755             poc= 0;
3756         }else{
3757             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3758             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3759         }
3760         field_poc[0]= poc;
3761         field_poc[1]= poc;
3762     }
3763
3764     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3765         s->current_picture_ptr->field_poc[0]= field_poc[0];
3766         s->current_picture_ptr->poc = field_poc[0];
3767     }
3768     if(s->picture_structure != PICT_TOP_FIELD) {
3769         s->current_picture_ptr->field_poc[1]= field_poc[1];
3770         s->current_picture_ptr->poc = field_poc[1];
3771     }
3772     if(!FIELD_PICTURE || !s->first_field) {
3773         Picture *cur = s->current_picture_ptr;
3774         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3775     }
3776
3777     return 0;
3778 }
3779
3780
3781 /**
3782  * initialize scan tables
3783  */
3784 static void init_scan_tables(H264Context *h){
3785     MpegEncContext * const s = &h->s;
3786     int i;
3787     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3788         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3789         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3790     }else{
3791         for(i=0; i<16; i++){
3792 #define T(x) (x>>2) | ((x<<2) & 0xF)
3793             h->zigzag_scan[i] = T(zigzag_scan[i]);
3794             h-> field_scan[i] = T( field_scan[i]);
3795 #undef T
3796         }
3797     }
3798     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3799         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3800         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3801         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3802         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3803     }else{
3804         for(i=0; i<64; i++){
3805 #define T(x) (x>>3) | ((x&7)<<3)
3806             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3807             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3808             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3809             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3810 #undef T
3811         }
3812     }
3813     if(h->sps.transform_bypass){ //FIXME same ugly
3814         h->zigzag_scan_q0          = zigzag_scan;
3815         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3816         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3817         h->field_scan_q0           = field_scan;
3818         h->field_scan8x8_q0        = field_scan8x8;
3819         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3820     }else{
3821         h->zigzag_scan_q0          = h->zigzag_scan;
3822         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3823         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3824         h->field_scan_q0           = h->field_scan;
3825         h->field_scan8x8_q0        = h->field_scan8x8;
3826         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3827     }
3828 }
3829
3830 /**
3831  * Replicates H264 "master" context to thread contexts.
3832  */
3833 static void clone_slice(H264Context *dst, H264Context *src)
3834 {
3835     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3836     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3837     dst->s.current_picture      = src->s.current_picture;
3838     dst->s.linesize             = src->s.linesize;
3839     dst->s.uvlinesize           = src->s.uvlinesize;
3840     dst->s.first_field          = src->s.first_field;
3841
3842     dst->prev_poc_msb           = src->prev_poc_msb;
3843     dst->prev_poc_lsb           = src->prev_poc_lsb;
3844     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3845     dst->prev_frame_num         = src->prev_frame_num;
3846     dst->short_ref_count        = src->short_ref_count;
3847
3848     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3849     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3850     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3851     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3852
3853     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3854     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3855 }
3856
3857 /**
3858  * decodes a slice header.
3859  * this will allso call MPV_common_init() and frame_start() as needed
3860  *
3861  * @param h h264context
3862  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3863  *
3864  * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
3865  */
3866 static int decode_slice_header(H264Context *h, H264Context *h0){
3867     MpegEncContext * const s = &h->s;
3868     MpegEncContext * const s0 = &h0->s;
3869     unsigned int first_mb_in_slice;
3870     unsigned int pps_id;
3871     int num_ref_idx_active_override_flag;
3872     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3873     unsigned int slice_type, tmp, i;
3874     int default_ref_list_done = 0;
3875     int last_pic_structure;
3876
3877     s->dropable= h->nal_ref_idc == 0;
3878
3879     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3880         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3881         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3882     }else{
3883         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3884         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3885     }
3886
3887     first_mb_in_slice= get_ue_golomb(&s->gb);
3888
3889     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3890         h0->current_slice = 0;
3891         if (!s0->first_field)
3892             s->current_picture_ptr= NULL;
3893     }
3894
3895     slice_type= get_ue_golomb(&s->gb);
3896     if(slice_type > 9){
3897         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3898         return -1;
3899     }
3900     if(slice_type > 4){
3901         slice_type -= 5;
3902         h->slice_type_fixed=1;
3903     }else
3904         h->slice_type_fixed=0;
3905
3906     slice_type= slice_type_map[ slice_type ];
3907     if (slice_type == I_TYPE
3908         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3909         default_ref_list_done = 1;
3910     }
3911     h->slice_type= slice_type;
3912
3913     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3914
3915     pps_id= get_ue_golomb(&s->gb);
3916     if(pps_id>=MAX_PPS_COUNT){
3917         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3918         return -1;
3919     }
3920     if(!h0->pps_buffers[pps_id]) {
3921         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3922         return -1;
3923     }
3924     h->pps= *h0->pps_buffers[pps_id];
3925
3926     if(!h0->sps_buffers[h->pps.sps_id]) {
3927         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3928         return -1;
3929     }
3930     h->sps = *h0->sps_buffers[h->pps.sps_id];
3931
3932     if(h == h0 && h->dequant_coeff_pps != pps_id){
3933         h->dequant_coeff_pps = pps_id;
3934         init_dequant_tables(h);
3935     }
3936
3937     s->mb_width= h->sps.mb_width;
3938     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3939
3940     h->b_stride=  s->mb_width*4;
3941     h->b8_stride= s->mb_width*2;
3942
3943     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3944     if(h->sps.frame_mbs_only_flag)
3945         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3946     else
3947         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3948
3949     if (s->context_initialized
3950         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3951         if(h != h0)
3952             return -1;   // width / height changed during parallelized decoding
3953         free_tables(h);
3954         MPV_common_end(s);
3955     }
3956     if (!s->context_initialized) {
3957         if(h != h0)
3958             return -1;  // we cant (re-)initialize context during parallel decoding
3959         if (MPV_common_init(s) < 0)
3960             return -1;
3961         s->first_field = 0;
3962
3963         init_scan_tables(h);
3964         alloc_tables(h);
3965
3966         for(i = 1; i < s->avctx->thread_count; i++) {
3967             H264Context *c;
3968             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3969             memcpy(c, h, sizeof(MpegEncContext));
3970             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3971             c->sps = h->sps;
3972             c->pps = h->pps;
3973             init_scan_tables(c);
3974             clone_tables(c, h);
3975         }
3976
3977         for(i = 0; i < s->avctx->thread_count; i++)
3978             if(context_init(h->thread_context[i]) < 0)
3979                 return -1;
3980
3981         s->avctx->width = s->width;
3982         s->avctx->height = s->height;
3983         s->avctx->sample_aspect_ratio= h->sps.sar;
3984         if(!s->avctx->sample_aspect_ratio.den)
3985             s->avctx->sample_aspect_ratio.den = 1;
3986
3987         if(h->sps.timing_info_present_flag){
3988             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3989             if(h->x264_build > 0 && h->x264_build < 44)
3990                 s->avctx->time_base.den *= 2;
3991             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3992                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3993         }
3994     }
3995
3996     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3997
3998     h->mb_mbaff = 0;
3999     h->mb_aff_frame = 0;
4000     last_pic_structure = s0->picture_structure;
4001     if(h->sps.frame_mbs_only_flag){
4002         s->picture_structure= PICT_FRAME;
4003     }else{
4004         if(get_bits1(&s->gb)) { //field_pic_flag
4005             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4006         } else {
4007             s->picture_structure= PICT_FRAME;
4008             h->mb_aff_frame = h->sps.mb_aff;
4009         }
4010     }
4011
4012     if(h0->current_slice == 0){
4013         /* See if we have a decoded first field looking for a pair... */
4014         if (s0->first_field) {
4015             assert(s0->current_picture_ptr);
4016             assert(s0->current_picture_ptr->data[0]);
4017             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4018
4019             /* figure out if we have a complementary field pair */
4020             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4021                 /*
4022                  * Previous field is unmatched. Don't display it, but let it
4023                  * remain for reference if marked as such.
4024                  */
4025                 s0->current_picture_ptr = NULL;
4026                 s0->first_field = FIELD_PICTURE;
4027
4028             } else {
4029                 if (h->nal_ref_idc &&
4030                         s0->current_picture_ptr->reference &&
4031                         s0->current_picture_ptr->frame_num != h->frame_num) {
4032                     /*
4033                      * This and previous field were reference, but had
4034                      * different frame_nums. Consider this field first in
4035                      * pair. Throw away previous field except for reference
4036                      * purposes.
4037                      */
4038                     s0->first_field = 1;
4039                     s0->current_picture_ptr = NULL;
4040
4041                 } else {
4042                     /* Second field in complementary pair */
4043                     s0->first_field = 0;
4044                 }
4045             }
4046
4047         } else {
4048             /* Frame or first field in a potentially complementary pair */
4049             assert(!s0->current_picture_ptr);
4050             s0->first_field = FIELD_PICTURE;
4051         }
4052
4053         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4054             s0->first_field = 0;
4055             return -1;
4056         }
4057     }
4058     if(h != h0)
4059         clone_slice(h, h0);
4060
4061     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4062
4063     assert(s->mb_num == s->mb_width * s->mb_height);
4064     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4065        first_mb_in_slice                    >= s->mb_num){
4066         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4067         return -1;
4068     }
4069     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4070     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4071     if (s->picture_structure == PICT_BOTTOM_FIELD)
4072         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4073     assert(s->mb_y < s->mb_height);
4074
4075     if(s->picture_structure==PICT_FRAME){
4076         h->curr_pic_num=   h->frame_num;
4077         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4078     }else{
4079         h->curr_pic_num= 2*h->frame_num + 1;
4080         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4081     }
4082
4083     if(h->nal_unit_type == NAL_IDR_SLICE){
4084         get_ue_golomb(&s->gb); /* idr_pic_id */
4085     }
4086
4087     if(h->sps.poc_type==0){
4088         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4089
4090         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4091             h->delta_poc_bottom= get_se_golomb(&s->gb);
4092         }
4093     }
4094
4095     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4096         h->delta_poc[0]= get_se_golomb(&s->gb);
4097
4098         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4099             h->delta_poc[1]= get_se_golomb(&s->gb);
4100     }
4101
4102     init_poc(h);
4103
4104     if(h->pps.redundant_pic_cnt_present){
4105         h->redundant_pic_count= get_ue_golomb(&s->gb);
4106     }
4107
4108     //set defaults, might be overriden a few line later
4109     h->ref_count[0]= h->pps.ref_count[0];
4110     h->ref_count[1]= h->pps.ref_count[1];
4111
4112     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4113         if(h->slice_type == B_TYPE){
4114             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4115             if(FIELD_OR_MBAFF_PICTURE && h->direct_spatial_mv_pred)
4116                 av_log(h->s.avctx, AV_LOG_ERROR, "Interlaced pictures + spatial direct mode is not implemented\n");
4117         }
4118         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4119
4120         if(num_ref_idx_active_override_flag){
4121             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4122             if(h->slice_type==B_TYPE)
4123                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4124
4125             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4126                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4127                 h->ref_count[0]= h->ref_count[1]= 1;
4128                 return -1;
4129             }
4130         }
4131         if(h->slice_type == B_TYPE)
4132             h->list_count= 2;
4133         else
4134             h->list_count= 1;
4135     }else
4136         h->list_count= 0;
4137
4138     if(!default_ref_list_done){
4139         fill_default_ref_list(h);
4140     }
4141
4142     if(decode_ref_pic_list_reordering(h) < 0)
4143         return -1;
4144
4145     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4146        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4147         pred_weight_table(h);
4148     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4149         implicit_weight_table(h);
4150     else
4151         h->use_weight = 0;
4152
4153     if(h->nal_ref_idc)
4154         decode_ref_pic_marking(h0, &s->gb);
4155
4156     if(FRAME_MBAFF)
4157         fill_mbaff_ref_list(h);
4158
4159     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4160         tmp = get_ue_golomb(&s->gb);
4161         if(tmp > 2){
4162             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4163             return -1;
4164         }
4165         h->cabac_init_idc= tmp;
4166     }
4167
4168     h->last_qscale_diff = 0;
4169     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4170     if(tmp>51){
4171         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4172         return -1;
4173     }
4174     s->qscale= tmp;
4175     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4176     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4177     //FIXME qscale / qp ... stuff
4178     if(h->slice_type == SP_TYPE){
4179         get_bits1(&s->gb); /* sp_for_switch_flag */
4180     }
4181     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4182         get_se_golomb(&s->gb); /* slice_qs_delta */
4183     }
4184
4185     h->deblocking_filter = 1;
4186     h->slice_alpha_c0_offset = 0;
4187     h->slice_beta_offset = 0;
4188     if( h->pps.deblocking_filter_parameters_present ) {
4189         tmp= get_ue_golomb(&s->gb);
4190         if(tmp > 2){
4191             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4192             return -1;
4193         }
4194         h->deblocking_filter= tmp;
4195         if(h->deblocking_filter < 2)
4196             h->deblocking_filter^= 1; // 1<->0
4197
4198         if( h->deblocking_filter ) {
4199             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4200             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4201         }
4202     }
4203
4204     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4205        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4206        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4207        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4208         h->deblocking_filter= 0;
4209
4210     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4211         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4212             /* Cheat slightly for speed:
4213                Dont bother to deblock across slices */
4214             h->deblocking_filter = 2;
4215         } else {
4216             h0->max_contexts = 1;
4217             if(!h0->single_decode_warning) {
4218                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4219                 h0->single_decode_warning = 1;
4220             }
4221             if(h != h0)
4222                 return 1; // deblocking switched inside frame
4223         }
4224     }
4225
4226 #if 0 //FMO
4227     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4228         slice_group_change_cycle= get_bits(&s->gb, ?);
4229 #endif
4230
4231     h0->last_slice_type = slice_type;
4232     h->slice_num = ++h0->current_slice;
4233
4234     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4235     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4236
4237     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4238         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4239                h->slice_num,
4240                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4241                first_mb_in_slice,
4242                av_get_pict_type_char(h->slice_type),
4243                pps_id, h->frame_num,
4244                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4245                h->ref_count[0], h->ref_count[1],
4246                s->qscale,
4247                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4248                h->use_weight,
4249                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4250                );
4251     }
4252
4253     return 0;
4254 }
4255
4256 /**
4257  *
4258  */
4259 static inline int get_level_prefix(GetBitContext *gb){
4260     unsigned int buf;
4261     int log;
4262
4263     OPEN_READER(re, gb);
4264     UPDATE_CACHE(re, gb);
4265     buf=GET_CACHE(re, gb);
4266
4267     log= 32 - av_log2(buf);
4268 #ifdef TRACE
4269     print_bin(buf>>(32-log), log);
4270     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4271 #endif
4272
4273     LAST_SKIP_BITS(re, gb, log);
4274     CLOSE_READER(re, gb);
4275
4276     return log-1;
4277 }
4278
4279 static inline int get_dct8x8_allowed(H264Context *h){
4280     int i;
4281     for(i=0; i<4; i++){
4282         if(!IS_SUB_8X8(h->sub_mb_type[i])
4283            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4284             return 0;
4285     }
4286     return 1;
4287 }
4288
4289 /**
4290  * decodes a residual block.
4291  * @param n block index
4292  * @param scantable scantable
4293  * @param max_coeff number of coefficients in the block
4294  * @return <0 if an error occured
4295  */
4296 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4297     MpegEncContext * const s = &h->s;
4298     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4299     int level[16];
4300     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4301
4302     //FIXME put trailing_onex into the context
4303
4304     if(n == CHROMA_DC_BLOCK_INDEX){
4305         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4306         total_coeff= coeff_token>>2;
4307     }else{
4308         if(n == LUMA_DC_BLOCK_INDEX){
4309             total_coeff= pred_non_zero_count(h, 0);
4310             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4311             total_coeff= coeff_token>>2;
4312         }else{
4313             total_coeff= pred_non_zero_count(h, n);
4314             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4315             total_coeff= coeff_token>>2;
4316             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4317         }
4318     }
4319
4320     //FIXME set last_non_zero?
4321
4322     if(total_coeff==0)
4323         return 0;
4324     if(total_coeff > (unsigned)max_coeff) {
4325         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4326         return -1;
4327     }
4328
4329     trailing_ones= coeff_token&3;
4330     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4331     assert(total_coeff<=16);
4332
4333     for(i=0; i<trailing_ones; i++){
4334         level[i]= 1 - 2*get_bits1(gb);
4335     }
4336
4337     if(i<total_coeff) {
4338         int level_code, mask;
4339         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4340         int prefix= get_level_prefix(gb);
4341
4342         //first coefficient has suffix_length equal to 0 or 1
4343         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4344             if(suffix_length)
4345                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4346             else
4347                 level_code= (prefix<<suffix_length); //part
4348         }else if(prefix==14){
4349             if(suffix_length)
4350                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4351             else
4352                 level_code= prefix + get_bits(gb, 4); //part
4353         }else if(prefix==15){
4354             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4355             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4356         }else{
4357             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4358             return -1;
4359         }
4360
4361         if(trailing_ones < 3) level_code += 2;
4362
4363         suffix_length = 1;
4364         if(level_code > 5)
4365             suffix_length++;
4366         mask= -(level_code&1);
4367         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4368         i++;
4369
4370         //remaining coefficients have suffix_length > 0
4371         for(;i<total_coeff;i++) {
4372             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4373             prefix = get_level_prefix(gb);
4374             if(prefix<15){
4375                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4376             }else if(prefix==15){
4377                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4378             }else{
4379                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4380                 return -1;
4381             }
4382             mask= -(level_code&1);
4383             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4384             if(level_code > suffix_limit[suffix_length])
4385                 suffix_length++;
4386         }
4387     }
4388
4389     if(total_coeff == max_coeff)
4390         zeros_left=0;
4391     else{
4392         if(n == CHROMA_DC_BLOCK_INDEX)
4393             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4394         else
4395             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4396     }
4397
4398     coeff_num = zeros_left + total_coeff - 1;
4399     j = scantable[coeff_num];
4400     if(n > 24){
4401         block[j] = level[0];
4402         for(i=1;i<total_coeff;i++) {
4403             if(zeros_left <= 0)
4404                 run_before = 0;
4405             else if(zeros_left < 7){
4406                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4407             }else{
4408                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4409             }
4410             zeros_left -= run_before;
4411             coeff_num -= 1 + run_before;
4412             j= scantable[ coeff_num ];
4413
4414             block[j]= level[i];
4415         }
4416     }else{
4417         block[j] = (level[0] * qmul[j] + 32)>>6;
4418         for(i=1;i<total_coeff;i++) {
4419             if(zeros_left <= 0)
4420                 run_before = 0;
4421             else if(zeros_left < 7){
4422                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4423             }else{
4424                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4425             }
4426             zeros_left -= run_before;
4427             coeff_num -= 1 + run_before;
4428             j= scantable[ coeff_num ];
4429
4430             block[j]= (level[i] * qmul[j] + 32)>>6;
4431         }
4432     }
4433
4434     if(zeros_left<0){
4435         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4436         return -1;
4437     }
4438
4439     return 0;
4440 }
4441
4442 static void predict_field_decoding_flag(H264Context *h){
4443     MpegEncContext * const s = &h->s;
4444     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4445     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4446                 ? s->current_picture.mb_type[mb_xy-1]
4447                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4448                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4449                 : 0;
4450     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4451 }
4452
4453 /**
4454  * decodes a P_SKIP or B_SKIP macroblock
4455  */
4456 static void decode_mb_skip(H264Context *h){
4457     MpegEncContext * const s = &h->s;
4458     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4459     int mb_type=0;
4460
4461     memset(h->non_zero_count[mb_xy], 0, 16);
4462     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4463
4464     if(MB_FIELD)
4465         mb_type|= MB_TYPE_INTERLACED;
4466
4467     if( h->slice_type == B_TYPE )
4468     {
4469         // just for fill_caches. pred_direct_motion will set the real mb_type
4470         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4471
4472         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4473         pred_direct_motion(h, &mb_type);
4474         mb_type|= MB_TYPE_SKIP;
4475     }
4476     else
4477     {
4478         int mx, my;
4479         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4480
4481         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4482         pred_pskip_motion(h, &mx, &my);
4483         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4484         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4485     }
4486
4487     write_back_motion(h, mb_type);
4488     s->current_picture.mb_type[mb_xy]= mb_type;
4489     s->current_picture.qscale_table[mb_xy]= s->qscale;
4490     h->slice_table[ mb_xy ]= h->slice_num;
4491     h->prev_mb_skipped= 1;
4492 }
4493
4494 /**
4495  * decodes a macroblock
4496  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4497  */
4498 static int decode_mb_cavlc(H264Context *h){
4499     MpegEncContext * const s = &h->s;
4500     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4501     int partition_count;
4502     unsigned int mb_type, cbp;
4503     int dct8x8_allowed= h->pps.transform_8x8_mode;
4504
4505     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4506
4507     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4508     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4509                 down the code */
4510     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4511         if(s->mb_skip_run==-1)
4512             s->mb_skip_run= get_ue_golomb(&s->gb);
4513
4514         if (s->mb_skip_run--) {
4515             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4516                 if(s->mb_skip_run==0)
4517                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4518                 else
4519                     predict_field_decoding_flag(h);
4520             }
4521             decode_mb_skip(h);
4522             return 0;
4523         }
4524     }
4525     if(FRAME_MBAFF){
4526         if( (s->mb_y&1) == 0 )
4527             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4528     }else
4529         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4530
4531     h->prev_mb_skipped= 0;
4532
4533     mb_type= get_ue_golomb(&s->gb);
4534     if(h->slice_type == B_TYPE){
4535         if(mb_type < 23){
4536             partition_count= b_mb_type_info[mb_type].partition_count;
4537             mb_type=         b_mb_type_info[mb_type].type;
4538         }else{
4539             mb_type -= 23;
4540             goto decode_intra_mb;
4541         }
4542     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4543         if(mb_type < 5){
4544             partition_count= p_mb_type_info[mb_type].partition_count;
4545             mb_type=         p_mb_type_info[mb_type].type;
4546         }else{
4547             mb_type -= 5;
4548             goto decode_intra_mb;
4549         }
4550     }else{
4551        assert(h->slice_type == I_TYPE);
4552 decode_intra_mb:
4553         if(mb_type > 25){
4554             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4555             return -1;
4556         }
4557         partition_count=0;
4558         cbp= i_mb_type_info[mb_type].cbp;
4559         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4560         mb_type= i_mb_type_info[mb_type].type;
4561     }
4562
4563     if(MB_FIELD)
4564         mb_type |= MB_TYPE_INTERLACED;
4565
4566     h->slice_table[ mb_xy ]= h->slice_num;
4567
4568     if(IS_INTRA_PCM(mb_type)){
4569         unsigned int x, y;
4570
4571         // We assume these blocks are very rare so we do not optimize it.
4572         align_get_bits(&s->gb);
4573
4574         // The pixels are stored in the same order as levels in h->mb array.
4575         for(y=0; y<16; y++){
4576             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4577             for(x=0; x<16; x++){
4578                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4579                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4580             }
4581         }
4582         for(y=0; y<8; y++){
4583             const int index= 256 + 4*(y&3) + 32*(y>>2);
4584             for(x=0; x<8; x++){
4585                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4586                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4587             }
4588         }
4589         for(y=0; y<8; y++){
4590             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4591             for(x=0; x<8; x++){
4592                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4593                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4594             }
4595         }
4596
4597         // In deblocking, the quantizer is 0
4598         s->current_picture.qscale_table[mb_xy]= 0;
4599         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4600         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4601         // All coeffs are present
4602         memset(h->non_zero_count[mb_xy], 16, 16);
4603
4604         s->current_picture.mb_type[mb_xy]= mb_type;
4605         return 0;
4606     }
4607
4608     if(MB_MBAFF){
4609         h->ref_count[0] <<= 1;
4610         h->ref_count[1] <<= 1;
4611     }
4612
4613     fill_caches(h, mb_type, 0);
4614
4615     //mb_pred
4616     if(IS_INTRA(mb_type)){
4617             int pred_mode;
4618 //            init_top_left_availability(h);
4619             if(IS_INTRA4x4(mb_type)){
4620                 int i;
4621                 int di = 1;
4622                 if(dct8x8_allowed && get_bits1(&s->gb)){
4623                     mb_type |= MB_TYPE_8x8DCT;
4624                     di = 4;
4625                 }
4626
4627 //                fill_intra4x4_pred_table(h);
4628                 for(i=0; i<16; i+=di){
4629                     int mode= pred_intra_mode(h, i);
4630
4631                     if(!get_bits1(&s->gb)){
4632                         const int rem_mode= get_bits(&s->gb, 3);
4633                         mode = rem_mode + (rem_mode >= mode);
4634                     }
4635
4636                     if(di==4)
4637                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4638                     else
4639                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4640                 }
4641                 write_back_intra_pred_mode(h);
4642                 if( check_intra4x4_pred_mode(h) < 0)
4643                     return -1;
4644             }else{
4645                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4646                 if(h->intra16x16_pred_mode < 0)
4647                     return -1;
4648             }
4649
4650             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4651             if(pred_mode < 0)
4652                 return -1;
4653             h->chroma_pred_mode= pred_mode;
4654     }else if(partition_count==4){
4655         int i, j, sub_partition_count[4], list, ref[2][4];
4656
4657         if(h->slice_type == B_TYPE){
4658             for(i=0; i<4; i++){
4659                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4660                 if(h->sub_mb_type[i] >=13){
4661                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4662                     return -1;
4663                 }
4664                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4665                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4666             }
4667             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4668                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4669                 pred_direct_motion(h, &mb_type);
4670                 h->ref_cache[0][scan8[4]] =
4671                 h->ref_cache[1][scan8[4]] =
4672                 h->ref_cache[0][scan8[12]] =
4673                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4674             }
4675         }else{
4676             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4677             for(i=0; i<4; i++){
4678                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4679                 if(h->sub_mb_type[i] >=4){
4680                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4681                     return -1;
4682                 }
4683                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4684                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4685             }
4686         }
4687
4688         for(list=0; list<h->list_count; list++){
4689             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4690             for(i=0; i<4; i++){
4691                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4692                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4693                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4694                     if(tmp>=ref_count){
4695                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4696                         return -1;
4697                     }
4698                     ref[list][i]= tmp;
4699                 }else{
4700                  //FIXME
4701                     ref[list][i] = -1;
4702                 }
4703             }
4704         }
4705
4706         if(dct8x8_allowed)
4707             dct8x8_allowed = get_dct8x8_allowed(h);
4708
4709         for(list=0; list<h->list_count; list++){
4710             for(i=0; i<4; i++){
4711                 if(IS_DIRECT(h->sub_mb_type[i])) {
4712                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4713                     continue;
4714                 }
4715                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4716                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4717
4718                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4719                     const int sub_mb_type= h->sub_mb_type[i];
4720                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4721                     for(j=0; j<sub_partition_count[i]; j++){
4722                         int mx, my;
4723                         const int index= 4*i + block_width*j;
4724                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4725                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4726                         mx += get_se_golomb(&s->gb);
4727                         my += get_se_golomb(&s->gb);
4728                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4729
4730                         if(IS_SUB_8X8(sub_mb_type)){
4731                             mv_cache[ 1 ][0]=
4732                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4733                             mv_cache[ 1 ][1]=
4734                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4735                         }else if(IS_SUB_8X4(sub_mb_type)){
4736                             mv_cache[ 1 ][0]= mx;
4737                             mv_cache[ 1 ][1]= my;
4738                         }else if(IS_SUB_4X8(sub_mb_type)){
4739                             mv_cache[ 8 ][0]= mx;
4740                             mv_cache[ 8 ][1]= my;
4741                         }
4742                         mv_cache[ 0 ][0]= mx;
4743                         mv_cache[ 0 ][1]= my;
4744                     }
4745                 }else{
4746                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4747                     p[0] = p[1]=
4748                     p[8] = p[9]= 0;
4749                 }
4750             }
4751         }
4752     }else if(IS_DIRECT(mb_type)){
4753         pred_direct_motion(h, &mb_type);
4754         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4755     }else{
4756         int list, mx, my, i;
4757          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4758         if(IS_16X16(mb_type)){
4759             for(list=0; list<h->list_count; list++){
4760                     unsigned int val;
4761                     if(IS_DIR(mb_type, 0, list)){
4762                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4763                         if(val >= h->ref_count[list]){
4764                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4765                             return -1;
4766                         }
4767                     }else
4768                         val= LIST_NOT_USED&0xFF;
4769                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4770             }
4771             for(list=0; list<h->list_count; list++){
4772                 unsigned int val;
4773                 if(IS_DIR(mb_type, 0, list)){
4774                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4775                     mx += get_se_golomb(&s->gb);
4776                     my += get_se_golomb(&s->gb);
4777                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4778
4779                     val= pack16to32(mx,my);
4780                 }else
4781                     val=0;
4782                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4783             }
4784         }
4785         else if(IS_16X8(mb_type)){
4786             for(list=0; list<h->list_count; list++){
4787                     for(i=0; i<2; i++){
4788                         unsigned int val;
4789                         if(IS_DIR(mb_type, i, list)){
4790                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4791                             if(val >= h->ref_count[list]){
4792                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4793                                 return -1;
4794                             }
4795                         }else
4796                             val= LIST_NOT_USED&0xFF;
4797                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4798                     }
4799             }
4800             for(list=0; list<h->list_count; list++){
4801                 for(i=0; i<2; i++){
4802                     unsigned int val;
4803                     if(IS_DIR(mb_type, i, list)){
4804                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4805                         mx += get_se_golomb(&s->gb);
4806                         my += get_se_golomb(&s->gb);
4807                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4808
4809                         val= pack16to32(mx,my);
4810                     }else
4811                         val=0;
4812                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4813                 }
4814             }
4815         }else{
4816             assert(IS_8X16(mb_type));
4817             for(list=0; list<h->list_count; list++){
4818                     for(i=0; i<2; i++){
4819                         unsigned int val;
4820                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4821                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4822                             if(val >= h->ref_count[list]){
4823                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4824                                 return -1;
4825                             }
4826                         }else
4827                             val= LIST_NOT_USED&0xFF;
4828                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4829                     }
4830             }
4831             for(list=0; list<h->list_count; list++){
4832                 for(i=0; i<2; i++){
4833                     unsigned int val;
4834                     if(IS_DIR(mb_type, i, list)){
4835                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4836                         mx += get_se_golomb(&s->gb);
4837                         my += get_se_golomb(&s->gb);
4838                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4839
4840                         val= pack16to32(mx,my);
4841                     }else
4842                         val=0;
4843                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4844                 }
4845             }
4846         }
4847     }
4848
4849     if(IS_INTER(mb_type))
4850         write_back_motion(h, mb_type);
4851
4852     if(!IS_INTRA16x16(mb_type)){
4853         cbp= get_ue_golomb(&s->gb);
4854         if(cbp > 47){
4855             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4856             return -1;
4857         }
4858
4859         if(IS_INTRA4x4(mb_type))
4860             cbp= golomb_to_intra4x4_cbp[cbp];
4861         else
4862             cbp= golomb_to_inter_cbp[cbp];
4863     }
4864     h->cbp = cbp;
4865
4866     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4867         if(get_bits1(&s->gb))
4868             mb_type |= MB_TYPE_8x8DCT;
4869     }
4870     s->current_picture.mb_type[mb_xy]= mb_type;
4871
4872     if(cbp || IS_INTRA16x16(mb_type)){
4873         int i8x8, i4x4, chroma_idx;
4874         int dquant;
4875         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4876         const uint8_t *scan, *scan8x8, *dc_scan;
4877
4878 //        fill_non_zero_count_cache(h);
4879
4880         if(IS_INTERLACED(mb_type)){
4881             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4882             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4883             dc_scan= luma_dc_field_scan;
4884         }else{
4885             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4886             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4887             dc_scan= luma_dc_zigzag_scan;
4888         }
4889
4890         dquant= get_se_golomb(&s->gb);
4891
4892         if( dquant > 25 || dquant < -26 ){
4893             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4894             return -1;
4895         }
4896
4897         s->qscale += dquant;
4898         if(((unsigned)s->qscale) > 51){
4899             if(s->qscale<0) s->qscale+= 52;
4900             else            s->qscale-= 52;
4901         }
4902
4903         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4904         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4905         if(IS_INTRA16x16(mb_type)){
4906             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4907                 return -1; //FIXME continue if partitioned and other return -1 too
4908             }
4909
4910             assert((cbp&15) == 0 || (cbp&15) == 15);
4911
4912             if(cbp&15){
4913                 for(i8x8=0; i8x8<4; i8x8++){
4914                     for(i4x4=0; i4x4<4; i4x4++){
4915                         const int index= i4x4 + 4*i8x8;
4916                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4917                             return -1;
4918                         }
4919                     }
4920                 }
4921             }else{
4922                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4923             }
4924         }else{
4925             for(i8x8=0; i8x8<4; i8x8++){
4926                 if(cbp & (1<<i8x8)){
4927                     if(IS_8x8DCT(mb_type)){
4928                         DCTELEM *buf = &h->mb[64*i8x8];
4929                         uint8_t *nnz;
4930                         for(i4x4=0; i4x4<4; i4x4++){
4931                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4932                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4933                                 return -1;
4934                         }
4935                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4936                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4937                     }else{
4938                         for(i4x4=0; i4x4<4; i4x4++){
4939                             const int index= i4x4 + 4*i8x8;
4940
4941                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4942                                 return -1;
4943                             }
4944                         }
4945                     }
4946                 }else{
4947                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4948                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4949                 }
4950             }
4951         }
4952
4953         if(cbp&0x30){
4954             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4955                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4956                     return -1;
4957                 }
4958         }
4959
4960         if(cbp&0x20){
4961             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4962                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4963                 for(i4x4=0; i4x4<4; i4x4++){
4964                     const int index= 16 + 4*chroma_idx + i4x4;
4965                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4966                         return -1;
4967                     }
4968                 }
4969             }
4970         }else{
4971             uint8_t * const nnz= &h->non_zero_count_cache[0];
4972             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4973             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4974         }
4975     }else{
4976         uint8_t * const nnz= &h->non_zero_count_cache[0];
4977         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4978         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4979         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4980     }
4981     s->current_picture.qscale_table[mb_xy]= s->qscale;
4982     write_back_non_zero_count(h);
4983
4984     if(MB_MBAFF){
4985         h->ref_count[0] >>= 1;
4986         h->ref_count[1] >>= 1;
4987     }
4988
4989     return 0;
4990 }
4991
4992 static int decode_cabac_field_decoding_flag(H264Context *h) {
4993     MpegEncContext * const s = &h->s;
4994     const int mb_x = s->mb_x;
4995     const int mb_y = s->mb_y & ~1;
4996     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4997     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4998
4999     unsigned int ctx = 0;
5000
5001     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5002         ctx += 1;
5003     }
5004     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5005         ctx += 1;
5006     }
5007
5008     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5009 }
5010
5011 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5012     uint8_t *state= &h->cabac_state[ctx_base];
5013     int mb_type;
5014
5015     if(intra_slice){
5016         MpegEncContext * const s = &h->s;
5017         const int mba_xy = h->left_mb_xy[0];
5018         const int mbb_xy = h->top_mb_xy;
5019         int ctx=0;
5020         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5021             ctx++;
5022         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5023             ctx++;
5024         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5025             return 0;   /* I4x4 */
5026         state += 2;
5027     }else{
5028         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5029             return 0;   /* I4x4 */
5030     }
5031
5032     if( get_cabac_terminate( &h->cabac ) )
5033         return 25;  /* PCM */
5034
5035     mb_type = 1; /* I16x16 */
5036     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5037     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5038         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5039     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5040     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5041     return mb_type;
5042 }
5043
5044 static int decode_cabac_mb_type( H264Context *h ) {
5045     MpegEncContext * const s = &h->s;
5046
5047     if( h->slice_type == I_TYPE ) {
5048         return decode_cabac_intra_mb_type(h, 3, 1);
5049     } else if( h->slice_type == P_TYPE ) {
5050         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5051             /* P-type */
5052             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5053                 /* P_L0_D16x16, P_8x8 */
5054                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5055             } else {
5056                 /* P_L0_D8x16, P_L0_D16x8 */
5057                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5058             }
5059         } else {
5060             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5061         }
5062     } else if( h->slice_type == B_TYPE ) {
5063         const int mba_xy = h->left_mb_xy[0];
5064         const int mbb_xy = h->top_mb_xy;
5065         int ctx = 0;
5066         int bits;
5067
5068         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5069             ctx++;
5070         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5071             ctx++;
5072
5073         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5074             return 0; /* B_Direct_16x16 */
5075
5076         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5077             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5078         }
5079
5080         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5081         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5082         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5083         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5084         if( bits < 8 )
5085             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5086         else if( bits == 13 ) {
5087             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5088         } else if( bits == 14 )
5089             return 11; /* B_L1_L0_8x16 */
5090         else if( bits == 15 )
5091             return 22; /* B_8x8 */
5092
5093         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5094         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5095     } else {
5096         /* TODO SI/SP frames? */
5097         return -1;
5098     }
5099 }
5100
5101 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5102     MpegEncContext * const s = &h->s;
5103     int mba_xy, mbb_xy;
5104     int ctx = 0;
5105
5106     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5107         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5108         mba_xy = mb_xy - 1;
5109         if( (mb_y&1)
5110             && h->slice_table[mba_xy] == h->slice_num
5111             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5112             mba_xy += s->mb_stride;
5113         if( MB_FIELD ){
5114             mbb_xy = mb_xy - s->mb_stride;
5115             if( !(mb_y&1)
5116                 && h->slice_table[mbb_xy] == h->slice_num
5117                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5118                 mbb_xy -= s->mb_stride;
5119         }else
5120             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5121     }else{
5122         int mb_xy = mb_x + mb_y*s->mb_stride;
5123         mba_xy = mb_xy - 1;
5124         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5125     }
5126
5127     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5128         ctx++;
5129     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5130         ctx++;
5131
5132     if( h->slice_type == B_TYPE )
5133         ctx += 13;
5134     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5135 }
5136
5137 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5138     int mode = 0;
5139
5140     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5141         return pred_mode;
5142
5143     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5144     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5145     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5146
5147     if( mode >= pred_mode )
5148         return mode + 1;
5149     else
5150         return mode;
5151 }
5152
5153 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5154     const int mba_xy = h->left_mb_xy[0];
5155     const int mbb_xy = h->top_mb_xy;
5156
5157     int ctx = 0;
5158
5159     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5160     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5161         ctx++;
5162
5163     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5164         ctx++;
5165
5166     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5167         return 0;
5168
5169     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5170         return 1;
5171     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5172         return 2;
5173     else
5174         return 3;
5175 }
5176
5177 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5178     int cbp_b, cbp_a, ctx, cbp = 0;
5179
5180     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5181     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5182
5183     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5184     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5185     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5186     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5187     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5188     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5189     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5190     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5191     return cbp;
5192 }
5193 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5194     int ctx;
5195     int cbp_a, cbp_b;
5196
5197     cbp_a = (h->left_cbp>>4)&0x03;
5198     cbp_b = (h-> top_cbp>>4)&0x03;
5199
5200     ctx = 0;
5201     if( cbp_a > 0 ) ctx++;
5202     if( cbp_b > 0 ) ctx += 2;
5203     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5204         return 0;
5205
5206     ctx = 4;
5207     if( cbp_a == 2 ) ctx++;
5208     if( cbp_b == 2 ) ctx += 2;
5209     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5210 }
5211 static int decode_cabac_mb_dqp( H264Context *h) {
5212     int   ctx = 0;
5213     int   val = 0;
5214
5215     if( h->last_qscale_diff != 0 )
5216         ctx++;
5217
5218     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5219         if( ctx < 2 )
5220             ctx = 2;
5221         else
5222             ctx = 3;
5223         val++;
5224         if(val > 102) //prevent infinite loop
5225             return INT_MIN;
5226     }
5227
5228     if( val&0x01 )
5229         return (val + 1)/2;
5230     else
5231         return -(val + 1)/2;
5232 }
5233 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5234     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5235         return 0;   /* 8x8 */
5236     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5237         return 1;   /* 8x4 */
5238     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5239         return 2;   /* 4x8 */
5240     return 3;       /* 4x4 */
5241 }
5242 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5243     int type;
5244     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5245         return 0;   /* B_Direct_8x8 */
5246     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5247         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5248     type = 3;
5249     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5250         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5251             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5252         type += 4;
5253     }
5254     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5255     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5256     return type;
5257 }
5258
5259 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5260     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5261 }
5262
5263 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5264     int refa = h->ref_cache[list][scan8[n] - 1];
5265     int refb = h->ref_cache[list][scan8[n] - 8];
5266     int ref  = 0;
5267     int ctx  = 0;
5268
5269     if( h->slice_type == B_TYPE) {
5270         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5271             ctx++;
5272         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5273             ctx += 2;
5274     } else {
5275         if( refa > 0 )
5276             ctx++;
5277         if( refb > 0 )
5278             ctx += 2;
5279     }
5280
5281     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5282         ref++;
5283         if( ctx < 4 )
5284             ctx = 4;
5285         else
5286             ctx = 5;
5287         if(ref >= 32 /*h->ref_list[list]*/){
5288             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5289             return 0; //FIXME we should return -1 and check the return everywhere
5290         }
5291     }
5292     return ref;
5293 }
5294
5295 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5296     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5297                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5298     int ctxbase = (l == 0) ? 40 : 47;
5299     int ctx, mvd;
5300
5301     if( amvd < 3 )
5302         ctx = 0;
5303     else if( amvd > 32 )
5304         ctx = 2;
5305     else
5306         ctx = 1;
5307
5308     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5309         return 0;
5310
5311     mvd= 1;
5312     ctx= 3;
5313     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5314         mvd++;
5315         if( ctx < 6 )
5316             ctx++;
5317     }
5318
5319     if( mvd >= 9 ) {
5320         int k = 3;
5321         while( get_cabac_bypass( &h->cabac ) ) {
5322             mvd += 1 << k;
5323             k++;
5324             if(k>24){
5325                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5326                 return INT_MIN;
5327             }
5328         }
5329         while( k-- ) {
5330             if( get_cabac_bypass( &h->cabac ) )
5331                 mvd += 1 << k;
5332         }
5333     }
5334     return get_cabac_bypass_sign( &h->cabac, -mvd );
5335 }
5336
5337 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5338     int nza, nzb;
5339     int ctx = 0;
5340
5341     if( cat == 0 ) {
5342         nza = h->left_cbp&0x100;
5343         nzb = h-> top_cbp&0x100;
5344     } else if( cat == 1 || cat == 2 ) {
5345         nza = h->non_zero_count_cache[scan8[idx] - 1];
5346         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5347     } else if( cat == 3 ) {
5348         nza = (h->left_cbp>>(6+idx))&0x01;
5349         nzb = (h-> top_cbp>>(6+idx))&0x01;
5350     } else {
5351         assert(cat == 4);
5352         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5353         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5354     }
5355
5356     if( nza > 0 )
5357         ctx++;
5358
5359     if( nzb > 0 )
5360         ctx += 2;
5361
5362     return ctx + 4 * cat;
5363 }
5364
5365 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5366     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5367     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5368     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5369     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5370 };
5371
5372 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5373     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5374     static const int significant_coeff_flag_offset[2][6] = {
5375       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5376       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5377     };
5378     static const int last_coeff_flag_offset[2][6] = {
5379       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5380       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5381     };
5382     static const int coeff_abs_level_m1_offset[6] = {
5383         227+0, 227+10, 227+20, 227+30, 227+39, 426
5384     };
5385     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5386       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5387         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5388         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5389        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5390       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5391         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5392         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5393         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5394     };
5395
5396     int index[64];
5397
5398     int av_unused last;
5399     int coeff_count = 0;
5400
5401     int abslevel1 = 1;
5402     int abslevelgt1 = 0;
5403
5404     uint8_t *significant_coeff_ctx_base;
5405     uint8_t *last_coeff_ctx_base;
5406     uint8_t *abs_level_m1_ctx_base;
5407
5408 #ifndef ARCH_X86
5409 #define CABAC_ON_STACK
5410 #endif
5411 #ifdef CABAC_ON_STACK
5412 #define CC &cc
5413     CABACContext cc;
5414     cc.range     = h->cabac.range;
5415     cc.low       = h->cabac.low;
5416     cc.bytestream= h->cabac.bytestream;
5417 #else
5418 #define CC &h->cabac
5419 #endif
5420
5421
5422     /* cat: 0-> DC 16x16  n = 0
5423      *      1-> AC 16x16  n = luma4x4idx
5424      *      2-> Luma4x4   n = luma4x4idx
5425      *      3-> DC Chroma n = iCbCr
5426      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5427      *      5-> Luma8x8   n = 4 * luma8x8idx
5428      */
5429
5430     /* read coded block flag */
5431     if( cat != 5 ) {
5432         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5433             if( cat == 1 || cat == 2 )
5434                 h->non_zero_count_cache[scan8[n]] = 0;
5435             else if( cat == 4 )
5436                 h->non_zero_count_cache[scan8[16+n]] = 0;
5437 #ifdef CABAC_ON_STACK
5438             h->cabac.range     = cc.range     ;
5439             h->cabac.low       = cc.low       ;
5440             h->cabac.bytestream= cc.bytestream;
5441 #endif
5442             return;
5443         }
5444     }
5445
5446     significant_coeff_ctx_base = h->cabac_state
5447         + significant_coeff_flag_offset[MB_FIELD][cat];
5448     last_coeff_ctx_base = h->cabac_state
5449         + last_coeff_flag_offset[MB_FIELD][cat];
5450     abs_level_m1_ctx_base = h->cabac_state
5451         + coeff_abs_level_m1_offset[cat];
5452
5453     if( cat == 5 ) {
5454 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5455         for(last= 0; last < coefs; last++) { \
5456             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5457             if( get_cabac( CC, sig_ctx )) { \
5458                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5459                 index[coeff_count++] = last; \
5460                 if( get_cabac( CC, last_ctx ) ) { \
5461                     last= max_coeff; \
5462                     break; \
5463                 } \
5464             } \
5465         }\
5466         if( last == max_coeff -1 ) {\
5467             index[coeff_count++] = last;\
5468         }
5469         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5470 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5471         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5472     } else {
5473         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5474 #else
5475         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5476     } else {
5477         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5478 #endif
5479     }
5480     assert(coeff_count > 0);
5481
5482     if( cat == 0 )
5483         h->cbp_table[mb_xy] |= 0x100;
5484     else if( cat == 1 || cat == 2 )
5485         h->non_zero_count_cache[scan8[n]] = coeff_count;
5486     else if( cat == 3 )
5487         h->cbp_table[mb_xy] |= 0x40 << n;
5488     else if( cat == 4 )
5489         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5490     else {
5491         assert( cat == 5 );
5492         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5493     }
5494
5495     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5496         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5497         int j= scantable[index[coeff_count]];
5498
5499         if( get_cabac( CC, ctx ) == 0 ) {
5500             if( !qmul ) {
5501                 block[j] = get_cabac_bypass_sign( CC, -1);
5502             }else{
5503                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5504             }
5505
5506             abslevel1++;
5507         } else {
5508             int coeff_abs = 2;
5509             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5510             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5511                 coeff_abs++;
5512             }
5513
5514             if( coeff_abs >= 15 ) {
5515                 int j = 0;
5516                 while( get_cabac_bypass( CC ) ) {
5517                     j++;
5518                 }
5519
5520                 coeff_abs=1;
5521                 while( j-- ) {
5522                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5523                 }
5524                 coeff_abs+= 14;
5525             }
5526
5527             if( !qmul ) {
5528                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5529                 else                                block[j] =  coeff_abs;
5530             }else{
5531                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5532                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5533             }
5534
5535             abslevelgt1++;
5536         }
5537     }
5538 #ifdef CABAC_ON_STACK
5539             h->cabac.range     = cc.range     ;
5540             h->cabac.low       = cc.low       ;
5541             h->cabac.bytestream= cc.bytestream;
5542 #endif
5543
5544 }
5545
5546 static inline void compute_mb_neighbors(H264Context *h)
5547 {
5548     MpegEncContext * const s = &h->s;
5549     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5550     h->top_mb_xy     = mb_xy - s->mb_stride;
5551     h->left_mb_xy[0] = mb_xy - 1;
5552     if(FRAME_MBAFF){
5553         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5554         const int top_pair_xy      = pair_xy     - s->mb_stride;
5555         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5556         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5557         const int curr_mb_frame_flag = !MB_FIELD;
5558         const int bottom = (s->mb_y & 1);
5559         if (bottom
5560                 ? !curr_mb_frame_flag // bottom macroblock
5561                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5562                 ) {
5563             h->top_mb_xy -= s->mb_stride;
5564         }
5565         if (left_mb_frame_flag != curr_mb_frame_flag) {
5566             h->left_mb_xy[0] = pair_xy - 1;
5567         }
5568     } else if (FIELD_PICTURE) {
5569         h->top_mb_xy -= s->mb_stride;
5570     }
5571     return;
5572 }
5573
5574 /**
5575  * decodes a macroblock
5576  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5577  */
5578 static int decode_mb_cabac(H264Context *h) {
5579     MpegEncContext * const s = &h->s;
5580     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5581     int mb_type, partition_count, cbp = 0;
5582     int dct8x8_allowed= h->pps.transform_8x8_mode;
5583
5584     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5585
5586     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5587     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5588         int skip;
5589         /* a skipped mb needs the aff flag from the following mb */
5590         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5591             predict_field_decoding_flag(h);
5592         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5593             skip = h->next_mb_skipped;
5594         else
5595             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5596         /* read skip flags */
5597         if( skip ) {
5598             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5599                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5600                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5601                 if(h->next_mb_skipped)
5602                     predict_field_decoding_flag(h);
5603                 else
5604                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5605             }
5606
5607             decode_mb_skip(h);
5608
5609             h->cbp_table[mb_xy] = 0;
5610             h->chroma_pred_mode_table[mb_xy] = 0;
5611             h->last_qscale_diff = 0;
5612
5613             return 0;
5614
5615         }
5616     }
5617     if(FRAME_MBAFF){
5618         if( (s->mb_y&1) == 0 )
5619             h->mb_mbaff =
5620             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5621     }else
5622         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5623
5624     h->prev_mb_skipped = 0;
5625
5626     compute_mb_neighbors(h);
5627     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5628         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5629         return -1;
5630     }
5631
5632     if( h->slice_type == B_TYPE ) {
5633         if( mb_type < 23 ){
5634             partition_count= b_mb_type_info[mb_type].partition_count;
5635             mb_type=         b_mb_type_info[mb_type].type;
5636         }else{
5637             mb_type -= 23;
5638             goto decode_intra_mb;
5639         }
5640     } else if( h->slice_type == P_TYPE ) {
5641         if( mb_type < 5) {
5642             partition_count= p_mb_type_info[mb_type].partition_count;
5643             mb_type=         p_mb_type_info[mb_type].type;
5644         } else {
5645             mb_type -= 5;
5646             goto decode_intra_mb;
5647         }
5648     } else {
5649        assert(h->slice_type == I_TYPE);
5650 decode_intra_mb:
5651         partition_count = 0;
5652         cbp= i_mb_type_info[mb_type].cbp;
5653         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5654         mb_type= i_mb_type_info[mb_type].type;
5655     }
5656     if(MB_FIELD)
5657         mb_type |= MB_TYPE_INTERLACED;
5658
5659     h->slice_table[ mb_xy ]= h->slice_num;
5660
5661     if(IS_INTRA_PCM(mb_type)) {
5662         const uint8_t *ptr;
5663         unsigned int x, y;
5664
5665         // We assume these blocks are very rare so we do not optimize it.
5666         // FIXME The two following lines get the bitstream position in the cabac
5667         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5668         ptr= h->cabac.bytestream;
5669         if(h->cabac.low&0x1) ptr--;
5670         if(CABAC_BITS==16){
5671             if(h->cabac.low&0x1FF) ptr--;
5672         }
5673
5674         // The pixels are stored in the same order as levels in h->mb array.
5675         for(y=0; y<16; y++){
5676             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5677             for(x=0; x<16; x++){
5678                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5679                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5680             }
5681         }
5682         for(y=0; y<8; y++){
5683             const int index= 256 + 4*(y&3) + 32*(y>>2);
5684             for(x=0; x<8; x++){
5685                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5686                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5687             }
5688         }
5689         for(y=0; y<8; y++){
5690             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5691             for(x=0; x<8; x++){
5692                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5693                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5694             }
5695         }
5696
5697         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5698
5699         // All blocks are present
5700         h->cbp_table[mb_xy] = 0x1ef;
5701         h->chroma_pred_mode_table[mb_xy] = 0;
5702         // In deblocking, the quantizer is 0
5703         s->current_picture.qscale_table[mb_xy]= 0;
5704         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5705         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5706         // All coeffs are present
5707         memset(h->non_zero_count[mb_xy], 16, 16);
5708         s->current_picture.mb_type[mb_xy]= mb_type;
5709         return 0;
5710     }
5711
5712     if(MB_MBAFF){
5713         h->ref_count[0] <<= 1;
5714         h->ref_count[1] <<= 1;
5715     }
5716
5717     fill_caches(h, mb_type, 0);
5718
5719     if( IS_INTRA( mb_type ) ) {
5720         int i, pred_mode;
5721         if( IS_INTRA4x4( mb_type ) ) {
5722             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5723                 mb_type |= MB_TYPE_8x8DCT;
5724                 for( i = 0; i < 16; i+=4 ) {
5725                     int pred = pred_intra_mode( h, i );
5726                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5727                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5728                 }
5729             } else {
5730                 for( i = 0; i < 16; i++ ) {
5731                     int pred = pred_intra_mode( h, i );
5732                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5733
5734                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5735                 }
5736             }
5737             write_back_intra_pred_mode(h);
5738             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5739         } else {
5740             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5741             if( h->intra16x16_pred_mode < 0 ) return -1;
5742         }
5743         h->chroma_pred_mode_table[mb_xy] =
5744         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5745
5746         pred_mode= check_intra_pred_mode( h, pred_mode );
5747         if( pred_mode < 0 ) return -1;
5748         h->chroma_pred_mode= pred_mode;
5749     } else if( partition_count == 4 ) {
5750         int i, j, sub_partition_count[4], list, ref[2][4];
5751
5752         if( h->slice_type == B_TYPE ) {
5753             for( i = 0; i < 4; i++ ) {
5754                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5755                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5756                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5757             }
5758             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5759                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5760                 pred_direct_motion(h, &mb_type);
5761                 h->ref_cache[0][scan8[4]] =
5762                 h->ref_cache[1][scan8[4]] =
5763                 h->ref_cache[0][scan8[12]] =
5764                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5765                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5766                     for( i = 0; i < 4; i++ )
5767                         if( IS_DIRECT(h->sub_mb_type[i]) )
5768                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5769                 }
5770             }
5771         } else {
5772             for( i = 0; i < 4; i++ ) {
5773                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5774                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5775                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5776             }
5777         }
5778
5779         for( list = 0; list < h->list_count; list++ ) {
5780                 for( i = 0; i < 4; i++ ) {
5781                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5782                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5783                         if( h->ref_count[list] > 1 )
5784                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5785                         else
5786                             ref[list][i] = 0;
5787                     } else {
5788                         ref[list][i] = -1;
5789                     }
5790                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5791                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5792                 }
5793         }
5794
5795         if(dct8x8_allowed)
5796             dct8x8_allowed = get_dct8x8_allowed(h);
5797
5798         for(list=0; list<h->list_count; list++){
5799             for(i=0; i<4; i++){
5800                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5801                 if(IS_DIRECT(h->sub_mb_type[i])){
5802                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5803                     continue;
5804                 }
5805
5806                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5807                     const int sub_mb_type= h->sub_mb_type[i];
5808                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5809                     for(j=0; j<sub_partition_count[i]; j++){
5810                         int mpx, mpy;
5811                         int mx, my;
5812                         const int index= 4*i + block_width*j;
5813                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5814                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5815                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5816
5817                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5818                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5819                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5820
5821                         if(IS_SUB_8X8(sub_mb_type)){
5822                             mv_cache[ 1 ][0]=
5823                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5824                             mv_cache[ 1 ][1]=
5825                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5826
5827                             mvd_cache[ 1 ][0]=
5828                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5829                             mvd_cache[ 1 ][1]=
5830                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5831                         }else if(IS_SUB_8X4(sub_mb_type)){
5832                             mv_cache[ 1 ][0]= mx;
5833                             mv_cache[ 1 ][1]= my;
5834
5835                             mvd_cache[ 1 ][0]= mx - mpx;
5836                             mvd_cache[ 1 ][1]= my - mpy;
5837                         }else if(IS_SUB_4X8(sub_mb_type)){
5838                             mv_cache[ 8 ][0]= mx;
5839                             mv_cache[ 8 ][1]= my;
5840
5841                             mvd_cache[ 8 ][0]= mx - mpx;
5842                             mvd_cache[ 8 ][1]= my - mpy;
5843                         }
5844                         mv_cache[ 0 ][0]= mx;
5845                         mv_cache[ 0 ][1]= my;
5846
5847                         mvd_cache[ 0 ][0]= mx - mpx;
5848                         mvd_cache[ 0 ][1]= my - mpy;
5849                     }
5850                 }else{
5851                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5852                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5853                     p[0] = p[1] = p[8] = p[9] = 0;
5854                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5855                 }
5856             }
5857         }
5858     } else if( IS_DIRECT(mb_type) ) {
5859         pred_direct_motion(h, &mb_type);
5860         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5861         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5862         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5863     } else {
5864         int list, mx, my, i, mpx, mpy;
5865         if(IS_16X16(mb_type)){
5866             for(list=0; list<h->list_count; list++){
5867                 if(IS_DIR(mb_type, 0, list)){
5868                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5869                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5870                 }else
5871                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5872             }
5873             for(list=0; list<h->list_count; list++){
5874                 if(IS_DIR(mb_type, 0, list)){
5875                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5876
5877                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5878                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5879                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5880
5881                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5882                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5883                 }else
5884                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5885             }
5886         }
5887         else if(IS_16X8(mb_type)){
5888             for(list=0; list<h->list_count; list++){
5889                     for(i=0; i<2; i++){
5890                         if(IS_DIR(mb_type, i, list)){
5891                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5892                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5893                         }else
5894                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5895                     }
5896             }
5897             for(list=0; list<h->list_count; list++){
5898                 for(i=0; i<2; i++){
5899                     if(IS_DIR(mb_type, i, list)){
5900                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5901                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5902                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5903                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5904
5905                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5906                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5907                     }else{
5908                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5909                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5910                     }
5911                 }
5912             }
5913         }else{
5914             assert(IS_8X16(mb_type));
5915             for(list=0; list<h->list_count; list++){
5916                     for(i=0; i<2; i++){
5917                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5918                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5919                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5920                         }else
5921                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5922                     }
5923             }
5924             for(list=0; list<h->list_count; list++){
5925                 for(i=0; i<2; i++){
5926                     if(IS_DIR(mb_type, i, list)){
5927                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5928                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5929                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5930
5931                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5932                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5933                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5934                     }else{
5935                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5936                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5937                     }
5938                 }
5939             }
5940         }
5941     }
5942
5943    if( IS_INTER( mb_type ) ) {
5944         h->chroma_pred_mode_table[mb_xy] = 0;
5945         write_back_motion( h, mb_type );
5946    }
5947
5948     if( !IS_INTRA16x16( mb_type ) ) {
5949         cbp  = decode_cabac_mb_cbp_luma( h );
5950         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5951     }
5952
5953     h->cbp_table[mb_xy] = h->cbp = cbp;
5954
5955     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5956         if( decode_cabac_mb_transform_size( h ) )
5957             mb_type |= MB_TYPE_8x8DCT;
5958     }
5959     s->current_picture.mb_type[mb_xy]= mb_type;
5960
5961     if( cbp || IS_INTRA16x16( mb_type ) ) {
5962         const uint8_t *scan, *scan8x8, *dc_scan;
5963         const uint32_t *qmul;
5964         int dqp;
5965
5966         if(IS_INTERLACED(mb_type)){
5967             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5968             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5969             dc_scan= luma_dc_field_scan;
5970         }else{
5971             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5972             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5973             dc_scan= luma_dc_zigzag_scan;
5974         }
5975
5976         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5977         if( dqp == INT_MIN ){
5978             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5979             return -1;
5980         }
5981         s->qscale += dqp;
5982         if(((unsigned)s->qscale) > 51){
5983             if(s->qscale<0) s->qscale+= 52;
5984             else            s->qscale-= 52;
5985         }
5986         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5987         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5988
5989         if( IS_INTRA16x16( mb_type ) ) {
5990             int i;
5991             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5992             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5993
5994             if( cbp&15 ) {
5995                 qmul = h->dequant4_coeff[0][s->qscale];
5996                 for( i = 0; i < 16; i++ ) {
5997                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5998                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5999                 }
6000             } else {
6001                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6002             }
6003         } else {
6004             int i8x8, i4x4;
6005             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6006                 if( cbp & (1<<i8x8) ) {
6007                     if( IS_8x8DCT(mb_type) ) {
6008                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6009                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6010                     } else {
6011                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6012                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6013                             const int index = 4*i8x8 + i4x4;
6014                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6015 //START_TIMER
6016                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6017 //STOP_TIMER("decode_residual")
6018                         }
6019                     }
6020                 } else {
6021                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6022                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6023                 }
6024             }
6025         }
6026
6027         if( cbp&0x30 ){
6028             int c;
6029             for( c = 0; c < 2; c++ ) {
6030                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6031                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6032             }
6033         }
6034
6035         if( cbp&0x20 ) {
6036             int c, i;
6037             for( c = 0; c < 2; c++ ) {
6038                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6039                 for( i = 0; i < 4; i++ ) {
6040                     const int index = 16 + 4 * c + i;
6041                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6042                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6043                 }
6044             }
6045         } else {
6046             uint8_t * const nnz= &h->non_zero_count_cache[0];
6047             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6048             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6049         }
6050     } else {
6051         uint8_t * const nnz= &h->non_zero_count_cache[0];
6052         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6053         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6054         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6055         h->last_qscale_diff = 0;
6056     }
6057
6058     s->current_picture.qscale_table[mb_xy]= s->qscale;
6059     write_back_non_zero_count(h);
6060
6061     if(MB_MBAFF){
6062         h->ref_count[0] >>= 1;
6063         h->ref_count[1] >>= 1;
6064     }
6065
6066     return 0;
6067 }
6068
6069
6070 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6071     int i, d;
6072     const int index_a = qp + h->slice_alpha_c0_offset;
6073     const int alpha = (alpha_table+52)[index_a];
6074     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6075
6076     if( bS[0] < 4 ) {
6077         int8_t tc[4];
6078         for(i=0; i<4; i++)
6079             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6080         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6081     } else {
6082         /* 16px edge length, because bS=4 is triggered by being at
6083          * the edge of an intra MB, so all 4 bS are the same */
6084             for( d = 0; d < 16; d++ ) {
6085                 const int p0 = pix[-1];
6086                 const int p1 = pix[-2];
6087                 const int p2 = pix[-3];
6088
6089                 const int q0 = pix[0];
6090                 const int q1 = pix[1];
6091                 const int q2 = pix[2];
6092
6093                 if( FFABS( p0 - q0 ) < alpha &&
6094                     FFABS( p1 - p0 ) < beta &&
6095                     FFABS( q1 - q0 ) < beta ) {
6096
6097                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6098                         if( FFABS( p2 - p0 ) < beta)
6099                         {
6100                             const int p3 = pix[-4];
6101                             /* p0', p1', p2' */
6102                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6103                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6104                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6105                         } else {
6106                             /* p0' */
6107                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6108                         }
6109                         if( FFABS( q2 - q0 ) < beta)
6110                         {
6111                             const int q3 = pix[3];
6112                             /* q0', q1', q2' */
6113                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6114                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6115                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6116                         } else {
6117                             /* q0' */
6118                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6119                         }
6120                     }else{
6121                         /* p0', q0' */
6122                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6123                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6124                     }
6125                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6126                 }
6127                 pix += stride;
6128             }
6129     }
6130 }
6131 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6132     int i;
6133     const int index_a = qp + h->slice_alpha_c0_offset;
6134     const int alpha = (alpha_table+52)[index_a];
6135     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6136
6137     if( bS[0] < 4 ) {
6138         int8_t tc[4];
6139         for(i=0; i<4; i++)
6140             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6141         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6142     } else {
6143         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6144     }
6145 }
6146
6147 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6148     int i;
6149     for( i = 0; i < 16; i++, pix += stride) {
6150         int index_a;
6151         int alpha;
6152         int beta;
6153
6154         int qp_index;
6155         int bS_index = (i >> 1);
6156         if (!MB_FIELD) {
6157             bS_index &= ~1;
6158             bS_index |= (i & 1);
6159         }
6160
6161         if( bS[bS_index] == 0 ) {
6162             continue;
6163         }
6164
6165         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6166         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6167         alpha = (alpha_table+52)[index_a];
6168         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6169
6170         if( bS[bS_index] < 4 ) {
6171             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6172             const int p0 = pix[-1];
6173             const int p1 = pix[-2];
6174             const int p2 = pix[-3];
6175             const int q0 = pix[0];
6176             const int q1 = pix[1];
6177             const int q2 = pix[2];
6178
6179             if( FFABS( p0 - q0 ) < alpha &&
6180                 FFABS( p1 - p0 ) < beta &&
6181                 FFABS( q1 - q0 ) < beta ) {
6182                 int tc = tc0;
6183                 int i_delta;
6184
6185                 if( FFABS( p2 - p0 ) < beta ) {
6186                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6187                     tc++;
6188                 }
6189                 if( FFABS( q2 - q0 ) < beta ) {
6190                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6191                     tc++;
6192                 }
6193
6194                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6195                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6196                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6197                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6198             }
6199         }else{
6200             const int p0 = pix[-1];
6201             const int p1 = pix[-2];
6202             const int p2 = pix[-3];
6203
6204             const int q0 = pix[0];
6205             const int q1 = pix[1];
6206             const int q2 = pix[2];
6207
6208             if( FFABS( p0 - q0 ) < alpha &&
6209                 FFABS( p1 - p0 ) < beta &&
6210                 FFABS( q1 - q0 ) < beta ) {
6211
6212                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6213                     if( FFABS( p2 - p0 ) < beta)
6214                     {
6215                         const int p3 = pix[-4];
6216                         /* p0', p1', p2' */
6217                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6218                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6219                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6220                     } else {
6221                         /* p0' */
6222                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6223                     }
6224                     if( FFABS( q2 - q0 ) < beta)
6225                     {
6226                         const int q3 = pix[3];
6227                         /* q0', q1', q2' */
6228                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6229                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6230                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6231                     } else {
6232                         /* q0' */
6233                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6234                     }
6235                 }else{
6236                     /* p0', q0' */
6237                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6238                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6239                 }
6240                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6241             }
6242         }
6243     }
6244 }
6245 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6246     int i;
6247     for( i = 0; i < 8; i++, pix += stride) {
6248         int index_a;
6249         int alpha;
6250         int beta;
6251
6252         int qp_index;
6253         int bS_index = i;
6254
6255         if( bS[bS_index] == 0 ) {
6256             continue;
6257         }
6258
6259         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6260         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6261         alpha = (alpha_table+52)[index_a];
6262         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6263
6264         if( bS[bS_index] < 4 ) {
6265             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6266             const int p0 = pix[-1];
6267             const int p1 = pix[-2];
6268             const int q0 = pix[0];
6269             const int q1 = pix[1];
6270
6271             if( FFABS( p0 - q0 ) < alpha &&
6272                 FFABS( p1 - p0 ) < beta &&
6273                 FFABS( q1 - q0 ) < beta ) {
6274                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6275
6276                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6277                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6278                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6279             }
6280         }else{
6281             const int p0 = pix[-1];
6282             const int p1 = pix[-2];
6283             const int q0 = pix[0];
6284             const int q1 = pix[1];
6285
6286             if( FFABS( p0 - q0 ) < alpha &&
6287                 FFABS( p1 - p0 ) < beta &&
6288                 FFABS( q1 - q0 ) < beta ) {
6289
6290                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6291                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6292                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6293             }
6294         }
6295     }
6296 }
6297
6298 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6299     int i, d;
6300     const int index_a = qp + h->slice_alpha_c0_offset;
6301     const int alpha = (alpha_table+52)[index_a];
6302     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6303     const int pix_next  = stride;
6304
6305     if( bS[0] < 4 ) {
6306         int8_t tc[4];
6307         for(i=0; i<4; i++)
6308             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6309         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6310     } else {
6311         /* 16px edge length, see filter_mb_edgev */
6312             for( d = 0; d < 16; d++ ) {
6313                 const int p0 = pix[-1*pix_next];
6314                 const int p1 = pix[-2*pix_next];
6315                 const int p2 = pix[-3*pix_next];
6316                 const int q0 = pix[0];
6317                 const int q1 = pix[1*pix_next];
6318                 const int q2 = pix[2*pix_next];
6319
6320                 if( FFABS( p0 - q0 ) < alpha &&
6321                     FFABS( p1 - p0 ) < beta &&
6322                     FFABS( q1 - q0 ) < beta ) {
6323
6324                     const int p3 = pix[-4*pix_next];
6325                     const int q3 = pix[ 3*pix_next];
6326
6327                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6328                         if( FFABS( p2 - p0 ) < beta) {
6329                             /* p0', p1', p2' */
6330                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6331                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6332                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6333                         } else {
6334                             /* p0' */
6335                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6336                         }
6337                         if( FFABS( q2 - q0 ) < beta) {
6338                             /* q0', q1', q2' */
6339                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6340                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6341                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6342                         } else {
6343                             /* q0' */
6344                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6345                         }
6346                     }else{
6347                         /* p0', q0' */
6348                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6349                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6350                     }
6351                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6352                 }
6353                 pix++;
6354             }
6355     }
6356 }
6357
6358 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6359     int i;
6360     const int index_a = qp + h->slice_alpha_c0_offset;
6361     const int alpha = (alpha_table+52)[index_a];
6362     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6363
6364     if( bS[0] < 4 ) {
6365         int8_t tc[4];
6366         for(i=0; i<4; i++)
6367             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6368         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6369     } else {
6370         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6371     }
6372 }
6373
6374 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6375     MpegEncContext * const s = &h->s;
6376     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6377     int mb_xy, mb_type;
6378     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6379
6380     mb_xy = mb_x + mb_y*s->mb_stride;
6381
6382     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6383        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6384                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6385         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6386         return;
6387     }
6388     assert(!FRAME_MBAFF);
6389
6390     mb_type = s->current_picture.mb_type[mb_xy];
6391     qp = s->current_picture.qscale_table[mb_xy];
6392     qp0 = s->current_picture.qscale_table[mb_xy-1];
6393     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6394     qpc = get_chroma_qp( h, 0, qp );
6395     qpc0 = get_chroma_qp( h, 0, qp0 );
6396     qpc1 = get_chroma_qp( h, 0, qp1 );
6397     qp0 = (qp + qp0 + 1) >> 1;
6398     qp1 = (qp + qp1 + 1) >> 1;
6399     qpc0 = (qpc + qpc0 + 1) >> 1;
6400     qpc1 = (qpc + qpc1 + 1) >> 1;
6401     qp_thresh = 15 - h->slice_alpha_c0_offset;
6402     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6403        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6404         return;
6405
6406     if( IS_INTRA(mb_type) ) {
6407         int16_t bS4[4] = {4,4,4,4};
6408         int16_t bS3[4] = {3,3,3,3};
6409         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6410         if( IS_8x8DCT(mb_type) ) {
6411             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6412             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6413             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6414             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6415         } else {
6416             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6417             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6418             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6419             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6420             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6421             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6422             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6423             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6424         }
6425         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6426         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6427         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6428         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6429         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6430         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6431         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6432         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6433         return;
6434     } else {
6435         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6436         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6437         int edges;
6438         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6439             edges = 4;
6440             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6441         } else {
6442             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6443                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6444             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6445                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6446                              ? 3 : 0;
6447             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6448             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6449             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6450                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6451         }
6452         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6453             bSv[0][0] = 0x0004000400040004ULL;
6454         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6455             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6456
6457 #define FILTER(hv,dir,edge)\
6458         if(bSv[dir][edge]) {\
6459             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6460             if(!(edge&1)) {\
6461                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6462                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6463             }\
6464         }
6465         if( edges == 1 ) {
6466             FILTER(v,0,0);
6467             FILTER(h,1,0);
6468         } else if( IS_8x8DCT(mb_type) ) {
6469             FILTER(v,0,0);
6470             FILTER(v,0,2);
6471             FILTER(h,1,0);
6472             FILTER(h,1,2);
6473         } else {
6474             FILTER(v,0,0);
6475             FILTER(v,0,1);
6476             FILTER(v,0,2);
6477             FILTER(v,0,3);
6478             FILTER(h,1,0);
6479             FILTER(h,1,1);
6480             FILTER(h,1,2);
6481             FILTER(h,1,3);
6482         }
6483 #undef FILTER
6484     }
6485 }
6486
6487 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6488     MpegEncContext * const s = &h->s;
6489     const int mb_xy= mb_x + mb_y*s->mb_stride;
6490     const int mb_type = s->current_picture.mb_type[mb_xy];
6491     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6492     int first_vertical_edge_done = 0;
6493     int dir;
6494     /* FIXME: A given frame may occupy more than one position in
6495      * the reference list. So ref2frm should be populated with
6496      * frame numbers, not indices. */
6497     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6498                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6499
6500     //for sufficiently low qp, filtering wouldn't do anything
6501     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6502     if(!FRAME_MBAFF){
6503         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6504         int qp = s->current_picture.qscale_table[mb_xy];
6505         if(qp <= qp_thresh
6506            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6507            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6508             return;
6509         }
6510     }
6511
6512     if (FRAME_MBAFF
6513             // left mb is in picture
6514             && h->slice_table[mb_xy-1] != 255
6515             // and current and left pair do not have the same interlaced type
6516             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6517             // and left mb is in the same slice if deblocking_filter == 2
6518             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6519         /* First vertical edge is different in MBAFF frames
6520          * There are 8 different bS to compute and 2 different Qp
6521          */
6522         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6523         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6524         int16_t bS[8];
6525         int qp[2];
6526         int bqp[2];
6527         int rqp[2];
6528         int mb_qp, mbn0_qp, mbn1_qp;
6529         int i;
6530         first_vertical_edge_done = 1;
6531
6532         if( IS_INTRA(mb_type) )
6533             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6534         else {
6535             for( i = 0; i < 8; i++ ) {
6536                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6537
6538                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6539                     bS[i] = 4;
6540                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6541                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6542                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6543                     bS[i] = 2;
6544                 else
6545                     bS[i] = 1;
6546             }
6547         }
6548
6549         mb_qp = s->current_picture.qscale_table[mb_xy];
6550         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6551         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6552         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6553         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6554                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6555         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6556                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6557         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6558         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6559                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6560         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6561                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6562
6563         /* Filter edge */
6564         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6565         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6566         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6567         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6568         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6569     }
6570     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6571     for( dir = 0; dir < 2; dir++ )
6572     {
6573         int edge;
6574         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6575         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6576         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6577
6578         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6579                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6580         // how often to recheck mv-based bS when iterating between edges
6581         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6582                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6583         // how often to recheck mv-based bS when iterating along each edge
6584         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6585
6586         if (first_vertical_edge_done) {
6587             start = 1;
6588             first_vertical_edge_done = 0;
6589         }
6590
6591         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6592             start = 1;
6593
6594         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6595             && !IS_INTERLACED(mb_type)
6596             && IS_INTERLACED(mbm_type)
6597             ) {
6598             // This is a special case in the norm where the filtering must
6599             // be done twice (one each of the field) even if we are in a
6600             // frame macroblock.
6601             //
6602             static const int nnz_idx[4] = {4,5,6,3};
6603             unsigned int tmp_linesize   = 2 *   linesize;
6604             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6605             int mbn_xy = mb_xy - 2 * s->mb_stride;
6606             int qp;
6607             int i, j;
6608             int16_t bS[4];
6609
6610             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6611                 if( IS_INTRA(mb_type) ||
6612                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6613                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6614                 } else {
6615                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6616                     for( i = 0; i < 4; i++ ) {
6617                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6618                             mbn_nnz[nnz_idx[i]] != 0 )
6619                             bS[i] = 2;
6620                         else
6621                             bS[i] = 1;
6622                     }
6623                 }
6624                 // Do not use s->qscale as luma quantizer because it has not the same
6625                 // value in IPCM macroblocks.
6626                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6627                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6628                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6629                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6630                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6631                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6632                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6633                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6634             }
6635
6636             start = 1;
6637         }
6638
6639         /* Calculate bS */
6640         for( edge = start; edge < edges; edge++ ) {
6641             /* mbn_xy: neighbor macroblock */
6642             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6643             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6644             int16_t bS[4];
6645             int qp;
6646
6647             if( (edge&1) && IS_8x8DCT(mb_type) )
6648                 continue;
6649
6650             if( IS_INTRA(mb_type) ||
6651                 IS_INTRA(mbn_type) ) {
6652                 int value;
6653                 if (edge == 0) {
6654                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6655                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6656                     ) {
6657                         value = 4;
6658                     } else {
6659                         value = 3;
6660                     }
6661                 } else {
6662                     value = 3;
6663                 }
6664                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6665             } else {
6666                 int i, l;
6667                 int mv_done;
6668
6669                 if( edge & mask_edge ) {
6670                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6671                     mv_done = 1;
6672                 }
6673                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6674                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6675                     mv_done = 1;
6676                 }
6677                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6678                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6679                     int bn_idx= b_idx - (dir ? 8:1);
6680                     int v = 0;
6681                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6682                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6683                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6684                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6685                     }
6686                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6687                     mv_done = 1;
6688                 }
6689                 else
6690                     mv_done = 0;
6691
6692                 for( i = 0; i < 4; i++ ) {
6693                     int x = dir == 0 ? edge : i;
6694                     int y = dir == 0 ? i    : edge;
6695                     int b_idx= 8 + 4 + x + 8*y;
6696                     int bn_idx= b_idx - (dir ? 8:1);
6697
6698                     if( h->non_zero_count_cache[b_idx] != 0 ||
6699                         h->non_zero_count_cache[bn_idx] != 0 ) {
6700                         bS[i] = 2;
6701                     }
6702                     else if(!mv_done)
6703                     {
6704                         bS[i] = 0;
6705                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6706                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6707                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6708                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6709                                 bS[i] = 1;
6710                                 break;
6711                             }
6712                         }
6713                     }
6714                 }
6715
6716                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6717                     continue;
6718             }
6719
6720             /* Filter edge */
6721             // Do not use s->qscale as luma quantizer because it has not the same
6722             // value in IPCM macroblocks.
6723             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6724             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6725             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6726             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6727             if( dir == 0 ) {
6728                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6729                 if( (edge&1) == 0 ) {
6730                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6731                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6732                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6733                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6734                 }
6735             } else {
6736                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6737                 if( (edge&1) == 0 ) {
6738                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6739                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6740                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6741                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6742                 }
6743             }
6744         }
6745     }
6746 }
6747
6748 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6749     MpegEncContext * const s = &h->s;
6750     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6751
6752     s->mb_skip_run= -1;
6753
6754     if( h->pps.cabac ) {
6755         int i;
6756
6757         /* realign */
6758         align_get_bits( &s->gb );
6759
6760         /* init cabac */
6761         ff_init_cabac_states( &h->cabac);
6762         ff_init_cabac_decoder( &h->cabac,
6763                                s->gb.buffer + get_bits_count(&s->gb)/8,
6764                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6765         /* calculate pre-state */
6766         for( i= 0; i < 460; i++ ) {
6767             int pre;
6768             if( h->slice_type == I_TYPE )
6769                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6770             else
6771                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6772
6773             if( pre <= 63 )
6774                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6775             else
6776                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6777         }
6778
6779         for(;;){
6780 //START_TIMER
6781             int ret = decode_mb_cabac(h);
6782             int eos;
6783 //STOP_TIMER("decode_mb_cabac")
6784
6785             if(ret>=0) hl_decode_mb(h);
6786
6787             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6788                 s->mb_y++;
6789
6790                 if(ret>=0) ret = decode_mb_cabac(h);
6791
6792                 if(ret>=0) hl_decode_mb(h);
6793                 s->mb_y--;
6794             }
6795             eos = get_cabac_terminate( &h->cabac );
6796
6797             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6798                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6799                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6800                 return -1;
6801             }
6802
6803             if( ++s->mb_x >= s->mb_width ) {
6804                 s->mb_x = 0;
6805                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6806                 ++s->mb_y;
6807                 if(FIELD_OR_MBAFF_PICTURE) {
6808                     ++s->mb_y;
6809                 }
6810             }
6811
6812             if( eos || s->mb_y >= s->mb_height ) {
6813                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6814                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6815                 return 0;
6816             }
6817         }
6818
6819     } else {
6820         for(;;){
6821             int ret = decode_mb_cavlc(h);
6822
6823             if(ret>=0) hl_decode_mb(h);
6824
6825             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6826                 s->mb_y++;
6827                 ret = decode_mb_cavlc(h);
6828
6829                 if(ret>=0) hl_decode_mb(h);
6830                 s->mb_y--;
6831             }
6832
6833             if(ret<0){
6834                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6835                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6836
6837                 return -1;
6838             }
6839
6840             if(++s->mb_x >= s->mb_width){
6841                 s->mb_x=0;
6842                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6843                 ++s->mb_y;
6844                 if(FIELD_OR_MBAFF_PICTURE) {
6845                     ++s->mb_y;
6846                 }
6847                 if(s->mb_y >= s->mb_height){
6848                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6849
6850                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6851                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6852
6853                         return 0;
6854                     }else{
6855                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6856
6857                         return -1;
6858                     }
6859                 }
6860             }
6861
6862             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6863                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6864                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6865                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6866
6867                     return 0;
6868                 }else{
6869                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6870
6871                     return -1;
6872                 }
6873             }
6874         }
6875     }
6876
6877 #if 0
6878     for(;s->mb_y < s->mb_height; s->mb_y++){
6879         for(;s->mb_x < s->mb_width; s->mb_x++){
6880             int ret= decode_mb(h);
6881
6882             hl_decode_mb(h);
6883
6884             if(ret<0){
6885                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6886                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6887
6888                 return -1;
6889             }
6890
6891             if(++s->mb_x >= s->mb_width){
6892                 s->mb_x=0;
6893                 if(++s->mb_y >= s->mb_height){
6894                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6895                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6896
6897                         return 0;
6898                     }else{
6899                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6900
6901                         return -1;
6902                     }
6903                 }
6904             }
6905
6906             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6907                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6908                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6909
6910                     return 0;
6911                 }else{
6912                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6913
6914                     return -1;
6915                 }
6916             }
6917         }
6918         s->mb_x=0;
6919         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6920     }
6921 #endif
6922     return -1; //not reached
6923 }
6924
6925 static int decode_unregistered_user_data(H264Context *h, int size){
6926     MpegEncContext * const s = &h->s;
6927     uint8_t user_data[16+256];
6928     int e, build, i;
6929
6930     if(size<16)
6931         return -1;
6932
6933     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6934         user_data[i]= get_bits(&s->gb, 8);
6935     }
6936
6937     user_data[i]= 0;
6938     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6939     if(e==1 && build>=0)
6940         h->x264_build= build;
6941
6942     if(s->avctx->debug & FF_DEBUG_BUGS)
6943         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6944
6945     for(; i<size; i++)
6946         skip_bits(&s->gb, 8);
6947
6948     return 0;
6949 }
6950
6951 static int decode_sei(H264Context *h){
6952     MpegEncContext * const s = &h->s;
6953
6954     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6955         int size, type;
6956
6957         type=0;
6958         do{
6959             type+= show_bits(&s->gb, 8);
6960         }while(get_bits(&s->gb, 8) == 255);
6961
6962         size=0;
6963         do{
6964             size+= show_bits(&s->gb, 8);
6965         }while(get_bits(&s->gb, 8) == 255);
6966
6967         switch(type){
6968         case 5:
6969             if(decode_unregistered_user_data(h, size) < 0)
6970                 return -1;
6971             break;
6972         default:
6973             skip_bits(&s->gb, 8*size);
6974         }
6975
6976         //FIXME check bits here
6977         align_get_bits(&s->gb);
6978     }
6979
6980     return 0;
6981 }
6982
6983 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6984     MpegEncContext * const s = &h->s;
6985     int cpb_count, i;
6986     cpb_count = get_ue_golomb(&s->gb) + 1;
6987     get_bits(&s->gb, 4); /* bit_rate_scale */
6988     get_bits(&s->gb, 4); /* cpb_size_scale */
6989     for(i=0; i<cpb_count; i++){
6990         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6991         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6992         get_bits1(&s->gb);     /* cbr_flag */
6993     }
6994     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6995     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6996     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6997     get_bits(&s->gb, 5); /* time_offset_length */
6998 }
6999
7000 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7001     MpegEncContext * const s = &h->s;
7002     int aspect_ratio_info_present_flag;
7003     unsigned int aspect_ratio_idc;
7004     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7005
7006     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7007
7008     if( aspect_ratio_info_present_flag ) {
7009         aspect_ratio_idc= get_bits(&s->gb, 8);
7010         if( aspect_ratio_idc == EXTENDED_SAR ) {
7011             sps->sar.num= get_bits(&s->gb, 16);
7012             sps->sar.den= get_bits(&s->gb, 16);
7013         }else if(aspect_ratio_idc < 14){
7014             sps->sar=  pixel_aspect[aspect_ratio_idc];
7015         }else{
7016             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7017             return -1;
7018         }
7019     }else{
7020         sps->sar.num=
7021         sps->sar.den= 0;
7022     }
7023 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7024
7025     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7026         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7027     }
7028
7029     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7030         get_bits(&s->gb, 3);    /* video_format */
7031         get_bits1(&s->gb);      /* video_full_range_flag */
7032         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7033             get_bits(&s->gb, 8); /* colour_primaries */
7034             get_bits(&s->gb, 8); /* transfer_characteristics */
7035             get_bits(&s->gb, 8); /* matrix_coefficients */
7036         }
7037     }
7038
7039     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7040         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7041         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7042     }
7043
7044     sps->timing_info_present_flag = get_bits1(&s->gb);
7045     if(sps->timing_info_present_flag){
7046         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7047         sps->time_scale = get_bits_long(&s->gb, 32);
7048         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7049     }
7050
7051     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7052     if(nal_hrd_parameters_present_flag)
7053         decode_hrd_parameters(h, sps);
7054     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7055     if(vcl_hrd_parameters_present_flag)
7056         decode_hrd_parameters(h, sps);
7057     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7058         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7059     get_bits1(&s->gb);         /* pic_struct_present_flag */
7060
7061     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7062     if(sps->bitstream_restriction_flag){
7063         unsigned int num_reorder_frames;
7064         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7065         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7066         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7067         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7068         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7069         num_reorder_frames= get_ue_golomb(&s->gb);
7070         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7071
7072         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7073             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7074             return -1;
7075         }
7076
7077         sps->num_reorder_frames= num_reorder_frames;
7078     }
7079
7080     return 0;
7081 }
7082
7083 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7084                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7085     MpegEncContext * const s = &h->s;
7086     int i, last = 8, next = 8;
7087     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7088     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7089         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7090     else
7091     for(i=0;i<size;i++){
7092         if(next)
7093             next = (last + get_se_golomb(&s->gb)) & 0xff;
7094         if(!i && !next){ /* matrix not written, we use the preset one */
7095             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7096             break;
7097         }
7098         last = factors[scan[i]] = next ? next : last;
7099     }
7100 }
7101
7102 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7103                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7104     MpegEncContext * const s = &h->s;
7105     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7106     const uint8_t *fallback[4] = {
7107         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7108         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7109         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7110         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7111     };
7112     if(get_bits1(&s->gb)){
7113         sps->scaling_matrix_present |= is_sps;
7114         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7115         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7116         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7117         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7118         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7119         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7120         if(is_sps || pps->transform_8x8_mode){
7121             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7122             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7123         }
7124     } else if(fallback_sps) {
7125         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7126         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7127     }
7128 }
7129
7130 /**
7131  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7132  */
7133 static void *
7134 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7135                     const size_t size, const char *name)
7136 {
7137     if(id>=max) {
7138         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7139         return NULL;
7140     }
7141
7142     if(!vec[id]) {
7143         vec[id] = av_mallocz(size);
7144         if(vec[id] == NULL)
7145             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7146     }
7147     return vec[id];
7148 }
7149
7150 static inline int decode_seq_parameter_set(H264Context *h){
7151     MpegEncContext * const s = &h->s;
7152     int profile_idc, level_idc;
7153     unsigned int sps_id, tmp, mb_width, mb_height;
7154     int i;
7155     SPS *sps;
7156
7157     profile_idc= get_bits(&s->gb, 8);
7158     get_bits1(&s->gb);   //constraint_set0_flag
7159     get_bits1(&s->gb);   //constraint_set1_flag
7160     get_bits1(&s->gb);   //constraint_set2_flag
7161     get_bits1(&s->gb);   //constraint_set3_flag
7162     get_bits(&s->gb, 4); // reserved
7163     level_idc= get_bits(&s->gb, 8);
7164     sps_id= get_ue_golomb(&s->gb);
7165
7166     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7167     if(sps == NULL)
7168         return -1;
7169
7170     sps->profile_idc= profile_idc;
7171     sps->level_idc= level_idc;
7172
7173     if(sps->profile_idc >= 100){ //high profile
7174         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7175             get_bits1(&s->gb);  //residual_color_transform_flag
7176         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7177         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7178         sps->transform_bypass = get_bits1(&s->gb);
7179         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7180     }else
7181         sps->scaling_matrix_present = 0;
7182
7183     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7184     sps->poc_type= get_ue_golomb(&s->gb);
7185
7186     if(sps->poc_type == 0){ //FIXME #define
7187         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7188     } else if(sps->poc_type == 1){//FIXME #define
7189         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7190         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7191         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7192         tmp= get_ue_golomb(&s->gb);
7193
7194         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7195             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7196             return -1;
7197         }
7198         sps->poc_cycle_length= tmp;
7199
7200         for(i=0; i<sps->poc_cycle_length; i++)
7201             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7202     }else if(sps->poc_type != 2){
7203         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7204         return -1;
7205     }
7206
7207     tmp= get_ue_golomb(&s->gb);
7208     if(tmp > MAX_PICTURE_COUNT-2){
7209         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7210     }
7211     sps->ref_frame_count= tmp;
7212     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7213     mb_width= get_ue_golomb(&s->gb) + 1;
7214     mb_height= get_ue_golomb(&s->gb) + 1;
7215     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7216        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7217         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7218         return -1;
7219     }
7220     sps->mb_width = mb_width;
7221     sps->mb_height= mb_height;
7222
7223     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7224     if(!sps->frame_mbs_only_flag)
7225         sps->mb_aff= get_bits1(&s->gb);
7226     else
7227         sps->mb_aff= 0;
7228
7229     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7230
7231 #ifndef ALLOW_INTERLACE
7232     if(sps->mb_aff)
7233         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7234 #endif
7235     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7236         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7237
7238     sps->crop= get_bits1(&s->gb);
7239     if(sps->crop){
7240         sps->crop_left  = get_ue_golomb(&s->gb);
7241         sps->crop_right = get_ue_golomb(&s->gb);
7242         sps->crop_top   = get_ue_golomb(&s->gb);
7243         sps->crop_bottom= get_ue_golomb(&s->gb);
7244         if(sps->crop_left || sps->crop_top){
7245             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7246         }
7247     }else{
7248         sps->crop_left  =
7249         sps->crop_right =
7250         sps->crop_top   =
7251         sps->crop_bottom= 0;
7252     }
7253
7254     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7255     if( sps->vui_parameters_present_flag )
7256         decode_vui_parameters(h, sps);
7257
7258     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7259         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7260                sps_id, sps->profile_idc, sps->level_idc,
7261                sps->poc_type,
7262                sps->ref_frame_count,
7263                sps->mb_width, sps->mb_height,
7264                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7265                sps->direct_8x8_inference_flag ? "8B8" : "",
7266                sps->crop_left, sps->crop_right,
7267                sps->crop_top, sps->crop_bottom,
7268                sps->vui_parameters_present_flag ? "VUI" : ""
7269                );
7270     }
7271     return 0;
7272 }
7273
7274 static void
7275 build_qp_table(PPS *pps, int t, int index)
7276 {
7277     int i;
7278     for(i = 0; i < 255; i++)
7279         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7280 }
7281
7282 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7283     MpegEncContext * const s = &h->s;
7284     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7285     PPS *pps;
7286
7287     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7288     if(pps == NULL)
7289         return -1;
7290
7291     tmp= get_ue_golomb(&s->gb);
7292     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7293         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7294         return -1;
7295     }
7296     pps->sps_id= tmp;
7297
7298     pps->cabac= get_bits1(&s->gb);
7299     pps->pic_order_present= get_bits1(&s->gb);
7300     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7301     if(pps->slice_group_count > 1 ){
7302         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7303         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7304         switch(pps->mb_slice_group_map_type){
7305         case 0:
7306 #if 0
7307 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7308 |    run_length[ i ]                                |1  |ue(v)   |
7309 #endif
7310             break;
7311         case 2:
7312 #if 0
7313 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7314 |{                                                  |   |        |
7315 |    top_left_mb[ i ]                               |1  |ue(v)   |
7316 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7317 |   }                                               |   |        |
7318 #endif
7319             break;
7320         case 3:
7321         case 4:
7322         case 5:
7323 #if 0
7324 |   slice_group_change_direction_flag               |1  |u(1)    |
7325 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7326 #endif
7327             break;
7328         case 6:
7329 #if 0
7330 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7331 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7332 |)                                                  |   |        |
7333 |    slice_group_id[ i ]                            |1  |u(v)    |
7334 #endif
7335             break;
7336         }
7337     }
7338     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7339     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7340     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7341         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7342         pps->ref_count[0]= pps->ref_count[1]= 1;
7343         return -1;
7344     }
7345
7346     pps->weighted_pred= get_bits1(&s->gb);
7347     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7348     pps->init_qp= get_se_golomb(&s->gb) + 26;
7349     pps->init_qs= get_se_golomb(&s->gb) + 26;
7350     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7351     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7352     pps->constrained_intra_pred= get_bits1(&s->gb);
7353     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7354
7355     pps->transform_8x8_mode= 0;
7356     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7357     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7358     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7359
7360     if(get_bits_count(&s->gb) < bit_length){
7361         pps->transform_8x8_mode= get_bits1(&s->gb);
7362         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7363         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7364     } else {
7365         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7366     }
7367
7368     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7369     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7370         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7371         h->pps.chroma_qp_diff= 1;
7372     } else
7373         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7374
7375     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7376         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7377                pps_id, pps->sps_id,
7378                pps->cabac ? "CABAC" : "CAVLC",
7379                pps->slice_group_count,
7380                pps->ref_count[0], pps->ref_count[1],
7381                pps->weighted_pred ? "weighted" : "",
7382                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7383                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7384                pps->constrained_intra_pred ? "CONSTR" : "",
7385                pps->redundant_pic_cnt_present ? "REDU" : "",
7386                pps->transform_8x8_mode ? "8x8DCT" : ""
7387                );
7388     }
7389
7390     return 0;
7391 }
7392
7393 /**
7394  * Call decode_slice() for each context.
7395  *
7396  * @param h h264 master context
7397  * @param context_count number of contexts to execute
7398  */
7399 static void execute_decode_slices(H264Context *h, int context_count){
7400     MpegEncContext * const s = &h->s;
7401     AVCodecContext * const avctx= s->avctx;
7402     H264Context *hx;
7403     int i;
7404
7405     if(context_count == 1) {
7406         decode_slice(avctx, h);
7407     } else {
7408         for(i = 1; i < context_count; i++) {
7409             hx = h->thread_context[i];
7410             hx->s.error_resilience = avctx->error_resilience;
7411             hx->s.error_count = 0;
7412         }
7413
7414         avctx->execute(avctx, (void *)decode_slice,
7415                        (void **)h->thread_context, NULL, context_count);
7416
7417         /* pull back stuff from slices to master context */
7418         hx = h->thread_context[context_count - 1];
7419         s->mb_x = hx->s.mb_x;
7420         s->mb_y = hx->s.mb_y;
7421         s->dropable = hx->s.dropable;
7422         s->picture_structure = hx->s.picture_structure;
7423         for(i = 1; i < context_count; i++)
7424             h->s.error_count += h->thread_context[i]->s.error_count;
7425     }
7426 }
7427
7428
7429 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7430     MpegEncContext * const s = &h->s;
7431     AVCodecContext * const avctx= s->avctx;
7432     int buf_index=0;
7433     H264Context *hx; ///< thread context
7434     int context_count = 0;
7435
7436     h->max_contexts = avctx->thread_count;
7437 #if 0
7438     int i;
7439     for(i=0; i<50; i++){
7440         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7441     }
7442 #endif
7443     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7444         h->current_slice = 0;
7445         if (!s->first_field)
7446             s->current_picture_ptr= NULL;
7447     }
7448
7449     for(;;){
7450         int consumed;
7451         int dst_length;
7452         int bit_length;
7453         uint8_t *ptr;
7454         int i, nalsize = 0;
7455         int err;
7456
7457         if(h->is_avc) {
7458             if(buf_index >= buf_size) break;
7459             nalsize = 0;
7460             for(i = 0; i < h->nal_length_size; i++)
7461                 nalsize = (nalsize << 8) | buf[buf_index++];
7462             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7463                 if(nalsize == 1){
7464                     buf_index++;
7465                     continue;
7466                 }else{
7467                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7468                     break;
7469                 }
7470             }
7471         } else {
7472             // start code prefix search
7473             for(; buf_index + 3 < buf_size; buf_index++){
7474                 // This should always succeed in the first iteration.
7475                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7476                     break;
7477             }
7478
7479             if(buf_index+3 >= buf_size) break;
7480
7481             buf_index+=3;
7482         }
7483
7484         hx = h->thread_context[context_count];
7485
7486         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7487         if (ptr==NULL || dst_length < 0){
7488             return -1;
7489         }
7490         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7491             dst_length--;
7492         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7493
7494         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7495             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7496         }
7497
7498         if (h->is_avc && (nalsize != consumed))
7499             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7500
7501         buf_index += consumed;
7502
7503         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7504            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7505             continue;
7506
7507       again:
7508         err = 0;
7509         switch(hx->nal_unit_type){
7510         case NAL_IDR_SLICE:
7511             if (h->nal_unit_type != NAL_IDR_SLICE) {
7512                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7513                 return -1;
7514             }
7515             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7516         case NAL_SLICE:
7517             init_get_bits(&hx->s.gb, ptr, bit_length);
7518             hx->intra_gb_ptr=
7519             hx->inter_gb_ptr= &hx->s.gb;
7520             hx->s.data_partitioning = 0;
7521
7522             if((err = decode_slice_header(hx, h)))
7523                break;
7524
7525             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7526             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7527                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7528                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7529                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7530                && avctx->skip_frame < AVDISCARD_ALL)
7531                 context_count++;
7532             break;
7533         case NAL_DPA:
7534             init_get_bits(&hx->s.gb, ptr, bit_length);
7535             hx->intra_gb_ptr=
7536             hx->inter_gb_ptr= NULL;
7537             hx->s.data_partitioning = 1;
7538
7539             err = decode_slice_header(hx, h);
7540             break;
7541         case NAL_DPB:
7542             init_get_bits(&hx->intra_gb, ptr, bit_length);
7543             hx->intra_gb_ptr= &hx->intra_gb;
7544             break;
7545         case NAL_DPC:
7546             init_get_bits(&hx->inter_gb, ptr, bit_length);
7547             hx->inter_gb_ptr= &hx->inter_gb;
7548
7549             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7550                && s->context_initialized
7551                && s->hurry_up < 5
7552                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7553                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7554                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7555                && avctx->skip_frame < AVDISCARD_ALL)
7556                 context_count++;
7557             break;
7558         case NAL_SEI:
7559             init_get_bits(&s->gb, ptr, bit_length);
7560             decode_sei(h);
7561             break;
7562         case NAL_SPS:
7563             init_get_bits(&s->gb, ptr, bit_length);
7564             decode_seq_parameter_set(h);
7565
7566             if(s->flags& CODEC_FLAG_LOW_DELAY)
7567                 s->low_delay=1;
7568
7569             if(avctx->has_b_frames < 2)
7570                 avctx->has_b_frames= !s->low_delay;
7571             break;
7572         case NAL_PPS:
7573             init_get_bits(&s->gb, ptr, bit_length);
7574
7575             decode_picture_parameter_set(h, bit_length);
7576
7577             break;
7578         case NAL_AUD:
7579         case NAL_END_SEQUENCE:
7580         case NAL_END_STREAM:
7581         case NAL_FILLER_DATA:
7582         case NAL_SPS_EXT:
7583         case NAL_AUXILIARY_SLICE:
7584             break;
7585         default:
7586             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7587         }
7588
7589         if(context_count == h->max_contexts) {
7590             execute_decode_slices(h, context_count);
7591             context_count = 0;
7592         }
7593
7594         if (err < 0)
7595             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7596         else if(err == 1) {
7597             /* Slice could not be decoded in parallel mode, copy down
7598              * NAL unit stuff to context 0 and restart. Note that
7599              * rbsp_buffer is not transfered, but since we no longer
7600              * run in parallel mode this should not be an issue. */
7601             h->nal_unit_type = hx->nal_unit_type;
7602             h->nal_ref_idc   = hx->nal_ref_idc;
7603             hx = h;
7604             goto again;
7605         }
7606     }
7607     if(context_count)
7608         execute_decode_slices(h, context_count);
7609     return buf_index;
7610 }
7611
7612 /**
7613  * returns the number of bytes consumed for building the current frame
7614  */
7615 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7616     if(s->flags&CODEC_FLAG_TRUNCATED){
7617         pos -= s->parse_context.last_index;
7618         if(pos<0) pos=0; // FIXME remove (unneeded?)
7619
7620         return pos;
7621     }else{
7622         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7623         if(pos+10>buf_size) pos=buf_size; // oops ;)
7624
7625         return pos;
7626     }
7627 }
7628
7629 static int decode_frame(AVCodecContext *avctx,
7630                              void *data, int *data_size,
7631                              uint8_t *buf, int buf_size)
7632 {
7633     H264Context *h = avctx->priv_data;
7634     MpegEncContext *s = &h->s;
7635     AVFrame *pict = data;
7636     int buf_index;
7637
7638     s->flags= avctx->flags;
7639     s->flags2= avctx->flags2;
7640
7641    /* no supplementary picture */
7642     if (buf_size == 0) {
7643         Picture *out;
7644         int i, out_idx;
7645
7646 //FIXME factorize this with the output code below
7647         out = h->delayed_pic[0];
7648         out_idx = 0;
7649         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7650             if(h->delayed_pic[i]->poc < out->poc){
7651                 out = h->delayed_pic[i];
7652                 out_idx = i;
7653             }
7654
7655         for(i=out_idx; h->delayed_pic[i]; i++)
7656             h->delayed_pic[i] = h->delayed_pic[i+1];
7657
7658         if(out){
7659             *data_size = sizeof(AVFrame);
7660             *pict= *(AVFrame*)out;
7661         }
7662
7663         return 0;
7664     }
7665
7666     if(s->flags&CODEC_FLAG_TRUNCATED){
7667         int next= ff_h264_find_frame_end(h, buf, buf_size);
7668
7669         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7670             return buf_size;
7671 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7672     }
7673
7674     if(h->is_avc && !h->got_avcC) {
7675         int i, cnt, nalsize;
7676         unsigned char *p = avctx->extradata;
7677         if(avctx->extradata_size < 7) {
7678             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7679             return -1;
7680         }
7681         if(*p != 1) {
7682             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7683             return -1;
7684         }
7685         /* sps and pps in the avcC always have length coded with 2 bytes,
7686            so put a fake nal_length_size = 2 while parsing them */
7687         h->nal_length_size = 2;
7688         // Decode sps from avcC
7689         cnt = *(p+5) & 0x1f; // Number of sps
7690         p += 6;
7691         for (i = 0; i < cnt; i++) {
7692             nalsize = AV_RB16(p) + 2;
7693             if(decode_nal_units(h, p, nalsize) < 0) {
7694                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7695                 return -1;
7696             }
7697             p += nalsize;
7698         }
7699         // Decode pps from avcC
7700         cnt = *(p++); // Number of pps
7701         for (i = 0; i < cnt; i++) {
7702             nalsize = AV_RB16(p) + 2;
7703             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7704                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7705                 return -1;
7706             }
7707             p += nalsize;
7708         }
7709         // Now store right nal length size, that will be use to parse all other nals
7710         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7711         // Do not reparse avcC
7712         h->got_avcC = 1;
7713     }
7714
7715     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7716         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7717             return -1;
7718     }
7719
7720     buf_index=decode_nal_units(h, buf, buf_size);
7721     if(buf_index < 0)
7722         return -1;
7723
7724     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7725         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7726         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7727         return -1;
7728     }
7729
7730     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7731         Picture *out = s->current_picture_ptr;
7732         Picture *cur = s->current_picture_ptr;
7733         Picture *prev = h->delayed_output_pic;
7734         int i, pics, cross_idr, out_of_order, out_idx;
7735
7736         s->mb_y= 0;
7737
7738         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7739         s->current_picture_ptr->pict_type= s->pict_type;
7740
7741         h->prev_frame_num_offset= h->frame_num_offset;
7742         h->prev_frame_num= h->frame_num;
7743         if(!s->dropable) {
7744             h->prev_poc_msb= h->poc_msb;
7745             h->prev_poc_lsb= h->poc_lsb;
7746             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7747         }
7748
7749         /*
7750          * FIXME: Error handling code does not seem to support interlaced
7751          * when slices span multiple rows
7752          * The ff_er_add_slice calls don't work right for bottom
7753          * fields; they cause massive erroneous error concealing
7754          * Error marking covers both fields (top and bottom).
7755          * This causes a mismatched s->error_count
7756          * and a bad error table. Further, the error count goes to
7757          * INT_MAX when called for bottom field, because mb_y is
7758          * past end by one (callers fault) and resync_mb_y != 0
7759          * causes problems for the first MB line, too.
7760          */
7761         if (!FIELD_PICTURE)
7762             ff_er_frame_end(s);
7763
7764         MPV_frame_end(s);
7765
7766         if (s->first_field) {
7767             /* Wait for second field. */
7768             *data_size = 0;
7769
7770         } else {
7771             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7772             /* Derive top_field_first from field pocs. */
7773             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7774
7775         //FIXME do something with unavailable reference frames
7776
7777 #if 0 //decode order
7778             *data_size = sizeof(AVFrame);
7779 #else
7780             /* Sort B-frames into display order */
7781
7782             if(h->sps.bitstream_restriction_flag
7783                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7784                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7785                 s->low_delay = 0;
7786             }
7787
7788             pics = 0;
7789             while(h->delayed_pic[pics]) pics++;
7790
7791             assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7792
7793             h->delayed_pic[pics++] = cur;
7794             if(cur->reference == 0)
7795                 cur->reference = DELAYED_PIC_REF;
7796
7797             cross_idr = 0;
7798             for(i=0; h->delayed_pic[i]; i++)
7799                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7800                     cross_idr = 1;
7801
7802             out = h->delayed_pic[0];
7803             out_idx = 0;
7804             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7805                 if(h->delayed_pic[i]->poc < out->poc){
7806                     out = h->delayed_pic[i];
7807                     out_idx = i;
7808                 }
7809
7810             out_of_order = !cross_idr && prev && out->poc < prev->poc;
7811             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7812                 { }
7813             else if(prev && pics <= s->avctx->has_b_frames)
7814                 out = prev;
7815             else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7816                || (s->low_delay &&
7817                 ((!cross_idr && prev && out->poc > prev->poc + 2)
7818                  || cur->pict_type == B_TYPE)))
7819             {
7820                 s->low_delay = 0;
7821                 s->avctx->has_b_frames++;
7822                 out = prev;
7823             }
7824             else if(out_of_order)
7825                 out = prev;
7826
7827             if(out_of_order || pics > s->avctx->has_b_frames){
7828                 for(i=out_idx; h->delayed_pic[i]; i++)
7829                     h->delayed_pic[i] = h->delayed_pic[i+1];
7830             }
7831
7832             if(prev == out)
7833                 *data_size = 0;
7834             else
7835                 *data_size = sizeof(AVFrame);
7836             if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7837                 prev->reference = 0;
7838             h->delayed_output_pic = out;
7839 #endif
7840
7841             if(out)
7842                 *pict= *(AVFrame*)out;
7843             else
7844                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7845         }
7846     }
7847
7848     assert(pict->data[0] || !*data_size);
7849     ff_print_debug_info(s, pict);
7850 //printf("out %d\n", (int)pict->data[0]);
7851 #if 0 //?
7852
7853     /* Return the Picture timestamp as the frame number */
7854     /* we subtract 1 because it is added on utils.c     */
7855     avctx->frame_number = s->picture_number - 1;
7856 #endif
7857     return get_consumed_bytes(s, buf_index, buf_size);
7858 }
7859 #if 0
7860 static inline void fill_mb_avail(H264Context *h){
7861     MpegEncContext * const s = &h->s;
7862     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7863
7864     if(s->mb_y){
7865         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7866         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7867         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7868     }else{
7869         h->mb_avail[0]=
7870         h->mb_avail[1]=
7871         h->mb_avail[2]= 0;
7872     }
7873     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7874     h->mb_avail[4]= 1; //FIXME move out
7875     h->mb_avail[5]= 0; //FIXME move out
7876 }
7877 #endif
7878
7879 #if 0 //selftest
7880 #undef random
7881 #define COUNT 8000
7882 #define SIZE (COUNT*40)
7883 int main(void){
7884     int i;
7885     uint8_t temp[SIZE];
7886     PutBitContext pb;
7887     GetBitContext gb;
7888 //    int int_temp[10000];
7889     DSPContext dsp;
7890     AVCodecContext avctx;
7891
7892     dsputil_init(&dsp, &avctx);
7893
7894     init_put_bits(&pb, temp, SIZE);
7895     printf("testing unsigned exp golomb\n");
7896     for(i=0; i<COUNT; i++){
7897         START_TIMER
7898         set_ue_golomb(&pb, i);
7899         STOP_TIMER("set_ue_golomb");
7900     }
7901     flush_put_bits(&pb);
7902
7903     init_get_bits(&gb, temp, 8*SIZE);
7904     for(i=0; i<COUNT; i++){
7905         int j, s;
7906
7907         s= show_bits(&gb, 24);
7908
7909         START_TIMER
7910         j= get_ue_golomb(&gb);
7911         if(j != i){
7912             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7913 //            return -1;
7914         }
7915         STOP_TIMER("get_ue_golomb");
7916     }
7917
7918
7919     init_put_bits(&pb, temp, SIZE);
7920     printf("testing signed exp golomb\n");
7921     for(i=0; i<COUNT; i++){
7922         START_TIMER
7923         set_se_golomb(&pb, i - COUNT/2);
7924         STOP_TIMER("set_se_golomb");
7925     }
7926     flush_put_bits(&pb);
7927
7928     init_get_bits(&gb, temp, 8*SIZE);
7929     for(i=0; i<COUNT; i++){
7930         int j, s;
7931
7932         s= show_bits(&gb, 24);
7933
7934         START_TIMER
7935         j= get_se_golomb(&gb);
7936         if(j != i - COUNT/2){
7937             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7938 //            return -1;
7939         }
7940         STOP_TIMER("get_se_golomb");
7941     }
7942
7943     printf("testing 4x4 (I)DCT\n");
7944
7945     DCTELEM block[16];
7946     uint8_t src[16], ref[16];
7947     uint64_t error= 0, max_error=0;
7948
7949     for(i=0; i<COUNT; i++){
7950         int j;
7951 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7952         for(j=0; j<16; j++){
7953             ref[j]= random()%255;
7954             src[j]= random()%255;
7955         }
7956
7957         h264_diff_dct_c(block, src, ref, 4);
7958
7959         //normalize
7960         for(j=0; j<16; j++){
7961 //            printf("%d ", block[j]);
7962             block[j]= block[j]*4;
7963             if(j&1) block[j]= (block[j]*4 + 2)/5;
7964             if(j&4) block[j]= (block[j]*4 + 2)/5;
7965         }
7966 //        printf("\n");
7967
7968         s->dsp.h264_idct_add(ref, block, 4);
7969 /*        for(j=0; j<16; j++){
7970             printf("%d ", ref[j]);
7971         }
7972         printf("\n");*/
7973
7974         for(j=0; j<16; j++){
7975             int diff= FFABS(src[j] - ref[j]);
7976
7977             error+= diff*diff;
7978             max_error= FFMAX(max_error, diff);
7979         }
7980     }
7981     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7982 #if 0
7983     printf("testing quantizer\n");
7984     for(qp=0; qp<52; qp++){
7985         for(i=0; i<16; i++)
7986             src1_block[i]= src2_block[i]= random()%255;
7987
7988     }
7989 #endif
7990     printf("Testing NAL layer\n");
7991
7992     uint8_t bitstream[COUNT];
7993     uint8_t nal[COUNT*2];
7994     H264Context h;
7995     memset(&h, 0, sizeof(H264Context));
7996
7997     for(i=0; i<COUNT; i++){
7998         int zeros= i;
7999         int nal_length;
8000         int consumed;
8001         int out_length;
8002         uint8_t *out;
8003         int j;
8004
8005         for(j=0; j<COUNT; j++){
8006             bitstream[j]= (random() % 255) + 1;
8007         }
8008
8009         for(j=0; j<zeros; j++){
8010             int pos= random() % COUNT;
8011             while(bitstream[pos] == 0){
8012                 pos++;
8013                 pos %= COUNT;
8014             }
8015             bitstream[pos]=0;
8016         }
8017
8018         START_TIMER
8019
8020         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8021         if(nal_length<0){
8022             printf("encoding failed\n");
8023             return -1;
8024         }
8025
8026         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8027
8028         STOP_TIMER("NAL")
8029
8030         if(out_length != COUNT){
8031             printf("incorrect length %d %d\n", out_length, COUNT);
8032             return -1;
8033         }
8034
8035         if(consumed != nal_length){
8036             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8037             return -1;
8038         }
8039
8040         if(memcmp(bitstream, out, COUNT)){
8041             printf("mismatch\n");
8042             return -1;
8043         }
8044     }
8045
8046     printf("Testing RBSP\n");
8047
8048
8049     return 0;
8050 }
8051 #endif
8052
8053
8054 static int decode_end(AVCodecContext *avctx)
8055 {
8056     H264Context *h = avctx->priv_data;
8057     MpegEncContext *s = &h->s;
8058
8059     av_freep(&h->rbsp_buffer[0]);
8060     av_freep(&h->rbsp_buffer[1]);
8061     free_tables(h); //FIXME cleanup init stuff perhaps
8062     MPV_common_end(s);
8063
8064 //    memset(h, 0, sizeof(H264Context));
8065
8066     return 0;
8067 }
8068
8069
8070 AVCodec h264_decoder = {
8071     "h264",
8072     CODEC_TYPE_VIDEO,
8073     CODEC_ID_H264,
8074     sizeof(H264Context),
8075     decode_init,
8076     NULL,
8077     decode_end,
8078     decode_frame,
8079     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8080     .flush= flush_dpb,
8081 };
8082
8083 #include "svq3.c"