git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 /**
  42  * Value of Picture.reference when Picture is not a reference picture, but
  43  * is held for delayed output.
  44  */
  45 #define DELAYED_PIC_REF 4
  46
  47 static VLC coeff_token_vlc[4];
  48 static VLC chroma_dc_coeff_token_vlc;
  49
  50 static VLC total_zeros_vlc[15];
  51 static VLC chroma_dc_total_zeros_vlc[3];
  52
  53 static VLC run_vlc[6];
  54 static VLC run7_vlc;
  55
  56 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  57 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  58 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  59 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  60
  61 static av_always_inline uint32_t pack16to32(int a, int b){
  62 #ifdef WORDS_BIGENDIAN
  63    return (b&0xFFFF) + (a<<16);
  64 #else
  65    return (a&0xFFFF) + (b<<16);
  66 #endif
  67 }
  68
  69 const uint8_t ff_rem6[52]={
  70 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  71 };
  72
  73 const uint8_t ff_div6[52]={
  74 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  75 };
  76
  77
  78 /**
  79  * fill a rectangle.
  80  * @param h height of the rectangle, should be a constant
  81  * @param w width of the rectangle, should be a constant
  82  * @param size the size of val (1 or 4), should be a constant
  83  */
  84 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  85     uint8_t *p= (uint8_t*)vp;
  86     assert(size==1 || size==4);
  87     assert(w<=4);
  88
  89     w      *= size;
  90     stride *= size;
  91
  92     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  93     assert((stride&(w-1))==0);
  94     if(w==2){
  95         const uint16_t v= size==4 ? val : val*0x0101;
  96         *(uint16_t*)(p + 0*stride)= v;
  97         if(h==1) return;
  98         *(uint16_t*)(p + 1*stride)= v;
  99         if(h==2) return;
 100         *(uint16_t*)(p + 2*stride)= v;
 101         *(uint16_t*)(p + 3*stride)= v;
 102     }else if(w==4){
 103         const uint32_t v= size==4 ? val : val*0x01010101;
 104         *(uint32_t*)(p + 0*stride)= v;
 105         if(h==1) return;
 106         *(uint32_t*)(p + 1*stride)= v;
 107         if(h==2) return;
 108         *(uint32_t*)(p + 2*stride)= v;
 109         *(uint32_t*)(p + 3*stride)= v;
 110     }else if(w==8){
 111     //gcc can't optimize 64bit math on x86_32
 112 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 113         const uint64_t v= val*0x0100000001ULL;
 114         *(uint64_t*)(p + 0*stride)= v;
 115         if(h==1) return;
 116         *(uint64_t*)(p + 1*stride)= v;
 117         if(h==2) return;
 118         *(uint64_t*)(p + 2*stride)= v;
 119         *(uint64_t*)(p + 3*stride)= v;
 120     }else if(w==16){
 121         const uint64_t v= val*0x0100000001ULL;
 122         *(uint64_t*)(p + 0+0*stride)= v;
 123         *(uint64_t*)(p + 8+0*stride)= v;
 124         *(uint64_t*)(p + 0+1*stride)= v;
 125         *(uint64_t*)(p + 8+1*stride)= v;
 126         if(h==2) return;
 127         *(uint64_t*)(p + 0+2*stride)= v;
 128         *(uint64_t*)(p + 8+2*stride)= v;
 129         *(uint64_t*)(p + 0+3*stride)= v;
 130         *(uint64_t*)(p + 8+3*stride)= v;
 131 #else
 132         *(uint32_t*)(p + 0+0*stride)= val;
 133         *(uint32_t*)(p + 4+0*stride)= val;
 134         if(h==1) return;
 135         *(uint32_t*)(p + 0+1*stride)= val;
 136         *(uint32_t*)(p + 4+1*stride)= val;
 137         if(h==2) return;
 138         *(uint32_t*)(p + 0+2*stride)= val;
 139         *(uint32_t*)(p + 4+2*stride)= val;
 140         *(uint32_t*)(p + 0+3*stride)= val;
 141         *(uint32_t*)(p + 4+3*stride)= val;
 142     }else if(w==16){
 143         *(uint32_t*)(p + 0+0*stride)= val;
 144         *(uint32_t*)(p + 4+0*stride)= val;
 145         *(uint32_t*)(p + 8+0*stride)= val;
 146         *(uint32_t*)(p +12+0*stride)= val;
 147         *(uint32_t*)(p + 0+1*stride)= val;
 148         *(uint32_t*)(p + 4+1*stride)= val;
 149         *(uint32_t*)(p + 8+1*stride)= val;
 150         *(uint32_t*)(p +12+1*stride)= val;
 151         if(h==2) return;
 152         *(uint32_t*)(p + 0+2*stride)= val;
 153         *(uint32_t*)(p + 4+2*stride)= val;
 154         *(uint32_t*)(p + 8+2*stride)= val;
 155         *(uint32_t*)(p +12+2*stride)= val;
 156         *(uint32_t*)(p + 0+3*stride)= val;
 157         *(uint32_t*)(p + 4+3*stride)= val;
 158         *(uint32_t*)(p + 8+3*stride)= val;
 159         *(uint32_t*)(p +12+3*stride)= val;
 160 #endif
 161     }else
 162         assert(0);
 163     assert(h==4);
 164 }
 165
 166 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 167     MpegEncContext * const s = &h->s;
 168     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 169     int topleft_xy, top_xy, topright_xy, left_xy[2];
 170     int topleft_type, top_type, topright_type, left_type[2];
 171     int left_block[8];
 172     int i;
 173
 174     //FIXME deblocking could skip the intra and nnz parts.
 175     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 176         return;
 177
 178     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 179
 180     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 181     topleft_xy = top_xy - 1;
 182     topright_xy= top_xy + 1;
 183     left_xy[1] = left_xy[0] = mb_xy-1;
 184     left_block[0]= 0;
 185     left_block[1]= 1;
 186     left_block[2]= 2;
 187     left_block[3]= 3;
 188     left_block[4]= 7;
 189     left_block[5]= 10;
 190     left_block[6]= 8;
 191     left_block[7]= 11;
 192     if(FRAME_MBAFF){
 193         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 194         const int top_pair_xy      = pair_xy     - s->mb_stride;
 195         const int topleft_pair_xy  = top_pair_xy - 1;
 196         const int topright_pair_xy = top_pair_xy + 1;
 197         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 198         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 199         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 200         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 201         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 202         const int bottom = (s->mb_y & 1);
 203         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 204         if (bottom
 205                 ? !curr_mb_frame_flag // bottom macroblock
 206                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 207                 ) {
 208             top_xy -= s->mb_stride;
 209         }
 210         if (bottom
 211                 ? !curr_mb_frame_flag // bottom macroblock
 212                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 213                 ) {
 214             topleft_xy -= s->mb_stride;
 215         }
 216         if (bottom
 217                 ? !curr_mb_frame_flag // bottom macroblock
 218                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 219                 ) {
 220             topright_xy -= s->mb_stride;
 221         }
 222         if (left_mb_frame_flag != curr_mb_frame_flag) {
 223             left_xy[1] = left_xy[0] = pair_xy - 1;
 224             if (curr_mb_frame_flag) {
 225                 if (bottom) {
 226                     left_block[0]= 2;
 227                     left_block[1]= 2;
 228                     left_block[2]= 3;
 229                     left_block[3]= 3;
 230                     left_block[4]= 8;
 231                     left_block[5]= 11;
 232                     left_block[6]= 8;
 233                     left_block[7]= 11;
 234                 } else {
 235                     left_block[0]= 0;
 236                     left_block[1]= 0;
 237                     left_block[2]= 1;
 238                     left_block[3]= 1;
 239                     left_block[4]= 7;
 240                     left_block[5]= 10;
 241                     left_block[6]= 7;
 242                     left_block[7]= 10;
 243                 }
 244             } else {
 245                 left_xy[1] += s->mb_stride;
 246                 //left_block[0]= 0;
 247                 left_block[1]= 2;
 248                 left_block[2]= 0;
 249                 left_block[3]= 2;
 250                 //left_block[4]= 7;
 251                 left_block[5]= 10;
 252                 left_block[6]= 7;
 253                 left_block[7]= 10;
 254             }
 255         }
 256     }
 257
 258     h->top_mb_xy = top_xy;
 259     h->left_mb_xy[0] = left_xy[0];
 260     h->left_mb_xy[1] = left_xy[1];
 261     if(for_deblock){
 262         topleft_type = 0;
 263         topright_type = 0;
 264         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 265         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 266         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 267
 268         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 269             int list;
 270             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 271             for(i=0; i<16; i++)
 272                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 273             for(list=0; list<h->list_count; list++){
 274                 if(USES_LIST(mb_type,list)){
 275                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 276                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 277                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 278                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 279                         dst[0] = src[0];
 280                         dst[1] = src[1];
 281                         dst[2] = src[2];
 282                         dst[3] = src[3];
 283                     }
 284                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 285                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 286                     ref += h->b8_stride;
 287                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 288                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 289                 }else{
 290                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 291                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 292                 }
 293             }
 294         }
 295     }else{
 296         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 297         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 298         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 299         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 300         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 301     }
 302
 303     if(IS_INTRA(mb_type)){
 304         h->topleft_samples_available=
 305         h->top_samples_available=
 306         h->left_samples_available= 0xFFFF;
 307         h->topright_samples_available= 0xEEEA;
 308
 309         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 310             h->topleft_samples_available= 0xB3FF;
 311             h->top_samples_available= 0x33FF;
 312             h->topright_samples_available= 0x26EA;
 313         }
 314         for(i=0; i<2; i++){
 315             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 316                 h->topleft_samples_available&= 0xDF5F;
 317                 h->left_samples_available&= 0x5F5F;
 318             }
 319         }
 320
 321         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 322             h->topleft_samples_available&= 0x7FFF;
 323
 324         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 325             h->topright_samples_available&= 0xFBFF;
 326
 327         if(IS_INTRA4x4(mb_type)){
 328             if(IS_INTRA4x4(top_type)){
 329                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 330                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 331                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 332                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 333             }else{
 334                 int pred;
 335                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 336                     pred= -1;
 337                 else{
 338                     pred= 2;
 339                 }
 340                 h->intra4x4_pred_mode_cache[4+8*0]=
 341                 h->intra4x4_pred_mode_cache[5+8*0]=
 342                 h->intra4x4_pred_mode_cache[6+8*0]=
 343                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 344             }
 345             for(i=0; i<2; i++){
 346                 if(IS_INTRA4x4(left_type[i])){
 347                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 348                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 349                 }else{
 350                     int pred;
 351                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 352                         pred= -1;
 353                     else{
 354                         pred= 2;
 355                     }
 356                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 357                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 358                 }
 359             }
 360         }
 361     }
 362
 363
 364 /*
 365 0 . T T. T T T T
 366 1 L . .L . . . .
 367 2 L . .L . . . .
 368 3 . T TL . . . .
 369 4 L . .L . . . .
 370 5 L . .. . . . .
 371 */
 372 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 373     if(top_type){
 374         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 375         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 376         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 377         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 378
 379         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 380         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 381
 382         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 383         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 384
 385     }else{
 386         h->non_zero_count_cache[4+8*0]=
 387         h->non_zero_count_cache[5+8*0]=
 388         h->non_zero_count_cache[6+8*0]=
 389         h->non_zero_count_cache[7+8*0]=
 390
 391         h->non_zero_count_cache[1+8*0]=
 392         h->non_zero_count_cache[2+8*0]=
 393
 394         h->non_zero_count_cache[1+8*3]=
 395         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 396
 397     }
 398
 399     for (i=0; i<2; i++) {
 400         if(left_type[i]){
 401             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 402             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 403             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 404             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 405         }else{
 406             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 407             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 408             h->non_zero_count_cache[0+8*1 +   8*i]=
 409             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 410         }
 411     }
 412
 413     if( h->pps.cabac ) {
 414         // top_cbp
 415         if(top_type) {
 416             h->top_cbp = h->cbp_table[top_xy];
 417         } else if(IS_INTRA(mb_type)) {
 418             h->top_cbp = 0x1C0;
 419         } else {
 420             h->top_cbp = 0;
 421         }
 422         // left_cbp
 423         if (left_type[0]) {
 424             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 425         } else if(IS_INTRA(mb_type)) {
 426             h->left_cbp = 0x1C0;
 427         } else {
 428             h->left_cbp = 0;
 429         }
 430         if (left_type[0]) {
 431             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 432         }
 433         if (left_type[1]) {
 434             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 435         }
 436     }
 437
 438 #if 1
 439     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 440         int list;
 441         for(list=0; list<h->list_count; list++){
 442             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 443                 /*if(!h->mv_cache_clean[list]){
 444                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 445                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 446                     h->mv_cache_clean[list]= 1;
 447                 }*/
 448                 continue;
 449             }
 450             h->mv_cache_clean[list]= 0;
 451
 452             if(USES_LIST(top_type, list)){
 453                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 454                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 455                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 456                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 457                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 458                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 459                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 460                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 461                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 462                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 463             }else{
 464                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 465                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 466                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 467                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 468                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 469             }
 470
 471             for(i=0; i<2; i++){
 472                 int cache_idx = scan8[0] - 1 + i*2*8;
 473                 if(USES_LIST(left_type[i], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 475                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 476                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 477                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 478                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 479                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 480                 }else{
 481                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 482                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 483                     h->ref_cache[list][cache_idx  ]=
 484                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 485                 }
 486             }
 487
 488             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 489                 continue;
 490
 491             if(USES_LIST(topleft_type, list)){
 492                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 493                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 494                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 495                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 496             }else{
 497                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 498                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 499             }
 500
 501             if(USES_LIST(topright_type, list)){
 502                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 503                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 504                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 505                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 506             }else{
 507                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 508                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 509             }
 510
 511             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 512                 continue;
 513
 514             h->ref_cache[list][scan8[5 ]+1] =
 515             h->ref_cache[list][scan8[7 ]+1] =
 516             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 517             h->ref_cache[list][scan8[4 ]] =
 518             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 519             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 520             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 521             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 522             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 523             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 524
 525             if( h->pps.cabac ) {
 526                 /* XXX beurk, Load mvd */
 527                 if(USES_LIST(top_type, list)){
 528                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 529                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 530                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 531                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 532                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 533                 }else{
 534                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 535                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 536                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 537                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 538                 }
 539                 if(USES_LIST(left_type[0], list)){
 540                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 541                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 542                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 543                 }else{
 544                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 545                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 546                 }
 547                 if(USES_LIST(left_type[1], list)){
 548                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 549                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 550                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 551                 }else{
 552                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 553                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 554                 }
 555                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 556                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 557                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 558                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 559                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 560
 561                 if(h->slice_type == B_TYPE){
 562                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 563
 564                     if(IS_DIRECT(top_type)){
 565                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 566                     }else if(IS_8X8(top_type)){
 567                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 568                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 569                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 570                     }else{
 571                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 572                     }
 573
 574                     if(IS_DIRECT(left_type[0]))
 575                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 576                     else if(IS_8X8(left_type[0]))
 577                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 578                     else
 579                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 580
 581                     if(IS_DIRECT(left_type[1]))
 582                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 583                     else if(IS_8X8(left_type[1]))
 584                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 585                     else
 586                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 587                 }
 588             }
 589
 590             if(FRAME_MBAFF){
 591 #define MAP_MVS\
 592                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 593                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 594                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 595                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 596                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 597                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 598                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 599                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 600                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 601                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 602                 if(MB_FIELD){
 603 #define MAP_F2F(idx, mb_type)\
 604                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 605                         h->ref_cache[list][idx] <<= 1;\
 606                         h->mv_cache[list][idx][1] /= 2;\
 607                         h->mvd_cache[list][idx][1] /= 2;\
 608                     }
 609                     MAP_MVS
 610 #undef MAP_F2F
 611                 }else{
 612 #define MAP_F2F(idx, mb_type)\
 613                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 614                         h->ref_cache[list][idx] >>= 1;\
 615                         h->mv_cache[list][idx][1] <<= 1;\
 616                         h->mvd_cache[list][idx][1] <<= 1;\
 617                     }
 618                     MAP_MVS
 619 #undef MAP_F2F
 620                 }
 621             }
 622         }
 623     }
 624 #endif
 625
 626     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 627 }
 628
 629 static inline void write_back_intra_pred_mode(H264Context *h){
 630     MpegEncContext * const s = &h->s;
 631     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 632
 633     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 634     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 635     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 636     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 637     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 638     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 639     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 640 }
 641
 642 /**
 643  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 644  */
 645 static inline int check_intra4x4_pred_mode(H264Context *h){
 646     MpegEncContext * const s = &h->s;
 647     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 648     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 649     int i;
 650
 651     if(!(h->top_samples_available&0x8000)){
 652         for(i=0; i<4; i++){
 653             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 654             if(status<0){
 655                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 656                 return -1;
 657             } else if(status){
 658                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 659             }
 660         }
 661     }
 662
 663     if(!(h->left_samples_available&0x8000)){
 664         for(i=0; i<4; i++){
 665             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 666             if(status<0){
 667                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 668                 return -1;
 669             } else if(status){
 670                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 671             }
 672         }
 673     }
 674
 675     return 0;
 676 } //FIXME cleanup like next
 677
 678 /**
 679  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 680  */
 681 static inline int check_intra_pred_mode(H264Context *h, int mode){
 682     MpegEncContext * const s = &h->s;
 683     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 684     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 685
 686     if(mode > 6U) {
 687         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 688         return -1;
 689     }
 690
 691     if(!(h->top_samples_available&0x8000)){
 692         mode= top[ mode ];
 693         if(mode<0){
 694             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 695             return -1;
 696         }
 697     }
 698
 699     if(!(h->left_samples_available&0x8000)){
 700         mode= left[ mode ];
 701         if(mode<0){
 702             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 703             return -1;
 704         }
 705     }
 706
 707     return mode;
 708 }
 709
 710 /**
 711  * gets the predicted intra4x4 prediction mode.
 712  */
 713 static inline int pred_intra_mode(H264Context *h, int n){
 714     const int index8= scan8[n];
 715     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 716     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 717     const int min= FFMIN(left, top);
 718
 719     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 720
 721     if(min<0) return DC_PRED;
 722     else      return min;
 723 }
 724
 725 static inline void write_back_non_zero_count(H264Context *h){
 726     MpegEncContext * const s = &h->s;
 727     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 728
 729     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 730     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 731     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 732     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 733     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 734     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 735     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 736
 737     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 738     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 739     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 740
 741     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 742     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 743     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 744
 745     if(FRAME_MBAFF){
 746         // store all luma nnzs, for deblocking
 747         int v = 0, i;
 748         for(i=0; i<16; i++)
 749             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 750         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 751     }
 752 }
 753
 754 /**
 755  * gets the predicted number of non zero coefficients.
 756  * @param n block index
 757  */
 758 static inline int pred_non_zero_count(H264Context *h, int n){
 759     const int index8= scan8[n];
 760     const int left= h->non_zero_count_cache[index8 - 1];
 761     const int top = h->non_zero_count_cache[index8 - 8];
 762     int i= left + top;
 763
 764     if(i<64) i= (i+1)>>1;
 765
 766     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 767
 768     return i&31;
 769 }
 770
 771 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 772     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 773     MpegEncContext *s = &h->s;
 774
 775     /* there is no consistent mapping of mvs to neighboring locations that will
 776      * make mbaff happy, so we can't move all this logic to fill_caches */
 777     if(FRAME_MBAFF){
 778         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 779         const int16_t *mv;
 780         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 781         *C = h->mv_cache[list][scan8[0]-2];
 782
 783         if(!MB_FIELD
 784            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 785             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 786             if(IS_INTERLACED(mb_types[topright_xy])){
 787 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 788                 const int x4 = X4, y4 = Y4;\
 789                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 790                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 791                     return LIST_NOT_USED;\
 792                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 793                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 794                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 795                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 796
 797                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 798             }
 799         }
 800         if(topright_ref == PART_NOT_AVAILABLE
 801            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 802            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 803             if(!MB_FIELD
 804                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 805                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 806             }
 807             if(MB_FIELD
 808                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 809                && i >= scan8[0]+8){
 810                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 811                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 812             }
 813         }
 814 #undef SET_DIAG_MV
 815     }
 816
 817     if(topright_ref != PART_NOT_AVAILABLE){
 818         *C= h->mv_cache[list][ i - 8 + part_width ];
 819         return topright_ref;
 820     }else{
 821         tprintf(s->avctx, "topright MV not available\n");
 822
 823         *C= h->mv_cache[list][ i - 8 - 1 ];
 824         return h->ref_cache[list][ i - 8 - 1 ];
 825     }
 826 }
 827
 828 /**
 829  * gets the predicted MV.
 830  * @param n the block index
 831  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 832  * @param mx the x component of the predicted motion vector
 833  * @param my the y component of the predicted motion vector
 834  */
 835 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 836     const int index8= scan8[n];
 837     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 838     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 839     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 840     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 841     const int16_t * C;
 842     int diagonal_ref, match_count;
 843
 844     assert(part_width==1 || part_width==2 || part_width==4);
 845
 846 /* mv_cache
 847   B . . A T T T T
 848   U . . L . . , .
 849   U . . L . . . .
 850   U . . L . . , .
 851   . . . L . . . .
 852 */
 853
 854     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 855     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 856     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 857     if(match_count > 1){ //most common
 858         *mx= mid_pred(A[0], B[0], C[0]);
 859         *my= mid_pred(A[1], B[1], C[1]);
 860     }else if(match_count==1){
 861         if(left_ref==ref){
 862             *mx= A[0];
 863             *my= A[1];
 864         }else if(top_ref==ref){
 865             *mx= B[0];
 866             *my= B[1];
 867         }else{
 868             *mx= C[0];
 869             *my= C[1];
 870         }
 871     }else{
 872         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 873             *mx= A[0];
 874             *my= A[1];
 875         }else{
 876             *mx= mid_pred(A[0], B[0], C[0]);
 877             *my= mid_pred(A[1], B[1], C[1]);
 878         }
 879     }
 880
 881     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 882 }
 883
 884 /**
 885  * gets the directionally predicted 16x8 MV.
 886  * @param n the block index
 887  * @param mx the x component of the predicted motion vector
 888  * @param my the y component of the predicted motion vector
 889  */
 890 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 891     if(n==0){
 892         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 893         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 894
 895         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 896
 897         if(top_ref == ref){
 898             *mx= B[0];
 899             *my= B[1];
 900             return;
 901         }
 902     }else{
 903         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 904         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 905
 906         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 907
 908         if(left_ref == ref){
 909             *mx= A[0];
 910             *my= A[1];
 911             return;
 912         }
 913     }
 914
 915     //RARE
 916     pred_motion(h, n, 4, list, ref, mx, my);
 917 }
 918
 919 /**
 920  * gets the directionally predicted 8x16 MV.
 921  * @param n the block index
 922  * @param mx the x component of the predicted motion vector
 923  * @param my the y component of the predicted motion vector
 924  */
 925 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 926     if(n==0){
 927         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 928         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 929
 930         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 931
 932         if(left_ref == ref){
 933             *mx= A[0];
 934             *my= A[1];
 935             return;
 936         }
 937     }else{
 938         const int16_t * C;
 939         int diagonal_ref;
 940
 941         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 942
 943         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 944
 945         if(diagonal_ref == ref){
 946             *mx= C[0];
 947             *my= C[1];
 948             return;
 949         }
 950     }
 951
 952     //RARE
 953     pred_motion(h, n, 2, list, ref, mx, my);
 954 }
 955
 956 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 957     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 958     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 959
 960     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 961
 962     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 963        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 964        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 965
 966         *mx = *my = 0;
 967         return;
 968     }
 969
 970     pred_motion(h, 0, 4, 0, 0, mx, my);
 971
 972     return;
 973 }
 974
 975 static inline void direct_dist_scale_factor(H264Context * const h){
 976     const int poc = h->s.current_picture_ptr->poc;
 977     const int poc1 = h->ref_list[1][0].poc;
 978     int i;
 979     for(i=0; i<h->ref_count[0]; i++){
 980         int poc0 = h->ref_list[0][i].poc;
 981         int td = av_clip(poc1 - poc0, -128, 127);
 982         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 983             h->dist_scale_factor[i] = 256;
 984         }else{
 985             int tb = av_clip(poc - poc0, -128, 127);
 986             int tx = (16384 + (FFABS(td) >> 1)) / td;
 987             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 988         }
 989     }
 990     if(FRAME_MBAFF){
 991         for(i=0; i<h->ref_count[0]; i++){
 992             h->dist_scale_factor_field[2*i] =
 993             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 994         }
 995     }
 996 }
 997 static inline void direct_ref_list_init(H264Context * const h){
 998     MpegEncContext * const s = &h->s;
 999     Picture * const ref1 = &h->ref_list[1][0];
1000     Picture * const cur = s->current_picture_ptr;
1001     int list, i, j;
1002     if(cur->pict_type == I_TYPE)
1003         cur->ref_count[0] = 0;
1004     if(cur->pict_type != B_TYPE)
1005         cur->ref_count[1] = 0;
1006     for(list=0; list<2; list++){
1007         cur->ref_count[list] = h->ref_count[list];
1008         for(j=0; j<h->ref_count[list]; j++)
1009             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1010     }
1011     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1012         return;
1013     for(list=0; list<2; list++){
1014         for(i=0; i<ref1->ref_count[list]; i++){
1015             const int poc = ref1->ref_poc[list][i];
1016             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1017             for(j=0; j<h->ref_count[list]; j++)
1018                 if(h->ref_list[list][j].poc == poc){
1019                     h->map_col_to_list0[list][i] = j;
1020                     break;
1021                 }
1022         }
1023     }
1024     if(FRAME_MBAFF){
1025         for(list=0; list<2; list++){
1026             for(i=0; i<ref1->ref_count[list]; i++){
1027                 j = h->map_col_to_list0[list][i];
1028                 h->map_col_to_list0_field[list][2*i] = 2*j;
1029                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1030             }
1031         }
1032     }
1033 }
1034
1035 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1036     MpegEncContext * const s = &h->s;
1037     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1038     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1039     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1040     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1041     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1042     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1043     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1044     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1045     const int is_b8x8 = IS_8X8(*mb_type);
1046     unsigned int sub_mb_type;
1047     int i8, i4;
1048
1049 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1050     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1051         /* FIXME save sub mb types from previous frames (or derive from MVs)
1052          * so we know exactly what block size to use */
1053         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1054         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1055     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1056         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1057         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1058     }else{
1059         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1060         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1061     }
1062     if(!is_b8x8)
1063         *mb_type |= MB_TYPE_DIRECT2;
1064     if(MB_FIELD)
1065         *mb_type |= MB_TYPE_INTERLACED;
1066
1067     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1068
1069     if(h->direct_spatial_mv_pred){
1070         int ref[2];
1071         int mv[2][2];
1072         int list;
1073
1074         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1075
1076         /* ref = min(neighbors) */
1077         for(list=0; list<2; list++){
1078             int refa = h->ref_cache[list][scan8[0] - 1];
1079             int refb = h->ref_cache[list][scan8[0] - 8];
1080             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1081             if(refc == -2)
1082                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1083             ref[list] = refa;
1084             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1085                 ref[list] = refb;
1086             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1087                 ref[list] = refc;
1088             if(ref[list] < 0)
1089                 ref[list] = -1;
1090         }
1091
1092         if(ref[0] < 0 && ref[1] < 0){
1093             ref[0] = ref[1] = 0;
1094             mv[0][0] = mv[0][1] =
1095             mv[1][0] = mv[1][1] = 0;
1096         }else{
1097             for(list=0; list<2; list++){
1098                 if(ref[list] >= 0)
1099                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1100                 else
1101                     mv[list][0] = mv[list][1] = 0;
1102             }
1103         }
1104
1105         if(ref[1] < 0){
1106             *mb_type &= ~MB_TYPE_P0L1;
1107             sub_mb_type &= ~MB_TYPE_P0L1;
1108         }else if(ref[0] < 0){
1109             *mb_type &= ~MB_TYPE_P0L0;
1110             sub_mb_type &= ~MB_TYPE_P0L0;
1111         }
1112
1113         if(IS_16X16(*mb_type)){
1114             int a=0, b=0;
1115
1116             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1117             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1118             if(!IS_INTRA(mb_type_col)
1119                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1120                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1121                        && (h->x264_build>33 || !h->x264_build)))){
1122                 if(ref[0] > 0)
1123                     a= pack16to32(mv[0][0],mv[0][1]);
1124                 if(ref[1] > 0)
1125                     b= pack16to32(mv[1][0],mv[1][1]);
1126             }else{
1127                 a= pack16to32(mv[0][0],mv[0][1]);
1128                 b= pack16to32(mv[1][0],mv[1][1]);
1129             }
1130             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1131             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1132         }else{
1133             for(i8=0; i8<4; i8++){
1134                 const int x8 = i8&1;
1135                 const int y8 = i8>>1;
1136
1137                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1138                     continue;
1139                 h->sub_mb_type[i8] = sub_mb_type;
1140
1141                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1142                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1143                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1144                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1145
1146                 /* col_zero_flag */
1147                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1148                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1149                                                   && (h->x264_build>33 || !h->x264_build)))){
1150                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1151                     if(IS_SUB_8X8(sub_mb_type)){
1152                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1153                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1154                             if(ref[0] == 0)
1155                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1156                             if(ref[1] == 0)
1157                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1158                         }
1159                     }else
1160                     for(i4=0; i4<4; i4++){
1161                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1162                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1163                             if(ref[0] == 0)
1164                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1165                             if(ref[1] == 0)
1166                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1167                         }
1168                     }
1169                 }
1170             }
1171         }
1172     }else{ /* direct temporal mv pred */
1173         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1174         const int *dist_scale_factor = h->dist_scale_factor;
1175
1176         if(FRAME_MBAFF){
1177             if(IS_INTERLACED(*mb_type)){
1178                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1179                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1180                 dist_scale_factor = h->dist_scale_factor_field;
1181             }
1182             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1183                 /* FIXME assumes direct_8x8_inference == 1 */
1184                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1185                 int mb_types_col[2];
1186                 int y_shift;
1187
1188                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1189                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1190                          | (*mb_type & MB_TYPE_INTERLACED);
1191                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1192
1193                 if(IS_INTERLACED(*mb_type)){
1194                     /* frame to field scaling */
1195                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1196                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1197                     if(s->mb_y&1){
1198                         l1ref0 -= 2*h->b8_stride;
1199                         l1ref1 -= 2*h->b8_stride;
1200                         l1mv0 -= 4*h->b_stride;
1201                         l1mv1 -= 4*h->b_stride;
1202                     }
1203                     y_shift = 0;
1204
1205                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1206                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1207                        && !is_b8x8)
1208                         *mb_type |= MB_TYPE_16x8;
1209                     else
1210                         *mb_type |= MB_TYPE_8x8;
1211                 }else{
1212                     /* field to frame scaling */
1213                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1214                      * but in MBAFF, top and bottom POC are equal */
1215                     int dy = (s->mb_y&1) ? 1 : 2;
1216                     mb_types_col[0] =
1217                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1218                     l1ref0 += dy*h->b8_stride;
1219                     l1ref1 += dy*h->b8_stride;
1220                     l1mv0 += 2*dy*h->b_stride;
1221                     l1mv1 += 2*dy*h->b_stride;
1222                     y_shift = 2;
1223
1224                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1225                        && !is_b8x8)
1226                         *mb_type |= MB_TYPE_16x16;
1227                     else
1228                         *mb_type |= MB_TYPE_8x8;
1229                 }
1230
1231                 for(i8=0; i8<4; i8++){
1232                     const int x8 = i8&1;
1233                     const int y8 = i8>>1;
1234                     int ref0, scale;
1235                     const int16_t (*l1mv)[2]= l1mv0;
1236
1237                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1238                         continue;
1239                     h->sub_mb_type[i8] = sub_mb_type;
1240
1241                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1242                     if(IS_INTRA(mb_types_col[y8])){
1243                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1244                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1245                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1246                         continue;
1247                     }
1248
1249                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1250                     if(ref0 >= 0)
1251                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1252                     else{
1253                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1254                         l1mv= l1mv1;
1255                     }
1256                     scale = dist_scale_factor[ref0];
1257                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1258
1259                     {
1260                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1261                         int my_col = (mv_col[1]<<y_shift)/2;
1262                         int mx = (scale * mv_col[0] + 128) >> 8;
1263                         int my = (scale * my_col + 128) >> 8;
1264                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1265                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1266                     }
1267                 }
1268                 return;
1269             }
1270         }
1271
1272         /* one-to-one mv scaling */
1273
1274         if(IS_16X16(*mb_type)){
1275             int ref, mv0, mv1;
1276
1277             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1278             if(IS_INTRA(mb_type_col)){
1279                 ref=mv0=mv1=0;
1280             }else{
1281                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1282                                                 : map_col_to_list0[1][l1ref1[0]];
1283                 const int scale = dist_scale_factor[ref0];
1284                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1285                 int mv_l0[2];
1286                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1287                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1288                 ref= ref0;
1289                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1290                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1291             }
1292             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1293             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1294             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1295         }else{
1296             for(i8=0; i8<4; i8++){
1297                 const int x8 = i8&1;
1298                 const int y8 = i8>>1;
1299                 int ref0, scale;
1300                 const int16_t (*l1mv)[2]= l1mv0;
1301
1302                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1303                     continue;
1304                 h->sub_mb_type[i8] = sub_mb_type;
1305                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1306                 if(IS_INTRA(mb_type_col)){
1307                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1308                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1309                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1310                     continue;
1311                 }
1312
1313                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1314                 if(ref0 >= 0)
1315                     ref0 = map_col_to_list0[0][ref0];
1316                 else{
1317                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1318                     l1mv= l1mv1;
1319                 }
1320                 scale = dist_scale_factor[ref0];
1321
1322                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1323                 if(IS_SUB_8X8(sub_mb_type)){
1324                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1325                     int mx = (scale * mv_col[0] + 128) >> 8;
1326                     int my = (scale * mv_col[1] + 128) >> 8;
1327                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1328                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1329                 }else
1330                 for(i4=0; i4<4; i4++){
1331                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1332                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1333                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1334                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1335                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1336                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1337                 }
1338             }
1339         }
1340     }
1341 }
1342
1343 static inline void write_back_motion(H264Context *h, int mb_type){
1344     MpegEncContext * const s = &h->s;
1345     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1346     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1347     int list;
1348
1349     if(!USES_LIST(mb_type, 0))
1350         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1351
1352     for(list=0; list<h->list_count; list++){
1353         int y;
1354         if(!USES_LIST(mb_type, list))
1355             continue;
1356
1357         for(y=0; y<4; y++){
1358             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1359             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1360         }
1361         if( h->pps.cabac ) {
1362             if(IS_SKIP(mb_type))
1363                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1364             else
1365             for(y=0; y<4; y++){
1366                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1367                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1368             }
1369         }
1370
1371         {
1372             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1373             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1374             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1375             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1376             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1377         }
1378     }
1379
1380     if(h->slice_type == B_TYPE && h->pps.cabac){
1381         if(IS_8X8(mb_type)){
1382             uint8_t *direct_table = &h->direct_table[b8_xy];
1383             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1384             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1385             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1386         }
1387     }
1388 }
1389
1390 /**
1391  * Decodes a network abstraction layer unit.
1392  * @param consumed is the number of bytes used as input
1393  * @param length is the length of the array
1394  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1395  * @returns decoded bytes, might be src+1 if no escapes
1396  */
1397 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1398     int i, si, di;
1399     uint8_t *dst;
1400     int bufidx;
1401
1402 //    src[0]&0x80;                //forbidden bit
1403     h->nal_ref_idc= src[0]>>5;
1404     h->nal_unit_type= src[0]&0x1F;
1405
1406     src++; length--;
1407 #if 0
1408     for(i=0; i<length; i++)
1409         printf("%2X ", src[i]);
1410 #endif
1411     for(i=0; i+1<length; i+=2){
1412         if(src[i]) continue;
1413         if(i>0 && src[i-1]==0) i--;
1414         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1415             if(src[i+2]!=3){
1416                 /* startcode, so we must be past the end */
1417                 length=i;
1418             }
1419             break;
1420         }
1421     }
1422
1423     if(i>=length-1){ //no escaped 0
1424         *dst_length= length;
1425         *consumed= length+1; //+1 for the header
1426         return src;
1427     }
1428
1429     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1430     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1431     dst= h->rbsp_buffer[bufidx];
1432
1433     if (dst == NULL){
1434         return NULL;
1435     }
1436
1437 //printf("decoding esc\n");
1438     si=di=0;
1439     while(si<length){
1440         //remove escapes (very rare 1:2^22)
1441         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1442             if(src[si+2]==3){ //escape
1443                 dst[di++]= 0;
1444                 dst[di++]= 0;
1445                 si+=3;
1446                 continue;
1447             }else //next start code
1448                 break;
1449         }
1450
1451         dst[di++]= src[si++];
1452     }
1453
1454     *dst_length= di;
1455     *consumed= si + 1;//+1 for the header
1456 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1457     return dst;
1458 }
1459
1460 /**
1461  * identifies the exact end of the bitstream
1462  * @return the length of the trailing, or 0 if damaged
1463  */
1464 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1465     int v= *src;
1466     int r;
1467
1468     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1469
1470     for(r=1; r<9; r++){
1471         if(v&1) return r;
1472         v>>=1;
1473     }
1474     return 0;
1475 }
1476
1477 /**
1478  * idct tranforms the 16 dc values and dequantize them.
1479  * @param qp quantization parameter
1480  */
1481 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1482 #define stride 16
1483     int i;
1484     int temp[16]; //FIXME check if this is a good idea
1485     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1486     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1487
1488 //memset(block, 64, 2*256);
1489 //return;
1490     for(i=0; i<4; i++){
1491         const int offset= y_offset[i];
1492         const int z0= block[offset+stride*0] + block[offset+stride*4];
1493         const int z1= block[offset+stride*0] - block[offset+stride*4];
1494         const int z2= block[offset+stride*1] - block[offset+stride*5];
1495         const int z3= block[offset+stride*1] + block[offset+stride*5];
1496
1497         temp[4*i+0]= z0+z3;
1498         temp[4*i+1]= z1+z2;
1499         temp[4*i+2]= z1-z2;
1500         temp[4*i+3]= z0-z3;
1501     }
1502
1503     for(i=0; i<4; i++){
1504         const int offset= x_offset[i];
1505         const int z0= temp[4*0+i] + temp[4*2+i];
1506         const int z1= temp[4*0+i] - temp[4*2+i];
1507         const int z2= temp[4*1+i] - temp[4*3+i];
1508         const int z3= temp[4*1+i] + temp[4*3+i];
1509
1510         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1511         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1512         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1513         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1514     }
1515 }
1516
1517 #if 0
1518 /**
1519  * dct tranforms the 16 dc values.
1520  * @param qp quantization parameter ??? FIXME
1521  */
1522 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1523 //    const int qmul= dequant_coeff[qp][0];
1524     int i;
1525     int temp[16]; //FIXME check if this is a good idea
1526     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1527     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1528
1529     for(i=0; i<4; i++){
1530         const int offset= y_offset[i];
1531         const int z0= block[offset+stride*0] + block[offset+stride*4];
1532         const int z1= block[offset+stride*0] - block[offset+stride*4];
1533         const int z2= block[offset+stride*1] - block[offset+stride*5];
1534         const int z3= block[offset+stride*1] + block[offset+stride*5];
1535
1536         temp[4*i+0]= z0+z3;
1537         temp[4*i+1]= z1+z2;
1538         temp[4*i+2]= z1-z2;
1539         temp[4*i+3]= z0-z3;
1540     }
1541
1542     for(i=0; i<4; i++){
1543         const int offset= x_offset[i];
1544         const int z0= temp[4*0+i] + temp[4*2+i];
1545         const int z1= temp[4*0+i] - temp[4*2+i];
1546         const int z2= temp[4*1+i] - temp[4*3+i];
1547         const int z3= temp[4*1+i] + temp[4*3+i];
1548
1549         block[stride*0 +offset]= (z0 + z3)>>1;
1550         block[stride*2 +offset]= (z1 + z2)>>1;
1551         block[stride*8 +offset]= (z1 - z2)>>1;
1552         block[stride*10+offset]= (z0 - z3)>>1;
1553     }
1554 }
1555 #endif
1556
1557 #undef xStride
1558 #undef stride
1559
1560 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1561     const int stride= 16*2;
1562     const int xStride= 16;
1563     int a,b,c,d,e;
1564
1565     a= block[stride*0 + xStride*0];
1566     b= block[stride*0 + xStride*1];
1567     c= block[stride*1 + xStride*0];
1568     d= block[stride*1 + xStride*1];
1569
1570     e= a-b;
1571     a= a+b;
1572     b= c-d;
1573     c= c+d;
1574
1575     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1576     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1577     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1578     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1579 }
1580
1581 #if 0
1582 static void chroma_dc_dct_c(DCTELEM *block){
1583     const int stride= 16*2;
1584     const int xStride= 16;
1585     int a,b,c,d,e;
1586
1587     a= block[stride*0 + xStride*0];
1588     b= block[stride*0 + xStride*1];
1589     c= block[stride*1 + xStride*0];
1590     d= block[stride*1 + xStride*1];
1591
1592     e= a-b;
1593     a= a+b;
1594     b= c-d;
1595     c= c+d;
1596
1597     block[stride*0 + xStride*0]= (a+c);
1598     block[stride*0 + xStride*1]= (e+b);
1599     block[stride*1 + xStride*0]= (a-c);
1600     block[stride*1 + xStride*1]= (e-b);
1601 }
1602 #endif
1603
1604 /**
1605  * gets the chroma qp.
1606  */
1607 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1608     return h->pps.chroma_qp_table[t][qscale & 0xff];
1609 }
1610
1611 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1612 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1613 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1614     int i;
1615     const int * const quant_table= quant_coeff[qscale];
1616     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1617     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1618     const unsigned int threshold2= (threshold1<<1);
1619     int last_non_zero;
1620
1621     if(separate_dc){
1622         if(qscale<=18){
1623             //avoid overflows
1624             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1625             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1626             const unsigned int dc_threshold2= (dc_threshold1<<1);
1627
1628             int level= block[0]*quant_coeff[qscale+18][0];
1629             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1630                 if(level>0){
1631                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1632                     block[0]= level;
1633                 }else{
1634                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1635                     block[0]= -level;
1636                 }
1637 //                last_non_zero = i;
1638             }else{
1639                 block[0]=0;
1640             }
1641         }else{
1642             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1643             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1644             const unsigned int dc_threshold2= (dc_threshold1<<1);
1645
1646             int level= block[0]*quant_table[0];
1647             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1648                 if(level>0){
1649                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1650                     block[0]= level;
1651                 }else{
1652                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1653                     block[0]= -level;
1654                 }
1655 //                last_non_zero = i;
1656             }else{
1657                 block[0]=0;
1658             }
1659         }
1660         last_non_zero= 0;
1661         i=1;
1662     }else{
1663         last_non_zero= -1;
1664         i=0;
1665     }
1666
1667     for(; i<16; i++){
1668         const int j= scantable[i];
1669         int level= block[j]*quant_table[j];
1670
1671 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1672 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1673         if(((unsigned)(level+threshold1))>threshold2){
1674             if(level>0){
1675                 level= (bias + level)>>QUANT_SHIFT;
1676                 block[j]= level;
1677             }else{
1678                 level= (bias - level)>>QUANT_SHIFT;
1679                 block[j]= -level;
1680             }
1681             last_non_zero = i;
1682         }else{
1683             block[j]=0;
1684         }
1685     }
1686
1687     return last_non_zero;
1688 }
1689
1690 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1691                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1692                            int src_x_offset, int src_y_offset,
1693                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1694     MpegEncContext * const s = &h->s;
1695     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1696     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1697     const int luma_xy= (mx&3) + ((my&3)<<2);
1698     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1699     uint8_t * src_cb, * src_cr;
1700     int extra_width= h->emu_edge_width;
1701     int extra_height= h->emu_edge_height;
1702     int emu=0;
1703     const int full_mx= mx>>2;
1704     const int full_my= my>>2;
1705     const int pic_width  = 16*s->mb_width;
1706     const int pic_height = 16*s->mb_height >> (MB_MBAFF || FIELD_PICTURE);
1707
1708     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1709         return;
1710
1711     if(mx&7) extra_width -= 3;
1712     if(my&7) extra_height -= 3;
1713
1714     if(   full_mx < 0-extra_width
1715        || full_my < 0-extra_height
1716        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1717        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1719             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1720         emu=1;
1721     }
1722
1723     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1724     if(!square){
1725         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1726     }
1727
1728     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1729
1730     if(MB_MBAFF || FIELD_PICTURE){
1731         // chroma offset when predicting from a field of opposite parity
1732         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
1733         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1734     }
1735     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1736     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1737
1738     if(emu){
1739         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1740             src_cb= s->edge_emu_buffer;
1741     }
1742     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1743
1744     if(emu){
1745         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1746             src_cr= s->edge_emu_buffer;
1747     }
1748     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1749 }
1750
1751 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1752                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1753                            int x_offset, int y_offset,
1754                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1755                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1756                            int list0, int list1){
1757     MpegEncContext * const s = &h->s;
1758     qpel_mc_func *qpix_op=  qpix_put;
1759     h264_chroma_mc_func chroma_op= chroma_put;
1760
1761     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1762     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1763     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1764     x_offset += 8*s->mb_x;
1765     y_offset += 8*(s->mb_y >> (MB_MBAFF || FIELD_PICTURE));
1766
1767     if(list0){
1768         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1769         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1770                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1771                            qpix_op, chroma_op);
1772
1773         qpix_op=  qpix_avg;
1774         chroma_op= chroma_avg;
1775     }
1776
1777     if(list1){
1778         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1779         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1780                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1781                            qpix_op, chroma_op);
1782     }
1783 }
1784
1785 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1786                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1787                            int x_offset, int y_offset,
1788                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1789                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1790                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1791                            int list0, int list1){
1792     MpegEncContext * const s = &h->s;
1793
1794     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1795     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1796     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1797     x_offset += 8*s->mb_x;
1798     y_offset += 8*(s->mb_y >> (MB_MBAFF || FIELD_PICTURE));
1799
1800     if(list0 && list1){
1801         /* don't optimize for luma-only case, since B-frames usually
1802          * use implicit weights => chroma too. */
1803         uint8_t *tmp_cb = s->obmc_scratchpad;
1804         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1805         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1806         int refn0 = h->ref_cache[0][ scan8[n] ];
1807         int refn1 = h->ref_cache[1][ scan8[n] ];
1808
1809         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1810                     dest_y, dest_cb, dest_cr,
1811                     x_offset, y_offset, qpix_put, chroma_put);
1812         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1813                     tmp_y, tmp_cb, tmp_cr,
1814                     x_offset, y_offset, qpix_put, chroma_put);
1815
1816         if(h->use_weight == 2){
1817             int weight0 = h->implicit_weight[refn0][refn1];
1818             int weight1 = 64 - weight0;
1819             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1820             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1821             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1822         }else{
1823             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1824                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1825                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1826             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1827                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1828                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1829             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1830                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1831                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1832         }
1833     }else{
1834         int list = list1 ? 1 : 0;
1835         int refn = h->ref_cache[list][ scan8[n] ];
1836         Picture *ref= &h->ref_list[list][refn];
1837         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1838                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1839                     qpix_put, chroma_put);
1840
1841         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1842                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1843         if(h->use_weight_chroma){
1844             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1845                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1846             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1847                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1848         }
1849     }
1850 }
1851
1852 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1853                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1854                            int x_offset, int y_offset,
1855                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1856                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1857                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1858                            int list0, int list1){
1859     if((h->use_weight==2 && list0 && list1
1860         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1861        || h->use_weight==1)
1862         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1863                          x_offset, y_offset, qpix_put, chroma_put,
1864                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1865     else
1866         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1867                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1868 }
1869
1870 static inline void prefetch_motion(H264Context *h, int list){
1871     /* fetch pixels for estimated mv 4 macroblocks ahead
1872      * optimized for 64byte cache lines */
1873     MpegEncContext * const s = &h->s;
1874     const int refn = h->ref_cache[list][scan8[0]];
1875     if(refn >= 0){
1876         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1877         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1878         uint8_t **src= h->ref_list[list][refn].data;
1879         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1880         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1881         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1882         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1883     }
1884 }
1885
1886 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1887                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1888                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1889                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1890     MpegEncContext * const s = &h->s;
1891     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1892     const int mb_type= s->current_picture.mb_type[mb_xy];
1893
1894     assert(IS_INTER(mb_type));
1895
1896     prefetch_motion(h, 0);
1897
1898     if(IS_16X16(mb_type)){
1899         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1900                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1901                 &weight_op[0], &weight_avg[0],
1902                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1903     }else if(IS_16X8(mb_type)){
1904         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1905                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1906                 &weight_op[1], &weight_avg[1],
1907                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1908         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1909                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1910                 &weight_op[1], &weight_avg[1],
1911                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1912     }else if(IS_8X16(mb_type)){
1913         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1914                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1915                 &weight_op[2], &weight_avg[2],
1916                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1917         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1918                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1919                 &weight_op[2], &weight_avg[2],
1920                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1921     }else{
1922         int i;
1923
1924         assert(IS_8X8(mb_type));
1925
1926         for(i=0; i<4; i++){
1927             const int sub_mb_type= h->sub_mb_type[i];
1928             const int n= 4*i;
1929             int x_offset= (i&1)<<2;
1930             int y_offset= (i&2)<<1;
1931
1932             if(IS_SUB_8X8(sub_mb_type)){
1933                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1934                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1935                     &weight_op[3], &weight_avg[3],
1936                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1937             }else if(IS_SUB_8X4(sub_mb_type)){
1938                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1939                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1940                     &weight_op[4], &weight_avg[4],
1941                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1942                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1943                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1944                     &weight_op[4], &weight_avg[4],
1945                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1946             }else if(IS_SUB_4X8(sub_mb_type)){
1947                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1948                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1949                     &weight_op[5], &weight_avg[5],
1950                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1951                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1952                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1953                     &weight_op[5], &weight_avg[5],
1954                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1955             }else{
1956                 int j;
1957                 assert(IS_SUB_4X4(sub_mb_type));
1958                 for(j=0; j<4; j++){
1959                     int sub_x_offset= x_offset + 2*(j&1);
1960                     int sub_y_offset= y_offset +   (j&2);
1961                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1962                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1963                         &weight_op[6], &weight_avg[6],
1964                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1965                 }
1966             }
1967         }
1968     }
1969
1970     prefetch_motion(h, 1);
1971 }
1972
1973 static void decode_init_vlc(void){
1974     static int done = 0;
1975
1976     if (!done) {
1977         int i;
1978         done = 1;
1979
1980         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1981                  &chroma_dc_coeff_token_len [0], 1, 1,
1982                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1983
1984         for(i=0; i<4; i++){
1985             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1986                      &coeff_token_len [i][0], 1, 1,
1987                      &coeff_token_bits[i][0], 1, 1, 1);
1988         }
1989
1990         for(i=0; i<3; i++){
1991             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1992                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1993                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1994         }
1995         for(i=0; i<15; i++){
1996             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1997                      &total_zeros_len [i][0], 1, 1,
1998                      &total_zeros_bits[i][0], 1, 1, 1);
1999         }
2000
2001         for(i=0; i<6; i++){
2002             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2003                      &run_len [i][0], 1, 1,
2004                      &run_bits[i][0], 1, 1, 1);
2005         }
2006         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2007                  &run_len [6][0], 1, 1,
2008                  &run_bits[6][0], 1, 1, 1);
2009     }
2010 }
2011
2012 static void free_tables(H264Context *h){
2013     int i;
2014     H264Context *hx;
2015     av_freep(&h->intra4x4_pred_mode);
2016     av_freep(&h->chroma_pred_mode_table);
2017     av_freep(&h->cbp_table);
2018     av_freep(&h->mvd_table[0]);
2019     av_freep(&h->mvd_table[1]);
2020     av_freep(&h->direct_table);
2021     av_freep(&h->non_zero_count);
2022     av_freep(&h->slice_table_base);
2023     h->slice_table= NULL;
2024
2025     av_freep(&h->mb2b_xy);
2026     av_freep(&h->mb2b8_xy);
2027
2028     for(i = 0; i < MAX_SPS_COUNT; i++)
2029         av_freep(h->sps_buffers + i);
2030
2031     for(i = 0; i < MAX_PPS_COUNT; i++)
2032         av_freep(h->pps_buffers + i);
2033
2034     for(i = 0; i < h->s.avctx->thread_count; i++) {
2035         hx = h->thread_context[i];
2036         if(!hx) continue;
2037         av_freep(&hx->top_borders[1]);
2038         av_freep(&hx->top_borders[0]);
2039         av_freep(&hx->s.obmc_scratchpad);
2040         av_freep(&hx->s.allocated_edge_emu_buffer);
2041     }
2042 }
2043
2044 static void init_dequant8_coeff_table(H264Context *h){
2045     int i,q,x;
2046     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2047     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2048     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2049
2050     for(i=0; i<2; i++ ){
2051         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2052             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2053             break;
2054         }
2055
2056         for(q=0; q<52; q++){
2057             int shift = ff_div6[q];
2058             int idx = ff_rem6[q];
2059             for(x=0; x<64; x++)
2060                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2061                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2062                     h->pps.scaling_matrix8[i][x]) << shift;
2063         }
2064     }
2065 }
2066
2067 static void init_dequant4_coeff_table(H264Context *h){
2068     int i,j,q,x;
2069     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2070     for(i=0; i<6; i++ ){
2071         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2072         for(j=0; j<i; j++){
2073             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2074                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2075                 break;
2076             }
2077         }
2078         if(j<i)
2079             continue;
2080
2081         for(q=0; q<52; q++){
2082             int shift = ff_div6[q] + 2;
2083             int idx = ff_rem6[q];
2084             for(x=0; x<16; x++)
2085                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2086                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2087                     h->pps.scaling_matrix4[i][x]) << shift;
2088         }
2089     }
2090 }
2091
2092 static void init_dequant_tables(H264Context *h){
2093     int i,x;
2094     init_dequant4_coeff_table(h);
2095     if(h->pps.transform_8x8_mode)
2096         init_dequant8_coeff_table(h);
2097     if(h->sps.transform_bypass){
2098         for(i=0; i<6; i++)
2099             for(x=0; x<16; x++)
2100                 h->dequant4_coeff[i][0][x] = 1<<6;
2101         if(h->pps.transform_8x8_mode)
2102             for(i=0; i<2; i++)
2103                 for(x=0; x<64; x++)
2104                     h->dequant8_coeff[i][0][x] = 1<<6;
2105     }
2106 }
2107
2108
2109 /**
2110  * allocates tables.
2111  * needs width/height
2112  */
2113 static int alloc_tables(H264Context *h){
2114     MpegEncContext * const s = &h->s;
2115     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2116     int x,y;
2117
2118     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2119
2120     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2121     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2122     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2123
2124     if( h->pps.cabac ) {
2125         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2126         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2127         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2128         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2129     }
2130
2131     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2132     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2133
2134     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2135     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2136     for(y=0; y<s->mb_height; y++){
2137         for(x=0; x<s->mb_width; x++){
2138             const int mb_xy= x + y*s->mb_stride;
2139             const int b_xy = 4*x + 4*y*h->b_stride;
2140             const int b8_xy= 2*x + 2*y*h->b8_stride;
2141
2142             h->mb2b_xy [mb_xy]= b_xy;
2143             h->mb2b8_xy[mb_xy]= b8_xy;
2144         }
2145     }
2146
2147     s->obmc_scratchpad = NULL;
2148
2149     if(!h->dequant4_coeff[0])
2150         init_dequant_tables(h);
2151
2152     return 0;
2153 fail:
2154     free_tables(h);
2155     return -1;
2156 }
2157
2158 /**
2159  * Mimic alloc_tables(), but for every context thread.
2160  */
2161 static void clone_tables(H264Context *dst, H264Context *src){
2162     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2163     dst->non_zero_count           = src->non_zero_count;
2164     dst->slice_table              = src->slice_table;
2165     dst->cbp_table                = src->cbp_table;
2166     dst->mb2b_xy                  = src->mb2b_xy;
2167     dst->mb2b8_xy                 = src->mb2b8_xy;
2168     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2169     dst->mvd_table[0]             = src->mvd_table[0];
2170     dst->mvd_table[1]             = src->mvd_table[1];
2171     dst->direct_table             = src->direct_table;
2172
2173     dst->s.obmc_scratchpad = NULL;
2174     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2175 }
2176
2177 /**
2178  * Init context
2179  * Allocate buffers which are not shared amongst multiple threads.
2180  */
2181 static int context_init(H264Context *h){
2182     MpegEncContext * const s = &h->s;
2183
2184     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2185     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2186
2187     // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264)
2188     CHECKED_ALLOCZ(s->allocated_edge_emu_buffer,
2189                    (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance
2190     s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21;
2191     return 0;
2192 fail:
2193     return -1; // free_tables will clean up for us
2194 }
2195
2196 static void common_init(H264Context *h){
2197     MpegEncContext * const s = &h->s;
2198
2199     s->width = s->avctx->width;
2200     s->height = s->avctx->height;
2201     s->codec_id= s->avctx->codec->id;
2202
2203     ff_h264_pred_init(&h->hpc, s->codec_id);
2204
2205     h->dequant_coeff_pps= -1;
2206     s->unrestricted_mv=1;
2207     s->decode=1; //FIXME
2208
2209     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2210     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2211 }
2212
2213 static int decode_init(AVCodecContext *avctx){
2214     H264Context *h= avctx->priv_data;
2215     MpegEncContext * const s = &h->s;
2216
2217     MPV_decode_defaults(s);
2218
2219     s->avctx = avctx;
2220     common_init(h);
2221
2222     s->out_format = FMT_H264;
2223     s->workaround_bugs= avctx->workaround_bugs;
2224
2225     // set defaults
2226 //    s->decode_mb= ff_h263_decode_mb;
2227     s->quarter_sample = 1;
2228     s->low_delay= 1;
2229     avctx->pix_fmt= PIX_FMT_YUV420P;
2230
2231     decode_init_vlc();
2232
2233     if(avctx->extradata_size > 0 && avctx->extradata &&
2234        *(char *)avctx->extradata == 1){
2235         h->is_avc = 1;
2236         h->got_avcC = 0;
2237     } else {
2238         h->is_avc = 0;
2239     }
2240
2241     h->thread_context[0] = h;
2242     return 0;
2243 }
2244
2245 static int frame_start(H264Context *h){
2246     MpegEncContext * const s = &h->s;
2247     int i;
2248
2249     if(MPV_frame_start(s, s->avctx) < 0)
2250         return -1;
2251     ff_er_frame_start(s);
2252     /*
2253      * MPV_frame_start uses pict_type to derive key_frame.
2254      * This is incorrect for H.264; IDR markings must be used.
2255      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2256      * See decode_nal_units().
2257      */
2258     s->current_picture_ptr->key_frame= 0;
2259
2260     assert(s->linesize && s->uvlinesize);
2261
2262     for(i=0; i<16; i++){
2263         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2264         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2265     }
2266     for(i=0; i<4; i++){
2267         h->block_offset[16+i]=
2268         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2269         h->block_offset[24+16+i]=
2270         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2271     }
2272
2273     /* can't be in alloc_tables because linesize isn't known there.
2274      * FIXME: redo bipred weight to not require extra buffer? */
2275     for(i = 0; i < s->avctx->thread_count; i++)
2276         if(!h->thread_context[i]->s.obmc_scratchpad)
2277             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2278
2279     /* some macroblocks will be accessed before they're available */
2280     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2281         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2282
2283 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2284     return 0;
2285 }
2286
2287 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2288     MpegEncContext * const s = &h->s;
2289     int i;
2290
2291     src_y  -=   linesize;
2292     src_cb -= uvlinesize;
2293     src_cr -= uvlinesize;
2294
2295     // There are two lines saved, the line above the the top macroblock of a pair,
2296     // and the line above the bottom macroblock
2297     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2298     for(i=1; i<17; i++){
2299         h->left_border[i]= src_y[15+i*  linesize];
2300     }
2301
2302     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2303     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2304
2305     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2306         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2307         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2308         for(i=1; i<9; i++){
2309             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2310             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2311         }
2312         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2313         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2314     }
2315 }
2316
2317 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2318     MpegEncContext * const s = &h->s;
2319     int temp8, i;
2320     uint64_t temp64;
2321     int deblock_left;
2322     int deblock_top;
2323     int mb_xy;
2324
2325     if(h->deblocking_filter == 2) {
2326         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2327         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2328         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2329     } else {
2330         deblock_left = (s->mb_x > 0);
2331         deblock_top =  (s->mb_y > 0);
2332     }
2333
2334     src_y  -=   linesize + 1;
2335     src_cb -= uvlinesize + 1;
2336     src_cr -= uvlinesize + 1;
2337
2338 #define XCHG(a,b,t,xchg)\
2339 t= a;\
2340 if(xchg)\
2341     a= b;\
2342 b= t;
2343
2344     if(deblock_left){
2345         for(i = !deblock_top; i<17; i++){
2346             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2347         }
2348     }
2349
2350     if(deblock_top){
2351         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2352         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2353         if(s->mb_x+1 < s->mb_width){
2354             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2355         }
2356     }
2357
2358     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2359         if(deblock_left){
2360             for(i = !deblock_top; i<9; i++){
2361                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2362                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2363             }
2364         }
2365         if(deblock_top){
2366             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2367             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2368         }
2369     }
2370 }
2371
2372 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2373     MpegEncContext * const s = &h->s;
2374     int i;
2375
2376     src_y  -= 2 *   linesize;
2377     src_cb -= 2 * uvlinesize;
2378     src_cr -= 2 * uvlinesize;
2379
2380     // There are two lines saved, the line above the the top macroblock of a pair,
2381     // and the line above the bottom macroblock
2382     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2383     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2384     for(i=2; i<34; i++){
2385         h->left_border[i]= src_y[15+i*  linesize];
2386     }
2387
2388     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2389     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2390     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2391     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2392
2393     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2394         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2395         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2396         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2397         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2398         for(i=2; i<18; i++){
2399             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2400             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2401         }
2402         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2403         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2404         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2405         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2406     }
2407 }
2408
2409 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2410     MpegEncContext * const s = &h->s;
2411     int temp8, i;
2412     uint64_t temp64;
2413     int deblock_left = (s->mb_x > 0);
2414     int deblock_top  = (s->mb_y > 1);
2415
2416     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2417
2418     src_y  -= 2 *   linesize + 1;
2419     src_cb -= 2 * uvlinesize + 1;
2420     src_cr -= 2 * uvlinesize + 1;
2421
2422 #define XCHG(a,b,t,xchg)\
2423 t= a;\
2424 if(xchg)\
2425     a= b;\
2426 b= t;
2427
2428     if(deblock_left){
2429         for(i = (!deblock_top)<<1; i<34; i++){
2430             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2431         }
2432     }
2433
2434     if(deblock_top){
2435         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2436         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2437         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2438         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2439         if(s->mb_x+1 < s->mb_width){
2440             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2441             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2442         }
2443     }
2444
2445     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2446         if(deblock_left){
2447             for(i = (!deblock_top) << 1; i<18; i++){
2448                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2449                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2450             }
2451         }
2452         if(deblock_top){
2453             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2454             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2455             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2456             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2457         }
2458     }
2459 }
2460
2461 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2462     MpegEncContext * const s = &h->s;
2463     const int mb_x= s->mb_x;
2464     const int mb_y= s->mb_y;
2465     const int mb_xy= mb_x + mb_y*s->mb_stride;
2466     const int mb_type= s->current_picture.mb_type[mb_xy];
2467     uint8_t  *dest_y, *dest_cb, *dest_cr;
2468     int linesize, uvlinesize /*dct_offset*/;
2469     int i;
2470     int *block_offset = &h->block_offset[0];
2471     const unsigned int bottom = mb_y & 1;
2472     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2473     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2474     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2475
2476     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2477     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2478     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2479
2480     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2481     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2482
2483     if (!simple && MB_FIELD) {
2484         linesize   = h->mb_linesize   = s->linesize * 2;
2485         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2486         block_offset = &h->block_offset[24];
2487         if(mb_y&1){ //FIXME move out of this func?
2488             dest_y -= s->linesize*15;
2489             dest_cb-= s->uvlinesize*7;
2490             dest_cr-= s->uvlinesize*7;
2491         }
2492         if(FRAME_MBAFF) {
2493             int list;
2494             for(list=0; list<h->list_count; list++){
2495                 if(!USES_LIST(mb_type, list))
2496                     continue;
2497                 if(IS_16X16(mb_type)){
2498                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2499                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
2500                 }else{
2501                     for(i=0; i<16; i+=4){
2502                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2503                         int ref = h->ref_cache[list][scan8[i]];
2504                         if(ref >= 0)
2505                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
2506                     }
2507                 }
2508             }
2509         }
2510     } else {
2511         linesize   = h->mb_linesize   = s->linesize;
2512         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2513 //        dct_offset = s->linesize * 16;
2514     }
2515
2516     if(transform_bypass){
2517         idct_dc_add =
2518         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2519     }else if(IS_8x8DCT(mb_type)){
2520         idct_dc_add = s->dsp.h264_idct8_dc_add;
2521         idct_add = s->dsp.h264_idct8_add;
2522     }else{
2523         idct_dc_add = s->dsp.h264_idct_dc_add;
2524         idct_add = s->dsp.h264_idct_add;
2525     }
2526
2527     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2528        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2529         int mbt_y = mb_y&~1;
2530         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2531         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2532         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2533         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2534     }
2535
2536     if (!simple && IS_INTRA_PCM(mb_type)) {
2537         unsigned int x, y;
2538
2539         // The pixels are stored in h->mb array in the same order as levels,
2540         // copy them in output in the correct order.
2541         for(i=0; i<16; i++) {
2542             for (y=0; y<4; y++) {
2543                 for (x=0; x<4; x++) {
2544                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2545                 }
2546             }
2547         }
2548         for(i=16; i<16+4; i++) {
2549             for (y=0; y<4; y++) {
2550                 for (x=0; x<4; x++) {
2551                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2552                 }
2553             }
2554         }
2555         for(i=20; i<20+4; i++) {
2556             for (y=0; y<4; y++) {
2557                 for (x=0; x<4; x++) {
2558                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2559                 }
2560             }
2561         }
2562     } else {
2563         if(IS_INTRA(mb_type)){
2564             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2565                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2566
2567             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2568                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2569                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2570             }
2571
2572             if(IS_INTRA4x4(mb_type)){
2573                 if(simple || !s->encoding){
2574                     if(IS_8x8DCT(mb_type)){
2575                         for(i=0; i<16; i+=4){
2576                             uint8_t * const ptr= dest_y + block_offset[i];
2577                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2578                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2579                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2580                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2581                             if(nnz){
2582                                 if(nnz == 1 && h->mb[i*16])
2583                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2584                                 else
2585                                     idct_add(ptr, h->mb + i*16, linesize);
2586                             }
2587                         }
2588                     }else
2589                     for(i=0; i<16; i++){
2590                         uint8_t * const ptr= dest_y + block_offset[i];
2591                         uint8_t *topright;
2592                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2593                         int nnz, tr;
2594
2595                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2596                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2597                             assert(mb_y || linesize <= block_offset[i]);
2598                             if(!topright_avail){
2599                                 tr= ptr[3 - linesize]*0x01010101;
2600                                 topright= (uint8_t*) &tr;
2601                             }else
2602                                 topright= ptr + 4 - linesize;
2603                         }else
2604                             topright= NULL;
2605
2606                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2607                         nnz = h->non_zero_count_cache[ scan8[i] ];
2608                         if(nnz){
2609                             if(is_h264){
2610                                 if(nnz == 1 && h->mb[i*16])
2611                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2612                                 else
2613                                     idct_add(ptr, h->mb + i*16, linesize);
2614                             }else
2615                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2616                         }
2617                     }
2618                 }
2619             }else{
2620                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2621                 if(is_h264){
2622                     if(!transform_bypass)
2623                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2624                 }else
2625                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2626             }
2627             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2628                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2629         }else if(is_h264){
2630             hl_motion(h, dest_y, dest_cb, dest_cr,
2631                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2632                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2633                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2634         }
2635
2636
2637         if(!IS_INTRA4x4(mb_type)){
2638             if(is_h264){
2639                 if(IS_INTRA16x16(mb_type)){
2640                     for(i=0; i<16; i++){
2641                         if(h->non_zero_count_cache[ scan8[i] ])
2642                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2643                         else if(h->mb[i*16])
2644                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2645                     }
2646                 }else{
2647                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2648                     for(i=0; i<16; i+=di){
2649                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2650                         if(nnz){
2651                             if(nnz==1 && h->mb[i*16])
2652                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2653                             else
2654                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2655                         }
2656                     }
2657                 }
2658             }else{
2659                 for(i=0; i<16; i++){
2660                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2661                         uint8_t * const ptr= dest_y + block_offset[i];
2662                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2663                     }
2664                 }
2665             }
2666         }
2667
2668         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2669             uint8_t *dest[2] = {dest_cb, dest_cr};
2670             if(transform_bypass){
2671                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2672             }else{
2673                 idct_add = s->dsp.h264_idct_add;
2674                 idct_dc_add = s->dsp.h264_idct_dc_add;
2675                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2676                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2677             }
2678             if(is_h264){
2679                 for(i=16; i<16+8; i++){
2680                     if(h->non_zero_count_cache[ scan8[i] ])
2681                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2682                     else if(h->mb[i*16])
2683                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2684                 }
2685             }else{
2686                 for(i=16; i<16+8; i++){
2687                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2688                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2689                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2690                     }
2691                 }
2692             }
2693         }
2694     }
2695     if(h->deblocking_filter) {
2696         if (!simple && FRAME_MBAFF) {
2697             //FIXME try deblocking one mb at a time?
2698             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2699             const int mb_y = s->mb_y - 1;
2700             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2701             const int mb_xy= mb_x + mb_y*s->mb_stride;
2702             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2703             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2704             if (!bottom) return;
2705             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2706             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2707             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2708
2709             if(IS_INTRA(mb_type_top | mb_type_bottom))
2710                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2711
2712             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2713             // deblock a pair
2714             // top
2715             s->mb_y--;
2716             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2717             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2718             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2719             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2720             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2721             // bottom
2722             s->mb_y++;
2723             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2724             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2725             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2726             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2727             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2728         } else {
2729             tprintf(h->s.avctx, "call filter_mb\n");
2730             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2731             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2732             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2733         }
2734     }
2735 }
2736
2737 /**
2738  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2739  */
2740 static void hl_decode_mb_simple(H264Context *h){
2741     hl_decode_mb_internal(h, 1);
2742 }
2743
2744 /**
2745  * Process a macroblock; this handles edge cases, such as interlacing.
2746  */
2747 static void av_noinline hl_decode_mb_complex(H264Context *h){
2748     hl_decode_mb_internal(h, 0);
2749 }
2750
2751 static void hl_decode_mb(H264Context *h){
2752     MpegEncContext * const s = &h->s;
2753     const int mb_x= s->mb_x;
2754     const int mb_y= s->mb_y;
2755     const int mb_xy= mb_x + mb_y*s->mb_stride;
2756     const int mb_type= s->current_picture.mb_type[mb_xy];
2757     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2758
2759     if(!s->decode)
2760         return;
2761
2762     if (is_complex)
2763         hl_decode_mb_complex(h);
2764     else hl_decode_mb_simple(h);
2765 }
2766
2767 static void pic_as_field(Picture *pic, const int bottom){
2768     int i;
2769     for (i = 0; i < 4; ++i) {
2770         if (bottom)
2771             pic->data[i] += pic->linesize[i];
2772         pic->linesize[i] *= 2;
2773     }
2774 }
2775
2776 static int split_field_copy(Picture *dest, Picture *src,
2777                             int parity, int id_add){
2778     int match = !!(src->reference & parity);
2779
2780     if (match) {
2781         *dest = *src;
2782         pic_as_field(dest, parity == PICT_BOTTOM_FIELD);
2783         dest->pic_id *= 2;
2784         dest->pic_id += id_add;
2785     }
2786
2787     return match;
2788 }
2789
2790 /**
2791  * Split one reference list into field parts, interleaving by parity
2792  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2793  * set to look at the actual start of data for that field.
2794  *
2795  * @param dest output list
2796  * @param dest_len maximum number of fields to put in dest
2797  * @param src the source reference list containing fields and/or field pairs
2798  *            (aka short_ref/long_ref, or
2799  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2800  * @param src_len number of Picture's in source (pairs and unmatched fields)
2801  * @param parity the parity of the picture being decoded/needing
2802  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2803  * @return number of fields placed in dest
2804  */
2805 static int split_field_half_ref_list(Picture *dest, int dest_len,
2806                                      Picture *src,  int src_len,  int parity){
2807     int same_parity   = 1;
2808     int same_i        = 0;
2809     int opp_i         = 0;
2810     int out_i;
2811     int field_output;
2812
2813     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2814         if (same_parity && same_i < src_len) {
2815             field_output = split_field_copy(dest + out_i, src + same_i,
2816                                             parity, 1);
2817             same_parity = !field_output;
2818             same_i++;
2819
2820         } else if (opp_i < src_len) {
2821             field_output = split_field_copy(dest + out_i, src + opp_i,
2822                                             PICT_FRAME - parity, 0);
2823             same_parity = field_output;
2824             opp_i++;
2825
2826         } else {
2827             break;
2828         }
2829     }
2830
2831     return out_i;
2832 }
2833
2834 /**
2835  * Split the reference frame list into a reference field list.
2836  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2837  * The input list contains both reference field pairs and
2838  * unmatched reference fields; it is ordered as spec describes
2839  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2840  * unmatched field pairs are also present. Conceptually this is equivalent
2841  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2842  *
2843  * @param dest output reference list where ordered fields are to be placed
2844  * @param dest_len max number of fields to place at dest
2845  * @param src source reference list, as described above
2846  * @param src_len number of pictures (pairs and unmatched fields) in src
2847  * @param parity parity of field being currently decoded
2848  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2849  * @param long_i index into src array that holds first long reference picture,
2850  *        or src_len if no long refs present.
2851  */
2852 static int split_field_ref_list(Picture *dest, int dest_len,
2853                                 Picture *src,  int src_len,
2854                                 int parity,    int long_i){
2855
2856     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2857     dest += i;
2858     dest_len -= i;
2859
2860     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2861                                    src_len - long_i, parity);
2862     return i;
2863 }
2864
2865 /**
2866  * fills the default_ref_list.
2867  */
2868 static int fill_default_ref_list(H264Context *h){
2869     MpegEncContext * const s = &h->s;
2870     int i;
2871     int smallest_poc_greater_than_current = -1;
2872     int structure_sel;
2873     Picture sorted_short_ref[32];
2874     Picture field_entry_list[2][32];
2875     Picture *frame_list[2];
2876
2877     if (FIELD_PICTURE) {
2878         structure_sel = PICT_FRAME;
2879         frame_list[0] = field_entry_list[0];
2880         frame_list[1] = field_entry_list[1];
2881     } else {
2882         structure_sel = 0;
2883         frame_list[0] = h->default_ref_list[0];
2884         frame_list[1] = h->default_ref_list[1];
2885     }
2886
2887     if(h->slice_type==B_TYPE){
2888         int list;
2889         int len[2];
2890         int short_len[2];
2891         int out_i;
2892         int limit= INT_MIN;
2893
2894         /* sort frame according to poc in B slice */
2895         for(out_i=0; out_i<h->short_ref_count; out_i++){
2896             int best_i=INT_MIN;
2897             int best_poc=INT_MAX;
2898
2899             for(i=0; i<h->short_ref_count; i++){
2900                 const int poc= h->short_ref[i]->poc;
2901                 if(poc > limit && poc < best_poc){
2902                     best_poc= poc;
2903                     best_i= i;
2904                 }
2905             }
2906
2907             assert(best_i != INT_MIN);
2908
2909             limit= best_poc;
2910             sorted_short_ref[out_i]= *h->short_ref[best_i];
2911             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2912             if (-1 == smallest_poc_greater_than_current) {
2913                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2914                     smallest_poc_greater_than_current = out_i;
2915                 }
2916             }
2917         }
2918
2919         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2920
2921         // find the largest poc
2922         for(list=0; list<2; list++){
2923             int index = 0;
2924             int j= -99;
2925             int step= list ? -1 : 1;
2926
2927             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2928                 int sel;
2929                 while(j<0 || j>= h->short_ref_count){
2930                     if(j != -99 && step == (list ? -1 : 1))
2931                         return -1;
2932                     step = -step;
2933                     j= smallest_poc_greater_than_current + (step>>1);
2934                 }
2935                 sel = sorted_short_ref[j].reference | structure_sel;
2936                 if(sel != PICT_FRAME) continue;
2937                 frame_list[list][index  ]= sorted_short_ref[j];
2938                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2939             }
2940             short_len[list] = index;
2941
2942             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2943                 int sel;
2944                 if(h->long_ref[i] == NULL) continue;
2945                 sel = h->long_ref[i]->reference | structure_sel;
2946                 if(sel != PICT_FRAME) continue;
2947
2948                 frame_list[ list ][index  ]= *h->long_ref[i];
2949                 frame_list[ list ][index++].pic_id= i;;
2950             }
2951             len[list] = index;
2952
2953             if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
2954                 // swap the two first elements of L1 when
2955                 // L0 and L1 are identical
2956                 Picture temp= frame_list[1][0];
2957                 frame_list[1][0] = frame_list[1][1];
2958                 frame_list[1][1] = temp;
2959             }
2960
2961         }
2962
2963         for(list=0; list<2; list++){
2964             if (FIELD_PICTURE)
2965                 len[list] = split_field_ref_list(h->default_ref_list[list],
2966                                                  h->ref_count[list],
2967                                                  frame_list[list],
2968                                                  len[list],
2969                                                  s->picture_structure,
2970                                                  short_len[list]);
2971
2972             if(len[list] < h->ref_count[ list ])
2973                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2974         }
2975
2976
2977     }else{
2978         int index=0;
2979         int short_len;
2980         for(i=0; i<h->short_ref_count; i++){
2981             int sel;
2982             sel = h->short_ref[i]->reference | structure_sel;
2983             if(sel != PICT_FRAME) continue;
2984             frame_list[0][index  ]= *h->short_ref[i];
2985             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2986         }
2987         short_len = index;
2988         for(i = 0; i < 16; i++){
2989             int sel;
2990             if(h->long_ref[i] == NULL) continue;
2991             sel = h->long_ref[i]->reference | structure_sel;
2992             if(sel != PICT_FRAME) continue;
2993             frame_list[0][index  ]= *h->long_ref[i];
2994             frame_list[0][index++].pic_id= i;;
2995         }
2996
2997         if (FIELD_PICTURE)
2998             index = split_field_ref_list(h->default_ref_list[0],
2999                                          h->ref_count[0], frame_list[0],
3000                                          index, s->picture_structure,
3001                                          short_len);
3002
3003         if(index < h->ref_count[0])
3004             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3005     }
3006 #ifdef TRACE
3007     for (i=0; i<h->ref_count[0]; i++) {
3008         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3009     }
3010     if(h->slice_type==B_TYPE){
3011         for (i=0; i<h->ref_count[1]; i++) {
3012             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3013         }
3014     }
3015 #endif
3016     return 0;
3017 }
3018
3019 static void print_short_term(H264Context *h);
3020 static void print_long_term(H264Context *h);
3021
3022 /**
3023  * Extract structure information about the picture described by pic_num in
3024  * the current decoding context (frame or field). Note that pic_num is
3025  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3026  * @param pic_num picture number for which to extract structure information
3027  * @param structure one of PICT_XXX describing structure of picture
3028  *                      with pic_num
3029  * @return frame number (short term) or long term index of picture
3030  *         described by pic_num
3031  */
3032 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3033     MpegEncContext * const s = &h->s;
3034
3035     *structure = s->picture_structure;
3036     if(FIELD_PICTURE){
3037         if (!(pic_num & 1))
3038             /* opposite field */
3039             *structure ^= PICT_FRAME;
3040         pic_num >>= 1;
3041     }
3042
3043     return pic_num;
3044 }
3045
3046 static int decode_ref_pic_list_reordering(H264Context *h){
3047     MpegEncContext * const s = &h->s;
3048     int list, index, pic_structure;
3049
3050     print_short_term(h);
3051     print_long_term(h);
3052     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3053
3054     for(list=0; list<h->list_count; list++){
3055         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3056
3057         if(get_bits1(&s->gb)){
3058             int pred= h->curr_pic_num;
3059
3060             for(index=0; ; index++){
3061                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3062                 unsigned int pic_id;
3063                 int i;
3064                 Picture *ref = NULL;
3065
3066                 if(reordering_of_pic_nums_idc==3)
3067                     break;
3068
3069                 if(index >= h->ref_count[list]){
3070                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3071                     return -1;
3072                 }
3073
3074                 if(reordering_of_pic_nums_idc<3){
3075                     if(reordering_of_pic_nums_idc<2){
3076                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3077                         int frame_num;
3078
3079                         if(abs_diff_pic_num >= h->max_pic_num){
3080                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3081                             return -1;
3082                         }
3083
3084                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3085                         else                                pred+= abs_diff_pic_num;
3086                         pred &= h->max_pic_num - 1;
3087
3088                         frame_num = pic_num_extract(h, pred, &pic_structure);
3089
3090                         for(i= h->short_ref_count-1; i>=0; i--){
3091                             ref = h->short_ref[i];
3092                             assert(ref->reference);
3093                             assert(!ref->long_ref);
3094                             if(ref->data[0] != NULL &&
3095                                    ref->frame_num == frame_num &&
3096                                    (ref->reference & pic_structure) &&
3097                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3098                                 break;
3099                         }
3100                         if(i>=0)
3101                             ref->pic_id= pred;
3102                     }else{
3103                         int long_idx;
3104                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3105
3106                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3107
3108                         if(long_idx>31){
3109                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3110                             return -1;
3111                         }
3112                         ref = h->long_ref[long_idx];
3113                         assert(!(ref && !ref->reference));
3114                         if(ref && (ref->reference & pic_structure)){
3115                             ref->pic_id= pic_id;
3116                             assert(ref->long_ref);
3117                             i=0;
3118                         }else{
3119                             i=-1;
3120                         }
3121                     }
3122
3123                     if (i < 0) {
3124                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3125                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3126                     } else {
3127                         for(i=index; i+1<h->ref_count[list]; i++){
3128                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3129                                 break;
3130                         }
3131                         for(; i > index; i--){
3132                             h->ref_list[list][i]= h->ref_list[list][i-1];
3133                         }
3134                         h->ref_list[list][index]= *ref;
3135                         if (FIELD_PICTURE){
3136                             int bot = pic_structure == PICT_BOTTOM_FIELD;
3137                             pic_as_field(&h->ref_list[list][index], bot);
3138                         }
3139                     }
3140                 }else{
3141                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3142                     return -1;
3143                 }
3144             }
3145         }
3146     }
3147     for(list=0; list<h->list_count; list++){
3148         for(index= 0; index < h->ref_count[list]; index++){
3149             if(!h->ref_list[list][index].data[0])
3150                 h->ref_list[list][index]= s->current_picture;
3151         }
3152     }
3153
3154     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3155         direct_dist_scale_factor(h);
3156     direct_ref_list_init(h);
3157     return 0;
3158 }
3159
3160 static void fill_mbaff_ref_list(H264Context *h){
3161     int list, i, j;
3162     for(list=0; list<2; list++){ //FIXME try list_count
3163         for(i=0; i<h->ref_count[list]; i++){
3164             Picture *frame = &h->ref_list[list][i];
3165             Picture *field = &h->ref_list[list][16+2*i];
3166             field[0] = *frame;
3167             for(j=0; j<3; j++)
3168                 field[0].linesize[j] <<= 1;
3169             field[1] = field[0];
3170             for(j=0; j<3; j++)
3171                 field[1].data[j] += frame->linesize[j];
3172
3173             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3174             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3175             for(j=0; j<2; j++){
3176                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3177                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3178             }
3179         }
3180     }
3181     for(j=0; j<h->ref_count[1]; j++){
3182         for(i=0; i<h->ref_count[0]; i++)
3183             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3184         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3185         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3186     }
3187 }
3188
3189 static int pred_weight_table(H264Context *h){
3190     MpegEncContext * const s = &h->s;
3191     int list, i;
3192     int luma_def, chroma_def;
3193
3194     h->use_weight= 0;
3195     h->use_weight_chroma= 0;
3196     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3197     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3198     luma_def = 1<<h->luma_log2_weight_denom;
3199     chroma_def = 1<<h->chroma_log2_weight_denom;
3200
3201     for(list=0; list<2; list++){
3202         for(i=0; i<h->ref_count[list]; i++){
3203             int luma_weight_flag, chroma_weight_flag;
3204
3205             luma_weight_flag= get_bits1(&s->gb);
3206             if(luma_weight_flag){
3207                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3208                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3209                 if(   h->luma_weight[list][i] != luma_def
3210                    || h->luma_offset[list][i] != 0)
3211                     h->use_weight= 1;
3212             }else{
3213                 h->luma_weight[list][i]= luma_def;
3214                 h->luma_offset[list][i]= 0;
3215             }
3216
3217             chroma_weight_flag= get_bits1(&s->gb);
3218             if(chroma_weight_flag){
3219                 int j;
3220                 for(j=0; j<2; j++){
3221                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3222                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3223                     if(   h->chroma_weight[list][i][j] != chroma_def
3224                        || h->chroma_offset[list][i][j] != 0)
3225                         h->use_weight_chroma= 1;
3226                 }
3227             }else{
3228                 int j;
3229                 for(j=0; j<2; j++){
3230                     h->chroma_weight[list][i][j]= chroma_def;
3231                     h->chroma_offset[list][i][j]= 0;
3232                 }
3233             }
3234         }
3235         if(h->slice_type != B_TYPE) break;
3236     }
3237     h->use_weight= h->use_weight || h->use_weight_chroma;
3238     return 0;
3239 }
3240
3241 static void implicit_weight_table(H264Context *h){
3242     MpegEncContext * const s = &h->s;
3243     int ref0, ref1;
3244     int cur_poc = s->current_picture_ptr->poc;
3245
3246     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3247        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3248         h->use_weight= 0;
3249         h->use_weight_chroma= 0;
3250         return;
3251     }
3252
3253     h->use_weight= 2;
3254     h->use_weight_chroma= 2;
3255     h->luma_log2_weight_denom= 5;
3256     h->chroma_log2_weight_denom= 5;
3257
3258     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3259         int poc0 = h->ref_list[0][ref0].poc;
3260         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3261             int poc1 = h->ref_list[1][ref1].poc;
3262             int td = av_clip(poc1 - poc0, -128, 127);
3263             if(td){
3264                 int tb = av_clip(cur_poc - poc0, -128, 127);
3265                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3266                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3267                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3268                     h->implicit_weight[ref0][ref1] = 32;
3269                 else
3270                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3271             }else
3272                 h->implicit_weight[ref0][ref1] = 32;
3273         }
3274     }
3275 }
3276
3277 /**
3278  * Mark a picture as no longer needed for reference. The refmask
3279  * argument allows unreferencing of individual fields or the whole frame.
3280  * If the picture becomes entirely unreferenced, but is being held for
3281  * display purposes, it is marked as such.
3282  * @param refmask mask of fields to unreference; the mask is bitwise
3283  *                anded with the reference marking of pic
3284  * @return non-zero if pic becomes entirely unreferenced (except possibly
3285  *         for display purposes) zero if one of the fields remains in
3286  *         reference
3287  */
3288 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3289     int i;
3290     if (pic->reference &= refmask) {
3291         return 0;
3292     } else {
3293         if(pic == h->delayed_output_pic)
3294             pic->reference=DELAYED_PIC_REF;
3295         else{
3296             for(i = 0; h->delayed_pic[i]; i++)
3297                 if(pic == h->delayed_pic[i]){
3298                     pic->reference=DELAYED_PIC_REF;
3299                     break;
3300                 }
3301         }
3302         return 1;
3303     }
3304 }
3305
3306 /**
3307  * instantaneous decoder refresh.
3308  */
3309 static void idr(H264Context *h){
3310     int i;
3311
3312     for(i=0; i<16; i++){
3313         if (h->long_ref[i] != NULL) {
3314             unreference_pic(h, h->long_ref[i], 0);
3315             h->long_ref[i]= NULL;
3316         }
3317     }
3318     h->long_ref_count=0;
3319
3320     for(i=0; i<h->short_ref_count; i++){
3321         unreference_pic(h, h->short_ref[i], 0);
3322         h->short_ref[i]= NULL;
3323     }
3324     h->short_ref_count=0;
3325 }
3326
3327 /* forget old pics after a seek */
3328 static void flush_dpb(AVCodecContext *avctx){
3329     H264Context *h= avctx->priv_data;
3330     int i;
3331     for(i=0; i<16; i++) {
3332         if(h->delayed_pic[i])
3333             h->delayed_pic[i]->reference= 0;
3334         h->delayed_pic[i]= NULL;
3335     }
3336     if(h->delayed_output_pic)
3337         h->delayed_output_pic->reference= 0;
3338     h->delayed_output_pic= NULL;
3339     idr(h);
3340     if(h->s.current_picture_ptr)
3341         h->s.current_picture_ptr->reference= 0;
3342 }
3343
3344 /**
3345  * Find a Picture in the short term reference list by frame number.
3346  * @param frame_num frame number to search for
3347  * @param idx the index into h->short_ref where returned picture is found
3348  *            undefined if no picture found.
3349  * @return pointer to the found picture, or NULL if no pic with the provided
3350  *                 frame number is found
3351  */
3352 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3353     MpegEncContext * const s = &h->s;
3354     int i;
3355
3356     for(i=0; i<h->short_ref_count; i++){
3357         Picture *pic= h->short_ref[i];
3358         if(s->avctx->debug&FF_DEBUG_MMCO)
3359             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3360         if(pic->frame_num == frame_num) {
3361             *idx = i;
3362             return pic;
3363         }
3364     }
3365     return NULL;
3366 }
3367
3368 /**
3369  * Remove a picture from the short term reference list by its index in
3370  * that list.  This does no checking on the provided index; it is assumed
3371  * to be valid. Other list entries are shifted down.
3372  * @param i index into h->short_ref of picture to remove.
3373  */
3374 static void remove_short_at_index(H264Context *h, int i){
3375     assert(i > 0 && i < h->short_ref_count);
3376     h->short_ref[i]= NULL;
3377     if (--h->short_ref_count)
3378         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3379 }
3380
3381 /**
3382  *
3383  * @return the removed picture or NULL if an error occurs
3384  */
3385 static Picture * remove_short(H264Context *h, int frame_num){
3386     MpegEncContext * const s = &h->s;
3387     Picture *pic;
3388     int i;
3389
3390     if(s->avctx->debug&FF_DEBUG_MMCO)
3391         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3392
3393     pic = find_short(h, frame_num, &i);
3394     if (pic)
3395         remove_short_at_index(h, i);
3396
3397     return pic;
3398 }
3399
3400 /**
3401  * Remove a picture from the long term reference list by its index in
3402  * that list.  This does no checking on the provided index; it is assumed
3403  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3404  * @param i index into h->long_ref of picture to remove.
3405  */
3406 static void remove_long_at_index(H264Context *h, int i){
3407     h->long_ref[i]= NULL;
3408     h->long_ref_count--;
3409 }
3410
3411 /**
3412  *
3413  * @return the removed picture or NULL if an error occurs
3414  */
3415 static Picture * remove_long(H264Context *h, int i){
3416     Picture *pic;
3417
3418     pic= h->long_ref[i];
3419     if (pic)
3420         remove_long_at_index(h, i);
3421
3422     return pic;
3423 }
3424
3425 /**
3426  * print short term list
3427  */
3428 static void print_short_term(H264Context *h) {
3429     uint32_t i;
3430     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3431         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3432         for(i=0; i<h->short_ref_count; i++){
3433             Picture *pic= h->short_ref[i];
3434             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3435         }
3436     }
3437 }
3438
3439 /**
3440  * print long term list
3441  */
3442 static void print_long_term(H264Context *h) {
3443     uint32_t i;
3444     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3445         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3446         for(i = 0; i < 16; i++){
3447             Picture *pic= h->long_ref[i];
3448             if (pic) {
3449                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3450             }
3451         }
3452     }
3453 }
3454
3455 /**
3456  * Executes the reference picture marking (memory management control operations).
3457  */
3458 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3459     MpegEncContext * const s = &h->s;
3460     int i, j;
3461     int current_is_long=0;
3462     Picture *pic;
3463
3464     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3465         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3466
3467     for(i=0; i<mmco_count; i++){
3468         if(s->avctx->debug&FF_DEBUG_MMCO)
3469             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3470
3471         switch(mmco[i].opcode){
3472         case MMCO_SHORT2UNUSED:
3473             pic= remove_short(h, mmco[i].short_pic_num);
3474             if(pic)
3475                 unreference_pic(h, pic, 0);
3476             else if(s->avctx->debug&FF_DEBUG_MMCO)
3477                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3478             break;
3479         case MMCO_SHORT2LONG:
3480             pic= remove_long(h, mmco[i].long_arg);
3481             if(pic) unreference_pic(h, pic, 0);
3482
3483             h->long_ref[ mmco[i].long_arg ]= remove_short(h, mmco[i].short_pic_num);
3484             if (h->long_ref[ mmco[i].long_arg ]){
3485                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3486                 h->long_ref_count++;
3487             }
3488             break;
3489         case MMCO_LONG2UNUSED:
3490             pic= remove_long(h, mmco[i].long_arg);
3491             if(pic)
3492                 unreference_pic(h, pic, 0);
3493             else if(s->avctx->debug&FF_DEBUG_MMCO)
3494                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3495             break;
3496         case MMCO_LONG:
3497             pic= remove_long(h, mmco[i].long_arg);
3498             if(pic) unreference_pic(h, pic, 0);
3499
3500             h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3501             h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3502             h->long_ref_count++;
3503
3504             current_is_long=1;
3505             break;
3506         case MMCO_SET_MAX_LONG:
3507             assert(mmco[i].long_arg <= 16);
3508             // just remove the long term which index is greater than new max
3509             for(j = mmco[i].long_arg; j<16; j++){
3510                 pic = remove_long(h, j);
3511                 if (pic) unreference_pic(h, pic, 0);
3512             }
3513             break;
3514         case MMCO_RESET:
3515             while(h->short_ref_count){
3516                 pic= remove_short(h, h->short_ref[0]->frame_num);
3517                 if(pic) unreference_pic(h, pic, 0);
3518             }
3519             for(j = 0; j < 16; j++) {
3520                 pic= remove_long(h, j);
3521                 if(pic) unreference_pic(h, pic, 0);
3522             }
3523             break;
3524         default: assert(0);
3525         }
3526     }
3527
3528     if(!current_is_long){
3529         pic= remove_short(h, s->current_picture_ptr->frame_num);
3530         if(pic){
3531             unreference_pic(h, pic, 0);
3532             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3533         }
3534
3535         if(h->short_ref_count)
3536             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3537
3538         h->short_ref[0]= s->current_picture_ptr;
3539         h->short_ref[0]->long_ref=0;
3540         h->short_ref_count++;
3541     }
3542
3543     print_short_term(h);
3544     print_long_term(h);
3545     return 0;
3546 }
3547
3548 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3549     MpegEncContext * const s = &h->s;
3550     int i;
3551
3552     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3553         s->broken_link= get_bits1(gb) -1;
3554         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3555         if(h->mmco[0].long_arg == -1)
3556             h->mmco_index= 0;
3557         else{
3558             h->mmco[0].opcode= MMCO_LONG;
3559             h->mmco_index= 1;
3560         }
3561     }else{
3562         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3563             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3564                 MMCOOpcode opcode= get_ue_golomb(gb);
3565
3566                 h->mmco[i].opcode= opcode;
3567                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3568                     h->mmco[i].short_pic_num= (h->frame_num - get_ue_golomb(gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
3569 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3570                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3571                         return -1;
3572                     }*/
3573                 }
3574                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3575                     unsigned int long_arg= get_ue_golomb(gb);
3576                     if(/*h->mmco[i].long_arg >= h->long_ref_count || h->long_ref[ h->mmco[i].long_arg ] == NULL*/ long_arg >= 16){
3577                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3578                         return -1;
3579                     }
3580                     h->mmco[i].long_arg= long_arg;
3581                 }
3582
3583                 if(opcode > (unsigned)MMCO_LONG){
3584                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3585                     return -1;
3586                 }
3587                 if(opcode == MMCO_END)
3588                     break;
3589             }
3590             h->mmco_index= i;
3591         }else{
3592             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3593
3594             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
3595                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3596                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3597                 h->mmco_index= 1;
3598             }else
3599                 h->mmco_index= 0;
3600         }
3601     }
3602
3603     return 0;
3604 }
3605
3606 static int init_poc(H264Context *h){
3607     MpegEncContext * const s = &h->s;
3608     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3609     int field_poc[2];
3610
3611     if(h->nal_unit_type == NAL_IDR_SLICE){
3612         h->frame_num_offset= 0;
3613     }else{
3614         if(h->frame_num < h->prev_frame_num)
3615             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3616         else
3617             h->frame_num_offset= h->prev_frame_num_offset;
3618     }
3619
3620     if(h->sps.poc_type==0){
3621         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3622
3623         if(h->nal_unit_type == NAL_IDR_SLICE){
3624              h->prev_poc_msb=
3625              h->prev_poc_lsb= 0;
3626         }
3627
3628         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3629             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3630         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3631             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3632         else
3633             h->poc_msb = h->prev_poc_msb;
3634 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3635         field_poc[0] =
3636         field_poc[1] = h->poc_msb + h->poc_lsb;
3637         if(s->picture_structure == PICT_FRAME)
3638             field_poc[1] += h->delta_poc_bottom;
3639     }else if(h->sps.poc_type==1){
3640         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3641         int i;
3642
3643         if(h->sps.poc_cycle_length != 0)
3644             abs_frame_num = h->frame_num_offset + h->frame_num;
3645         else
3646             abs_frame_num = 0;
3647
3648         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3649             abs_frame_num--;
3650
3651         expected_delta_per_poc_cycle = 0;
3652         for(i=0; i < h->sps.poc_cycle_length; i++)
3653             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3654
3655         if(abs_frame_num > 0){
3656             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3657             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3658
3659             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3660             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3661                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3662         } else
3663             expectedpoc = 0;
3664
3665         if(h->nal_ref_idc == 0)
3666             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3667
3668         field_poc[0] = expectedpoc + h->delta_poc[0];
3669         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3670
3671         if(s->picture_structure == PICT_FRAME)
3672             field_poc[1] += h->delta_poc[1];
3673     }else{
3674         int poc;
3675         if(h->nal_unit_type == NAL_IDR_SLICE){
3676             poc= 0;
3677         }else{
3678             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3679             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3680         }
3681         field_poc[0]= poc;
3682         field_poc[1]= poc;
3683     }
3684
3685     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3686         s->current_picture_ptr->field_poc[0]= field_poc[0];
3687         s->current_picture_ptr->poc = field_poc[0];
3688     }
3689     if(s->picture_structure != PICT_TOP_FIELD) {
3690         s->current_picture_ptr->field_poc[1]= field_poc[1];
3691         s->current_picture_ptr->poc = field_poc[1];
3692     }
3693     if(!FIELD_PICTURE || !s->first_field)
3694         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
3695
3696     return 0;
3697 }
3698
3699
3700 /**
3701  * initialize scan tables
3702  */
3703 static void init_scan_tables(H264Context *h){
3704     MpegEncContext * const s = &h->s;
3705     int i;
3706     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3707         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3708         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3709     }else{
3710         for(i=0; i<16; i++){
3711 #define T(x) (x>>2) | ((x<<2) & 0xF)
3712             h->zigzag_scan[i] = T(zigzag_scan[i]);
3713             h-> field_scan[i] = T( field_scan[i]);
3714 #undef T
3715         }
3716     }
3717     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3718         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3719         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3720         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3721         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3722     }else{
3723         for(i=0; i<64; i++){
3724 #define T(x) (x>>3) | ((x&7)<<3)
3725             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3726             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3727             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3728             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3729 #undef T
3730         }
3731     }
3732     if(h->sps.transform_bypass){ //FIXME same ugly
3733         h->zigzag_scan_q0          = zigzag_scan;
3734         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3735         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3736         h->field_scan_q0           = field_scan;
3737         h->field_scan8x8_q0        = field_scan8x8;
3738         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3739     }else{
3740         h->zigzag_scan_q0          = h->zigzag_scan;
3741         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3742         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3743         h->field_scan_q0           = h->field_scan;
3744         h->field_scan8x8_q0        = h->field_scan8x8;
3745         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3746     }
3747 }
3748
3749 /**
3750  * Replicates H264 "master" context to thread contexts.
3751  */
3752 static void clone_slice(H264Context *dst, H264Context *src)
3753 {
3754     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3755     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3756     dst->s.current_picture      = src->s.current_picture;
3757     dst->s.linesize             = src->s.linesize;
3758     dst->s.uvlinesize           = src->s.uvlinesize;
3759
3760     dst->prev_poc_msb           = src->prev_poc_msb;
3761     dst->prev_poc_lsb           = src->prev_poc_lsb;
3762     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3763     dst->prev_frame_num         = src->prev_frame_num;
3764     dst->short_ref_count        = src->short_ref_count;
3765
3766     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3767     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3768     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3769     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3770
3771     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3772     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3773 }
3774
3775 /**
3776  * decodes a slice header.
3777  * this will allso call MPV_common_init() and frame_start() as needed
3778  *
3779  * @param h h264context
3780  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3781  *
3782  * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
3783  */
3784 static int decode_slice_header(H264Context *h, H264Context *h0){
3785     MpegEncContext * const s = &h->s;
3786     unsigned int first_mb_in_slice;
3787     unsigned int pps_id;
3788     int num_ref_idx_active_override_flag;
3789     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3790     unsigned int slice_type, tmp, i;
3791     int default_ref_list_done = 0;
3792
3793     s->dropable= h->nal_ref_idc == 0;
3794
3795     first_mb_in_slice= get_ue_golomb(&s->gb);
3796
3797     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3798         h0->current_slice = 0;
3799         s->current_picture_ptr= NULL;
3800     }
3801
3802     slice_type= get_ue_golomb(&s->gb);
3803     if(slice_type > 9){
3804         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3805         return -1;
3806     }
3807     if(slice_type > 4){
3808         slice_type -= 5;
3809         h->slice_type_fixed=1;
3810     }else
3811         h->slice_type_fixed=0;
3812
3813     slice_type= slice_type_map[ slice_type ];
3814     if (slice_type == I_TYPE
3815         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3816         default_ref_list_done = 1;
3817     }
3818     h->slice_type= slice_type;
3819
3820     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3821
3822     pps_id= get_ue_golomb(&s->gb);
3823     if(pps_id>=MAX_PPS_COUNT){
3824         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3825         return -1;
3826     }
3827     if(!h0->pps_buffers[pps_id]) {
3828         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3829         return -1;
3830     }
3831     h->pps= *h0->pps_buffers[pps_id];
3832
3833     if(!h0->sps_buffers[h->pps.sps_id]) {
3834         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3835         return -1;
3836     }
3837     h->sps = *h0->sps_buffers[h->pps.sps_id];
3838
3839     if(h == h0 && h->dequant_coeff_pps != pps_id){
3840         h->dequant_coeff_pps = pps_id;
3841         init_dequant_tables(h);
3842     }
3843
3844     s->mb_width= h->sps.mb_width;
3845     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3846
3847     h->b_stride=  s->mb_width*4;
3848     h->b8_stride= s->mb_width*2;
3849
3850     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3851     if(h->sps.frame_mbs_only_flag)
3852         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3853     else
3854         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3855
3856     if (s->context_initialized
3857         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3858         if(h != h0)
3859             return -1;   // width / height changed during parallelized decoding
3860         free_tables(h);
3861         MPV_common_end(s);
3862     }
3863     if (!s->context_initialized) {
3864         if(h != h0)
3865             return -1;  // we cant (re-)initialize context during parallel decoding
3866         if (MPV_common_init(s) < 0)
3867             return -1;
3868
3869         init_scan_tables(h);
3870         alloc_tables(h);
3871
3872         for(i = 1; i < s->avctx->thread_count; i++) {
3873             H264Context *c;
3874             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3875             memcpy(c, h, sizeof(MpegEncContext));
3876             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3877             c->sps = h->sps;
3878             c->pps = h->pps;
3879             init_scan_tables(c);
3880             clone_tables(c, h);
3881         }
3882
3883         for(i = 0; i < s->avctx->thread_count; i++)
3884             if(context_init(h->thread_context[i]) < 0)
3885                 return -1;
3886
3887         s->avctx->width = s->width;
3888         s->avctx->height = s->height;
3889         s->avctx->sample_aspect_ratio= h->sps.sar;
3890         if(!s->avctx->sample_aspect_ratio.den)
3891             s->avctx->sample_aspect_ratio.den = 1;
3892
3893         if(h->sps.timing_info_present_flag){
3894             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3895             if(h->x264_build > 0 && h->x264_build < 44)
3896                 s->avctx->time_base.den *= 2;
3897             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3898                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3899         }
3900     }
3901
3902     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3903
3904     h->mb_mbaff = 0;
3905     h->mb_aff_frame = 0;
3906     if(h->sps.frame_mbs_only_flag){
3907         s->picture_structure= PICT_FRAME;
3908     }else{
3909         if(get_bits1(&s->gb)) { //field_pic_flag
3910             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3911             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
3912         } else {
3913             s->picture_structure= PICT_FRAME;
3914             h->mb_aff_frame = h->sps.mb_aff;
3915         }
3916     }
3917
3918     if(h0->current_slice == 0){
3919         if(frame_start(h) < 0)
3920             return -1;
3921     }
3922     if(h != h0)
3923         clone_slice(h, h0);
3924
3925     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3926
3927     assert(s->mb_num == s->mb_width * s->mb_height);
3928     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3929        first_mb_in_slice                    >= s->mb_num){
3930         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3931         return -1;
3932     }
3933     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3934     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3935     if (s->picture_structure == PICT_BOTTOM_FIELD)
3936         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3937     assert(s->mb_y < s->mb_height);
3938
3939     if(s->picture_structure==PICT_FRAME){
3940         h->curr_pic_num=   h->frame_num;
3941         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3942     }else{
3943         h->curr_pic_num= 2*h->frame_num + 1;
3944         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3945     }
3946
3947     if(h->nal_unit_type == NAL_IDR_SLICE){
3948         get_ue_golomb(&s->gb); /* idr_pic_id */
3949     }
3950
3951     if(h->sps.poc_type==0){
3952         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3953
3954         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3955             h->delta_poc_bottom= get_se_golomb(&s->gb);
3956         }
3957     }
3958
3959     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3960         h->delta_poc[0]= get_se_golomb(&s->gb);
3961
3962         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3963             h->delta_poc[1]= get_se_golomb(&s->gb);
3964     }
3965
3966     init_poc(h);
3967
3968     if(h->pps.redundant_pic_cnt_present){
3969         h->redundant_pic_count= get_ue_golomb(&s->gb);
3970     }
3971
3972     //set defaults, might be overriden a few line later
3973     h->ref_count[0]= h->pps.ref_count[0];
3974     h->ref_count[1]= h->pps.ref_count[1];
3975
3976     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
3977         if(h->slice_type == B_TYPE){
3978             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3979             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
3980                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
3981         }
3982         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3983
3984         if(num_ref_idx_active_override_flag){
3985             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3986             if(h->slice_type==B_TYPE)
3987                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3988
3989             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3990                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3991                 h->ref_count[0]= h->ref_count[1]= 1;
3992                 return -1;
3993             }
3994         }
3995         if(h->slice_type == B_TYPE)
3996             h->list_count= 2;
3997         else
3998             h->list_count= 1;
3999     }else
4000         h->list_count= 0;
4001
4002     if(!default_ref_list_done){
4003         fill_default_ref_list(h);
4004     }
4005
4006     if(decode_ref_pic_list_reordering(h) < 0)
4007         return -1;
4008
4009     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4010        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4011         pred_weight_table(h);
4012     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4013         implicit_weight_table(h);
4014     else
4015         h->use_weight = 0;
4016
4017     if(h->nal_ref_idc)
4018         decode_ref_pic_marking(h0, &s->gb);
4019
4020     if(FRAME_MBAFF)
4021         fill_mbaff_ref_list(h);
4022
4023     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4024         tmp = get_ue_golomb(&s->gb);
4025         if(tmp > 2){
4026             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4027             return -1;
4028         }
4029         h->cabac_init_idc= tmp;
4030     }
4031
4032     h->last_qscale_diff = 0;
4033     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4034     if(tmp>51){
4035         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4036         return -1;
4037     }
4038     s->qscale= tmp;
4039     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4040     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4041     //FIXME qscale / qp ... stuff
4042     if(h->slice_type == SP_TYPE){
4043         get_bits1(&s->gb); /* sp_for_switch_flag */
4044     }
4045     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4046         get_se_golomb(&s->gb); /* slice_qs_delta */
4047     }
4048
4049     h->deblocking_filter = 1;
4050     h->slice_alpha_c0_offset = 0;
4051     h->slice_beta_offset = 0;
4052     if( h->pps.deblocking_filter_parameters_present ) {
4053         tmp= get_ue_golomb(&s->gb);
4054         if(tmp > 2){
4055             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4056             return -1;
4057         }
4058         h->deblocking_filter= tmp;
4059         if(h->deblocking_filter < 2)
4060             h->deblocking_filter^= 1; // 1<->0
4061
4062         if( h->deblocking_filter ) {
4063             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4064             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4065         }
4066     }
4067
4068     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4069        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4070        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4071        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4072         h->deblocking_filter= 0;
4073
4074     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4075         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4076             /* Cheat slightly for speed:
4077                Dont bother to deblock across slices */
4078             h->deblocking_filter = 2;
4079         } else {
4080             h0->max_contexts = 1;
4081             if(!h0->single_decode_warning) {
4082                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4083                 h0->single_decode_warning = 1;
4084             }
4085             if(h != h0)
4086                 return 1; // deblocking switched inside frame
4087         }
4088     }
4089
4090 #if 0 //FMO
4091     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4092         slice_group_change_cycle= get_bits(&s->gb, ?);
4093 #endif
4094
4095     h0->last_slice_type = slice_type;
4096     h->slice_num = ++h0->current_slice;
4097
4098     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4099     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4100
4101     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4102         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4103                h->slice_num,
4104                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4105                first_mb_in_slice,
4106                av_get_pict_type_char(h->slice_type),
4107                pps_id, h->frame_num,
4108                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4109                h->ref_count[0], h->ref_count[1],
4110                s->qscale,
4111                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4112                h->use_weight,
4113                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4114                );
4115     }
4116
4117     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
4118         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4119         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4120     }else{
4121         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4122         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4123     }
4124
4125     return 0;
4126 }
4127
4128 /**
4129  *
4130  */
4131 static inline int get_level_prefix(GetBitContext *gb){
4132     unsigned int buf;
4133     int log;
4134
4135     OPEN_READER(re, gb);
4136     UPDATE_CACHE(re, gb);
4137     buf=GET_CACHE(re, gb);
4138
4139     log= 32 - av_log2(buf);
4140 #ifdef TRACE
4141     print_bin(buf>>(32-log), log);
4142     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4143 #endif
4144
4145     LAST_SKIP_BITS(re, gb, log);
4146     CLOSE_READER(re, gb);
4147
4148     return log-1;
4149 }
4150
4151 static inline int get_dct8x8_allowed(H264Context *h){
4152     int i;
4153     for(i=0; i<4; i++){
4154         if(!IS_SUB_8X8(h->sub_mb_type[i])
4155            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4156             return 0;
4157     }
4158     return 1;
4159 }
4160
4161 /**
4162  * decodes a residual block.
4163  * @param n block index
4164  * @param scantable scantable
4165  * @param max_coeff number of coefficients in the block
4166  * @return <0 if an error occured
4167  */
4168 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4169     MpegEncContext * const s = &h->s;
4170     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4171     int level[16];
4172     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4173
4174     //FIXME put trailing_onex into the context
4175
4176     if(n == CHROMA_DC_BLOCK_INDEX){
4177         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4178         total_coeff= coeff_token>>2;
4179     }else{
4180         if(n == LUMA_DC_BLOCK_INDEX){
4181             total_coeff= pred_non_zero_count(h, 0);
4182             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4183             total_coeff= coeff_token>>2;
4184         }else{
4185             total_coeff= pred_non_zero_count(h, n);
4186             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4187             total_coeff= coeff_token>>2;
4188             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4189         }
4190     }
4191
4192     //FIXME set last_non_zero?
4193
4194     if(total_coeff==0)
4195         return 0;
4196     if(total_coeff > (unsigned)max_coeff) {
4197         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4198         return -1;
4199     }
4200
4201     trailing_ones= coeff_token&3;
4202     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4203     assert(total_coeff<=16);
4204
4205     for(i=0; i<trailing_ones; i++){
4206         level[i]= 1 - 2*get_bits1(gb);
4207     }
4208
4209     if(i<total_coeff) {
4210         int level_code, mask;
4211         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4212         int prefix= get_level_prefix(gb);
4213
4214         //first coefficient has suffix_length equal to 0 or 1
4215         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4216             if(suffix_length)
4217                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4218             else
4219                 level_code= (prefix<<suffix_length); //part
4220         }else if(prefix==14){
4221             if(suffix_length)
4222                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4223             else
4224                 level_code= prefix + get_bits(gb, 4); //part
4225         }else if(prefix==15){
4226             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4227             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4228         }else{
4229             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4230             return -1;
4231         }
4232
4233         if(trailing_ones < 3) level_code += 2;
4234
4235         suffix_length = 1;
4236         if(level_code > 5)
4237             suffix_length++;
4238         mask= -(level_code&1);
4239         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4240         i++;
4241
4242         //remaining coefficients have suffix_length > 0
4243         for(;i<total_coeff;i++) {
4244             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4245             prefix = get_level_prefix(gb);
4246             if(prefix<15){
4247                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4248             }else if(prefix==15){
4249                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4250             }else{
4251                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4252                 return -1;
4253             }
4254             mask= -(level_code&1);
4255             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4256             if(level_code > suffix_limit[suffix_length])
4257                 suffix_length++;
4258         }
4259     }
4260
4261     if(total_coeff == max_coeff)
4262         zeros_left=0;
4263     else{
4264         if(n == CHROMA_DC_BLOCK_INDEX)
4265             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4266         else
4267             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4268     }
4269
4270     coeff_num = zeros_left + total_coeff - 1;
4271     j = scantable[coeff_num];
4272     if(n > 24){
4273         block[j] = level[0];
4274         for(i=1;i<total_coeff;i++) {
4275             if(zeros_left <= 0)
4276                 run_before = 0;
4277             else if(zeros_left < 7){
4278                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4279             }else{
4280                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4281             }
4282             zeros_left -= run_before;
4283             coeff_num -= 1 + run_before;
4284             j= scantable[ coeff_num ];
4285
4286             block[j]= level[i];
4287         }
4288     }else{
4289         block[j] = (level[0] * qmul[j] + 32)>>6;
4290         for(i=1;i<total_coeff;i++) {
4291             if(zeros_left <= 0)
4292                 run_before = 0;
4293             else if(zeros_left < 7){
4294                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4295             }else{
4296                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4297             }
4298             zeros_left -= run_before;
4299             coeff_num -= 1 + run_before;
4300             j= scantable[ coeff_num ];
4301
4302             block[j]= (level[i] * qmul[j] + 32)>>6;
4303         }
4304     }
4305
4306     if(zeros_left<0){
4307         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4308         return -1;
4309     }
4310
4311     return 0;
4312 }
4313
4314 static void predict_field_decoding_flag(H264Context *h){
4315     MpegEncContext * const s = &h->s;
4316     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4317     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4318                 ? s->current_picture.mb_type[mb_xy-1]
4319                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4320                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4321                 : 0;
4322     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4323 }
4324
4325 /**
4326  * decodes a P_SKIP or B_SKIP macroblock
4327  */
4328 static void decode_mb_skip(H264Context *h){
4329     MpegEncContext * const s = &h->s;
4330     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4331     int mb_type=0;
4332
4333     memset(h->non_zero_count[mb_xy], 0, 16);
4334     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4335
4336     if(MB_FIELD)
4337         mb_type|= MB_TYPE_INTERLACED;
4338
4339     if( h->slice_type == B_TYPE )
4340     {
4341         // just for fill_caches. pred_direct_motion will set the real mb_type
4342         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4343
4344         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4345         pred_direct_motion(h, &mb_type);
4346         mb_type|= MB_TYPE_SKIP;
4347     }
4348     else
4349     {
4350         int mx, my;
4351         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4352
4353         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4354         pred_pskip_motion(h, &mx, &my);
4355         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4356         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4357     }
4358
4359     write_back_motion(h, mb_type);
4360     s->current_picture.mb_type[mb_xy]= mb_type;
4361     s->current_picture.qscale_table[mb_xy]= s->qscale;
4362     h->slice_table[ mb_xy ]= h->slice_num;
4363     h->prev_mb_skipped= 1;
4364 }
4365
4366 /**
4367  * decodes a macroblock
4368  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4369  */
4370 static int decode_mb_cavlc(H264Context *h){
4371     MpegEncContext * const s = &h->s;
4372     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4373     int partition_count;
4374     unsigned int mb_type, cbp;
4375     int dct8x8_allowed= h->pps.transform_8x8_mode;
4376
4377     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4378
4379     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4380     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4381                 down the code */
4382     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4383         if(s->mb_skip_run==-1)
4384             s->mb_skip_run= get_ue_golomb(&s->gb);
4385
4386         if (s->mb_skip_run--) {
4387             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4388                 if(s->mb_skip_run==0)
4389                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4390                 else
4391                     predict_field_decoding_flag(h);
4392             }
4393             decode_mb_skip(h);
4394             return 0;
4395         }
4396     }
4397     if(FRAME_MBAFF){
4398         if( (s->mb_y&1) == 0 )
4399             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4400     }else
4401         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4402
4403     h->prev_mb_skipped= 0;
4404
4405     mb_type= get_ue_golomb(&s->gb);
4406     if(h->slice_type == B_TYPE){
4407         if(mb_type < 23){
4408             partition_count= b_mb_type_info[mb_type].partition_count;
4409             mb_type=         b_mb_type_info[mb_type].type;
4410         }else{
4411             mb_type -= 23;
4412             goto decode_intra_mb;
4413         }
4414     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4415         if(mb_type < 5){
4416             partition_count= p_mb_type_info[mb_type].partition_count;
4417             mb_type=         p_mb_type_info[mb_type].type;
4418         }else{
4419             mb_type -= 5;
4420             goto decode_intra_mb;
4421         }
4422     }else{
4423        assert(h->slice_type == I_TYPE);
4424 decode_intra_mb:
4425         if(mb_type > 25){
4426             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4427             return -1;
4428         }
4429         partition_count=0;
4430         cbp= i_mb_type_info[mb_type].cbp;
4431         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4432         mb_type= i_mb_type_info[mb_type].type;
4433     }
4434
4435     if(MB_FIELD)
4436         mb_type |= MB_TYPE_INTERLACED;
4437
4438     h->slice_table[ mb_xy ]= h->slice_num;
4439
4440     if(IS_INTRA_PCM(mb_type)){
4441         unsigned int x, y;
4442
4443         // We assume these blocks are very rare so we do not optimize it.
4444         align_get_bits(&s->gb);
4445
4446         // The pixels are stored in the same order as levels in h->mb array.
4447         for(y=0; y<16; y++){
4448             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4449             for(x=0; x<16; x++){
4450                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4451                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4452             }
4453         }
4454         for(y=0; y<8; y++){
4455             const int index= 256 + 4*(y&3) + 32*(y>>2);
4456             for(x=0; x<8; x++){
4457                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4458                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4459             }
4460         }
4461         for(y=0; y<8; y++){
4462             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4463             for(x=0; x<8; x++){
4464                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4465                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4466             }
4467         }
4468
4469         // In deblocking, the quantizer is 0
4470         s->current_picture.qscale_table[mb_xy]= 0;
4471         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4472         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4473         // All coeffs are present
4474         memset(h->non_zero_count[mb_xy], 16, 16);
4475
4476         s->current_picture.mb_type[mb_xy]= mb_type;
4477         return 0;
4478     }
4479
4480     if(MB_MBAFF){
4481         h->ref_count[0] <<= 1;
4482         h->ref_count[1] <<= 1;
4483     }
4484
4485     fill_caches(h, mb_type, 0);
4486
4487     //mb_pred
4488     if(IS_INTRA(mb_type)){
4489             int pred_mode;
4490 //            init_top_left_availability(h);
4491             if(IS_INTRA4x4(mb_type)){
4492                 int i;
4493                 int di = 1;
4494                 if(dct8x8_allowed && get_bits1(&s->gb)){
4495                     mb_type |= MB_TYPE_8x8DCT;
4496                     di = 4;
4497                 }
4498
4499 //                fill_intra4x4_pred_table(h);
4500                 for(i=0; i<16; i+=di){
4501                     int mode= pred_intra_mode(h, i);
4502
4503                     if(!get_bits1(&s->gb)){
4504                         const int rem_mode= get_bits(&s->gb, 3);
4505                         mode = rem_mode + (rem_mode >= mode);
4506                     }
4507
4508                     if(di==4)
4509                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4510                     else
4511                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4512                 }
4513                 write_back_intra_pred_mode(h);
4514                 if( check_intra4x4_pred_mode(h) < 0)
4515                     return -1;
4516             }else{
4517                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4518                 if(h->intra16x16_pred_mode < 0)
4519                     return -1;
4520             }
4521
4522             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4523             if(pred_mode < 0)
4524                 return -1;
4525             h->chroma_pred_mode= pred_mode;
4526     }else if(partition_count==4){
4527         int i, j, sub_partition_count[4], list, ref[2][4];
4528
4529         if(h->slice_type == B_TYPE){
4530             for(i=0; i<4; i++){
4531                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4532                 if(h->sub_mb_type[i] >=13){
4533                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4534                     return -1;
4535                 }
4536                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4537                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4538             }
4539             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4540                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4541                 pred_direct_motion(h, &mb_type);
4542                 h->ref_cache[0][scan8[4]] =
4543                 h->ref_cache[1][scan8[4]] =
4544                 h->ref_cache[0][scan8[12]] =
4545                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4546             }
4547         }else{
4548             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4549             for(i=0; i<4; i++){
4550                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4551                 if(h->sub_mb_type[i] >=4){
4552                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4553                     return -1;
4554                 }
4555                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4556                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4557             }
4558         }
4559
4560         for(list=0; list<h->list_count; list++){
4561             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4562             for(i=0; i<4; i++){
4563                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4564                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4565                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4566                     if(tmp>=ref_count){
4567                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4568                         return -1;
4569                     }
4570                     ref[list][i]= tmp;
4571                 }else{
4572                  //FIXME
4573                     ref[list][i] = -1;
4574                 }
4575             }
4576         }
4577
4578         if(dct8x8_allowed)
4579             dct8x8_allowed = get_dct8x8_allowed(h);
4580
4581         for(list=0; list<h->list_count; list++){
4582             for(i=0; i<4; i++){
4583                 if(IS_DIRECT(h->sub_mb_type[i])) {
4584                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4585                     continue;
4586                 }
4587                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4588                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4589
4590                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4591                     const int sub_mb_type= h->sub_mb_type[i];
4592                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4593                     for(j=0; j<sub_partition_count[i]; j++){
4594                         int mx, my;
4595                         const int index= 4*i + block_width*j;
4596                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4597                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4598                         mx += get_se_golomb(&s->gb);
4599                         my += get_se_golomb(&s->gb);
4600                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4601
4602                         if(IS_SUB_8X8(sub_mb_type)){
4603                             mv_cache[ 1 ][0]=
4604                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4605                             mv_cache[ 1 ][1]=
4606                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4607                         }else if(IS_SUB_8X4(sub_mb_type)){
4608                             mv_cache[ 1 ][0]= mx;
4609                             mv_cache[ 1 ][1]= my;
4610                         }else if(IS_SUB_4X8(sub_mb_type)){
4611                             mv_cache[ 8 ][0]= mx;
4612                             mv_cache[ 8 ][1]= my;
4613                         }
4614                         mv_cache[ 0 ][0]= mx;
4615                         mv_cache[ 0 ][1]= my;
4616                     }
4617                 }else{
4618                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4619                     p[0] = p[1]=
4620                     p[8] = p[9]= 0;
4621                 }
4622             }
4623         }
4624     }else if(IS_DIRECT(mb_type)){
4625         pred_direct_motion(h, &mb_type);
4626         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4627     }else{
4628         int list, mx, my, i;
4629          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4630         if(IS_16X16(mb_type)){
4631             for(list=0; list<h->list_count; list++){
4632                     unsigned int val;
4633                     if(IS_DIR(mb_type, 0, list)){
4634                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4635                         if(val >= h->ref_count[list]){
4636                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4637                             return -1;
4638                         }
4639                     }else
4640                         val= LIST_NOT_USED&0xFF;
4641                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4642             }
4643             for(list=0; list<h->list_count; list++){
4644                 unsigned int val;
4645                 if(IS_DIR(mb_type, 0, list)){
4646                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4647                     mx += get_se_golomb(&s->gb);
4648                     my += get_se_golomb(&s->gb);
4649                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4650
4651                     val= pack16to32(mx,my);
4652                 }else
4653                     val=0;
4654                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4655             }
4656         }
4657         else if(IS_16X8(mb_type)){
4658             for(list=0; list<h->list_count; list++){
4659                     for(i=0; i<2; i++){
4660                         unsigned int val;
4661                         if(IS_DIR(mb_type, i, list)){
4662                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4663                             if(val >= h->ref_count[list]){
4664                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4665                                 return -1;
4666                             }
4667                         }else
4668                             val= LIST_NOT_USED&0xFF;
4669                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4670                     }
4671             }
4672             for(list=0; list<h->list_count; list++){
4673                 for(i=0; i<2; i++){
4674                     unsigned int val;
4675                     if(IS_DIR(mb_type, i, list)){
4676                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4677                         mx += get_se_golomb(&s->gb);
4678                         my += get_se_golomb(&s->gb);
4679                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4680
4681                         val= pack16to32(mx,my);
4682                     }else
4683                         val=0;
4684                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4685                 }
4686             }
4687         }else{
4688             assert(IS_8X16(mb_type));
4689             for(list=0; list<h->list_count; list++){
4690                     for(i=0; i<2; i++){
4691                         unsigned int val;
4692                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4693                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4694                             if(val >= h->ref_count[list]){
4695                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4696                                 return -1;
4697                             }
4698                         }else
4699                             val= LIST_NOT_USED&0xFF;
4700                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4701                     }
4702             }
4703             for(list=0; list<h->list_count; list++){
4704                 for(i=0; i<2; i++){
4705                     unsigned int val;
4706                     if(IS_DIR(mb_type, i, list)){
4707                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4708                         mx += get_se_golomb(&s->gb);
4709                         my += get_se_golomb(&s->gb);
4710                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4711
4712                         val= pack16to32(mx,my);
4713                     }else
4714                         val=0;
4715                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4716                 }
4717             }
4718         }
4719     }
4720
4721     if(IS_INTER(mb_type))
4722         write_back_motion(h, mb_type);
4723
4724     if(!IS_INTRA16x16(mb_type)){
4725         cbp= get_ue_golomb(&s->gb);
4726         if(cbp > 47){
4727             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4728             return -1;
4729         }
4730
4731         if(IS_INTRA4x4(mb_type))
4732             cbp= golomb_to_intra4x4_cbp[cbp];
4733         else
4734             cbp= golomb_to_inter_cbp[cbp];
4735     }
4736     h->cbp = cbp;
4737
4738     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4739         if(get_bits1(&s->gb))
4740             mb_type |= MB_TYPE_8x8DCT;
4741     }
4742     s->current_picture.mb_type[mb_xy]= mb_type;
4743
4744     if(cbp || IS_INTRA16x16(mb_type)){
4745         int i8x8, i4x4, chroma_idx;
4746         int dquant;
4747         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4748         const uint8_t *scan, *scan8x8, *dc_scan;
4749
4750 //        fill_non_zero_count_cache(h);
4751
4752         if(IS_INTERLACED(mb_type)){
4753             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4754             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4755             dc_scan= luma_dc_field_scan;
4756         }else{
4757             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4758             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4759             dc_scan= luma_dc_zigzag_scan;
4760         }
4761
4762         dquant= get_se_golomb(&s->gb);
4763
4764         if( dquant > 25 || dquant < -26 ){
4765             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4766             return -1;
4767         }
4768
4769         s->qscale += dquant;
4770         if(((unsigned)s->qscale) > 51){
4771             if(s->qscale<0) s->qscale+= 52;
4772             else            s->qscale-= 52;
4773         }
4774
4775         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4776         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4777         if(IS_INTRA16x16(mb_type)){
4778             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4779                 return -1; //FIXME continue if partitioned and other return -1 too
4780             }
4781
4782             assert((cbp&15) == 0 || (cbp&15) == 15);
4783
4784             if(cbp&15){
4785                 for(i8x8=0; i8x8<4; i8x8++){
4786                     for(i4x4=0; i4x4<4; i4x4++){
4787                         const int index= i4x4 + 4*i8x8;
4788                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4789                             return -1;
4790                         }
4791                     }
4792                 }
4793             }else{
4794                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4795             }
4796         }else{
4797             for(i8x8=0; i8x8<4; i8x8++){
4798                 if(cbp & (1<<i8x8)){
4799                     if(IS_8x8DCT(mb_type)){
4800                         DCTELEM *buf = &h->mb[64*i8x8];
4801                         uint8_t *nnz;
4802                         for(i4x4=0; i4x4<4; i4x4++){
4803                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4804                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4805                                 return -1;
4806                         }
4807                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4808                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4809                     }else{
4810                         for(i4x4=0; i4x4<4; i4x4++){
4811                             const int index= i4x4 + 4*i8x8;
4812
4813                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4814                                 return -1;
4815                             }
4816                         }
4817                     }
4818                 }else{
4819                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4820                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4821                 }
4822             }
4823         }
4824
4825         if(cbp&0x30){
4826             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4827                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4828                     return -1;
4829                 }
4830         }
4831
4832         if(cbp&0x20){
4833             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4834                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4835                 for(i4x4=0; i4x4<4; i4x4++){
4836                     const int index= 16 + 4*chroma_idx + i4x4;
4837                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4838                         return -1;
4839                     }
4840                 }
4841             }
4842         }else{
4843             uint8_t * const nnz= &h->non_zero_count_cache[0];
4844             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4845             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4846         }
4847     }else{
4848         uint8_t * const nnz= &h->non_zero_count_cache[0];
4849         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4850         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4851         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4852     }
4853     s->current_picture.qscale_table[mb_xy]= s->qscale;
4854     write_back_non_zero_count(h);
4855
4856     if(MB_MBAFF){
4857         h->ref_count[0] >>= 1;
4858         h->ref_count[1] >>= 1;
4859     }
4860
4861     return 0;
4862 }
4863
4864 static int decode_cabac_field_decoding_flag(H264Context *h) {
4865     MpegEncContext * const s = &h->s;
4866     const int mb_x = s->mb_x;
4867     const int mb_y = s->mb_y & ~1;
4868     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4869     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4870
4871     unsigned int ctx = 0;
4872
4873     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4874         ctx += 1;
4875     }
4876     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4877         ctx += 1;
4878     }
4879
4880     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4881 }
4882
4883 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4884     uint8_t *state= &h->cabac_state[ctx_base];
4885     int mb_type;
4886
4887     if(intra_slice){
4888         MpegEncContext * const s = &h->s;
4889         const int mba_xy = h->left_mb_xy[0];
4890         const int mbb_xy = h->top_mb_xy;
4891         int ctx=0;
4892         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4893             ctx++;
4894         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4895             ctx++;
4896         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4897             return 0;   /* I4x4 */
4898         state += 2;
4899     }else{
4900         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4901             return 0;   /* I4x4 */
4902     }
4903
4904     if( get_cabac_terminate( &h->cabac ) )
4905         return 25;  /* PCM */
4906
4907     mb_type = 1; /* I16x16 */
4908     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4909     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4910         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4911     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4912     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4913     return mb_type;
4914 }
4915
4916 static int decode_cabac_mb_type( H264Context *h ) {
4917     MpegEncContext * const s = &h->s;
4918
4919     if( h->slice_type == I_TYPE ) {
4920         return decode_cabac_intra_mb_type(h, 3, 1);
4921     } else if( h->slice_type == P_TYPE ) {
4922         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4923             /* P-type */
4924             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4925                 /* P_L0_D16x16, P_8x8 */
4926                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4927             } else {
4928                 /* P_L0_D8x16, P_L0_D16x8 */
4929                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4930             }
4931         } else {
4932             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4933         }
4934     } else if( h->slice_type == B_TYPE ) {
4935         const int mba_xy = h->left_mb_xy[0];
4936         const int mbb_xy = h->top_mb_xy;
4937         int ctx = 0;
4938         int bits;
4939
4940         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4941             ctx++;
4942         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4943             ctx++;
4944
4945         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4946             return 0; /* B_Direct_16x16 */
4947
4948         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4949             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4950         }
4951
4952         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4953         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4954         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4955         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4956         if( bits < 8 )
4957             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4958         else if( bits == 13 ) {
4959             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4960         } else if( bits == 14 )
4961             return 11; /* B_L1_L0_8x16 */
4962         else if( bits == 15 )
4963             return 22; /* B_8x8 */
4964
4965         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4966         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4967     } else {
4968         /* TODO SI/SP frames? */
4969         return -1;
4970     }
4971 }
4972
4973 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4974     MpegEncContext * const s = &h->s;
4975     int mba_xy, mbb_xy;
4976     int ctx = 0;
4977
4978     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4979         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4980         mba_xy = mb_xy - 1;
4981         if( (mb_y&1)
4982             && h->slice_table[mba_xy] == h->slice_num
4983             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4984             mba_xy += s->mb_stride;
4985         if( MB_FIELD ){
4986             mbb_xy = mb_xy - s->mb_stride;
4987             if( !(mb_y&1)
4988                 && h->slice_table[mbb_xy] == h->slice_num
4989                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4990                 mbb_xy -= s->mb_stride;
4991         }else
4992             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4993     }else{
4994         int mb_xy = mb_x + mb_y*s->mb_stride;
4995         mba_xy = mb_xy - 1;
4996         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4997     }
4998
4999     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5000         ctx++;
5001     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5002         ctx++;
5003
5004     if( h->slice_type == B_TYPE )
5005         ctx += 13;
5006     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5007 }
5008
5009 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5010     int mode = 0;
5011
5012     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5013         return pred_mode;
5014
5015     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5016     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5017     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5018
5019     if( mode >= pred_mode )
5020         return mode + 1;
5021     else
5022         return mode;
5023 }
5024
5025 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5026     const int mba_xy = h->left_mb_xy[0];
5027     const int mbb_xy = h->top_mb_xy;
5028
5029     int ctx = 0;
5030
5031     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5032     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5033         ctx++;
5034
5035     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5036         ctx++;
5037
5038     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5039         return 0;
5040
5041     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5042         return 1;
5043     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5044         return 2;
5045     else
5046         return 3;
5047 }
5048
5049 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5050     int cbp_b, cbp_a, ctx, cbp = 0;
5051
5052     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5053     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5054
5055     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5056     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5057     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5058     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5059     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5060     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5061     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5062     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5063     return cbp;
5064 }
5065 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5066     int ctx;
5067     int cbp_a, cbp_b;
5068
5069     cbp_a = (h->left_cbp>>4)&0x03;
5070     cbp_b = (h-> top_cbp>>4)&0x03;
5071
5072     ctx = 0;
5073     if( cbp_a > 0 ) ctx++;
5074     if( cbp_b > 0 ) ctx += 2;
5075     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5076         return 0;
5077
5078     ctx = 4;
5079     if( cbp_a == 2 ) ctx++;
5080     if( cbp_b == 2 ) ctx += 2;
5081     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5082 }
5083 static int decode_cabac_mb_dqp( H264Context *h) {
5084     int   ctx = 0;
5085     int   val = 0;
5086
5087     if( h->last_qscale_diff != 0 )
5088         ctx++;
5089
5090     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5091         if( ctx < 2 )
5092             ctx = 2;
5093         else
5094             ctx = 3;
5095         val++;
5096         if(val > 102) //prevent infinite loop
5097             return INT_MIN;
5098     }
5099
5100     if( val&0x01 )
5101         return (val + 1)/2;
5102     else
5103         return -(val + 1)/2;
5104 }
5105 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5106     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5107         return 0;   /* 8x8 */
5108     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5109         return 1;   /* 8x4 */
5110     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5111         return 2;   /* 4x8 */
5112     return 3;       /* 4x4 */
5113 }
5114 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5115     int type;
5116     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5117         return 0;   /* B_Direct_8x8 */
5118     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5119         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5120     type = 3;
5121     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5122         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5123             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5124         type += 4;
5125     }
5126     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5127     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5128     return type;
5129 }
5130
5131 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5132     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5133 }
5134
5135 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5136     int refa = h->ref_cache[list][scan8[n] - 1];
5137     int refb = h->ref_cache[list][scan8[n] - 8];
5138     int ref  = 0;
5139     int ctx  = 0;
5140
5141     if( h->slice_type == B_TYPE) {
5142         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5143             ctx++;
5144         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5145             ctx += 2;
5146     } else {
5147         if( refa > 0 )
5148             ctx++;
5149         if( refb > 0 )
5150             ctx += 2;
5151     }
5152
5153     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5154         ref++;
5155         if( ctx < 4 )
5156             ctx = 4;
5157         else
5158             ctx = 5;
5159         if(ref >= 32 /*h->ref_list[list]*/){
5160             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5161             return 0; //FIXME we should return -1 and check the return everywhere
5162         }
5163     }
5164     return ref;
5165 }
5166
5167 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5168     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5169                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5170     int ctxbase = (l == 0) ? 40 : 47;
5171     int ctx, mvd;
5172
5173     if( amvd < 3 )
5174         ctx = 0;
5175     else if( amvd > 32 )
5176         ctx = 2;
5177     else
5178         ctx = 1;
5179
5180     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5181         return 0;
5182
5183     mvd= 1;
5184     ctx= 3;
5185     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5186         mvd++;
5187         if( ctx < 6 )
5188             ctx++;
5189     }
5190
5191     if( mvd >= 9 ) {
5192         int k = 3;
5193         while( get_cabac_bypass( &h->cabac ) ) {
5194             mvd += 1 << k;
5195             k++;
5196             if(k>24){
5197                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5198                 return INT_MIN;
5199             }
5200         }
5201         while( k-- ) {
5202             if( get_cabac_bypass( &h->cabac ) )
5203                 mvd += 1 << k;
5204         }
5205     }
5206     return get_cabac_bypass_sign( &h->cabac, -mvd );
5207 }
5208
5209 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5210     int nza, nzb;
5211     int ctx = 0;
5212
5213     if( cat == 0 ) {
5214         nza = h->left_cbp&0x100;
5215         nzb = h-> top_cbp&0x100;
5216     } else if( cat == 1 || cat == 2 ) {
5217         nza = h->non_zero_count_cache[scan8[idx] - 1];
5218         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5219     } else if( cat == 3 ) {
5220         nza = (h->left_cbp>>(6+idx))&0x01;
5221         nzb = (h-> top_cbp>>(6+idx))&0x01;
5222     } else {
5223         assert(cat == 4);
5224         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5225         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5226     }
5227
5228     if( nza > 0 )
5229         ctx++;
5230
5231     if( nzb > 0 )
5232         ctx += 2;
5233
5234     return ctx + 4 * cat;
5235 }
5236
5237 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5238     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5239     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5240     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5241     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5242 };
5243
5244 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5245     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5246     static const int significant_coeff_flag_offset[2][6] = {
5247       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5248       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5249     };
5250     static const int last_coeff_flag_offset[2][6] = {
5251       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5252       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5253     };
5254     static const int coeff_abs_level_m1_offset[6] = {
5255         227+0, 227+10, 227+20, 227+30, 227+39, 426
5256     };
5257     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5258       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5259         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5260         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5261        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5262       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5263         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5264         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5265         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5266     };
5267
5268     int index[64];
5269
5270     int av_unused last;
5271     int coeff_count = 0;
5272
5273     int abslevel1 = 1;
5274     int abslevelgt1 = 0;
5275
5276     uint8_t *significant_coeff_ctx_base;
5277     uint8_t *last_coeff_ctx_base;
5278     uint8_t *abs_level_m1_ctx_base;
5279
5280 #ifndef ARCH_X86
5281 #define CABAC_ON_STACK
5282 #endif
5283 #ifdef CABAC_ON_STACK
5284 #define CC &cc
5285     CABACContext cc;
5286     cc.range     = h->cabac.range;
5287     cc.low       = h->cabac.low;
5288     cc.bytestream= h->cabac.bytestream;
5289 #else
5290 #define CC &h->cabac
5291 #endif
5292
5293
5294     /* cat: 0-> DC 16x16  n = 0
5295      *      1-> AC 16x16  n = luma4x4idx
5296      *      2-> Luma4x4   n = luma4x4idx
5297      *      3-> DC Chroma n = iCbCr
5298      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5299      *      5-> Luma8x8   n = 4 * luma8x8idx
5300      */
5301
5302     /* read coded block flag */
5303     if( cat != 5 ) {
5304         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5305             if( cat == 1 || cat == 2 )
5306                 h->non_zero_count_cache[scan8[n]] = 0;
5307             else if( cat == 4 )
5308                 h->non_zero_count_cache[scan8[16+n]] = 0;
5309 #ifdef CABAC_ON_STACK
5310             h->cabac.range     = cc.range     ;
5311             h->cabac.low       = cc.low       ;
5312             h->cabac.bytestream= cc.bytestream;
5313 #endif
5314             return;
5315         }
5316     }
5317
5318     significant_coeff_ctx_base = h->cabac_state
5319         + significant_coeff_flag_offset[MB_FIELD][cat];
5320     last_coeff_ctx_base = h->cabac_state
5321         + last_coeff_flag_offset[MB_FIELD][cat];
5322     abs_level_m1_ctx_base = h->cabac_state
5323         + coeff_abs_level_m1_offset[cat];
5324
5325     if( cat == 5 ) {
5326 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5327         for(last= 0; last < coefs; last++) { \
5328             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5329             if( get_cabac( CC, sig_ctx )) { \
5330                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5331                 index[coeff_count++] = last; \
5332                 if( get_cabac( CC, last_ctx ) ) { \
5333                     last= max_coeff; \
5334                     break; \
5335                 } \
5336             } \
5337         }\
5338         if( last == max_coeff -1 ) {\
5339             index[coeff_count++] = last;\
5340         }
5341         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5342 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5343         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5344     } else {
5345         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5346 #else
5347         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5348     } else {
5349         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5350 #endif
5351     }
5352     assert(coeff_count > 0);
5353
5354     if( cat == 0 )
5355         h->cbp_table[mb_xy] |= 0x100;
5356     else if( cat == 1 || cat == 2 )
5357         h->non_zero_count_cache[scan8[n]] = coeff_count;
5358     else if( cat == 3 )
5359         h->cbp_table[mb_xy] |= 0x40 << n;
5360     else if( cat == 4 )
5361         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5362     else {
5363         assert( cat == 5 );
5364         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5365     }
5366
5367     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5368         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5369         int j= scantable[index[coeff_count]];
5370
5371         if( get_cabac( CC, ctx ) == 0 ) {
5372             if( !qmul ) {
5373                 block[j] = get_cabac_bypass_sign( CC, -1);
5374             }else{
5375                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5376             }
5377
5378             abslevel1++;
5379         } else {
5380             int coeff_abs = 2;
5381             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5382             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5383                 coeff_abs++;
5384             }
5385
5386             if( coeff_abs >= 15 ) {
5387                 int j = 0;
5388                 while( get_cabac_bypass( CC ) ) {
5389                     j++;
5390                 }
5391
5392                 coeff_abs=1;
5393                 while( j-- ) {
5394                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5395                 }
5396                 coeff_abs+= 14;
5397             }
5398
5399             if( !qmul ) {
5400                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5401                 else                                block[j] =  coeff_abs;
5402             }else{
5403                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5404                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5405             }
5406
5407             abslevelgt1++;
5408         }
5409     }
5410 #ifdef CABAC_ON_STACK
5411             h->cabac.range     = cc.range     ;
5412             h->cabac.low       = cc.low       ;
5413             h->cabac.bytestream= cc.bytestream;
5414 #endif
5415
5416 }
5417
5418 static inline void compute_mb_neighbors(H264Context *h)
5419 {
5420     MpegEncContext * const s = &h->s;
5421     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5422     h->top_mb_xy     = mb_xy - s->mb_stride;
5423     h->left_mb_xy[0] = mb_xy - 1;
5424     if(FRAME_MBAFF){
5425         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5426         const int top_pair_xy      = pair_xy     - s->mb_stride;
5427         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5428         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5429         const int curr_mb_frame_flag = !MB_FIELD;
5430         const int bottom = (s->mb_y & 1);
5431         if (bottom
5432                 ? !curr_mb_frame_flag // bottom macroblock
5433                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5434                 ) {
5435             h->top_mb_xy -= s->mb_stride;
5436         }
5437         if (left_mb_frame_flag != curr_mb_frame_flag) {
5438             h->left_mb_xy[0] = pair_xy - 1;
5439         }
5440     } else if (FIELD_PICTURE) {
5441         h->top_mb_xy -= s->mb_stride;
5442     }
5443     return;
5444 }
5445
5446 /**
5447  * decodes a macroblock
5448  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5449  */
5450 static int decode_mb_cabac(H264Context *h) {
5451     MpegEncContext * const s = &h->s;
5452     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5453     int mb_type, partition_count, cbp = 0;
5454     int dct8x8_allowed= h->pps.transform_8x8_mode;
5455
5456     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5457
5458     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5459     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5460         int skip;
5461         /* a skipped mb needs the aff flag from the following mb */
5462         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5463             predict_field_decoding_flag(h);
5464         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5465             skip = h->next_mb_skipped;
5466         else
5467             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5468         /* read skip flags */
5469         if( skip ) {
5470             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5471                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5472                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5473                 if(h->next_mb_skipped)
5474                     predict_field_decoding_flag(h);
5475                 else
5476                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5477             }
5478
5479             decode_mb_skip(h);
5480
5481             h->cbp_table[mb_xy] = 0;
5482             h->chroma_pred_mode_table[mb_xy] = 0;
5483             h->last_qscale_diff = 0;
5484
5485             return 0;
5486
5487         }
5488     }
5489     if(FRAME_MBAFF){
5490         if( (s->mb_y&1) == 0 )
5491             h->mb_mbaff =
5492             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5493     }else
5494         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5495
5496     h->prev_mb_skipped = 0;
5497
5498     compute_mb_neighbors(h);
5499     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5500         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5501         return -1;
5502     }
5503
5504     if( h->slice_type == B_TYPE ) {
5505         if( mb_type < 23 ){
5506             partition_count= b_mb_type_info[mb_type].partition_count;
5507             mb_type=         b_mb_type_info[mb_type].type;
5508         }else{
5509             mb_type -= 23;
5510             goto decode_intra_mb;
5511         }
5512     } else if( h->slice_type == P_TYPE ) {
5513         if( mb_type < 5) {
5514             partition_count= p_mb_type_info[mb_type].partition_count;
5515             mb_type=         p_mb_type_info[mb_type].type;
5516         } else {
5517             mb_type -= 5;
5518             goto decode_intra_mb;
5519         }
5520     } else {
5521        assert(h->slice_type == I_TYPE);
5522 decode_intra_mb:
5523         partition_count = 0;
5524         cbp= i_mb_type_info[mb_type].cbp;
5525         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5526         mb_type= i_mb_type_info[mb_type].type;
5527     }
5528     if(MB_FIELD)
5529         mb_type |= MB_TYPE_INTERLACED;
5530
5531     h->slice_table[ mb_xy ]= h->slice_num;
5532
5533     if(IS_INTRA_PCM(mb_type)) {
5534         const uint8_t *ptr;
5535         unsigned int x, y;
5536
5537         // We assume these blocks are very rare so we do not optimize it.
5538         // FIXME The two following lines get the bitstream position in the cabac
5539         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5540         ptr= h->cabac.bytestream;
5541         if(h->cabac.low&0x1) ptr--;
5542         if(CABAC_BITS==16){
5543             if(h->cabac.low&0x1FF) ptr--;
5544         }
5545
5546         // The pixels are stored in the same order as levels in h->mb array.
5547         for(y=0; y<16; y++){
5548             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5549             for(x=0; x<16; x++){
5550                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5551                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5552             }
5553         }
5554         for(y=0; y<8; y++){
5555             const int index= 256 + 4*(y&3) + 32*(y>>2);
5556             for(x=0; x<8; x++){
5557                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5558                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5559             }
5560         }
5561         for(y=0; y<8; y++){
5562             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5563             for(x=0; x<8; x++){
5564                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5565                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5566             }
5567         }
5568
5569         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5570
5571         // All blocks are present
5572         h->cbp_table[mb_xy] = 0x1ef;
5573         h->chroma_pred_mode_table[mb_xy] = 0;
5574         // In deblocking, the quantizer is 0
5575         s->current_picture.qscale_table[mb_xy]= 0;
5576         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5577         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5578         // All coeffs are present
5579         memset(h->non_zero_count[mb_xy], 16, 16);
5580         s->current_picture.mb_type[mb_xy]= mb_type;
5581         return 0;
5582     }
5583
5584     if(MB_MBAFF){
5585         h->ref_count[0] <<= 1;
5586         h->ref_count[1] <<= 1;
5587     }
5588
5589     fill_caches(h, mb_type, 0);
5590
5591     if( IS_INTRA( mb_type ) ) {
5592         int i, pred_mode;
5593         if( IS_INTRA4x4( mb_type ) ) {
5594             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5595                 mb_type |= MB_TYPE_8x8DCT;
5596                 for( i = 0; i < 16; i+=4 ) {
5597                     int pred = pred_intra_mode( h, i );
5598                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5599                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5600                 }
5601             } else {
5602                 for( i = 0; i < 16; i++ ) {
5603                     int pred = pred_intra_mode( h, i );
5604                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5605
5606                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5607                 }
5608             }
5609             write_back_intra_pred_mode(h);
5610             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5611         } else {
5612             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5613             if( h->intra16x16_pred_mode < 0 ) return -1;
5614         }
5615         h->chroma_pred_mode_table[mb_xy] =
5616         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5617
5618         pred_mode= check_intra_pred_mode( h, pred_mode );
5619         if( pred_mode < 0 ) return -1;
5620         h->chroma_pred_mode= pred_mode;
5621     } else if( partition_count == 4 ) {
5622         int i, j, sub_partition_count[4], list, ref[2][4];
5623
5624         if( h->slice_type == B_TYPE ) {
5625             for( i = 0; i < 4; i++ ) {
5626                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5627                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5628                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5629             }
5630             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5631                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5632                 pred_direct_motion(h, &mb_type);
5633                 h->ref_cache[0][scan8[4]] =
5634                 h->ref_cache[1][scan8[4]] =
5635                 h->ref_cache[0][scan8[12]] =
5636                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5637                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5638                     for( i = 0; i < 4; i++ )
5639                         if( IS_DIRECT(h->sub_mb_type[i]) )
5640                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5641                 }
5642             }
5643         } else {
5644             for( i = 0; i < 4; i++ ) {
5645                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5646                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5647                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5648             }
5649         }
5650
5651         for( list = 0; list < h->list_count; list++ ) {
5652                 for( i = 0; i < 4; i++ ) {
5653                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5654                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5655                         if( h->ref_count[list] > 1 )
5656                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5657                         else
5658                             ref[list][i] = 0;
5659                     } else {
5660                         ref[list][i] = -1;
5661                     }
5662                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5663                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5664                 }
5665         }
5666
5667         if(dct8x8_allowed)
5668             dct8x8_allowed = get_dct8x8_allowed(h);
5669
5670         for(list=0; list<h->list_count; list++){
5671             for(i=0; i<4; i++){
5672                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5673                 if(IS_DIRECT(h->sub_mb_type[i])){
5674                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5675                     continue;
5676                 }
5677
5678                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5679                     const int sub_mb_type= h->sub_mb_type[i];
5680                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5681                     for(j=0; j<sub_partition_count[i]; j++){
5682                         int mpx, mpy;
5683                         int mx, my;
5684                         const int index= 4*i + block_width*j;
5685                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5686                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5687                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5688
5689                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5690                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5691                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5692
5693                         if(IS_SUB_8X8(sub_mb_type)){
5694                             mv_cache[ 1 ][0]=
5695                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5696                             mv_cache[ 1 ][1]=
5697                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5698
5699                             mvd_cache[ 1 ][0]=
5700                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5701                             mvd_cache[ 1 ][1]=
5702                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5703                         }else if(IS_SUB_8X4(sub_mb_type)){
5704                             mv_cache[ 1 ][0]= mx;
5705                             mv_cache[ 1 ][1]= my;
5706
5707                             mvd_cache[ 1 ][0]= mx - mpx;
5708                             mvd_cache[ 1 ][1]= my - mpy;
5709                         }else if(IS_SUB_4X8(sub_mb_type)){
5710                             mv_cache[ 8 ][0]= mx;
5711                             mv_cache[ 8 ][1]= my;
5712
5713                             mvd_cache[ 8 ][0]= mx - mpx;
5714                             mvd_cache[ 8 ][1]= my - mpy;
5715                         }
5716                         mv_cache[ 0 ][0]= mx;
5717                         mv_cache[ 0 ][1]= my;
5718
5719                         mvd_cache[ 0 ][0]= mx - mpx;
5720                         mvd_cache[ 0 ][1]= my - mpy;
5721                     }
5722                 }else{
5723                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5724                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5725                     p[0] = p[1] = p[8] = p[9] = 0;
5726                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5727                 }
5728             }
5729         }
5730     } else if( IS_DIRECT(mb_type) ) {
5731         pred_direct_motion(h, &mb_type);
5732         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5733         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5734         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5735     } else {
5736         int list, mx, my, i, mpx, mpy;
5737         if(IS_16X16(mb_type)){
5738             for(list=0; list<h->list_count; list++){
5739                 if(IS_DIR(mb_type, 0, list)){
5740                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5741                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5742                 }else
5743                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5744             }
5745             for(list=0; list<h->list_count; list++){
5746                 if(IS_DIR(mb_type, 0, list)){
5747                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5748
5749                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5750                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5751                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5752
5753                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5754                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5755                 }else
5756                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5757             }
5758         }
5759         else if(IS_16X8(mb_type)){
5760             for(list=0; list<h->list_count; list++){
5761                     for(i=0; i<2; i++){
5762                         if(IS_DIR(mb_type, i, list)){
5763                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5764                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5765                         }else
5766                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5767                     }
5768             }
5769             for(list=0; list<h->list_count; list++){
5770                 for(i=0; i<2; i++){
5771                     if(IS_DIR(mb_type, i, list)){
5772                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5773                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5774                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5775                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5776
5777                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5778                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5779                     }else{
5780                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5781                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5782                     }
5783                 }
5784             }
5785         }else{
5786             assert(IS_8X16(mb_type));
5787             for(list=0; list<h->list_count; list++){
5788                     for(i=0; i<2; i++){
5789                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5790                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5791                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5792                         }else
5793                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5794                     }
5795             }
5796             for(list=0; list<h->list_count; list++){
5797                 for(i=0; i<2; i++){
5798                     if(IS_DIR(mb_type, i, list)){
5799                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5800                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5801                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5802
5803                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5804                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5805                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5806                     }else{
5807                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5808                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5809                     }
5810                 }
5811             }
5812         }
5813     }
5814
5815    if( IS_INTER( mb_type ) ) {
5816         h->chroma_pred_mode_table[mb_xy] = 0;
5817         write_back_motion( h, mb_type );
5818    }
5819
5820     if( !IS_INTRA16x16( mb_type ) ) {
5821         cbp  = decode_cabac_mb_cbp_luma( h );
5822         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5823     }
5824
5825     h->cbp_table[mb_xy] = h->cbp = cbp;
5826
5827     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5828         if( decode_cabac_mb_transform_size( h ) )
5829             mb_type |= MB_TYPE_8x8DCT;
5830     }
5831     s->current_picture.mb_type[mb_xy]= mb_type;
5832
5833     if( cbp || IS_INTRA16x16( mb_type ) ) {
5834         const uint8_t *scan, *scan8x8, *dc_scan;
5835         const uint32_t *qmul;
5836         int dqp;
5837
5838         if(IS_INTERLACED(mb_type)){
5839             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5840             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5841             dc_scan= luma_dc_field_scan;
5842         }else{
5843             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5844             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5845             dc_scan= luma_dc_zigzag_scan;
5846         }
5847
5848         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5849         if( dqp == INT_MIN ){
5850             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5851             return -1;
5852         }
5853         s->qscale += dqp;
5854         if(((unsigned)s->qscale) > 51){
5855             if(s->qscale<0) s->qscale+= 52;
5856             else            s->qscale-= 52;
5857         }
5858         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5859         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5860
5861         if( IS_INTRA16x16( mb_type ) ) {
5862             int i;
5863             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5864             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5865
5866             if( cbp&15 ) {
5867                 qmul = h->dequant4_coeff[0][s->qscale];
5868                 for( i = 0; i < 16; i++ ) {
5869                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5870                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5871                 }
5872             } else {
5873                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5874             }
5875         } else {
5876             int i8x8, i4x4;
5877             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5878                 if( cbp & (1<<i8x8) ) {
5879                     if( IS_8x8DCT(mb_type) ) {
5880                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5881                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5882                     } else {
5883                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5884                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5885                             const int index = 4*i8x8 + i4x4;
5886                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5887 //START_TIMER
5888                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5889 //STOP_TIMER("decode_residual")
5890                         }
5891                     }
5892                 } else {
5893                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5894                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5895                 }
5896             }
5897         }
5898
5899         if( cbp&0x30 ){
5900             int c;
5901             for( c = 0; c < 2; c++ ) {
5902                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5903                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5904             }
5905         }
5906
5907         if( cbp&0x20 ) {
5908             int c, i;
5909             for( c = 0; c < 2; c++ ) {
5910                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5911                 for( i = 0; i < 4; i++ ) {
5912                     const int index = 16 + 4 * c + i;
5913                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5914                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5915                 }
5916             }
5917         } else {
5918             uint8_t * const nnz= &h->non_zero_count_cache[0];
5919             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5920             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5921         }
5922     } else {
5923         uint8_t * const nnz= &h->non_zero_count_cache[0];
5924         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5925         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5926         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5927         h->last_qscale_diff = 0;
5928     }
5929
5930     s->current_picture.qscale_table[mb_xy]= s->qscale;
5931     write_back_non_zero_count(h);
5932
5933     if(MB_MBAFF){
5934         h->ref_count[0] >>= 1;
5935         h->ref_count[1] >>= 1;
5936     }
5937
5938     return 0;
5939 }
5940
5941
5942 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5943     int i, d;
5944     const int index_a = qp + h->slice_alpha_c0_offset;
5945     const int alpha = (alpha_table+52)[index_a];
5946     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5947
5948     if( bS[0] < 4 ) {
5949         int8_t tc[4];
5950         for(i=0; i<4; i++)
5951             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5952         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5953     } else {
5954         /* 16px edge length, because bS=4 is triggered by being at
5955          * the edge of an intra MB, so all 4 bS are the same */
5956             for( d = 0; d < 16; d++ ) {
5957                 const int p0 = pix[-1];
5958                 const int p1 = pix[-2];
5959                 const int p2 = pix[-3];
5960
5961                 const int q0 = pix[0];
5962                 const int q1 = pix[1];
5963                 const int q2 = pix[2];
5964
5965                 if( FFABS( p0 - q0 ) < alpha &&
5966                     FFABS( p1 - p0 ) < beta &&
5967                     FFABS( q1 - q0 ) < beta ) {
5968
5969                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5970                         if( FFABS( p2 - p0 ) < beta)
5971                         {
5972                             const int p3 = pix[-4];
5973                             /* p0', p1', p2' */
5974                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5975                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5976                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5977                         } else {
5978                             /* p0' */
5979                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5980                         }
5981                         if( FFABS( q2 - q0 ) < beta)
5982                         {
5983                             const int q3 = pix[3];
5984                             /* q0', q1', q2' */
5985                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5986                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5987                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5988                         } else {
5989                             /* q0' */
5990                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5991                         }
5992                     }else{
5993                         /* p0', q0' */
5994                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5995                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5996                     }
5997                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5998                 }
5999                 pix += stride;
6000             }
6001     }
6002 }
6003 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6004     int i;
6005     const int index_a = qp + h->slice_alpha_c0_offset;
6006     const int alpha = (alpha_table+52)[index_a];
6007     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6008
6009     if( bS[0] < 4 ) {
6010         int8_t tc[4];
6011         for(i=0; i<4; i++)
6012             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6013         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6014     } else {
6015         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6016     }
6017 }
6018
6019 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6020     int i;
6021     for( i = 0; i < 16; i++, pix += stride) {
6022         int index_a;
6023         int alpha;
6024         int beta;
6025
6026         int qp_index;
6027         int bS_index = (i >> 1);
6028         if (!MB_FIELD) {
6029             bS_index &= ~1;
6030             bS_index |= (i & 1);
6031         }
6032
6033         if( bS[bS_index] == 0 ) {
6034             continue;
6035         }
6036
6037         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6038         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6039         alpha = (alpha_table+52)[index_a];
6040         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6041
6042         if( bS[bS_index] < 4 ) {
6043             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6044             const int p0 = pix[-1];
6045             const int p1 = pix[-2];
6046             const int p2 = pix[-3];
6047             const int q0 = pix[0];
6048             const int q1 = pix[1];
6049             const int q2 = pix[2];
6050
6051             if( FFABS( p0 - q0 ) < alpha &&
6052                 FFABS( p1 - p0 ) < beta &&
6053                 FFABS( q1 - q0 ) < beta ) {
6054                 int tc = tc0;
6055                 int i_delta;
6056
6057                 if( FFABS( p2 - p0 ) < beta ) {
6058                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6059                     tc++;
6060                 }
6061                 if( FFABS( q2 - q0 ) < beta ) {
6062                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6063                     tc++;
6064                 }
6065
6066                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6067                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6068                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6069                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6070             }
6071         }else{
6072             const int p0 = pix[-1];
6073             const int p1 = pix[-2];
6074             const int p2 = pix[-3];
6075
6076             const int q0 = pix[0];
6077             const int q1 = pix[1];
6078             const int q2 = pix[2];
6079
6080             if( FFABS( p0 - q0 ) < alpha &&
6081                 FFABS( p1 - p0 ) < beta &&
6082                 FFABS( q1 - q0 ) < beta ) {
6083
6084                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6085                     if( FFABS( p2 - p0 ) < beta)
6086                     {
6087                         const int p3 = pix[-4];
6088                         /* p0', p1', p2' */
6089                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6090                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6091                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6092                     } else {
6093                         /* p0' */
6094                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6095                     }
6096                     if( FFABS( q2 - q0 ) < beta)
6097                     {
6098                         const int q3 = pix[3];
6099                         /* q0', q1', q2' */
6100                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6101                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6102                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6103                     } else {
6104                         /* q0' */
6105                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6106                     }
6107                 }else{
6108                     /* p0', q0' */
6109                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6110                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6111                 }
6112                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6113             }
6114         }
6115     }
6116 }
6117 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6118     int i;
6119     for( i = 0; i < 8; i++, pix += stride) {
6120         int index_a;
6121         int alpha;
6122         int beta;
6123
6124         int qp_index;
6125         int bS_index = i;
6126
6127         if( bS[bS_index] == 0 ) {
6128             continue;
6129         }
6130
6131         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6132         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6133         alpha = (alpha_table+52)[index_a];
6134         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6135
6136         if( bS[bS_index] < 4 ) {
6137             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6138             const int p0 = pix[-1];
6139             const int p1 = pix[-2];
6140             const int q0 = pix[0];
6141             const int q1 = pix[1];
6142
6143             if( FFABS( p0 - q0 ) < alpha &&
6144                 FFABS( p1 - p0 ) < beta &&
6145                 FFABS( q1 - q0 ) < beta ) {
6146                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6147
6148                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6149                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6150                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6151             }
6152         }else{
6153             const int p0 = pix[-1];
6154             const int p1 = pix[-2];
6155             const int q0 = pix[0];
6156             const int q1 = pix[1];
6157
6158             if( FFABS( p0 - q0 ) < alpha &&
6159                 FFABS( p1 - p0 ) < beta &&
6160                 FFABS( q1 - q0 ) < beta ) {
6161
6162                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6163                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6164                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6165             }
6166         }
6167     }
6168 }
6169
6170 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6171     int i, d;
6172     const int index_a = qp + h->slice_alpha_c0_offset;
6173     const int alpha = (alpha_table+52)[index_a];
6174     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6175     const int pix_next  = stride;
6176
6177     if( bS[0] < 4 ) {
6178         int8_t tc[4];
6179         for(i=0; i<4; i++)
6180             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6181         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6182     } else {
6183         /* 16px edge length, see filter_mb_edgev */
6184             for( d = 0; d < 16; d++ ) {
6185                 const int p0 = pix[-1*pix_next];
6186                 const int p1 = pix[-2*pix_next];
6187                 const int p2 = pix[-3*pix_next];
6188                 const int q0 = pix[0];
6189                 const int q1 = pix[1*pix_next];
6190                 const int q2 = pix[2*pix_next];
6191
6192                 if( FFABS( p0 - q0 ) < alpha &&
6193                     FFABS( p1 - p0 ) < beta &&
6194                     FFABS( q1 - q0 ) < beta ) {
6195
6196                     const int p3 = pix[-4*pix_next];
6197                     const int q3 = pix[ 3*pix_next];
6198
6199                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6200                         if( FFABS( p2 - p0 ) < beta) {
6201                             /* p0', p1', p2' */
6202                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6203                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6204                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6205                         } else {
6206                             /* p0' */
6207                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6208                         }
6209                         if( FFABS( q2 - q0 ) < beta) {
6210                             /* q0', q1', q2' */
6211                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6212                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6213                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6214                         } else {
6215                             /* q0' */
6216                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6217                         }
6218                     }else{
6219                         /* p0', q0' */
6220                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6221                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6222                     }
6223                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6224                 }
6225                 pix++;
6226             }
6227     }
6228 }
6229
6230 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6231     int i;
6232     const int index_a = qp + h->slice_alpha_c0_offset;
6233     const int alpha = (alpha_table+52)[index_a];
6234     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6235
6236     if( bS[0] < 4 ) {
6237         int8_t tc[4];
6238         for(i=0; i<4; i++)
6239             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6240         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6241     } else {
6242         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6243     }
6244 }
6245
6246 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6247     MpegEncContext * const s = &h->s;
6248     int mb_xy, mb_type;
6249     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6250
6251     mb_xy = mb_x + mb_y*s->mb_stride;
6252
6253     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6254        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6255                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6256         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6257         return;
6258     }
6259     assert(!FRAME_MBAFF);
6260
6261     mb_type = s->current_picture.mb_type[mb_xy];
6262     qp = s->current_picture.qscale_table[mb_xy];
6263     qp0 = s->current_picture.qscale_table[mb_xy-1];
6264     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6265     qpc = get_chroma_qp( h, 0, qp );
6266     qpc0 = get_chroma_qp( h, 0, qp0 );
6267     qpc1 = get_chroma_qp( h, 0, qp1 );
6268     qp0 = (qp + qp0 + 1) >> 1;
6269     qp1 = (qp + qp1 + 1) >> 1;
6270     qpc0 = (qpc + qpc0 + 1) >> 1;
6271     qpc1 = (qpc + qpc1 + 1) >> 1;
6272     qp_thresh = 15 - h->slice_alpha_c0_offset;
6273     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6274        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6275         return;
6276
6277     if( IS_INTRA(mb_type) ) {
6278         int16_t bS4[4] = {4,4,4,4};
6279         int16_t bS3[4] = {3,3,3,3};
6280         if( IS_8x8DCT(mb_type) ) {
6281             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6282             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6283             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6284             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6285         } else {
6286             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6287             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6288             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6289             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6290             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6291             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6292             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6293             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6294         }
6295         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6296         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6297         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6298         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6299         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6300         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6301         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6302         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6303         return;
6304     } else {
6305         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6306         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6307         int edges;
6308         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6309             edges = 4;
6310             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6311         } else {
6312             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6313                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6314             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6315                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6316                              ? 3 : 0;
6317             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6318             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6319             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6320                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6321         }
6322         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6323             bSv[0][0] = 0x0004000400040004ULL;
6324         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6325             bSv[1][0] = 0x0004000400040004ULL;
6326
6327 #define FILTER(hv,dir,edge)\
6328         if(bSv[dir][edge]) {\
6329             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6330             if(!(edge&1)) {\
6331                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6332                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6333             }\
6334         }
6335         if( edges == 1 ) {
6336             FILTER(v,0,0);
6337             FILTER(h,1,0);
6338         } else if( IS_8x8DCT(mb_type) ) {
6339             FILTER(v,0,0);
6340             FILTER(v,0,2);
6341             FILTER(h,1,0);
6342             FILTER(h,1,2);
6343         } else {
6344             FILTER(v,0,0);
6345             FILTER(v,0,1);
6346             FILTER(v,0,2);
6347             FILTER(v,0,3);
6348             FILTER(h,1,0);
6349             FILTER(h,1,1);
6350             FILTER(h,1,2);
6351             FILTER(h,1,3);
6352         }
6353 #undef FILTER
6354     }
6355 }
6356
6357 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6358     MpegEncContext * const s = &h->s;
6359     const int mb_xy= mb_x + mb_y*s->mb_stride;
6360     const int mb_type = s->current_picture.mb_type[mb_xy];
6361     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6362     int first_vertical_edge_done = 0;
6363     int dir;
6364     /* FIXME: A given frame may occupy more than one position in
6365      * the reference list. So ref2frm should be populated with
6366      * frame numbers, not indices. */
6367     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6368                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6369
6370     //for sufficiently low qp, filtering wouldn't do anything
6371     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6372     if(!FRAME_MBAFF){
6373         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6374         int qp = s->current_picture.qscale_table[mb_xy];
6375         if(qp <= qp_thresh
6376            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6377            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6378             return;
6379         }
6380     }
6381
6382     if (FRAME_MBAFF
6383             // left mb is in picture
6384             && h->slice_table[mb_xy-1] != 255
6385             // and current and left pair do not have the same interlaced type
6386             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6387             // and left mb is in the same slice if deblocking_filter == 2
6388             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6389         /* First vertical edge is different in MBAFF frames
6390          * There are 8 different bS to compute and 2 different Qp
6391          */
6392         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6393         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6394         int16_t bS[8];
6395         int qp[2];
6396         int bqp[2];
6397         int rqp[2];
6398         int mb_qp, mbn0_qp, mbn1_qp;
6399         int i;
6400         first_vertical_edge_done = 1;
6401
6402         if( IS_INTRA(mb_type) )
6403             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6404         else {
6405             for( i = 0; i < 8; i++ ) {
6406                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6407
6408                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6409                     bS[i] = 4;
6410                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6411                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6412                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6413                     bS[i] = 2;
6414                 else
6415                     bS[i] = 1;
6416             }
6417         }
6418
6419         mb_qp = s->current_picture.qscale_table[mb_xy];
6420         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6421         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6422         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6423         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6424                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6425         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6426                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6427         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6428         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6429                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6430         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6431                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6432
6433         /* Filter edge */
6434         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6435         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6436         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6437         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6438         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6439     }
6440     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6441     for( dir = 0; dir < 2; dir++ )
6442     {
6443         int edge;
6444         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6445         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6446         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6447
6448         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6449                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6450         // how often to recheck mv-based bS when iterating between edges
6451         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6452                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6453         // how often to recheck mv-based bS when iterating along each edge
6454         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6455
6456         if (first_vertical_edge_done) {
6457             start = 1;
6458             first_vertical_edge_done = 0;
6459         }
6460
6461         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6462             start = 1;
6463
6464         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6465             && !IS_INTERLACED(mb_type)
6466             && IS_INTERLACED(mbm_type)
6467             ) {
6468             // This is a special case in the norm where the filtering must
6469             // be done twice (one each of the field) even if we are in a
6470             // frame macroblock.
6471             //
6472             static const int nnz_idx[4] = {4,5,6,3};
6473             unsigned int tmp_linesize   = 2 *   linesize;
6474             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6475             int mbn_xy = mb_xy - 2 * s->mb_stride;
6476             int qp;
6477             int i, j;
6478             int16_t bS[4];
6479
6480             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6481                 if( IS_INTRA(mb_type) ||
6482                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6483                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6484                 } else {
6485                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6486                     for( i = 0; i < 4; i++ ) {
6487                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6488                             mbn_nnz[nnz_idx[i]] != 0 )
6489                             bS[i] = 2;
6490                         else
6491                             bS[i] = 1;
6492                     }
6493                 }
6494                 // Do not use s->qscale as luma quantizer because it has not the same
6495                 // value in IPCM macroblocks.
6496                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6497                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6498                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6499                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6500                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6501                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6502                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6503                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6504             }
6505
6506             start = 1;
6507         }
6508
6509         /* Calculate bS */
6510         for( edge = start; edge < edges; edge++ ) {
6511             /* mbn_xy: neighbor macroblock */
6512             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6513             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6514             int16_t bS[4];
6515             int qp;
6516
6517             if( (edge&1) && IS_8x8DCT(mb_type) )
6518                 continue;
6519
6520             if( IS_INTRA(mb_type) ||
6521                 IS_INTRA(mbn_type) ) {
6522                 int value;
6523                 if (edge == 0) {
6524                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6525                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6526                     ) {
6527                         value = 4;
6528                     } else {
6529                         value = 3;
6530                     }
6531                 } else {
6532                     value = 3;
6533                 }
6534                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6535             } else {
6536                 int i, l;
6537                 int mv_done;
6538
6539                 if( edge & mask_edge ) {
6540                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6541                     mv_done = 1;
6542                 }
6543                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6544                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6545                     mv_done = 1;
6546                 }
6547                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6548                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6549                     int bn_idx= b_idx - (dir ? 8:1);
6550                     int v = 0;
6551                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6552                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6553                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6554                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6555                     }
6556                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6557                     mv_done = 1;
6558                 }
6559                 else
6560                     mv_done = 0;
6561
6562                 for( i = 0; i < 4; i++ ) {
6563                     int x = dir == 0 ? edge : i;
6564                     int y = dir == 0 ? i    : edge;
6565                     int b_idx= 8 + 4 + x + 8*y;
6566                     int bn_idx= b_idx - (dir ? 8:1);
6567
6568                     if( h->non_zero_count_cache[b_idx] != 0 ||
6569                         h->non_zero_count_cache[bn_idx] != 0 ) {
6570                         bS[i] = 2;
6571                     }
6572                     else if(!mv_done)
6573                     {
6574                         bS[i] = 0;
6575                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6576                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6577                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6578                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6579                                 bS[i] = 1;
6580                                 break;
6581                             }
6582                         }
6583                     }
6584                 }
6585
6586                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6587                     continue;
6588             }
6589
6590             /* Filter edge */
6591             // Do not use s->qscale as luma quantizer because it has not the same
6592             // value in IPCM macroblocks.
6593             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6594             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6595             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6596             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6597             if( dir == 0 ) {
6598                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6599                 if( (edge&1) == 0 ) {
6600                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6601                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6602                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6603                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6604                 }
6605             } else {
6606                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6607                 if( (edge&1) == 0 ) {
6608                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6609                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6610                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6611                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6612                 }
6613             }
6614         }
6615     }
6616 }
6617
6618 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6619     MpegEncContext * const s = &h->s;
6620     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6621
6622     s->mb_skip_run= -1;
6623
6624     if( h->pps.cabac ) {
6625         int i;
6626
6627         /* realign */
6628         align_get_bits( &s->gb );
6629
6630         /* init cabac */
6631         ff_init_cabac_states( &h->cabac);
6632         ff_init_cabac_decoder( &h->cabac,
6633                                s->gb.buffer + get_bits_count(&s->gb)/8,
6634                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6635         /* calculate pre-state */
6636         for( i= 0; i < 460; i++ ) {
6637             int pre;
6638             if( h->slice_type == I_TYPE )
6639                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6640             else
6641                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6642
6643             if( pre <= 63 )
6644                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6645             else
6646                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6647         }
6648
6649         for(;;){
6650 //START_TIMER
6651             int ret = decode_mb_cabac(h);
6652             int eos;
6653 //STOP_TIMER("decode_mb_cabac")
6654
6655             if(ret>=0) hl_decode_mb(h);
6656
6657             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6658                 s->mb_y++;
6659
6660                 if(ret>=0) ret = decode_mb_cabac(h);
6661
6662                 if(ret>=0) hl_decode_mb(h);
6663                 s->mb_y--;
6664             }
6665             eos = get_cabac_terminate( &h->cabac );
6666
6667             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6668                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6669                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6670                 return -1;
6671             }
6672
6673             if( ++s->mb_x >= s->mb_width ) {
6674                 s->mb_x = 0;
6675                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6676                 ++s->mb_y;
6677                 if(FIELD_OR_MBAFF_PICTURE) {
6678                     ++s->mb_y;
6679                 }
6680             }
6681
6682             if( eos || s->mb_y >= s->mb_height ) {
6683                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6684                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6685                 return 0;
6686             }
6687         }
6688
6689     } else {
6690         for(;;){
6691             int ret = decode_mb_cavlc(h);
6692
6693             if(ret>=0) hl_decode_mb(h);
6694
6695             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6696                 s->mb_y++;
6697                 ret = decode_mb_cavlc(h);
6698
6699                 if(ret>=0) hl_decode_mb(h);
6700                 s->mb_y--;
6701             }
6702
6703             if(ret<0){
6704                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6705                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6706
6707                 return -1;
6708             }
6709
6710             if(++s->mb_x >= s->mb_width){
6711                 s->mb_x=0;
6712                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6713                 ++s->mb_y;
6714                 if(FIELD_OR_MBAFF_PICTURE) {
6715                     ++s->mb_y;
6716                 }
6717                 if(s->mb_y >= s->mb_height){
6718                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6719
6720                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6721                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6722
6723                         return 0;
6724                     }else{
6725                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6726
6727                         return -1;
6728                     }
6729                 }
6730             }
6731
6732             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6733                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6734                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6735                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6736
6737                     return 0;
6738                 }else{
6739                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6740
6741                     return -1;
6742                 }
6743             }
6744         }
6745     }
6746
6747 #if 0
6748     for(;s->mb_y < s->mb_height; s->mb_y++){
6749         for(;s->mb_x < s->mb_width; s->mb_x++){
6750             int ret= decode_mb(h);
6751
6752             hl_decode_mb(h);
6753
6754             if(ret<0){
6755                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6756                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6757
6758                 return -1;
6759             }
6760
6761             if(++s->mb_x >= s->mb_width){
6762                 s->mb_x=0;
6763                 if(++s->mb_y >= s->mb_height){
6764                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6765                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6766
6767                         return 0;
6768                     }else{
6769                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6770
6771                         return -1;
6772                     }
6773                 }
6774             }
6775
6776             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6777                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6778                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6779
6780                     return 0;
6781                 }else{
6782                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6783
6784                     return -1;
6785                 }
6786             }
6787         }
6788         s->mb_x=0;
6789         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6790     }
6791 #endif
6792     return -1; //not reached
6793 }
6794
6795 static int decode_unregistered_user_data(H264Context *h, int size){
6796     MpegEncContext * const s = &h->s;
6797     uint8_t user_data[16+256];
6798     int e, build, i;
6799
6800     if(size<16)
6801         return -1;
6802
6803     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6804         user_data[i]= get_bits(&s->gb, 8);
6805     }
6806
6807     user_data[i]= 0;
6808     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6809     if(e==1 && build>=0)
6810         h->x264_build= build;
6811
6812     if(s->avctx->debug & FF_DEBUG_BUGS)
6813         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6814
6815     for(; i<size; i++)
6816         skip_bits(&s->gb, 8);
6817
6818     return 0;
6819 }
6820
6821 static int decode_sei(H264Context *h){
6822     MpegEncContext * const s = &h->s;
6823
6824     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6825         int size, type;
6826
6827         type=0;
6828         do{
6829             type+= show_bits(&s->gb, 8);
6830         }while(get_bits(&s->gb, 8) == 255);
6831
6832         size=0;
6833         do{
6834             size+= show_bits(&s->gb, 8);
6835         }while(get_bits(&s->gb, 8) == 255);
6836
6837         switch(type){
6838         case 5:
6839             if(decode_unregistered_user_data(h, size) < 0)
6840                 return -1;
6841             break;
6842         default:
6843             skip_bits(&s->gb, 8*size);
6844         }
6845
6846         //FIXME check bits here
6847         align_get_bits(&s->gb);
6848     }
6849
6850     return 0;
6851 }
6852
6853 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6854     MpegEncContext * const s = &h->s;
6855     int cpb_count, i;
6856     cpb_count = get_ue_golomb(&s->gb) + 1;
6857     get_bits(&s->gb, 4); /* bit_rate_scale */
6858     get_bits(&s->gb, 4); /* cpb_size_scale */
6859     for(i=0; i<cpb_count; i++){
6860         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6861         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6862         get_bits1(&s->gb);     /* cbr_flag */
6863     }
6864     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6865     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6866     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6867     get_bits(&s->gb, 5); /* time_offset_length */
6868 }
6869
6870 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6871     MpegEncContext * const s = &h->s;
6872     int aspect_ratio_info_present_flag;
6873     unsigned int aspect_ratio_idc;
6874     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6875
6876     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6877
6878     if( aspect_ratio_info_present_flag ) {
6879         aspect_ratio_idc= get_bits(&s->gb, 8);
6880         if( aspect_ratio_idc == EXTENDED_SAR ) {
6881             sps->sar.num= get_bits(&s->gb, 16);
6882             sps->sar.den= get_bits(&s->gb, 16);
6883         }else if(aspect_ratio_idc < 14){
6884             sps->sar=  pixel_aspect[aspect_ratio_idc];
6885         }else{
6886             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6887             return -1;
6888         }
6889     }else{
6890         sps->sar.num=
6891         sps->sar.den= 0;
6892     }
6893 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6894
6895     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6896         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6897     }
6898
6899     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6900         get_bits(&s->gb, 3);    /* video_format */
6901         get_bits1(&s->gb);      /* video_full_range_flag */
6902         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6903             get_bits(&s->gb, 8); /* colour_primaries */
6904             get_bits(&s->gb, 8); /* transfer_characteristics */
6905             get_bits(&s->gb, 8); /* matrix_coefficients */
6906         }
6907     }
6908
6909     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6910         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6911         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6912     }
6913
6914     sps->timing_info_present_flag = get_bits1(&s->gb);
6915     if(sps->timing_info_present_flag){
6916         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6917         sps->time_scale = get_bits_long(&s->gb, 32);
6918         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6919     }
6920
6921     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6922     if(nal_hrd_parameters_present_flag)
6923         decode_hrd_parameters(h, sps);
6924     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6925     if(vcl_hrd_parameters_present_flag)
6926         decode_hrd_parameters(h, sps);
6927     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6928         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6929     get_bits1(&s->gb);         /* pic_struct_present_flag */
6930
6931     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6932     if(sps->bitstream_restriction_flag){
6933         unsigned int num_reorder_frames;
6934         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6935         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6936         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6937         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6938         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6939         num_reorder_frames= get_ue_golomb(&s->gb);
6940         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6941
6942         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6943             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6944             return -1;
6945         }
6946
6947         sps->num_reorder_frames= num_reorder_frames;
6948     }
6949
6950     return 0;
6951 }
6952
6953 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6954                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6955     MpegEncContext * const s = &h->s;
6956     int i, last = 8, next = 8;
6957     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6958     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6959         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6960     else
6961     for(i=0;i<size;i++){
6962         if(next)
6963             next = (last + get_se_golomb(&s->gb)) & 0xff;
6964         if(!i && !next){ /* matrix not written, we use the preset one */
6965             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6966             break;
6967         }
6968         last = factors[scan[i]] = next ? next : last;
6969     }
6970 }
6971
6972 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6973                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6974     MpegEncContext * const s = &h->s;
6975     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6976     const uint8_t *fallback[4] = {
6977         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6978         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6979         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6980         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6981     };
6982     if(get_bits1(&s->gb)){
6983         sps->scaling_matrix_present |= is_sps;
6984         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6985         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6986         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6987         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6988         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6989         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6990         if(is_sps || pps->transform_8x8_mode){
6991             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6992             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6993         }
6994     } else if(fallback_sps) {
6995         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
6996         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
6997     }
6998 }
6999
7000 /**
7001  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7002  */
7003 static void *
7004 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7005                     const size_t size, const char *name)
7006 {
7007     if(id>=max) {
7008         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7009         return NULL;
7010     }
7011
7012     if(!vec[id]) {
7013         vec[id] = av_mallocz(size);
7014         if(vec[id] == NULL)
7015             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7016     }
7017     return vec[id];
7018 }
7019
7020 static inline int decode_seq_parameter_set(H264Context *h){
7021     MpegEncContext * const s = &h->s;
7022     int profile_idc, level_idc;
7023     unsigned int sps_id, tmp, mb_width, mb_height;
7024     int i;
7025     SPS *sps;
7026
7027     profile_idc= get_bits(&s->gb, 8);
7028     get_bits1(&s->gb);   //constraint_set0_flag
7029     get_bits1(&s->gb);   //constraint_set1_flag
7030     get_bits1(&s->gb);   //constraint_set2_flag
7031     get_bits1(&s->gb);   //constraint_set3_flag
7032     get_bits(&s->gb, 4); // reserved
7033     level_idc= get_bits(&s->gb, 8);
7034     sps_id= get_ue_golomb(&s->gb);
7035
7036     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7037     if(sps == NULL)
7038         return -1;
7039
7040     sps->profile_idc= profile_idc;
7041     sps->level_idc= level_idc;
7042
7043     if(sps->profile_idc >= 100){ //high profile
7044         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7045             get_bits1(&s->gb);  //residual_color_transform_flag
7046         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7047         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7048         sps->transform_bypass = get_bits1(&s->gb);
7049         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7050     }else
7051         sps->scaling_matrix_present = 0;
7052
7053     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7054     sps->poc_type= get_ue_golomb(&s->gb);
7055
7056     if(sps->poc_type == 0){ //FIXME #define
7057         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7058     } else if(sps->poc_type == 1){//FIXME #define
7059         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7060         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7061         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7062         tmp= get_ue_golomb(&s->gb);
7063
7064         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7065             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7066             return -1;
7067         }
7068         sps->poc_cycle_length= tmp;
7069
7070         for(i=0; i<sps->poc_cycle_length; i++)
7071             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7072     }else if(sps->poc_type != 2){
7073         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7074         return -1;
7075     }
7076
7077     tmp= get_ue_golomb(&s->gb);
7078     if(tmp > MAX_PICTURE_COUNT-2){
7079         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7080     }
7081     sps->ref_frame_count= tmp;
7082     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7083     mb_width= get_ue_golomb(&s->gb) + 1;
7084     mb_height= get_ue_golomb(&s->gb) + 1;
7085     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7086        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7087         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7088         return -1;
7089     }
7090     sps->mb_width = mb_width;
7091     sps->mb_height= mb_height;
7092
7093     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7094     if(!sps->frame_mbs_only_flag)
7095         sps->mb_aff= get_bits1(&s->gb);
7096     else
7097         sps->mb_aff= 0;
7098
7099     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7100
7101 #ifndef ALLOW_INTERLACE
7102     if(sps->mb_aff)
7103         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7104 #endif
7105     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7106         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7107
7108     sps->crop= get_bits1(&s->gb);
7109     if(sps->crop){
7110         sps->crop_left  = get_ue_golomb(&s->gb);
7111         sps->crop_right = get_ue_golomb(&s->gb);
7112         sps->crop_top   = get_ue_golomb(&s->gb);
7113         sps->crop_bottom= get_ue_golomb(&s->gb);
7114         if(sps->crop_left || sps->crop_top){
7115             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7116         }
7117     }else{
7118         sps->crop_left  =
7119         sps->crop_right =
7120         sps->crop_top   =
7121         sps->crop_bottom= 0;
7122     }
7123
7124     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7125     if( sps->vui_parameters_present_flag )
7126         decode_vui_parameters(h, sps);
7127
7128     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7129         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7130                sps_id, sps->profile_idc, sps->level_idc,
7131                sps->poc_type,
7132                sps->ref_frame_count,
7133                sps->mb_width, sps->mb_height,
7134                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7135                sps->direct_8x8_inference_flag ? "8B8" : "",
7136                sps->crop_left, sps->crop_right,
7137                sps->crop_top, sps->crop_bottom,
7138                sps->vui_parameters_present_flag ? "VUI" : ""
7139                );
7140     }
7141     return 0;
7142 }
7143
7144 static void
7145 build_qp_table(PPS *pps, int t, int index)
7146 {
7147     int i;
7148     for(i = 0; i < 255; i++)
7149         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7150 }
7151
7152 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7153     MpegEncContext * const s = &h->s;
7154     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7155     PPS *pps;
7156
7157     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7158     if(pps == NULL)
7159         return -1;
7160
7161     tmp= get_ue_golomb(&s->gb);
7162     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7163         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7164         return -1;
7165     }
7166     pps->sps_id= tmp;
7167
7168     pps->cabac= get_bits1(&s->gb);
7169     pps->pic_order_present= get_bits1(&s->gb);
7170     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7171     if(pps->slice_group_count > 1 ){
7172         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7173         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7174         switch(pps->mb_slice_group_map_type){
7175         case 0:
7176 #if 0
7177 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7178 |    run_length[ i ]                                |1  |ue(v)   |
7179 #endif
7180             break;
7181         case 2:
7182 #if 0
7183 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7184 |{                                                  |   |        |
7185 |    top_left_mb[ i ]                               |1  |ue(v)   |
7186 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7187 |   }                                               |   |        |
7188 #endif
7189             break;
7190         case 3:
7191         case 4:
7192         case 5:
7193 #if 0
7194 |   slice_group_change_direction_flag               |1  |u(1)    |
7195 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7196 #endif
7197             break;
7198         case 6:
7199 #if 0
7200 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7201 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7202 |)                                                  |   |        |
7203 |    slice_group_id[ i ]                            |1  |u(v)    |
7204 #endif
7205             break;
7206         }
7207     }
7208     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7209     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7210     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7211         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7212         pps->ref_count[0]= pps->ref_count[1]= 1;
7213         return -1;
7214     }
7215
7216     pps->weighted_pred= get_bits1(&s->gb);
7217     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7218     pps->init_qp= get_se_golomb(&s->gb) + 26;
7219     pps->init_qs= get_se_golomb(&s->gb) + 26;
7220     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7221     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7222     pps->constrained_intra_pred= get_bits1(&s->gb);
7223     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7224
7225     pps->transform_8x8_mode= 0;
7226     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7227     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7228     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7229
7230     if(get_bits_count(&s->gb) < bit_length){
7231         pps->transform_8x8_mode= get_bits1(&s->gb);
7232         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7233         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7234     } else {
7235         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7236     }
7237
7238     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7239     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7240         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7241         h->pps.chroma_qp_diff= 1;
7242     } else
7243         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7244
7245     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7246         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7247                pps_id, pps->sps_id,
7248                pps->cabac ? "CABAC" : "CAVLC",
7249                pps->slice_group_count,
7250                pps->ref_count[0], pps->ref_count[1],
7251                pps->weighted_pred ? "weighted" : "",
7252                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7253                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7254                pps->constrained_intra_pred ? "CONSTR" : "",
7255                pps->redundant_pic_cnt_present ? "REDU" : "",
7256                pps->transform_8x8_mode ? "8x8DCT" : ""
7257                );
7258     }
7259
7260     return 0;
7261 }
7262
7263 /**
7264  * Call decode_slice() for each context.
7265  *
7266  * @param h h264 master context
7267  * @param context_count number of contexts to execute
7268  */
7269 static void execute_decode_slices(H264Context *h, int context_count){
7270     MpegEncContext * const s = &h->s;
7271     AVCodecContext * const avctx= s->avctx;
7272     H264Context *hx;
7273     int i;
7274
7275     if(context_count == 1) {
7276         decode_slice(avctx, h);
7277     } else {
7278         for(i = 1; i < context_count; i++) {
7279             hx = h->thread_context[i];
7280             hx->s.error_resilience = avctx->error_resilience;
7281             hx->s.error_count = 0;
7282         }
7283
7284         avctx->execute(avctx, (void *)decode_slice,
7285                        (void **)h->thread_context, NULL, context_count);
7286
7287         /* pull back stuff from slices to master context */
7288         hx = h->thread_context[context_count - 1];
7289         s->mb_x = hx->s.mb_x;
7290         s->mb_y = hx->s.mb_y;
7291         for(i = 1; i < context_count; i++)
7292             h->s.error_count += h->thread_context[i]->s.error_count;
7293     }
7294 }
7295
7296
7297 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7298     MpegEncContext * const s = &h->s;
7299     AVCodecContext * const avctx= s->avctx;
7300     int buf_index=0;
7301     H264Context *hx; ///< thread context
7302     int context_count = 0;
7303
7304     h->max_contexts = avctx->thread_count;
7305 #if 0
7306     int i;
7307     for(i=0; i<50; i++){
7308         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7309     }
7310 #endif
7311     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7312         h->current_slice = 0;
7313         s->current_picture_ptr= NULL;
7314     }
7315
7316     for(;;){
7317         int consumed;
7318         int dst_length;
7319         int bit_length;
7320         uint8_t *ptr;
7321         int i, nalsize = 0;
7322         int err;
7323
7324         if(h->is_avc) {
7325             if(buf_index >= buf_size) break;
7326             nalsize = 0;
7327             for(i = 0; i < h->nal_length_size; i++)
7328                 nalsize = (nalsize << 8) | buf[buf_index++];
7329             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7330                 if(nalsize == 1){
7331                     buf_index++;
7332                     continue;
7333                 }else{
7334                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7335                     break;
7336                 }
7337             }
7338         } else {
7339             // start code prefix search
7340             for(; buf_index + 3 < buf_size; buf_index++){
7341                 // This should always succeed in the first iteration.
7342                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7343                     break;
7344             }
7345
7346             if(buf_index+3 >= buf_size) break;
7347
7348             buf_index+=3;
7349         }
7350
7351         hx = h->thread_context[context_count];
7352
7353         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7354         if (ptr==NULL || dst_length < 0){
7355             return -1;
7356         }
7357         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7358             dst_length--;
7359         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7360
7361         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7362             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7363         }
7364
7365         if (h->is_avc && (nalsize != consumed))
7366             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7367
7368         buf_index += consumed;
7369
7370         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7371            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7372             continue;
7373
7374       again:
7375         err = 0;
7376         switch(hx->nal_unit_type){
7377         case NAL_IDR_SLICE:
7378             if (h->nal_unit_type != NAL_IDR_SLICE) {
7379                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7380                 return -1;
7381             }
7382             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7383         case NAL_SLICE:
7384             init_get_bits(&hx->s.gb, ptr, bit_length);
7385             hx->intra_gb_ptr=
7386             hx->inter_gb_ptr= &hx->s.gb;
7387             hx->s.data_partitioning = 0;
7388
7389             if((err = decode_slice_header(hx, h)))
7390                break;
7391
7392             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7393             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7394                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7395                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7396                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7397                && avctx->skip_frame < AVDISCARD_ALL)
7398                 context_count++;
7399             break;
7400         case NAL_DPA:
7401             init_get_bits(&hx->s.gb, ptr, bit_length);
7402             hx->intra_gb_ptr=
7403             hx->inter_gb_ptr= NULL;
7404             hx->s.data_partitioning = 1;
7405
7406             err = decode_slice_header(hx, h);
7407             break;
7408         case NAL_DPB:
7409             init_get_bits(&hx->intra_gb, ptr, bit_length);
7410             hx->intra_gb_ptr= &hx->intra_gb;
7411             break;
7412         case NAL_DPC:
7413             init_get_bits(&hx->inter_gb, ptr, bit_length);
7414             hx->inter_gb_ptr= &hx->inter_gb;
7415
7416             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7417                && s->context_initialized
7418                && s->hurry_up < 5
7419                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7420                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7421                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7422                && avctx->skip_frame < AVDISCARD_ALL)
7423                 context_count++;
7424             break;
7425         case NAL_SEI:
7426             init_get_bits(&s->gb, ptr, bit_length);
7427             decode_sei(h);
7428             break;
7429         case NAL_SPS:
7430             init_get_bits(&s->gb, ptr, bit_length);
7431             decode_seq_parameter_set(h);
7432
7433             if(s->flags& CODEC_FLAG_LOW_DELAY)
7434                 s->low_delay=1;
7435
7436             if(avctx->has_b_frames < 2)
7437                 avctx->has_b_frames= !s->low_delay;
7438             break;
7439         case NAL_PPS:
7440             init_get_bits(&s->gb, ptr, bit_length);
7441
7442             decode_picture_parameter_set(h, bit_length);
7443
7444             break;
7445         case NAL_AUD:
7446         case NAL_END_SEQUENCE:
7447         case NAL_END_STREAM:
7448         case NAL_FILLER_DATA:
7449         case NAL_SPS_EXT:
7450         case NAL_AUXILIARY_SLICE:
7451             break;
7452         default:
7453             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7454         }
7455
7456         if(context_count == h->max_contexts) {
7457             execute_decode_slices(h, context_count);
7458             context_count = 0;
7459         }
7460
7461         if (err < 0)
7462             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7463         else if(err == 1) {
7464             /* Slice could not be decoded in parallel mode, copy down
7465              * NAL unit stuff to context 0 and restart. Note that
7466              * rbsp_buffer is not transfered, but since we no longer
7467              * run in parallel mode this should not be an issue. */
7468             h->nal_unit_type = hx->nal_unit_type;
7469             h->nal_ref_idc   = hx->nal_ref_idc;
7470             hx = h;
7471             goto again;
7472         }
7473     }
7474     if(context_count)
7475         execute_decode_slices(h, context_count);
7476     return buf_index;
7477 }
7478
7479 /**
7480  * returns the number of bytes consumed for building the current frame
7481  */
7482 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7483     if(s->flags&CODEC_FLAG_TRUNCATED){
7484         pos -= s->parse_context.last_index;
7485         if(pos<0) pos=0; // FIXME remove (unneeded?)
7486
7487         return pos;
7488     }else{
7489         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7490         if(pos+10>buf_size) pos=buf_size; // oops ;)
7491
7492         return pos;
7493     }
7494 }
7495
7496 static int decode_frame(AVCodecContext *avctx,
7497                              void *data, int *data_size,
7498                              uint8_t *buf, int buf_size)
7499 {
7500     H264Context *h = avctx->priv_data;
7501     MpegEncContext *s = &h->s;
7502     AVFrame *pict = data;
7503     int buf_index;
7504
7505     s->flags= avctx->flags;
7506     s->flags2= avctx->flags2;
7507
7508    /* no supplementary picture */
7509     if (buf_size == 0) {
7510         Picture *out;
7511         int i, out_idx;
7512
7513 //FIXME factorize this with the output code below
7514         out = h->delayed_pic[0];
7515         out_idx = 0;
7516         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7517             if(h->delayed_pic[i]->poc < out->poc){
7518                 out = h->delayed_pic[i];
7519                 out_idx = i;
7520             }
7521
7522         for(i=out_idx; h->delayed_pic[i]; i++)
7523             h->delayed_pic[i] = h->delayed_pic[i+1];
7524
7525         if(out){
7526             *data_size = sizeof(AVFrame);
7527             *pict= *(AVFrame*)out;
7528         }
7529
7530         return 0;
7531     }
7532
7533     if(s->flags&CODEC_FLAG_TRUNCATED){
7534         int next= ff_h264_find_frame_end(h, buf, buf_size);
7535
7536         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7537             return buf_size;
7538 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7539     }
7540
7541     if(h->is_avc && !h->got_avcC) {
7542         int i, cnt, nalsize;
7543         unsigned char *p = avctx->extradata;
7544         if(avctx->extradata_size < 7) {
7545             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7546             return -1;
7547         }
7548         if(*p != 1) {
7549             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7550             return -1;
7551         }
7552         /* sps and pps in the avcC always have length coded with 2 bytes,
7553            so put a fake nal_length_size = 2 while parsing them */
7554         h->nal_length_size = 2;
7555         // Decode sps from avcC
7556         cnt = *(p+5) & 0x1f; // Number of sps
7557         p += 6;
7558         for (i = 0; i < cnt; i++) {
7559             nalsize = AV_RB16(p) + 2;
7560             if(decode_nal_units(h, p, nalsize) < 0) {
7561                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7562                 return -1;
7563             }
7564             p += nalsize;
7565         }
7566         // Decode pps from avcC
7567         cnt = *(p++); // Number of pps
7568         for (i = 0; i < cnt; i++) {
7569             nalsize = AV_RB16(p) + 2;
7570             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7571                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7572                 return -1;
7573             }
7574             p += nalsize;
7575         }
7576         // Now store right nal length size, that will be use to parse all other nals
7577         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7578         // Do not reparse avcC
7579         h->got_avcC = 1;
7580     }
7581
7582     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7583         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7584             return -1;
7585     }
7586
7587     buf_index=decode_nal_units(h, buf, buf_size);
7588     if(buf_index < 0)
7589         return -1;
7590
7591     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7592         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7593         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7594         return -1;
7595     }
7596
7597     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7598         Picture *out = s->current_picture_ptr;
7599         Picture *cur = s->current_picture_ptr;
7600         Picture *prev = h->delayed_output_pic;
7601         int i, pics, cross_idr, out_of_order, out_idx;
7602
7603         s->mb_y= 0;
7604
7605         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7606         s->current_picture_ptr->pict_type= s->pict_type;
7607
7608         h->prev_frame_num_offset= h->frame_num_offset;
7609         h->prev_frame_num= h->frame_num;
7610         if(s->current_picture_ptr->reference & s->picture_structure){
7611             h->prev_poc_msb= h->poc_msb;
7612             h->prev_poc_lsb= h->poc_lsb;
7613             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7614         }
7615
7616         ff_er_frame_end(s);
7617
7618         MPV_frame_end(s);
7619
7620     //FIXME do something with unavailable reference frames
7621
7622 #if 0 //decode order
7623         *data_size = sizeof(AVFrame);
7624 #else
7625         /* Sort B-frames into display order */
7626
7627         if(h->sps.bitstream_restriction_flag
7628            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7629             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7630             s->low_delay = 0;
7631         }
7632
7633         pics = 0;
7634         while(h->delayed_pic[pics]) pics++;
7635
7636         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7637
7638         h->delayed_pic[pics++] = cur;
7639         if(cur->reference == 0)
7640             cur->reference = DELAYED_PIC_REF;
7641
7642         cross_idr = 0;
7643         for(i=0; h->delayed_pic[i]; i++)
7644             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7645                 cross_idr = 1;
7646
7647         out = h->delayed_pic[0];
7648         out_idx = 0;
7649         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7650             if(h->delayed_pic[i]->poc < out->poc){
7651                 out = h->delayed_pic[i];
7652                 out_idx = i;
7653             }
7654
7655         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7656         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7657             { }
7658         else if(prev && pics <= s->avctx->has_b_frames)
7659             out = prev;
7660         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7661            || (s->low_delay &&
7662             ((!cross_idr && prev && out->poc > prev->poc + 2)
7663              || cur->pict_type == B_TYPE)))
7664         {
7665             s->low_delay = 0;
7666             s->avctx->has_b_frames++;
7667             out = prev;
7668         }
7669         else if(out_of_order)
7670             out = prev;
7671
7672         if(out_of_order || pics > s->avctx->has_b_frames){
7673             for(i=out_idx; h->delayed_pic[i]; i++)
7674                 h->delayed_pic[i] = h->delayed_pic[i+1];
7675         }
7676
7677         if(prev == out)
7678             *data_size = 0;
7679         else
7680             *data_size = sizeof(AVFrame);
7681         if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7682             prev->reference = 0;
7683         h->delayed_output_pic = out;
7684 #endif
7685
7686         if(out)
7687             *pict= *(AVFrame*)out;
7688         else
7689             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7690     }
7691
7692     assert(pict->data[0] || !*data_size);
7693     ff_print_debug_info(s, pict);
7694 //printf("out %d\n", (int)pict->data[0]);
7695 #if 0 //?
7696
7697     /* Return the Picture timestamp as the frame number */
7698     /* we substract 1 because it is added on utils.c    */
7699     avctx->frame_number = s->picture_number - 1;
7700 #endif
7701     return get_consumed_bytes(s, buf_index, buf_size);
7702 }
7703 #if 0
7704 static inline void fill_mb_avail(H264Context *h){
7705     MpegEncContext * const s = &h->s;
7706     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7707
7708     if(s->mb_y){
7709         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7710         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7711         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7712     }else{
7713         h->mb_avail[0]=
7714         h->mb_avail[1]=
7715         h->mb_avail[2]= 0;
7716     }
7717     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7718     h->mb_avail[4]= 1; //FIXME move out
7719     h->mb_avail[5]= 0; //FIXME move out
7720 }
7721 #endif
7722
7723 #if 0 //selftest
7724 #undef random
7725 #define COUNT 8000
7726 #define SIZE (COUNT*40)
7727 int main(){
7728     int i;
7729     uint8_t temp[SIZE];
7730     PutBitContext pb;
7731     GetBitContext gb;
7732 //    int int_temp[10000];
7733     DSPContext dsp;
7734     AVCodecContext avctx;
7735
7736     dsputil_init(&dsp, &avctx);
7737
7738     init_put_bits(&pb, temp, SIZE);
7739     printf("testing unsigned exp golomb\n");
7740     for(i=0; i<COUNT; i++){
7741         START_TIMER
7742         set_ue_golomb(&pb, i);
7743         STOP_TIMER("set_ue_golomb");
7744     }
7745     flush_put_bits(&pb);
7746
7747     init_get_bits(&gb, temp, 8*SIZE);
7748     for(i=0; i<COUNT; i++){
7749         int j, s;
7750
7751         s= show_bits(&gb, 24);
7752
7753         START_TIMER
7754         j= get_ue_golomb(&gb);
7755         if(j != i){
7756             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7757 //            return -1;
7758         }
7759         STOP_TIMER("get_ue_golomb");
7760     }
7761
7762
7763     init_put_bits(&pb, temp, SIZE);
7764     printf("testing signed exp golomb\n");
7765     for(i=0; i<COUNT; i++){
7766         START_TIMER
7767         set_se_golomb(&pb, i - COUNT/2);
7768         STOP_TIMER("set_se_golomb");
7769     }
7770     flush_put_bits(&pb);
7771
7772     init_get_bits(&gb, temp, 8*SIZE);
7773     for(i=0; i<COUNT; i++){
7774         int j, s;
7775
7776         s= show_bits(&gb, 24);
7777
7778         START_TIMER
7779         j= get_se_golomb(&gb);
7780         if(j != i - COUNT/2){
7781             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7782 //            return -1;
7783         }
7784         STOP_TIMER("get_se_golomb");
7785     }
7786
7787     printf("testing 4x4 (I)DCT\n");
7788
7789     DCTELEM block[16];
7790     uint8_t src[16], ref[16];
7791     uint64_t error= 0, max_error=0;
7792
7793     for(i=0; i<COUNT; i++){
7794         int j;
7795 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7796         for(j=0; j<16; j++){
7797             ref[j]= random()%255;
7798             src[j]= random()%255;
7799         }
7800
7801         h264_diff_dct_c(block, src, ref, 4);
7802
7803         //normalize
7804         for(j=0; j<16; j++){
7805 //            printf("%d ", block[j]);
7806             block[j]= block[j]*4;
7807             if(j&1) block[j]= (block[j]*4 + 2)/5;
7808             if(j&4) block[j]= (block[j]*4 + 2)/5;
7809         }
7810 //        printf("\n");
7811
7812         s->dsp.h264_idct_add(ref, block, 4);
7813 /*        for(j=0; j<16; j++){
7814             printf("%d ", ref[j]);
7815         }
7816         printf("\n");*/
7817
7818         for(j=0; j<16; j++){
7819             int diff= FFABS(src[j] - ref[j]);
7820
7821             error+= diff*diff;
7822             max_error= FFMAX(max_error, diff);
7823         }
7824     }
7825     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7826 #if 0
7827     printf("testing quantizer\n");
7828     for(qp=0; qp<52; qp++){
7829         for(i=0; i<16; i++)
7830             src1_block[i]= src2_block[i]= random()%255;
7831
7832     }
7833 #endif
7834     printf("Testing NAL layer\n");
7835
7836     uint8_t bitstream[COUNT];
7837     uint8_t nal[COUNT*2];
7838     H264Context h;
7839     memset(&h, 0, sizeof(H264Context));
7840
7841     for(i=0; i<COUNT; i++){
7842         int zeros= i;
7843         int nal_length;
7844         int consumed;
7845         int out_length;
7846         uint8_t *out;
7847         int j;
7848
7849         for(j=0; j<COUNT; j++){
7850             bitstream[j]= (random() % 255) + 1;
7851         }
7852
7853         for(j=0; j<zeros; j++){
7854             int pos= random() % COUNT;
7855             while(bitstream[pos] == 0){
7856                 pos++;
7857                 pos %= COUNT;
7858             }
7859             bitstream[pos]=0;
7860         }
7861
7862         START_TIMER
7863
7864         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7865         if(nal_length<0){
7866             printf("encoding failed\n");
7867             return -1;
7868         }
7869
7870         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7871
7872         STOP_TIMER("NAL")
7873
7874         if(out_length != COUNT){
7875             printf("incorrect length %d %d\n", out_length, COUNT);
7876             return -1;
7877         }
7878
7879         if(consumed != nal_length){
7880             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7881             return -1;
7882         }
7883
7884         if(memcmp(bitstream, out, COUNT)){
7885             printf("mismatch\n");
7886             return -1;
7887         }
7888     }
7889
7890     printf("Testing RBSP\n");
7891
7892
7893     return 0;
7894 }
7895 #endif
7896
7897
7898 static int decode_end(AVCodecContext *avctx)
7899 {
7900     H264Context *h = avctx->priv_data;
7901     MpegEncContext *s = &h->s;
7902
7903     av_freep(&h->rbsp_buffer[0]);
7904     av_freep(&h->rbsp_buffer[1]);
7905     free_tables(h); //FIXME cleanup init stuff perhaps
7906     MPV_common_end(s);
7907
7908 //    memset(h, 0, sizeof(H264Context));
7909
7910     return 0;
7911 }
7912
7913
7914 AVCodec h264_decoder = {
7915     "h264",
7916     CODEC_TYPE_VIDEO,
7917     CODEC_ID_H264,
7918     sizeof(H264Context),
7919     decode_init,
7920     NULL,
7921     decode_end,
7922     decode_frame,
7923     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
7924     .flush= flush_dpb,
7925 };
7926
7927 #include "svq3.c"