git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 /**
  42  * Value of Picture.reference when Picture is not a reference picture, but
  43  * is held for delayed output.
  44  */
  45 #define DELAYED_PIC_REF 4
  46
  47 static VLC coeff_token_vlc[4];
  48 static VLC chroma_dc_coeff_token_vlc;
  49
  50 static VLC total_zeros_vlc[15];
  51 static VLC chroma_dc_total_zeros_vlc[3];
  52
  53 static VLC run_vlc[6];
  54 static VLC run7_vlc;
  55
  56 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  57 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  58 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  59 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  60
  61 static av_always_inline uint32_t pack16to32(int a, int b){
  62 #ifdef WORDS_BIGENDIAN
  63    return (b&0xFFFF) + (a<<16);
  64 #else
  65    return (a&0xFFFF) + (b<<16);
  66 #endif
  67 }
  68
  69 const uint8_t ff_rem6[52]={
  70 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  71 };
  72
  73 const uint8_t ff_div6[52]={
  74 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  75 };
  76
  77
  78 /**
  79  * fill a rectangle.
  80  * @param h height of the rectangle, should be a constant
  81  * @param w width of the rectangle, should be a constant
  82  * @param size the size of val (1 or 4), should be a constant
  83  */
  84 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  85     uint8_t *p= (uint8_t*)vp;
  86     assert(size==1 || size==4);
  87     assert(w<=4);
  88
  89     w      *= size;
  90     stride *= size;
  91
  92     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  93     assert((stride&(w-1))==0);
  94     if(w==2){
  95         const uint16_t v= size==4 ? val : val*0x0101;
  96         *(uint16_t*)(p + 0*stride)= v;
  97         if(h==1) return;
  98         *(uint16_t*)(p + 1*stride)= v;
  99         if(h==2) return;
 100         *(uint16_t*)(p + 2*stride)= v;
 101         *(uint16_t*)(p + 3*stride)= v;
 102     }else if(w==4){
 103         const uint32_t v= size==4 ? val : val*0x01010101;
 104         *(uint32_t*)(p + 0*stride)= v;
 105         if(h==1) return;
 106         *(uint32_t*)(p + 1*stride)= v;
 107         if(h==2) return;
 108         *(uint32_t*)(p + 2*stride)= v;
 109         *(uint32_t*)(p + 3*stride)= v;
 110     }else if(w==8){
 111     //gcc can't optimize 64bit math on x86_32
 112 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 113         const uint64_t v= val*0x0100000001ULL;
 114         *(uint64_t*)(p + 0*stride)= v;
 115         if(h==1) return;
 116         *(uint64_t*)(p + 1*stride)= v;
 117         if(h==2) return;
 118         *(uint64_t*)(p + 2*stride)= v;
 119         *(uint64_t*)(p + 3*stride)= v;
 120     }else if(w==16){
 121         const uint64_t v= val*0x0100000001ULL;
 122         *(uint64_t*)(p + 0+0*stride)= v;
 123         *(uint64_t*)(p + 8+0*stride)= v;
 124         *(uint64_t*)(p + 0+1*stride)= v;
 125         *(uint64_t*)(p + 8+1*stride)= v;
 126         if(h==2) return;
 127         *(uint64_t*)(p + 0+2*stride)= v;
 128         *(uint64_t*)(p + 8+2*stride)= v;
 129         *(uint64_t*)(p + 0+3*stride)= v;
 130         *(uint64_t*)(p + 8+3*stride)= v;
 131 #else
 132         *(uint32_t*)(p + 0+0*stride)= val;
 133         *(uint32_t*)(p + 4+0*stride)= val;
 134         if(h==1) return;
 135         *(uint32_t*)(p + 0+1*stride)= val;
 136         *(uint32_t*)(p + 4+1*stride)= val;
 137         if(h==2) return;
 138         *(uint32_t*)(p + 0+2*stride)= val;
 139         *(uint32_t*)(p + 4+2*stride)= val;
 140         *(uint32_t*)(p + 0+3*stride)= val;
 141         *(uint32_t*)(p + 4+3*stride)= val;
 142     }else if(w==16){
 143         *(uint32_t*)(p + 0+0*stride)= val;
 144         *(uint32_t*)(p + 4+0*stride)= val;
 145         *(uint32_t*)(p + 8+0*stride)= val;
 146         *(uint32_t*)(p +12+0*stride)= val;
 147         *(uint32_t*)(p + 0+1*stride)= val;
 148         *(uint32_t*)(p + 4+1*stride)= val;
 149         *(uint32_t*)(p + 8+1*stride)= val;
 150         *(uint32_t*)(p +12+1*stride)= val;
 151         if(h==2) return;
 152         *(uint32_t*)(p + 0+2*stride)= val;
 153         *(uint32_t*)(p + 4+2*stride)= val;
 154         *(uint32_t*)(p + 8+2*stride)= val;
 155         *(uint32_t*)(p +12+2*stride)= val;
 156         *(uint32_t*)(p + 0+3*stride)= val;
 157         *(uint32_t*)(p + 4+3*stride)= val;
 158         *(uint32_t*)(p + 8+3*stride)= val;
 159         *(uint32_t*)(p +12+3*stride)= val;
 160 #endif
 161     }else
 162         assert(0);
 163     assert(h==4);
 164 }
 165
 166 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 167     MpegEncContext * const s = &h->s;
 168     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 169     int topleft_xy, top_xy, topright_xy, left_xy[2];
 170     int topleft_type, top_type, topright_type, left_type[2];
 171     int left_block[8];
 172     int i;
 173
 174     //FIXME deblocking could skip the intra and nnz parts.
 175     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 176         return;
 177
 178     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 179
 180     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 181     topleft_xy = top_xy - 1;
 182     topright_xy= top_xy + 1;
 183     left_xy[1] = left_xy[0] = mb_xy-1;
 184     left_block[0]= 0;
 185     left_block[1]= 1;
 186     left_block[2]= 2;
 187     left_block[3]= 3;
 188     left_block[4]= 7;
 189     left_block[5]= 10;
 190     left_block[6]= 8;
 191     left_block[7]= 11;
 192     if(FRAME_MBAFF){
 193         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 194         const int top_pair_xy      = pair_xy     - s->mb_stride;
 195         const int topleft_pair_xy  = top_pair_xy - 1;
 196         const int topright_pair_xy = top_pair_xy + 1;
 197         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 198         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 199         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 200         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 201         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 202         const int bottom = (s->mb_y & 1);
 203         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 204         if (bottom
 205                 ? !curr_mb_frame_flag // bottom macroblock
 206                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 207                 ) {
 208             top_xy -= s->mb_stride;
 209         }
 210         if (bottom
 211                 ? !curr_mb_frame_flag // bottom macroblock
 212                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 213                 ) {
 214             topleft_xy -= s->mb_stride;
 215         }
 216         if (bottom
 217                 ? !curr_mb_frame_flag // bottom macroblock
 218                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 219                 ) {
 220             topright_xy -= s->mb_stride;
 221         }
 222         if (left_mb_frame_flag != curr_mb_frame_flag) {
 223             left_xy[1] = left_xy[0] = pair_xy - 1;
 224             if (curr_mb_frame_flag) {
 225                 if (bottom) {
 226                     left_block[0]= 2;
 227                     left_block[1]= 2;
 228                     left_block[2]= 3;
 229                     left_block[3]= 3;
 230                     left_block[4]= 8;
 231                     left_block[5]= 11;
 232                     left_block[6]= 8;
 233                     left_block[7]= 11;
 234                 } else {
 235                     left_block[0]= 0;
 236                     left_block[1]= 0;
 237                     left_block[2]= 1;
 238                     left_block[3]= 1;
 239                     left_block[4]= 7;
 240                     left_block[5]= 10;
 241                     left_block[6]= 7;
 242                     left_block[7]= 10;
 243                 }
 244             } else {
 245                 left_xy[1] += s->mb_stride;
 246                 //left_block[0]= 0;
 247                 left_block[1]= 2;
 248                 left_block[2]= 0;
 249                 left_block[3]= 2;
 250                 //left_block[4]= 7;
 251                 left_block[5]= 10;
 252                 left_block[6]= 7;
 253                 left_block[7]= 10;
 254             }
 255         }
 256     }
 257
 258     h->top_mb_xy = top_xy;
 259     h->left_mb_xy[0] = left_xy[0];
 260     h->left_mb_xy[1] = left_xy[1];
 261     if(for_deblock){
 262         topleft_type = 0;
 263         topright_type = 0;
 264         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 265         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 266         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 267
 268         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 269             int list;
 270             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 271             for(i=0; i<16; i++)
 272                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 273             for(list=0; list<h->list_count; list++){
 274                 if(USES_LIST(mb_type,list)){
 275                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 276                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 277                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 278                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 279                         dst[0] = src[0];
 280                         dst[1] = src[1];
 281                         dst[2] = src[2];
 282                         dst[3] = src[3];
 283                     }
 284                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 285                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 286                     ref += h->b8_stride;
 287                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 288                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 289                 }else{
 290                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 291                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 292                 }
 293             }
 294         }
 295     }else{
 296         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 297         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 298         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 299         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 300         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 301     }
 302
 303     if(IS_INTRA(mb_type)){
 304         h->topleft_samples_available=
 305         h->top_samples_available=
 306         h->left_samples_available= 0xFFFF;
 307         h->topright_samples_available= 0xEEEA;
 308
 309         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 310             h->topleft_samples_available= 0xB3FF;
 311             h->top_samples_available= 0x33FF;
 312             h->topright_samples_available= 0x26EA;
 313         }
 314         for(i=0; i<2; i++){
 315             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 316                 h->topleft_samples_available&= 0xDF5F;
 317                 h->left_samples_available&= 0x5F5F;
 318             }
 319         }
 320
 321         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 322             h->topleft_samples_available&= 0x7FFF;
 323
 324         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 325             h->topright_samples_available&= 0xFBFF;
 326
 327         if(IS_INTRA4x4(mb_type)){
 328             if(IS_INTRA4x4(top_type)){
 329                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 330                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 331                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 332                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 333             }else{
 334                 int pred;
 335                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 336                     pred= -1;
 337                 else{
 338                     pred= 2;
 339                 }
 340                 h->intra4x4_pred_mode_cache[4+8*0]=
 341                 h->intra4x4_pred_mode_cache[5+8*0]=
 342                 h->intra4x4_pred_mode_cache[6+8*0]=
 343                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 344             }
 345             for(i=0; i<2; i++){
 346                 if(IS_INTRA4x4(left_type[i])){
 347                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 348                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 349                 }else{
 350                     int pred;
 351                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 352                         pred= -1;
 353                     else{
 354                         pred= 2;
 355                     }
 356                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 357                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 358                 }
 359             }
 360         }
 361     }
 362
 363
 364 /*
 365 0 . T T. T T T T
 366 1 L . .L . . . .
 367 2 L . .L . . . .
 368 3 . T TL . . . .
 369 4 L . .L . . . .
 370 5 L . .. . . . .
 371 */
 372 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 373     if(top_type){
 374         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 375         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 376         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 377         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 378
 379         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 380         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 381
 382         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 383         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 384
 385     }else{
 386         h->non_zero_count_cache[4+8*0]=
 387         h->non_zero_count_cache[5+8*0]=
 388         h->non_zero_count_cache[6+8*0]=
 389         h->non_zero_count_cache[7+8*0]=
 390
 391         h->non_zero_count_cache[1+8*0]=
 392         h->non_zero_count_cache[2+8*0]=
 393
 394         h->non_zero_count_cache[1+8*3]=
 395         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 396
 397     }
 398
 399     for (i=0; i<2; i++) {
 400         if(left_type[i]){
 401             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 402             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 403             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 404             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 405         }else{
 406             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 407             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 408             h->non_zero_count_cache[0+8*1 +   8*i]=
 409             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 410         }
 411     }
 412
 413     if( h->pps.cabac ) {
 414         // top_cbp
 415         if(top_type) {
 416             h->top_cbp = h->cbp_table[top_xy];
 417         } else if(IS_INTRA(mb_type)) {
 418             h->top_cbp = 0x1C0;
 419         } else {
 420             h->top_cbp = 0;
 421         }
 422         // left_cbp
 423         if (left_type[0]) {
 424             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 425         } else if(IS_INTRA(mb_type)) {
 426             h->left_cbp = 0x1C0;
 427         } else {
 428             h->left_cbp = 0;
 429         }
 430         if (left_type[0]) {
 431             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 432         }
 433         if (left_type[1]) {
 434             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 435         }
 436     }
 437
 438 #if 1
 439     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 440         int list;
 441         for(list=0; list<h->list_count; list++){
 442             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 443                 /*if(!h->mv_cache_clean[list]){
 444                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 445                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 446                     h->mv_cache_clean[list]= 1;
 447                 }*/
 448                 continue;
 449             }
 450             h->mv_cache_clean[list]= 0;
 451
 452             if(USES_LIST(top_type, list)){
 453                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 454                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 455                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 456                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 457                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 458                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 459                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 460                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 461                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 462                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 463             }else{
 464                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 465                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 466                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 467                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 468                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 469             }
 470
 471             for(i=0; i<2; i++){
 472                 int cache_idx = scan8[0] - 1 + i*2*8;
 473                 if(USES_LIST(left_type[i], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 475                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 476                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 477                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 478                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 479                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 480                 }else{
 481                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 482                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 483                     h->ref_cache[list][cache_idx  ]=
 484                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 485                 }
 486             }
 487
 488             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 489                 continue;
 490
 491             if(USES_LIST(topleft_type, list)){
 492                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 493                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 494                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 495                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 496             }else{
 497                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 498                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 499             }
 500
 501             if(USES_LIST(topright_type, list)){
 502                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 503                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 504                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 505                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 506             }else{
 507                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 508                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 509             }
 510
 511             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 512                 continue;
 513
 514             h->ref_cache[list][scan8[5 ]+1] =
 515             h->ref_cache[list][scan8[7 ]+1] =
 516             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 517             h->ref_cache[list][scan8[4 ]] =
 518             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 519             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 520             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 521             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 522             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 523             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 524
 525             if( h->pps.cabac ) {
 526                 /* XXX beurk, Load mvd */
 527                 if(USES_LIST(top_type, list)){
 528                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 529                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 530                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 531                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 532                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 533                 }else{
 534                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 535                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 536                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 537                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 538                 }
 539                 if(USES_LIST(left_type[0], list)){
 540                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 541                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 542                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 543                 }else{
 544                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 545                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 546                 }
 547                 if(USES_LIST(left_type[1], list)){
 548                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 549                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 550                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 551                 }else{
 552                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 553                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 554                 }
 555                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 556                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 557                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 558                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 559                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 560
 561                 if(h->slice_type == B_TYPE){
 562                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 563
 564                     if(IS_DIRECT(top_type)){
 565                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 566                     }else if(IS_8X8(top_type)){
 567                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 568                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 569                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 570                     }else{
 571                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 572                     }
 573
 574                     if(IS_DIRECT(left_type[0]))
 575                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 576                     else if(IS_8X8(left_type[0]))
 577                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 578                     else
 579                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 580
 581                     if(IS_DIRECT(left_type[1]))
 582                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 583                     else if(IS_8X8(left_type[1]))
 584                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 585                     else
 586                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 587                 }
 588             }
 589
 590             if(FRAME_MBAFF){
 591 #define MAP_MVS\
 592                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 593                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 594                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 595                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 596                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 597                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 598                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 599                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 600                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 601                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 602                 if(MB_FIELD){
 603 #define MAP_F2F(idx, mb_type)\
 604                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 605                         h->ref_cache[list][idx] <<= 1;\
 606                         h->mv_cache[list][idx][1] /= 2;\
 607                         h->mvd_cache[list][idx][1] /= 2;\
 608                     }
 609                     MAP_MVS
 610 #undef MAP_F2F
 611                 }else{
 612 #define MAP_F2F(idx, mb_type)\
 613                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 614                         h->ref_cache[list][idx] >>= 1;\
 615                         h->mv_cache[list][idx][1] <<= 1;\
 616                         h->mvd_cache[list][idx][1] <<= 1;\
 617                     }
 618                     MAP_MVS
 619 #undef MAP_F2F
 620                 }
 621             }
 622         }
 623     }
 624 #endif
 625
 626     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 627 }
 628
 629 static inline void write_back_intra_pred_mode(H264Context *h){
 630     MpegEncContext * const s = &h->s;
 631     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 632
 633     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 634     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 635     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 636     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 637     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 638     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 639     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 640 }
 641
 642 /**
 643  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 644  */
 645 static inline int check_intra4x4_pred_mode(H264Context *h){
 646     MpegEncContext * const s = &h->s;
 647     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 648     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 649     int i;
 650
 651     if(!(h->top_samples_available&0x8000)){
 652         for(i=0; i<4; i++){
 653             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 654             if(status<0){
 655                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 656                 return -1;
 657             } else if(status){
 658                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 659             }
 660         }
 661     }
 662
 663     if(!(h->left_samples_available&0x8000)){
 664         for(i=0; i<4; i++){
 665             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 666             if(status<0){
 667                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 668                 return -1;
 669             } else if(status){
 670                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 671             }
 672         }
 673     }
 674
 675     return 0;
 676 } //FIXME cleanup like next
 677
 678 /**
 679  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 680  */
 681 static inline int check_intra_pred_mode(H264Context *h, int mode){
 682     MpegEncContext * const s = &h->s;
 683     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 684     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 685
 686     if(mode > 6U) {
 687         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 688         return -1;
 689     }
 690
 691     if(!(h->top_samples_available&0x8000)){
 692         mode= top[ mode ];
 693         if(mode<0){
 694             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 695             return -1;
 696         }
 697     }
 698
 699     if(!(h->left_samples_available&0x8000)){
 700         mode= left[ mode ];
 701         if(mode<0){
 702             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 703             return -1;
 704         }
 705     }
 706
 707     return mode;
 708 }
 709
 710 /**
 711  * gets the predicted intra4x4 prediction mode.
 712  */
 713 static inline int pred_intra_mode(H264Context *h, int n){
 714     const int index8= scan8[n];
 715     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 716     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 717     const int min= FFMIN(left, top);
 718
 719     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 720
 721     if(min<0) return DC_PRED;
 722     else      return min;
 723 }
 724
 725 static inline void write_back_non_zero_count(H264Context *h){
 726     MpegEncContext * const s = &h->s;
 727     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 728
 729     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 730     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 731     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 732     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 733     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 734     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 735     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 736
 737     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 738     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 739     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 740
 741     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 742     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 743     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 744
 745     if(FRAME_MBAFF){
 746         // store all luma nnzs, for deblocking
 747         int v = 0, i;
 748         for(i=0; i<16; i++)
 749             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 750         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 751     }
 752 }
 753
 754 /**
 755  * gets the predicted number of non zero coefficients.
 756  * @param n block index
 757  */
 758 static inline int pred_non_zero_count(H264Context *h, int n){
 759     const int index8= scan8[n];
 760     const int left= h->non_zero_count_cache[index8 - 1];
 761     const int top = h->non_zero_count_cache[index8 - 8];
 762     int i= left + top;
 763
 764     if(i<64) i= (i+1)>>1;
 765
 766     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 767
 768     return i&31;
 769 }
 770
 771 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 772     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 773     MpegEncContext *s = &h->s;
 774
 775     /* there is no consistent mapping of mvs to neighboring locations that will
 776      * make mbaff happy, so we can't move all this logic to fill_caches */
 777     if(FRAME_MBAFF){
 778         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 779         const int16_t *mv;
 780         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 781         *C = h->mv_cache[list][scan8[0]-2];
 782
 783         if(!MB_FIELD
 784            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 785             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 786             if(IS_INTERLACED(mb_types[topright_xy])){
 787 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 788                 const int x4 = X4, y4 = Y4;\
 789                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 790                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 791                     return LIST_NOT_USED;\
 792                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 793                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 794                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 795                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 796
 797                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 798             }
 799         }
 800         if(topright_ref == PART_NOT_AVAILABLE
 801            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 802            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 803             if(!MB_FIELD
 804                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 805                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 806             }
 807             if(MB_FIELD
 808                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 809                && i >= scan8[0]+8){
 810                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 811                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 812             }
 813         }
 814 #undef SET_DIAG_MV
 815     }
 816
 817     if(topright_ref != PART_NOT_AVAILABLE){
 818         *C= h->mv_cache[list][ i - 8 + part_width ];
 819         return topright_ref;
 820     }else{
 821         tprintf(s->avctx, "topright MV not available\n");
 822
 823         *C= h->mv_cache[list][ i - 8 - 1 ];
 824         return h->ref_cache[list][ i - 8 - 1 ];
 825     }
 826 }
 827
 828 /**
 829  * gets the predicted MV.
 830  * @param n the block index
 831  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 832  * @param mx the x component of the predicted motion vector
 833  * @param my the y component of the predicted motion vector
 834  */
 835 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 836     const int index8= scan8[n];
 837     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 838     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 839     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 840     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 841     const int16_t * C;
 842     int diagonal_ref, match_count;
 843
 844     assert(part_width==1 || part_width==2 || part_width==4);
 845
 846 /* mv_cache
 847   B . . A T T T T
 848   U . . L . . , .
 849   U . . L . . . .
 850   U . . L . . , .
 851   . . . L . . . .
 852 */
 853
 854     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 855     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 856     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 857     if(match_count > 1){ //most common
 858         *mx= mid_pred(A[0], B[0], C[0]);
 859         *my= mid_pred(A[1], B[1], C[1]);
 860     }else if(match_count==1){
 861         if(left_ref==ref){
 862             *mx= A[0];
 863             *my= A[1];
 864         }else if(top_ref==ref){
 865             *mx= B[0];
 866             *my= B[1];
 867         }else{
 868             *mx= C[0];
 869             *my= C[1];
 870         }
 871     }else{
 872         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 873             *mx= A[0];
 874             *my= A[1];
 875         }else{
 876             *mx= mid_pred(A[0], B[0], C[0]);
 877             *my= mid_pred(A[1], B[1], C[1]);
 878         }
 879     }
 880
 881     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 882 }
 883
 884 /**
 885  * gets the directionally predicted 16x8 MV.
 886  * @param n the block index
 887  * @param mx the x component of the predicted motion vector
 888  * @param my the y component of the predicted motion vector
 889  */
 890 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 891     if(n==0){
 892         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 893         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 894
 895         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 896
 897         if(top_ref == ref){
 898             *mx= B[0];
 899             *my= B[1];
 900             return;
 901         }
 902     }else{
 903         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 904         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 905
 906         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 907
 908         if(left_ref == ref){
 909             *mx= A[0];
 910             *my= A[1];
 911             return;
 912         }
 913     }
 914
 915     //RARE
 916     pred_motion(h, n, 4, list, ref, mx, my);
 917 }
 918
 919 /**
 920  * gets the directionally predicted 8x16 MV.
 921  * @param n the block index
 922  * @param mx the x component of the predicted motion vector
 923  * @param my the y component of the predicted motion vector
 924  */
 925 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 926     if(n==0){
 927         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 928         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 929
 930         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 931
 932         if(left_ref == ref){
 933             *mx= A[0];
 934             *my= A[1];
 935             return;
 936         }
 937     }else{
 938         const int16_t * C;
 939         int diagonal_ref;
 940
 941         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 942
 943         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 944
 945         if(diagonal_ref == ref){
 946             *mx= C[0];
 947             *my= C[1];
 948             return;
 949         }
 950     }
 951
 952     //RARE
 953     pred_motion(h, n, 2, list, ref, mx, my);
 954 }
 955
 956 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 957     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 958     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 959
 960     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 961
 962     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 963        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 964        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 965
 966         *mx = *my = 0;
 967         return;
 968     }
 969
 970     pred_motion(h, 0, 4, 0, 0, mx, my);
 971
 972     return;
 973 }
 974
 975 static inline void direct_dist_scale_factor(H264Context * const h){
 976     const int poc = h->s.current_picture_ptr->poc;
 977     const int poc1 = h->ref_list[1][0].poc;
 978     int i;
 979     for(i=0; i<h->ref_count[0]; i++){
 980         int poc0 = h->ref_list[0][i].poc;
 981         int td = av_clip(poc1 - poc0, -128, 127);
 982         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 983             h->dist_scale_factor[i] = 256;
 984         }else{
 985             int tb = av_clip(poc - poc0, -128, 127);
 986             int tx = (16384 + (FFABS(td) >> 1)) / td;
 987             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 988         }
 989     }
 990     if(FRAME_MBAFF){
 991         for(i=0; i<h->ref_count[0]; i++){
 992             h->dist_scale_factor_field[2*i] =
 993             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 994         }
 995     }
 996 }
 997 static inline void direct_ref_list_init(H264Context * const h){
 998     MpegEncContext * const s = &h->s;
 999     Picture * const ref1 = &h->ref_list[1][0];
1000     Picture * const cur = s->current_picture_ptr;
1001     int list, i, j;
1002     if(cur->pict_type == I_TYPE)
1003         cur->ref_count[0] = 0;
1004     if(cur->pict_type != B_TYPE)
1005         cur->ref_count[1] = 0;
1006     for(list=0; list<2; list++){
1007         cur->ref_count[list] = h->ref_count[list];
1008         for(j=0; j<h->ref_count[list]; j++)
1009             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1010     }
1011     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1012         return;
1013     for(list=0; list<2; list++){
1014         for(i=0; i<ref1->ref_count[list]; i++){
1015             const int poc = ref1->ref_poc[list][i];
1016             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1017             for(j=0; j<h->ref_count[list]; j++)
1018                 if(h->ref_list[list][j].poc == poc){
1019                     h->map_col_to_list0[list][i] = j;
1020                     break;
1021                 }
1022         }
1023     }
1024     if(FRAME_MBAFF){
1025         for(list=0; list<2; list++){
1026             for(i=0; i<ref1->ref_count[list]; i++){
1027                 j = h->map_col_to_list0[list][i];
1028                 h->map_col_to_list0_field[list][2*i] = 2*j;
1029                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1030             }
1031         }
1032     }
1033 }
1034
1035 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1036     MpegEncContext * const s = &h->s;
1037     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1038     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1039     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1040     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1041     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1042     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1043     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1044     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1045     const int is_b8x8 = IS_8X8(*mb_type);
1046     unsigned int sub_mb_type;
1047     int i8, i4;
1048
1049 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1050     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1051         /* FIXME save sub mb types from previous frames (or derive from MVs)
1052          * so we know exactly what block size to use */
1053         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1054         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1055     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1056         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1057         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1058     }else{
1059         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1060         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1061     }
1062     if(!is_b8x8)
1063         *mb_type |= MB_TYPE_DIRECT2;
1064     if(MB_FIELD)
1065         *mb_type |= MB_TYPE_INTERLACED;
1066
1067     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1068
1069     if(h->direct_spatial_mv_pred){
1070         int ref[2];
1071         int mv[2][2];
1072         int list;
1073
1074         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1075
1076         /* ref = min(neighbors) */
1077         for(list=0; list<2; list++){
1078             int refa = h->ref_cache[list][scan8[0] - 1];
1079             int refb = h->ref_cache[list][scan8[0] - 8];
1080             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1081             if(refc == -2)
1082                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1083             ref[list] = refa;
1084             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1085                 ref[list] = refb;
1086             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1087                 ref[list] = refc;
1088             if(ref[list] < 0)
1089                 ref[list] = -1;
1090         }
1091
1092         if(ref[0] < 0 && ref[1] < 0){
1093             ref[0] = ref[1] = 0;
1094             mv[0][0] = mv[0][1] =
1095             mv[1][0] = mv[1][1] = 0;
1096         }else{
1097             for(list=0; list<2; list++){
1098                 if(ref[list] >= 0)
1099                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1100                 else
1101                     mv[list][0] = mv[list][1] = 0;
1102             }
1103         }
1104
1105         if(ref[1] < 0){
1106             *mb_type &= ~MB_TYPE_P0L1;
1107             sub_mb_type &= ~MB_TYPE_P0L1;
1108         }else if(ref[0] < 0){
1109             *mb_type &= ~MB_TYPE_P0L0;
1110             sub_mb_type &= ~MB_TYPE_P0L0;
1111         }
1112
1113         if(IS_16X16(*mb_type)){
1114             int a=0, b=0;
1115
1116             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1117             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1118             if(!IS_INTRA(mb_type_col)
1119                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1120                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1121                        && (h->x264_build>33 || !h->x264_build)))){
1122                 if(ref[0] > 0)
1123                     a= pack16to32(mv[0][0],mv[0][1]);
1124                 if(ref[1] > 0)
1125                     b= pack16to32(mv[1][0],mv[1][1]);
1126             }else{
1127                 a= pack16to32(mv[0][0],mv[0][1]);
1128                 b= pack16to32(mv[1][0],mv[1][1]);
1129             }
1130             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1131             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1132         }else{
1133             for(i8=0; i8<4; i8++){
1134                 const int x8 = i8&1;
1135                 const int y8 = i8>>1;
1136
1137                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1138                     continue;
1139                 h->sub_mb_type[i8] = sub_mb_type;
1140
1141                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1142                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1143                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1144                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1145
1146                 /* col_zero_flag */
1147                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1148                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1149                                                   && (h->x264_build>33 || !h->x264_build)))){
1150                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1151                     if(IS_SUB_8X8(sub_mb_type)){
1152                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1153                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1154                             if(ref[0] == 0)
1155                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1156                             if(ref[1] == 0)
1157                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1158                         }
1159                     }else
1160                     for(i4=0; i4<4; i4++){
1161                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1162                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1163                             if(ref[0] == 0)
1164                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1165                             if(ref[1] == 0)
1166                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1167                         }
1168                     }
1169                 }
1170             }
1171         }
1172     }else{ /* direct temporal mv pred */
1173         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1174         const int *dist_scale_factor = h->dist_scale_factor;
1175
1176         if(FRAME_MBAFF){
1177             if(IS_INTERLACED(*mb_type)){
1178                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1179                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1180                 dist_scale_factor = h->dist_scale_factor_field;
1181             }
1182             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1183                 /* FIXME assumes direct_8x8_inference == 1 */
1184                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1185                 int mb_types_col[2];
1186                 int y_shift;
1187
1188                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1189                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1190                          | (*mb_type & MB_TYPE_INTERLACED);
1191                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1192
1193                 if(IS_INTERLACED(*mb_type)){
1194                     /* frame to field scaling */
1195                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1196                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1197                     if(s->mb_y&1){
1198                         l1ref0 -= 2*h->b8_stride;
1199                         l1ref1 -= 2*h->b8_stride;
1200                         l1mv0 -= 4*h->b_stride;
1201                         l1mv1 -= 4*h->b_stride;
1202                     }
1203                     y_shift = 0;
1204
1205                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1206                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1207                        && !is_b8x8)
1208                         *mb_type |= MB_TYPE_16x8;
1209                     else
1210                         *mb_type |= MB_TYPE_8x8;
1211                 }else{
1212                     /* field to frame scaling */
1213                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1214                      * but in MBAFF, top and bottom POC are equal */
1215                     int dy = (s->mb_y&1) ? 1 : 2;
1216                     mb_types_col[0] =
1217                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1218                     l1ref0 += dy*h->b8_stride;
1219                     l1ref1 += dy*h->b8_stride;
1220                     l1mv0 += 2*dy*h->b_stride;
1221                     l1mv1 += 2*dy*h->b_stride;
1222                     y_shift = 2;
1223
1224                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1225                        && !is_b8x8)
1226                         *mb_type |= MB_TYPE_16x16;
1227                     else
1228                         *mb_type |= MB_TYPE_8x8;
1229                 }
1230
1231                 for(i8=0; i8<4; i8++){
1232                     const int x8 = i8&1;
1233                     const int y8 = i8>>1;
1234                     int ref0, scale;
1235                     const int16_t (*l1mv)[2]= l1mv0;
1236
1237                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1238                         continue;
1239                     h->sub_mb_type[i8] = sub_mb_type;
1240
1241                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1242                     if(IS_INTRA(mb_types_col[y8])){
1243                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1244                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1245                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1246                         continue;
1247                     }
1248
1249                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1250                     if(ref0 >= 0)
1251                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1252                     else{
1253                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1254                         l1mv= l1mv1;
1255                     }
1256                     scale = dist_scale_factor[ref0];
1257                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1258
1259                     {
1260                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1261                         int my_col = (mv_col[1]<<y_shift)/2;
1262                         int mx = (scale * mv_col[0] + 128) >> 8;
1263                         int my = (scale * my_col + 128) >> 8;
1264                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1265                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1266                     }
1267                 }
1268                 return;
1269             }
1270         }
1271
1272         /* one-to-one mv scaling */
1273
1274         if(IS_16X16(*mb_type)){
1275             int ref, mv0, mv1;
1276
1277             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1278             if(IS_INTRA(mb_type_col)){
1279                 ref=mv0=mv1=0;
1280             }else{
1281                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1282                                                 : map_col_to_list0[1][l1ref1[0]];
1283                 const int scale = dist_scale_factor[ref0];
1284                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1285                 int mv_l0[2];
1286                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1287                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1288                 ref= ref0;
1289                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1290                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1291             }
1292             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1293             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1294             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1295         }else{
1296             for(i8=0; i8<4; i8++){
1297                 const int x8 = i8&1;
1298                 const int y8 = i8>>1;
1299                 int ref0, scale;
1300                 const int16_t (*l1mv)[2]= l1mv0;
1301
1302                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1303                     continue;
1304                 h->sub_mb_type[i8] = sub_mb_type;
1305                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1306                 if(IS_INTRA(mb_type_col)){
1307                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1308                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1309                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1310                     continue;
1311                 }
1312
1313                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1314                 if(ref0 >= 0)
1315                     ref0 = map_col_to_list0[0][ref0];
1316                 else{
1317                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1318                     l1mv= l1mv1;
1319                 }
1320                 scale = dist_scale_factor[ref0];
1321
1322                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1323                 if(IS_SUB_8X8(sub_mb_type)){
1324                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1325                     int mx = (scale * mv_col[0] + 128) >> 8;
1326                     int my = (scale * mv_col[1] + 128) >> 8;
1327                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1328                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1329                 }else
1330                 for(i4=0; i4<4; i4++){
1331                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1332                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1333                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1334                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1335                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1336                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1337                 }
1338             }
1339         }
1340     }
1341 }
1342
1343 static inline void write_back_motion(H264Context *h, int mb_type){
1344     MpegEncContext * const s = &h->s;
1345     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1346     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1347     int list;
1348
1349     if(!USES_LIST(mb_type, 0))
1350         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1351
1352     for(list=0; list<h->list_count; list++){
1353         int y;
1354         if(!USES_LIST(mb_type, list))
1355             continue;
1356
1357         for(y=0; y<4; y++){
1358             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1359             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1360         }
1361         if( h->pps.cabac ) {
1362             if(IS_SKIP(mb_type))
1363                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1364             else
1365             for(y=0; y<4; y++){
1366                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1367                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1368             }
1369         }
1370
1371         {
1372             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1373             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1374             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1375             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1376             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1377         }
1378     }
1379
1380     if(h->slice_type == B_TYPE && h->pps.cabac){
1381         if(IS_8X8(mb_type)){
1382             uint8_t *direct_table = &h->direct_table[b8_xy];
1383             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1384             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1385             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1386         }
1387     }
1388 }
1389
1390 /**
1391  * Decodes a network abstraction layer unit.
1392  * @param consumed is the number of bytes used as input
1393  * @param length is the length of the array
1394  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1395  * @returns decoded bytes, might be src+1 if no escapes
1396  */
1397 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1398     int i, si, di;
1399     uint8_t *dst;
1400     int bufidx;
1401
1402 //    src[0]&0x80;                //forbidden bit
1403     h->nal_ref_idc= src[0]>>5;
1404     h->nal_unit_type= src[0]&0x1F;
1405
1406     src++; length--;
1407 #if 0
1408     for(i=0; i<length; i++)
1409         printf("%2X ", src[i]);
1410 #endif
1411     for(i=0; i+1<length; i+=2){
1412         if(src[i]) continue;
1413         if(i>0 && src[i-1]==0) i--;
1414         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1415             if(src[i+2]!=3){
1416                 /* startcode, so we must be past the end */
1417                 length=i;
1418             }
1419             break;
1420         }
1421     }
1422
1423     if(i>=length-1){ //no escaped 0
1424         *dst_length= length;
1425         *consumed= length+1; //+1 for the header
1426         return src;
1427     }
1428
1429     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1430     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1431     dst= h->rbsp_buffer[bufidx];
1432
1433     if (dst == NULL){
1434         return NULL;
1435     }
1436
1437 //printf("decoding esc\n");
1438     si=di=0;
1439     while(si<length){
1440         //remove escapes (very rare 1:2^22)
1441         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1442             if(src[si+2]==3){ //escape
1443                 dst[di++]= 0;
1444                 dst[di++]= 0;
1445                 si+=3;
1446                 continue;
1447             }else //next start code
1448                 break;
1449         }
1450
1451         dst[di++]= src[si++];
1452     }
1453
1454     *dst_length= di;
1455     *consumed= si + 1;//+1 for the header
1456 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1457     return dst;
1458 }
1459
1460 /**
1461  * identifies the exact end of the bitstream
1462  * @return the length of the trailing, or 0 if damaged
1463  */
1464 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1465     int v= *src;
1466     int r;
1467
1468     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1469
1470     for(r=1; r<9; r++){
1471         if(v&1) return r;
1472         v>>=1;
1473     }
1474     return 0;
1475 }
1476
1477 /**
1478  * idct tranforms the 16 dc values and dequantize them.
1479  * @param qp quantization parameter
1480  */
1481 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1482 #define stride 16
1483     int i;
1484     int temp[16]; //FIXME check if this is a good idea
1485     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1486     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1487
1488 //memset(block, 64, 2*256);
1489 //return;
1490     for(i=0; i<4; i++){
1491         const int offset= y_offset[i];
1492         const int z0= block[offset+stride*0] + block[offset+stride*4];
1493         const int z1= block[offset+stride*0] - block[offset+stride*4];
1494         const int z2= block[offset+stride*1] - block[offset+stride*5];
1495         const int z3= block[offset+stride*1] + block[offset+stride*5];
1496
1497         temp[4*i+0]= z0+z3;
1498         temp[4*i+1]= z1+z2;
1499         temp[4*i+2]= z1-z2;
1500         temp[4*i+3]= z0-z3;
1501     }
1502
1503     for(i=0; i<4; i++){
1504         const int offset= x_offset[i];
1505         const int z0= temp[4*0+i] + temp[4*2+i];
1506         const int z1= temp[4*0+i] - temp[4*2+i];
1507         const int z2= temp[4*1+i] - temp[4*3+i];
1508         const int z3= temp[4*1+i] + temp[4*3+i];
1509
1510         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1511         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1512         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1513         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1514     }
1515 }
1516
1517 #if 0
1518 /**
1519  * dct tranforms the 16 dc values.
1520  * @param qp quantization parameter ??? FIXME
1521  */
1522 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1523 //    const int qmul= dequant_coeff[qp][0];
1524     int i;
1525     int temp[16]; //FIXME check if this is a good idea
1526     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1527     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1528
1529     for(i=0; i<4; i++){
1530         const int offset= y_offset[i];
1531         const int z0= block[offset+stride*0] + block[offset+stride*4];
1532         const int z1= block[offset+stride*0] - block[offset+stride*4];
1533         const int z2= block[offset+stride*1] - block[offset+stride*5];
1534         const int z3= block[offset+stride*1] + block[offset+stride*5];
1535
1536         temp[4*i+0]= z0+z3;
1537         temp[4*i+1]= z1+z2;
1538         temp[4*i+2]= z1-z2;
1539         temp[4*i+3]= z0-z3;
1540     }
1541
1542     for(i=0; i<4; i++){
1543         const int offset= x_offset[i];
1544         const int z0= temp[4*0+i] + temp[4*2+i];
1545         const int z1= temp[4*0+i] - temp[4*2+i];
1546         const int z2= temp[4*1+i] - temp[4*3+i];
1547         const int z3= temp[4*1+i] + temp[4*3+i];
1548
1549         block[stride*0 +offset]= (z0 + z3)>>1;
1550         block[stride*2 +offset]= (z1 + z2)>>1;
1551         block[stride*8 +offset]= (z1 - z2)>>1;
1552         block[stride*10+offset]= (z0 - z3)>>1;
1553     }
1554 }
1555 #endif
1556
1557 #undef xStride
1558 #undef stride
1559
1560 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1561     const int stride= 16*2;
1562     const int xStride= 16;
1563     int a,b,c,d,e;
1564
1565     a= block[stride*0 + xStride*0];
1566     b= block[stride*0 + xStride*1];
1567     c= block[stride*1 + xStride*0];
1568     d= block[stride*1 + xStride*1];
1569
1570     e= a-b;
1571     a= a+b;
1572     b= c-d;
1573     c= c+d;
1574
1575     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1576     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1577     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1578     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1579 }
1580
1581 #if 0
1582 static void chroma_dc_dct_c(DCTELEM *block){
1583     const int stride= 16*2;
1584     const int xStride= 16;
1585     int a,b,c,d,e;
1586
1587     a= block[stride*0 + xStride*0];
1588     b= block[stride*0 + xStride*1];
1589     c= block[stride*1 + xStride*0];
1590     d= block[stride*1 + xStride*1];
1591
1592     e= a-b;
1593     a= a+b;
1594     b= c-d;
1595     c= c+d;
1596
1597     block[stride*0 + xStride*0]= (a+c);
1598     block[stride*0 + xStride*1]= (e+b);
1599     block[stride*1 + xStride*0]= (a-c);
1600     block[stride*1 + xStride*1]= (e-b);
1601 }
1602 #endif
1603
1604 /**
1605  * gets the chroma qp.
1606  */
1607 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1608     return h->pps.chroma_qp_table[t][qscale & 0xff];
1609 }
1610
1611 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1612 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1613 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1614     int i;
1615     const int * const quant_table= quant_coeff[qscale];
1616     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1617     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1618     const unsigned int threshold2= (threshold1<<1);
1619     int last_non_zero;
1620
1621     if(separate_dc){
1622         if(qscale<=18){
1623             //avoid overflows
1624             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1625             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1626             const unsigned int dc_threshold2= (dc_threshold1<<1);
1627
1628             int level= block[0]*quant_coeff[qscale+18][0];
1629             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1630                 if(level>0){
1631                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1632                     block[0]= level;
1633                 }else{
1634                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1635                     block[0]= -level;
1636                 }
1637 //                last_non_zero = i;
1638             }else{
1639                 block[0]=0;
1640             }
1641         }else{
1642             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1643             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1644             const unsigned int dc_threshold2= (dc_threshold1<<1);
1645
1646             int level= block[0]*quant_table[0];
1647             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1648                 if(level>0){
1649                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1650                     block[0]= level;
1651                 }else{
1652                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1653                     block[0]= -level;
1654                 }
1655 //                last_non_zero = i;
1656             }else{
1657                 block[0]=0;
1658             }
1659         }
1660         last_non_zero= 0;
1661         i=1;
1662     }else{
1663         last_non_zero= -1;
1664         i=0;
1665     }
1666
1667     for(; i<16; i++){
1668         const int j= scantable[i];
1669         int level= block[j]*quant_table[j];
1670
1671 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1672 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1673         if(((unsigned)(level+threshold1))>threshold2){
1674             if(level>0){
1675                 level= (bias + level)>>QUANT_SHIFT;
1676                 block[j]= level;
1677             }else{
1678                 level= (bias - level)>>QUANT_SHIFT;
1679                 block[j]= -level;
1680             }
1681             last_non_zero = i;
1682         }else{
1683             block[j]=0;
1684         }
1685     }
1686
1687     return last_non_zero;
1688 }
1689
1690 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1691                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1692                            int src_x_offset, int src_y_offset,
1693                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1694     MpegEncContext * const s = &h->s;
1695     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1696     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1697     const int luma_xy= (mx&3) + ((my&3)<<2);
1698     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1699     uint8_t * src_cb, * src_cr;
1700     int extra_width= h->emu_edge_width;
1701     int extra_height= h->emu_edge_height;
1702     int emu=0;
1703     const int full_mx= mx>>2;
1704     const int full_my= my>>2;
1705     const int pic_width  = 16*s->mb_width;
1706     const int pic_height = 16*s->mb_height >> (MB_MBAFF || FIELD_PICTURE);
1707
1708     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1709         return;
1710
1711     if(mx&7) extra_width -= 3;
1712     if(my&7) extra_height -= 3;
1713
1714     if(   full_mx < 0-extra_width
1715        || full_my < 0-extra_height
1716        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1717        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1719             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1720         emu=1;
1721     }
1722
1723     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1724     if(!square){
1725         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1726     }
1727
1728     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1729
1730     if(MB_MBAFF || FIELD_PICTURE){
1731         // chroma offset when predicting from a field of opposite parity
1732         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
1733         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1734     }
1735     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1736     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1737
1738     if(emu){
1739         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1740             src_cb= s->edge_emu_buffer;
1741     }
1742     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1743
1744     if(emu){
1745         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1746             src_cr= s->edge_emu_buffer;
1747     }
1748     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1749 }
1750
1751 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1752                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1753                            int x_offset, int y_offset,
1754                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1755                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1756                            int list0, int list1){
1757     MpegEncContext * const s = &h->s;
1758     qpel_mc_func *qpix_op=  qpix_put;
1759     h264_chroma_mc_func chroma_op= chroma_put;
1760
1761     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1762     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1763     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1764     x_offset += 8*s->mb_x;
1765     y_offset += 8*(s->mb_y >> (MB_MBAFF || FIELD_PICTURE));
1766
1767     if(list0){
1768         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1769         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1770                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1771                            qpix_op, chroma_op);
1772
1773         qpix_op=  qpix_avg;
1774         chroma_op= chroma_avg;
1775     }
1776
1777     if(list1){
1778         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1779         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1780                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1781                            qpix_op, chroma_op);
1782     }
1783 }
1784
1785 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1786                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1787                            int x_offset, int y_offset,
1788                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1789                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1790                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1791                            int list0, int list1){
1792     MpegEncContext * const s = &h->s;
1793
1794     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1795     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1796     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1797     x_offset += 8*s->mb_x;
1798     y_offset += 8*(s->mb_y >> (MB_MBAFF || FIELD_PICTURE));
1799
1800     if(list0 && list1){
1801         /* don't optimize for luma-only case, since B-frames usually
1802          * use implicit weights => chroma too. */
1803         uint8_t *tmp_cb = s->obmc_scratchpad;
1804         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1805         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1806         int refn0 = h->ref_cache[0][ scan8[n] ];
1807         int refn1 = h->ref_cache[1][ scan8[n] ];
1808
1809         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1810                     dest_y, dest_cb, dest_cr,
1811                     x_offset, y_offset, qpix_put, chroma_put);
1812         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1813                     tmp_y, tmp_cb, tmp_cr,
1814                     x_offset, y_offset, qpix_put, chroma_put);
1815
1816         if(h->use_weight == 2){
1817             int weight0 = h->implicit_weight[refn0][refn1];
1818             int weight1 = 64 - weight0;
1819             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1820             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1821             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1822         }else{
1823             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1824                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1825                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1826             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1827                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1828                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1829             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1830                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1831                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1832         }
1833     }else{
1834         int list = list1 ? 1 : 0;
1835         int refn = h->ref_cache[list][ scan8[n] ];
1836         Picture *ref= &h->ref_list[list][refn];
1837         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1838                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1839                     qpix_put, chroma_put);
1840
1841         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1842                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1843         if(h->use_weight_chroma){
1844             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1845                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1846             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1847                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1848         }
1849     }
1850 }
1851
1852 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1853                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1854                            int x_offset, int y_offset,
1855                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1856                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1857                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1858                            int list0, int list1){
1859     if((h->use_weight==2 && list0 && list1
1860         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1861        || h->use_weight==1)
1862         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1863                          x_offset, y_offset, qpix_put, chroma_put,
1864                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1865     else
1866         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1867                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1868 }
1869
1870 static inline void prefetch_motion(H264Context *h, int list){
1871     /* fetch pixels for estimated mv 4 macroblocks ahead
1872      * optimized for 64byte cache lines */
1873     MpegEncContext * const s = &h->s;
1874     const int refn = h->ref_cache[list][scan8[0]];
1875     if(refn >= 0){
1876         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1877         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1878         uint8_t **src= h->ref_list[list][refn].data;
1879         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1880         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1881         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1882         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1883     }
1884 }
1885
1886 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1887                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1888                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1889                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1890     MpegEncContext * const s = &h->s;
1891     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1892     const int mb_type= s->current_picture.mb_type[mb_xy];
1893
1894     assert(IS_INTER(mb_type));
1895
1896     prefetch_motion(h, 0);
1897
1898     if(IS_16X16(mb_type)){
1899         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1900                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1901                 &weight_op[0], &weight_avg[0],
1902                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1903     }else if(IS_16X8(mb_type)){
1904         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1905                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1906                 &weight_op[1], &weight_avg[1],
1907                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1908         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1909                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1910                 &weight_op[1], &weight_avg[1],
1911                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1912     }else if(IS_8X16(mb_type)){
1913         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1914                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1915                 &weight_op[2], &weight_avg[2],
1916                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1917         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1918                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1919                 &weight_op[2], &weight_avg[2],
1920                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1921     }else{
1922         int i;
1923
1924         assert(IS_8X8(mb_type));
1925
1926         for(i=0; i<4; i++){
1927             const int sub_mb_type= h->sub_mb_type[i];
1928             const int n= 4*i;
1929             int x_offset= (i&1)<<2;
1930             int y_offset= (i&2)<<1;
1931
1932             if(IS_SUB_8X8(sub_mb_type)){
1933                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1934                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1935                     &weight_op[3], &weight_avg[3],
1936                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1937             }else if(IS_SUB_8X4(sub_mb_type)){
1938                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1939                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1940                     &weight_op[4], &weight_avg[4],
1941                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1942                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1943                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1944                     &weight_op[4], &weight_avg[4],
1945                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1946             }else if(IS_SUB_4X8(sub_mb_type)){
1947                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1948                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1949                     &weight_op[5], &weight_avg[5],
1950                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1951                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1952                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1953                     &weight_op[5], &weight_avg[5],
1954                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1955             }else{
1956                 int j;
1957                 assert(IS_SUB_4X4(sub_mb_type));
1958                 for(j=0; j<4; j++){
1959                     int sub_x_offset= x_offset + 2*(j&1);
1960                     int sub_y_offset= y_offset +   (j&2);
1961                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1962                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1963                         &weight_op[6], &weight_avg[6],
1964                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1965                 }
1966             }
1967         }
1968     }
1969
1970     prefetch_motion(h, 1);
1971 }
1972
1973 static void decode_init_vlc(void){
1974     static int done = 0;
1975
1976     if (!done) {
1977         int i;
1978         done = 1;
1979
1980         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1981                  &chroma_dc_coeff_token_len [0], 1, 1,
1982                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1983
1984         for(i=0; i<4; i++){
1985             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1986                      &coeff_token_len [i][0], 1, 1,
1987                      &coeff_token_bits[i][0], 1, 1, 1);
1988         }
1989
1990         for(i=0; i<3; i++){
1991             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1992                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1993                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1994         }
1995         for(i=0; i<15; i++){
1996             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1997                      &total_zeros_len [i][0], 1, 1,
1998                      &total_zeros_bits[i][0], 1, 1, 1);
1999         }
2000
2001         for(i=0; i<6; i++){
2002             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2003                      &run_len [i][0], 1, 1,
2004                      &run_bits[i][0], 1, 1, 1);
2005         }
2006         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2007                  &run_len [6][0], 1, 1,
2008                  &run_bits[6][0], 1, 1, 1);
2009     }
2010 }
2011
2012 static void free_tables(H264Context *h){
2013     int i;
2014     H264Context *hx;
2015     av_freep(&h->intra4x4_pred_mode);
2016     av_freep(&h->chroma_pred_mode_table);
2017     av_freep(&h->cbp_table);
2018     av_freep(&h->mvd_table[0]);
2019     av_freep(&h->mvd_table[1]);
2020     av_freep(&h->direct_table);
2021     av_freep(&h->non_zero_count);
2022     av_freep(&h->slice_table_base);
2023     h->slice_table= NULL;
2024
2025     av_freep(&h->mb2b_xy);
2026     av_freep(&h->mb2b8_xy);
2027
2028     for(i = 0; i < MAX_SPS_COUNT; i++)
2029         av_freep(h->sps_buffers + i);
2030
2031     for(i = 0; i < MAX_PPS_COUNT; i++)
2032         av_freep(h->pps_buffers + i);
2033
2034     for(i = 0; i < h->s.avctx->thread_count; i++) {
2035         hx = h->thread_context[i];
2036         if(!hx) continue;
2037         av_freep(&hx->top_borders[1]);
2038         av_freep(&hx->top_borders[0]);
2039         av_freep(&hx->s.obmc_scratchpad);
2040         av_freep(&hx->s.allocated_edge_emu_buffer);
2041     }
2042 }
2043
2044 static void init_dequant8_coeff_table(H264Context *h){
2045     int i,q,x;
2046     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2047     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2048     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2049
2050     for(i=0; i<2; i++ ){
2051         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2052             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2053             break;
2054         }
2055
2056         for(q=0; q<52; q++){
2057             int shift = ff_div6[q];
2058             int idx = ff_rem6[q];
2059             for(x=0; x<64; x++)
2060                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2061                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2062                     h->pps.scaling_matrix8[i][x]) << shift;
2063         }
2064     }
2065 }
2066
2067 static void init_dequant4_coeff_table(H264Context *h){
2068     int i,j,q,x;
2069     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2070     for(i=0; i<6; i++ ){
2071         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2072         for(j=0; j<i; j++){
2073             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2074                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2075                 break;
2076             }
2077         }
2078         if(j<i)
2079             continue;
2080
2081         for(q=0; q<52; q++){
2082             int shift = ff_div6[q] + 2;
2083             int idx = ff_rem6[q];
2084             for(x=0; x<16; x++)
2085                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2086                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2087                     h->pps.scaling_matrix4[i][x]) << shift;
2088         }
2089     }
2090 }
2091
2092 static void init_dequant_tables(H264Context *h){
2093     int i,x;
2094     init_dequant4_coeff_table(h);
2095     if(h->pps.transform_8x8_mode)
2096         init_dequant8_coeff_table(h);
2097     if(h->sps.transform_bypass){
2098         for(i=0; i<6; i++)
2099             for(x=0; x<16; x++)
2100                 h->dequant4_coeff[i][0][x] = 1<<6;
2101         if(h->pps.transform_8x8_mode)
2102             for(i=0; i<2; i++)
2103                 for(x=0; x<64; x++)
2104                     h->dequant8_coeff[i][0][x] = 1<<6;
2105     }
2106 }
2107
2108
2109 /**
2110  * allocates tables.
2111  * needs width/height
2112  */
2113 static int alloc_tables(H264Context *h){
2114     MpegEncContext * const s = &h->s;
2115     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2116     int x,y;
2117
2118     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2119
2120     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2121     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2122     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2123
2124     if( h->pps.cabac ) {
2125         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2126         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2127         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2128         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2129     }
2130
2131     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2132     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2133
2134     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2135     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2136     for(y=0; y<s->mb_height; y++){
2137         for(x=0; x<s->mb_width; x++){
2138             const int mb_xy= x + y*s->mb_stride;
2139             const int b_xy = 4*x + 4*y*h->b_stride;
2140             const int b8_xy= 2*x + 2*y*h->b8_stride;
2141
2142             h->mb2b_xy [mb_xy]= b_xy;
2143             h->mb2b8_xy[mb_xy]= b8_xy;
2144         }
2145     }
2146
2147     s->obmc_scratchpad = NULL;
2148
2149     if(!h->dequant4_coeff[0])
2150         init_dequant_tables(h);
2151
2152     return 0;
2153 fail:
2154     free_tables(h);
2155     return -1;
2156 }
2157
2158 /**
2159  * Mimic alloc_tables(), but for every context thread.
2160  */
2161 static void clone_tables(H264Context *dst, H264Context *src){
2162     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2163     dst->non_zero_count           = src->non_zero_count;
2164     dst->slice_table              = src->slice_table;
2165     dst->cbp_table                = src->cbp_table;
2166     dst->mb2b_xy                  = src->mb2b_xy;
2167     dst->mb2b8_xy                 = src->mb2b8_xy;
2168     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2169     dst->mvd_table[0]             = src->mvd_table[0];
2170     dst->mvd_table[1]             = src->mvd_table[1];
2171     dst->direct_table             = src->direct_table;
2172
2173     dst->s.obmc_scratchpad = NULL;
2174     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2175 }
2176
2177 /**
2178  * Init context
2179  * Allocate buffers which are not shared amongst multiple threads.
2180  */
2181 static int context_init(H264Context *h){
2182     MpegEncContext * const s = &h->s;
2183
2184     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2185     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2186
2187     // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264)
2188     CHECKED_ALLOCZ(s->allocated_edge_emu_buffer,
2189                    (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance
2190     s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21;
2191     return 0;
2192 fail:
2193     return -1; // free_tables will clean up for us
2194 }
2195
2196 static void common_init(H264Context *h){
2197     MpegEncContext * const s = &h->s;
2198
2199     s->width = s->avctx->width;
2200     s->height = s->avctx->height;
2201     s->codec_id= s->avctx->codec->id;
2202
2203     ff_h264_pred_init(&h->hpc, s->codec_id);
2204
2205     h->dequant_coeff_pps= -1;
2206     s->unrestricted_mv=1;
2207     s->decode=1; //FIXME
2208
2209     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2210     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2211 }
2212
2213 static int decode_init(AVCodecContext *avctx){
2214     H264Context *h= avctx->priv_data;
2215     MpegEncContext * const s = &h->s;
2216
2217     MPV_decode_defaults(s);
2218
2219     s->avctx = avctx;
2220     common_init(h);
2221
2222     s->out_format = FMT_H264;
2223     s->workaround_bugs= avctx->workaround_bugs;
2224
2225     // set defaults
2226 //    s->decode_mb= ff_h263_decode_mb;
2227     s->quarter_sample = 1;
2228     s->low_delay= 1;
2229     avctx->pix_fmt= PIX_FMT_YUV420P;
2230
2231     decode_init_vlc();
2232
2233     if(avctx->extradata_size > 0 && avctx->extradata &&
2234        *(char *)avctx->extradata == 1){
2235         h->is_avc = 1;
2236         h->got_avcC = 0;
2237     } else {
2238         h->is_avc = 0;
2239     }
2240
2241     h->thread_context[0] = h;
2242     return 0;
2243 }
2244
2245 static int frame_start(H264Context *h){
2246     MpegEncContext * const s = &h->s;
2247     int i;
2248
2249     if(MPV_frame_start(s, s->avctx) < 0)
2250         return -1;
2251     ff_er_frame_start(s);
2252     /*
2253      * MPV_frame_start uses pict_type to derive key_frame.
2254      * This is incorrect for H.264; IDR markings must be used.
2255      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2256      * See decode_nal_units().
2257      */
2258     s->current_picture_ptr->key_frame= 0;
2259
2260     assert(s->linesize && s->uvlinesize);
2261
2262     for(i=0; i<16; i++){
2263         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2264         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2265     }
2266     for(i=0; i<4; i++){
2267         h->block_offset[16+i]=
2268         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2269         h->block_offset[24+16+i]=
2270         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2271     }
2272
2273     /* can't be in alloc_tables because linesize isn't known there.
2274      * FIXME: redo bipred weight to not require extra buffer? */
2275     for(i = 0; i < s->avctx->thread_count; i++)
2276         if(!h->thread_context[i]->s.obmc_scratchpad)
2277             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2278
2279     /* some macroblocks will be accessed before they're available */
2280     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2281         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2282
2283 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2284     return 0;
2285 }
2286
2287 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2288     MpegEncContext * const s = &h->s;
2289     int i;
2290
2291     src_y  -=   linesize;
2292     src_cb -= uvlinesize;
2293     src_cr -= uvlinesize;
2294
2295     // There are two lines saved, the line above the the top macroblock of a pair,
2296     // and the line above the bottom macroblock
2297     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2298     for(i=1; i<17; i++){
2299         h->left_border[i]= src_y[15+i*  linesize];
2300     }
2301
2302     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2303     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2304
2305     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2306         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2307         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2308         for(i=1; i<9; i++){
2309             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2310             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2311         }
2312         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2313         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2314     }
2315 }
2316
2317 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2318     MpegEncContext * const s = &h->s;
2319     int temp8, i;
2320     uint64_t temp64;
2321     int deblock_left;
2322     int deblock_top;
2323     int mb_xy;
2324
2325     if(h->deblocking_filter == 2) {
2326         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2327         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2328         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2329     } else {
2330         deblock_left = (s->mb_x > 0);
2331         deblock_top =  (s->mb_y > 0);
2332     }
2333
2334     src_y  -=   linesize + 1;
2335     src_cb -= uvlinesize + 1;
2336     src_cr -= uvlinesize + 1;
2337
2338 #define XCHG(a,b,t,xchg)\
2339 t= a;\
2340 if(xchg)\
2341     a= b;\
2342 b= t;
2343
2344     if(deblock_left){
2345         for(i = !deblock_top; i<17; i++){
2346             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2347         }
2348     }
2349
2350     if(deblock_top){
2351         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2352         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2353         if(s->mb_x+1 < s->mb_width){
2354             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2355         }
2356     }
2357
2358     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2359         if(deblock_left){
2360             for(i = !deblock_top; i<9; i++){
2361                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2362                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2363             }
2364         }
2365         if(deblock_top){
2366             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2367             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2368         }
2369     }
2370 }
2371
2372 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2373     MpegEncContext * const s = &h->s;
2374     int i;
2375
2376     src_y  -= 2 *   linesize;
2377     src_cb -= 2 * uvlinesize;
2378     src_cr -= 2 * uvlinesize;
2379
2380     // There are two lines saved, the line above the the top macroblock of a pair,
2381     // and the line above the bottom macroblock
2382     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2383     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2384     for(i=2; i<34; i++){
2385         h->left_border[i]= src_y[15+i*  linesize];
2386     }
2387
2388     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2389     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2390     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2391     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2392
2393     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2394         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2395         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2396         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2397         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2398         for(i=2; i<18; i++){
2399             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2400             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2401         }
2402         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2403         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2404         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2405         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2406     }
2407 }
2408
2409 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2410     MpegEncContext * const s = &h->s;
2411     int temp8, i;
2412     uint64_t temp64;
2413     int deblock_left = (s->mb_x > 0);
2414     int deblock_top  = (s->mb_y > 1);
2415
2416     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2417
2418     src_y  -= 2 *   linesize + 1;
2419     src_cb -= 2 * uvlinesize + 1;
2420     src_cr -= 2 * uvlinesize + 1;
2421
2422 #define XCHG(a,b,t,xchg)\
2423 t= a;\
2424 if(xchg)\
2425     a= b;\
2426 b= t;
2427
2428     if(deblock_left){
2429         for(i = (!deblock_top)<<1; i<34; i++){
2430             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2431         }
2432     }
2433
2434     if(deblock_top){
2435         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2436         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2437         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2438         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2439         if(s->mb_x+1 < s->mb_width){
2440             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2441             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2442         }
2443     }
2444
2445     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2446         if(deblock_left){
2447             for(i = (!deblock_top) << 1; i<18; i++){
2448                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2449                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2450             }
2451         }
2452         if(deblock_top){
2453             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2454             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2455             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2456             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2457         }
2458     }
2459 }
2460
2461 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2462     MpegEncContext * const s = &h->s;
2463     const int mb_x= s->mb_x;
2464     const int mb_y= s->mb_y;
2465     const int mb_xy= mb_x + mb_y*s->mb_stride;
2466     const int mb_type= s->current_picture.mb_type[mb_xy];
2467     uint8_t  *dest_y, *dest_cb, *dest_cr;
2468     int linesize, uvlinesize /*dct_offset*/;
2469     int i;
2470     int *block_offset = &h->block_offset[0];
2471     const unsigned int bottom = mb_y & 1;
2472     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2473     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2474     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2475
2476     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2477     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2478     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2479
2480     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2481     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2482
2483     if (!simple && MB_FIELD) {
2484         linesize   = h->mb_linesize   = s->linesize * 2;
2485         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2486         block_offset = &h->block_offset[24];
2487         if(mb_y&1){ //FIXME move out of this func?
2488             dest_y -= s->linesize*15;
2489             dest_cb-= s->uvlinesize*7;
2490             dest_cr-= s->uvlinesize*7;
2491         }
2492         if(FRAME_MBAFF) {
2493             int list;
2494             for(list=0; list<h->list_count; list++){
2495                 if(!USES_LIST(mb_type, list))
2496                     continue;
2497                 if(IS_16X16(mb_type)){
2498                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2499                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
2500                 }else{
2501                     for(i=0; i<16; i+=4){
2502                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2503                         int ref = h->ref_cache[list][scan8[i]];
2504                         if(ref >= 0)
2505                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
2506                     }
2507                 }
2508             }
2509         }
2510     } else {
2511         linesize   = h->mb_linesize   = s->linesize;
2512         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2513 //        dct_offset = s->linesize * 16;
2514     }
2515
2516     if(transform_bypass){
2517         idct_dc_add =
2518         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2519     }else if(IS_8x8DCT(mb_type)){
2520         idct_dc_add = s->dsp.h264_idct8_dc_add;
2521         idct_add = s->dsp.h264_idct8_add;
2522     }else{
2523         idct_dc_add = s->dsp.h264_idct_dc_add;
2524         idct_add = s->dsp.h264_idct_add;
2525     }
2526
2527     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2528        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2529         int mbt_y = mb_y&~1;
2530         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2531         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2532         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2533         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2534     }
2535
2536     if (!simple && IS_INTRA_PCM(mb_type)) {
2537         unsigned int x, y;
2538
2539         // The pixels are stored in h->mb array in the same order as levels,
2540         // copy them in output in the correct order.
2541         for(i=0; i<16; i++) {
2542             for (y=0; y<4; y++) {
2543                 for (x=0; x<4; x++) {
2544                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2545                 }
2546             }
2547         }
2548         for(i=16; i<16+4; i++) {
2549             for (y=0; y<4; y++) {
2550                 for (x=0; x<4; x++) {
2551                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2552                 }
2553             }
2554         }
2555         for(i=20; i<20+4; i++) {
2556             for (y=0; y<4; y++) {
2557                 for (x=0; x<4; x++) {
2558                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2559                 }
2560             }
2561         }
2562     } else {
2563         if(IS_INTRA(mb_type)){
2564             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2565                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2566
2567             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2568                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2569                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2570             }
2571
2572             if(IS_INTRA4x4(mb_type)){
2573                 if(simple || !s->encoding){
2574                     if(IS_8x8DCT(mb_type)){
2575                         for(i=0; i<16; i+=4){
2576                             uint8_t * const ptr= dest_y + block_offset[i];
2577                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2578                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2579                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2580                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2581                             if(nnz){
2582                                 if(nnz == 1 && h->mb[i*16])
2583                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2584                                 else
2585                                     idct_add(ptr, h->mb + i*16, linesize);
2586                             }
2587                         }
2588                     }else
2589                     for(i=0; i<16; i++){
2590                         uint8_t * const ptr= dest_y + block_offset[i];
2591                         uint8_t *topright;
2592                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2593                         int nnz, tr;
2594
2595                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2596                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2597                             assert(mb_y || linesize <= block_offset[i]);
2598                             if(!topright_avail){
2599                                 tr= ptr[3 - linesize]*0x01010101;
2600                                 topright= (uint8_t*) &tr;
2601                             }else
2602                                 topright= ptr + 4 - linesize;
2603                         }else
2604                             topright= NULL;
2605
2606                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2607                         nnz = h->non_zero_count_cache[ scan8[i] ];
2608                         if(nnz){
2609                             if(is_h264){
2610                                 if(nnz == 1 && h->mb[i*16])
2611                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2612                                 else
2613                                     idct_add(ptr, h->mb + i*16, linesize);
2614                             }else
2615                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2616                         }
2617                     }
2618                 }
2619             }else{
2620                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2621                 if(is_h264){
2622                     if(!transform_bypass)
2623                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2624                 }else
2625                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2626             }
2627             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2628                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2629         }else if(is_h264){
2630             hl_motion(h, dest_y, dest_cb, dest_cr,
2631                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2632                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2633                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2634         }
2635
2636
2637         if(!IS_INTRA4x4(mb_type)){
2638             if(is_h264){
2639                 if(IS_INTRA16x16(mb_type)){
2640                     for(i=0; i<16; i++){
2641                         if(h->non_zero_count_cache[ scan8[i] ])
2642                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2643                         else if(h->mb[i*16])
2644                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2645                     }
2646                 }else{
2647                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2648                     for(i=0; i<16; i+=di){
2649                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2650                         if(nnz){
2651                             if(nnz==1 && h->mb[i*16])
2652                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2653                             else
2654                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2655                         }
2656                     }
2657                 }
2658             }else{
2659                 for(i=0; i<16; i++){
2660                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2661                         uint8_t * const ptr= dest_y + block_offset[i];
2662                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2663                     }
2664                 }
2665             }
2666         }
2667
2668         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2669             uint8_t *dest[2] = {dest_cb, dest_cr};
2670             if(transform_bypass){
2671                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2672             }else{
2673                 idct_add = s->dsp.h264_idct_add;
2674                 idct_dc_add = s->dsp.h264_idct_dc_add;
2675                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2676                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2677             }
2678             if(is_h264){
2679                 for(i=16; i<16+8; i++){
2680                     if(h->non_zero_count_cache[ scan8[i] ])
2681                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2682                     else if(h->mb[i*16])
2683                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2684                 }
2685             }else{
2686                 for(i=16; i<16+8; i++){
2687                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2688                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2689                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2690                     }
2691                 }
2692             }
2693         }
2694     }
2695     if(h->deblocking_filter) {
2696         if (!simple && FRAME_MBAFF) {
2697             //FIXME try deblocking one mb at a time?
2698             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2699             const int mb_y = s->mb_y - 1;
2700             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2701             const int mb_xy= mb_x + mb_y*s->mb_stride;
2702             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2703             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2704             if (!bottom) return;
2705             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2706             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2707             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2708
2709             if(IS_INTRA(mb_type_top | mb_type_bottom))
2710                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2711
2712             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2713             // deblock a pair
2714             // top
2715             s->mb_y--;
2716             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2717             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2718             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2719             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2720             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2721             // bottom
2722             s->mb_y++;
2723             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2724             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2725             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2726             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2727             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2728         } else {
2729             tprintf(h->s.avctx, "call filter_mb\n");
2730             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2731             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2732             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2733         }
2734     }
2735 }
2736
2737 /**
2738  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2739  */
2740 static void hl_decode_mb_simple(H264Context *h){
2741     hl_decode_mb_internal(h, 1);
2742 }
2743
2744 /**
2745  * Process a macroblock; this handles edge cases, such as interlacing.
2746  */
2747 static void av_noinline hl_decode_mb_complex(H264Context *h){
2748     hl_decode_mb_internal(h, 0);
2749 }
2750
2751 static void hl_decode_mb(H264Context *h){
2752     MpegEncContext * const s = &h->s;
2753     const int mb_x= s->mb_x;
2754     const int mb_y= s->mb_y;
2755     const int mb_xy= mb_x + mb_y*s->mb_stride;
2756     const int mb_type= s->current_picture.mb_type[mb_xy];
2757     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2758
2759     if(!s->decode)
2760         return;
2761
2762     if (is_complex)
2763         hl_decode_mb_complex(h);
2764     else hl_decode_mb_simple(h);
2765 }
2766
2767 static void pic_as_field(Picture *pic, const int bottom){
2768     int i;
2769     for (i = 0; i < 4; ++i) {
2770         if (bottom)
2771             pic->data[i] += pic->linesize[i];
2772         pic->linesize[i] *= 2;
2773     }
2774 }
2775
2776 static int split_field_copy(Picture *dest, Picture *src,
2777                             int parity, int id_add){
2778     int match = !!(src->reference & parity);
2779
2780     if (match) {
2781         *dest = *src;
2782         pic_as_field(dest, parity == PICT_BOTTOM_FIELD);
2783         dest->pic_id *= 2;
2784         dest->pic_id += id_add;
2785     }
2786
2787     return match;
2788 }
2789
2790 /**
2791  * Split one reference list into field parts, interleaving by parity
2792  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2793  * set to look at the actual start of data for that field.
2794  *
2795  * @param dest output list
2796  * @param dest_len maximum number of fields to put in dest
2797  * @param src the source reference list containing fields and/or field pairs
2798  *            (aka short_ref/long_ref, or
2799  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2800  * @param src_len number of Picture's in source (pairs and unmatched fields)
2801  * @param parity the parity of the picture being decoded/needing
2802  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2803  * @return number of fields placed in dest
2804  */
2805 static int split_field_half_ref_list(Picture *dest, int dest_len,
2806                                      Picture *src,  int src_len,  int parity){
2807     int same_parity   = 1;
2808     int same_i        = 0;
2809     int opp_i         = 0;
2810     int out_i;
2811     int field_output;
2812
2813     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2814         if (same_parity && same_i < src_len) {
2815             field_output = split_field_copy(dest + out_i, src + same_i,
2816                                             parity, 1);
2817             same_parity = !field_output;
2818             same_i++;
2819
2820         } else if (opp_i < src_len) {
2821             field_output = split_field_copy(dest + out_i, src + opp_i,
2822                                             PICT_FRAME - parity, 0);
2823             same_parity = field_output;
2824             opp_i++;
2825
2826         } else {
2827             break;
2828         }
2829     }
2830
2831     return out_i;
2832 }
2833
2834 /**
2835  * Split the reference frame list into a reference field list.
2836  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2837  * The input list contains both reference field pairs and
2838  * unmatched reference fields; it is ordered as spec describes
2839  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2840  * unmatched field pairs are also present. Conceptually this is equivalent
2841  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2842  *
2843  * @param dest output reference list where ordered fields are to be placed
2844  * @param dest_len max number of fields to place at dest
2845  * @param src source reference list, as described above
2846  * @param src_len number of pictures (pairs and unmatched fields) in src
2847  * @param parity parity of field being currently decoded
2848  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2849  * @param long_i index into src array that holds first long reference picture,
2850  *        or src_len if no long refs present.
2851  */
2852 static int split_field_ref_list(Picture *dest, int dest_len,
2853                                 Picture *src,  int src_len,
2854                                 int parity,    int long_i){
2855
2856     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2857     dest += i;
2858     dest_len -= i;
2859
2860     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2861                                    src_len - long_i, parity);
2862     return i;
2863 }
2864
2865 /**
2866  * fills the default_ref_list.
2867  */
2868 static int fill_default_ref_list(H264Context *h){
2869     MpegEncContext * const s = &h->s;
2870     int i;
2871     int smallest_poc_greater_than_current = -1;
2872     int structure_sel;
2873     Picture sorted_short_ref[32];
2874     Picture field_entry_list[2][32];
2875     Picture *frame_list[2];
2876
2877     if (FIELD_PICTURE) {
2878         structure_sel = PICT_FRAME;
2879         frame_list[0] = field_entry_list[0];
2880         frame_list[1] = field_entry_list[1];
2881     } else {
2882         structure_sel = 0;
2883         frame_list[0] = h->default_ref_list[0];
2884         frame_list[1] = h->default_ref_list[1];
2885     }
2886
2887     if(h->slice_type==B_TYPE){
2888         int list;
2889         int len[2];
2890         int short_len[2];
2891         int out_i;
2892         int limit= INT_MIN;
2893
2894         /* sort frame according to poc in B slice */
2895         for(out_i=0; out_i<h->short_ref_count; out_i++){
2896             int best_i=INT_MIN;
2897             int best_poc=INT_MAX;
2898
2899             for(i=0; i<h->short_ref_count; i++){
2900                 const int poc= h->short_ref[i]->poc;
2901                 if(poc > limit && poc < best_poc){
2902                     best_poc= poc;
2903                     best_i= i;
2904                 }
2905             }
2906
2907             assert(best_i != INT_MIN);
2908
2909             limit= best_poc;
2910             sorted_short_ref[out_i]= *h->short_ref[best_i];
2911             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2912             if (-1 == smallest_poc_greater_than_current) {
2913                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2914                     smallest_poc_greater_than_current = out_i;
2915                 }
2916             }
2917         }
2918
2919         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2920
2921         // find the largest poc
2922         for(list=0; list<2; list++){
2923             int index = 0;
2924             int j= -99;
2925             int step= list ? -1 : 1;
2926
2927             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2928                 int sel;
2929                 while(j<0 || j>= h->short_ref_count){
2930                     if(j != -99 && step == (list ? -1 : 1))
2931                         return -1;
2932                     step = -step;
2933                     j= smallest_poc_greater_than_current + (step>>1);
2934                 }
2935                 sel = sorted_short_ref[j].reference | structure_sel;
2936                 if(sel != PICT_FRAME) continue;
2937                 frame_list[list][index  ]= sorted_short_ref[j];
2938                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2939             }
2940             short_len[list] = index;
2941
2942             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2943                 int sel;
2944                 if(h->long_ref[i] == NULL) continue;
2945                 sel = h->long_ref[i]->reference | structure_sel;
2946                 if(sel != PICT_FRAME) continue;
2947
2948                 frame_list[ list ][index  ]= *h->long_ref[i];
2949                 frame_list[ list ][index++].pic_id= i;;
2950             }
2951             len[list] = index;
2952
2953             if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
2954                 // swap the two first elements of L1 when
2955                 // L0 and L1 are identical
2956                 Picture temp= frame_list[1][0];
2957                 frame_list[1][0] = frame_list[1][1];
2958                 frame_list[1][1] = temp;
2959             }
2960
2961         }
2962
2963         for(list=0; list<2; list++){
2964             if (FIELD_PICTURE)
2965                 len[list] = split_field_ref_list(h->default_ref_list[list],
2966                                                  h->ref_count[list],
2967                                                  frame_list[list],
2968                                                  len[list],
2969                                                  s->picture_structure,
2970                                                  short_len[list]);
2971
2972             if(len[list] < h->ref_count[ list ])
2973                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2974         }
2975
2976
2977     }else{
2978         int index=0;
2979         int short_len;
2980         for(i=0; i<h->short_ref_count; i++){
2981             int sel;
2982             sel = h->short_ref[i]->reference | structure_sel;
2983             if(sel != PICT_FRAME) continue;
2984             frame_list[0][index  ]= *h->short_ref[i];
2985             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2986         }
2987         short_len = index;
2988         for(i = 0; i < 16; i++){
2989             int sel;
2990             if(h->long_ref[i] == NULL) continue;
2991             sel = h->long_ref[i]->reference | structure_sel;
2992             if(sel != PICT_FRAME) continue;
2993             frame_list[0][index  ]= *h->long_ref[i];
2994             frame_list[0][index++].pic_id= i;;
2995         }
2996
2997         if (FIELD_PICTURE)
2998             index = split_field_ref_list(h->default_ref_list[0],
2999                                          h->ref_count[0], frame_list[0],
3000                                          index, s->picture_structure,
3001                                          short_len);
3002
3003         if(index < h->ref_count[0])
3004             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3005     }
3006 #ifdef TRACE
3007     for (i=0; i<h->ref_count[0]; i++) {
3008         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3009     }
3010     if(h->slice_type==B_TYPE){
3011         for (i=0; i<h->ref_count[1]; i++) {
3012             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3013         }
3014     }
3015 #endif
3016     return 0;
3017 }
3018
3019 static void print_short_term(H264Context *h);
3020 static void print_long_term(H264Context *h);
3021
3022 /**
3023  * Extract structure information about the picture described by pic_num in
3024  * the current decoding context (frame or field). Note that pic_num is
3025  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
3026  * @param pic_num picture number for which to extract structure information
3027  * @param structure one of PICT_XXX describing structure of picture
3028  *                      with pic_num
3029  * @return frame number (short term) or long term index of picture
3030  *         described by pic_num
3031  */
3032 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3033     MpegEncContext * const s = &h->s;
3034
3035     *structure = s->picture_structure;
3036     if(FIELD_PICTURE){
3037         if (!(pic_num & 1))
3038             /* opposite field */
3039             *structure ^= PICT_FRAME;
3040         pic_num >>= 1;
3041     }
3042
3043     return pic_num;
3044 }
3045
3046 static int decode_ref_pic_list_reordering(H264Context *h){
3047     MpegEncContext * const s = &h->s;
3048     int list, index, pic_structure;
3049
3050     print_short_term(h);
3051     print_long_term(h);
3052     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3053
3054     for(list=0; list<h->list_count; list++){
3055         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3056
3057         if(get_bits1(&s->gb)){
3058             int pred= h->curr_pic_num;
3059
3060             for(index=0; ; index++){
3061                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3062                 unsigned int pic_id;
3063                 int i;
3064                 Picture *ref = NULL;
3065
3066                 if(reordering_of_pic_nums_idc==3)
3067                     break;
3068
3069                 if(index >= h->ref_count[list]){
3070                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3071                     return -1;
3072                 }
3073
3074                 if(reordering_of_pic_nums_idc<3){
3075                     if(reordering_of_pic_nums_idc<2){
3076                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3077                         int frame_num;
3078
3079                         if(abs_diff_pic_num >= h->max_pic_num){
3080                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3081                             return -1;
3082                         }
3083
3084                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3085                         else                                pred+= abs_diff_pic_num;
3086                         pred &= h->max_pic_num - 1;
3087
3088                         frame_num = pic_num_extract(h, pred, &pic_structure);
3089
3090                         for(i= h->short_ref_count-1; i>=0; i--){
3091                             ref = h->short_ref[i];
3092                             assert(ref->reference);
3093                             assert(!ref->long_ref);
3094                             if(ref->data[0] != NULL &&
3095                                    ref->frame_num == frame_num &&
3096                                    (ref->reference & pic_structure) &&
3097                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3098                                 break;
3099                         }
3100                         if(i>=0)
3101                             ref->pic_id= pred;
3102                     }else{
3103                         int long_idx;
3104                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3105
3106                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3107
3108                         if(long_idx>31){
3109                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3110                             return -1;
3111                         }
3112                         ref = h->long_ref[long_idx];
3113                         assert(!(ref && !ref->reference));
3114                         if(ref && (ref->reference & pic_structure)){
3115                             ref->pic_id= pic_id;
3116                             assert(ref->long_ref);
3117                             i=0;
3118                         }else{
3119                             i=-1;
3120                         }
3121                     }
3122
3123                     if (i < 0) {
3124                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3125                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3126                     } else {
3127                         for(i=index; i+1<h->ref_count[list]; i++){
3128                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3129                                 break;
3130                         }
3131                         for(; i > index; i--){
3132                             h->ref_list[list][i]= h->ref_list[list][i-1];
3133                         }
3134                         h->ref_list[list][index]= *ref;
3135                         if (FIELD_PICTURE){
3136                             int bot = pic_structure == PICT_BOTTOM_FIELD;
3137                             pic_as_field(&h->ref_list[list][index], bot);
3138                         }
3139                     }
3140                 }else{
3141                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3142                     return -1;
3143                 }
3144             }
3145         }
3146     }
3147     for(list=0; list<h->list_count; list++){
3148         for(index= 0; index < h->ref_count[list]; index++){
3149             if(!h->ref_list[list][index].data[0])
3150                 h->ref_list[list][index]= s->current_picture;
3151         }
3152     }
3153
3154     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3155         direct_dist_scale_factor(h);
3156     direct_ref_list_init(h);
3157     return 0;
3158 }
3159
3160 static void fill_mbaff_ref_list(H264Context *h){
3161     int list, i, j;
3162     for(list=0; list<2; list++){ //FIXME try list_count
3163         for(i=0; i<h->ref_count[list]; i++){
3164             Picture *frame = &h->ref_list[list][i];
3165             Picture *field = &h->ref_list[list][16+2*i];
3166             field[0] = *frame;
3167             for(j=0; j<3; j++)
3168                 field[0].linesize[j] <<= 1;
3169             field[1] = field[0];
3170             for(j=0; j<3; j++)
3171                 field[1].data[j] += frame->linesize[j];
3172
3173             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3174             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3175             for(j=0; j<2; j++){
3176                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3177                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3178             }
3179         }
3180     }
3181     for(j=0; j<h->ref_count[1]; j++){
3182         for(i=0; i<h->ref_count[0]; i++)
3183             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3184         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3185         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3186     }
3187 }
3188
3189 static int pred_weight_table(H264Context *h){
3190     MpegEncContext * const s = &h->s;
3191     int list, i;
3192     int luma_def, chroma_def;
3193
3194     h->use_weight= 0;
3195     h->use_weight_chroma= 0;
3196     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3197     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3198     luma_def = 1<<h->luma_log2_weight_denom;
3199     chroma_def = 1<<h->chroma_log2_weight_denom;
3200
3201     for(list=0; list<2; list++){
3202         for(i=0; i<h->ref_count[list]; i++){
3203             int luma_weight_flag, chroma_weight_flag;
3204
3205             luma_weight_flag= get_bits1(&s->gb);
3206             if(luma_weight_flag){
3207                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3208                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3209                 if(   h->luma_weight[list][i] != luma_def
3210                    || h->luma_offset[list][i] != 0)
3211                     h->use_weight= 1;
3212             }else{
3213                 h->luma_weight[list][i]= luma_def;
3214                 h->luma_offset[list][i]= 0;
3215             }
3216
3217             chroma_weight_flag= get_bits1(&s->gb);
3218             if(chroma_weight_flag){
3219                 int j;
3220                 for(j=0; j<2; j++){
3221                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3222                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3223                     if(   h->chroma_weight[list][i][j] != chroma_def
3224                        || h->chroma_offset[list][i][j] != 0)
3225                         h->use_weight_chroma= 1;
3226                 }
3227             }else{
3228                 int j;
3229                 for(j=0; j<2; j++){
3230                     h->chroma_weight[list][i][j]= chroma_def;
3231                     h->chroma_offset[list][i][j]= 0;
3232                 }
3233             }
3234         }
3235         if(h->slice_type != B_TYPE) break;
3236     }
3237     h->use_weight= h->use_weight || h->use_weight_chroma;
3238     return 0;
3239 }
3240
3241 static void implicit_weight_table(H264Context *h){
3242     MpegEncContext * const s = &h->s;
3243     int ref0, ref1;
3244     int cur_poc = s->current_picture_ptr->poc;
3245
3246     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3247        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3248         h->use_weight= 0;
3249         h->use_weight_chroma= 0;
3250         return;
3251     }
3252
3253     h->use_weight= 2;
3254     h->use_weight_chroma= 2;
3255     h->luma_log2_weight_denom= 5;
3256     h->chroma_log2_weight_denom= 5;
3257
3258     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3259         int poc0 = h->ref_list[0][ref0].poc;
3260         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3261             int poc1 = h->ref_list[1][ref1].poc;
3262             int td = av_clip(poc1 - poc0, -128, 127);
3263             if(td){
3264                 int tb = av_clip(cur_poc - poc0, -128, 127);
3265                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3266                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3267                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3268                     h->implicit_weight[ref0][ref1] = 32;
3269                 else
3270                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3271             }else
3272                 h->implicit_weight[ref0][ref1] = 32;
3273         }
3274     }
3275 }
3276
3277 /**
3278  * Mark a picture as no longer needed for reference. The refmask
3279  * argument allows unreferencing of individual fields or the whole frame.
3280  * If the picture becomes entirely unreferenced, but is being held for
3281  * display purposes, it is marked as such.
3282  * @param refmask mask of fields to unreference; the mask is bitwise
3283  *                anded with the reference marking of pic
3284  * @return non-zero if pic becomes entirely unreferenced (except possibly
3285  *         for display purposes) zero if one of the fields remains in
3286  *         reference
3287  */
3288 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3289     int i;
3290     if (pic->reference &= refmask) {
3291         return 0;
3292     } else {
3293         if(pic == h->delayed_output_pic)
3294             pic->reference=DELAYED_PIC_REF;
3295         else{
3296             for(i = 0; h->delayed_pic[i]; i++)
3297                 if(pic == h->delayed_pic[i]){
3298                     pic->reference=DELAYED_PIC_REF;
3299                     break;
3300                 }
3301         }
3302         return 1;
3303     }
3304 }
3305
3306 /**
3307  * instantaneous decoder refresh.
3308  */
3309 static void idr(H264Context *h){
3310     int i;
3311
3312     for(i=0; i<16; i++){
3313         if (h->long_ref[i] != NULL) {
3314             unreference_pic(h, h->long_ref[i], 0);
3315             h->long_ref[i]= NULL;
3316         }
3317     }
3318     h->long_ref_count=0;
3319
3320     for(i=0; i<h->short_ref_count; i++){
3321         unreference_pic(h, h->short_ref[i], 0);
3322         h->short_ref[i]= NULL;
3323     }
3324     h->short_ref_count=0;
3325 }
3326
3327 /* forget old pics after a seek */
3328 static void flush_dpb(AVCodecContext *avctx){
3329     H264Context *h= avctx->priv_data;
3330     int i;
3331     for(i=0; i<16; i++) {
3332         if(h->delayed_pic[i])
3333             h->delayed_pic[i]->reference= 0;
3334         h->delayed_pic[i]= NULL;
3335     }
3336     if(h->delayed_output_pic)
3337         h->delayed_output_pic->reference= 0;
3338     h->delayed_output_pic= NULL;
3339     idr(h);
3340     if(h->s.current_picture_ptr)
3341         h->s.current_picture_ptr->reference= 0;
3342 }
3343
3344 /**
3345  * Find a Picture in the short term reference list by frame number.
3346  * @param frame_num frame number to search for
3347  * @param idx the index into h->short_ref where returned picture is found
3348  *            undefined if no picture found.
3349  * @return pointer to the found picture, or NULL if no pic with the provided
3350  *                 frame number is found
3351  */
3352 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3353     MpegEncContext * const s = &h->s;
3354     int i;
3355
3356     for(i=0; i<h->short_ref_count; i++){
3357         Picture *pic= h->short_ref[i];
3358         if(s->avctx->debug&FF_DEBUG_MMCO)
3359             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3360         if(pic->frame_num == frame_num) {
3361             *idx = i;
3362             return pic;
3363         }
3364     }
3365     return NULL;
3366 }
3367
3368 /**
3369  * Remove a picture from the short term reference list by its index in
3370  * that list.  This does no checking on the provided index; it is assumed
3371  * to be valid. Other list entries are shifted down.
3372  * @param i index into h->short_ref of picture to remove.
3373  */
3374 static void remove_short_at_index(H264Context *h, int i){
3375     assert(i > 0 && i < h->short_ref_count);
3376     h->short_ref[i]= NULL;
3377     if (--h->short_ref_count)
3378         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3379 }
3380
3381 /**
3382  *
3383  * @return the removed picture or NULL if an error occurs
3384  */
3385 static Picture * remove_short(H264Context *h, int frame_num){
3386     MpegEncContext * const s = &h->s;
3387     Picture *pic;
3388     int i;
3389
3390     if(s->avctx->debug&FF_DEBUG_MMCO)
3391         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3392
3393     pic = find_short(h, frame_num, &i);
3394     if (pic)
3395         remove_short_at_index(h, i);
3396
3397     return pic;
3398 }
3399
3400 /**
3401  * Remove a picture from the long term reference list by its index in
3402  * that list.  This does no checking on the provided index; it is assumed
3403  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3404  * @param i index into h->long_ref of picture to remove.
3405  */
3406 static void remove_long_at_index(H264Context *h, int i){
3407     h->long_ref[i]= NULL;
3408     h->long_ref_count--;
3409 }
3410
3411 /**
3412  *
3413  * @return the removed picture or NULL if an error occurs
3414  */
3415 static Picture * remove_long(H264Context *h, int i){
3416     Picture *pic;
3417
3418     pic= h->long_ref[i];
3419     if (pic)
3420         remove_long_at_index(h, i);
3421
3422     return pic;
3423 }
3424
3425 /**
3426  * print short term list
3427  */
3428 static void print_short_term(H264Context *h) {
3429     uint32_t i;
3430     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3431         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3432         for(i=0; i<h->short_ref_count; i++){
3433             Picture *pic= h->short_ref[i];
3434             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3435         }
3436     }
3437 }
3438
3439 /**
3440  * print long term list
3441  */
3442 static void print_long_term(H264Context *h) {
3443     uint32_t i;
3444     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3445         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3446         for(i = 0; i < 16; i++){
3447             Picture *pic= h->long_ref[i];
3448             if (pic) {
3449                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3450             }
3451         }
3452     }
3453 }
3454
3455 /**
3456  * Executes the reference picture marking (memory management control operations).
3457  */
3458 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3459     MpegEncContext * const s = &h->s;
3460     int i, j;
3461     int current_is_long=0;
3462     Picture *pic;
3463
3464     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3465         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3466
3467     for(i=0; i<mmco_count; i++){
3468         if(s->avctx->debug&FF_DEBUG_MMCO)
3469             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3470
3471         switch(mmco[i].opcode){
3472         case MMCO_SHORT2UNUSED:
3473             pic= remove_short(h, mmco[i].short_pic_num);
3474             if(pic)
3475                 unreference_pic(h, pic, 0);
3476             else if(s->avctx->debug&FF_DEBUG_MMCO)
3477                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3478             break;
3479         case MMCO_SHORT2LONG:
3480             pic= remove_long(h, mmco[i].long_arg);
3481             if(pic) unreference_pic(h, pic, 0);
3482
3483             h->long_ref[ mmco[i].long_arg ]= remove_short(h, mmco[i].short_pic_num);
3484             if (h->long_ref[ mmco[i].long_arg ]){
3485                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3486                 h->long_ref_count++;
3487             }
3488             break;
3489         case MMCO_LONG2UNUSED:
3490             pic= remove_long(h, mmco[i].long_arg);
3491             if(pic)
3492                 unreference_pic(h, pic, 0);
3493             else if(s->avctx->debug&FF_DEBUG_MMCO)
3494                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3495             break;
3496         case MMCO_LONG:
3497             pic= remove_long(h, mmco[i].long_arg);
3498             if(pic) unreference_pic(h, pic, 0);
3499
3500             h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3501             h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3502             h->long_ref_count++;
3503
3504             current_is_long=1;
3505             break;
3506         case MMCO_SET_MAX_LONG:
3507             assert(mmco[i].long_arg <= 16);
3508             // just remove the long term which index is greater than new max
3509             for(j = mmco[i].long_arg; j<16; j++){
3510                 pic = remove_long(h, j);
3511                 if (pic) unreference_pic(h, pic, 0);
3512             }
3513             break;
3514         case MMCO_RESET:
3515             while(h->short_ref_count){
3516                 pic= remove_short(h, h->short_ref[0]->frame_num);
3517                 if(pic) unreference_pic(h, pic, 0);
3518             }
3519             for(j = 0; j < 16; j++) {
3520                 pic= remove_long(h, j);
3521                 if(pic) unreference_pic(h, pic, 0);
3522             }
3523             break;
3524         default: assert(0);
3525         }
3526     }
3527
3528     if(!current_is_long){
3529         pic= remove_short(h, s->current_picture_ptr->frame_num);
3530         if(pic){
3531             unreference_pic(h, pic, 0);
3532             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3533         }
3534
3535         if(h->short_ref_count)
3536             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3537
3538         h->short_ref[0]= s->current_picture_ptr;
3539         h->short_ref[0]->long_ref=0;
3540         h->short_ref_count++;
3541     }
3542
3543     print_short_term(h);
3544     print_long_term(h);
3545     return 0;
3546 }
3547
3548 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3549     MpegEncContext * const s = &h->s;
3550     int i;
3551
3552     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3553         s->broken_link= get_bits1(gb) -1;
3554         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3555         if(h->mmco[0].long_arg == -1)
3556             h->mmco_index= 0;
3557         else{
3558             h->mmco[0].opcode= MMCO_LONG;
3559             h->mmco_index= 1;
3560         }
3561     }else{
3562         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3563             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3564                 MMCOOpcode opcode= get_ue_golomb(gb);
3565
3566                 h->mmco[i].opcode= opcode;
3567                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3568                     h->mmco[i].short_pic_num= (h->frame_num - get_ue_golomb(gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
3569 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3570                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3571                         return -1;
3572                     }*/
3573                 }
3574                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3575                     unsigned int long_arg= get_ue_golomb(gb);
3576                     if(/*h->mmco[i].long_arg >= h->long_ref_count || h->long_ref[ h->mmco[i].long_arg ] == NULL*/ long_arg >= 16){
3577                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3578                         return -1;
3579                     }
3580                     h->mmco[i].long_arg= long_arg;
3581                 }
3582
3583                 if(opcode > (unsigned)MMCO_LONG){
3584                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3585                     return -1;
3586                 }
3587                 if(opcode == MMCO_END)
3588                     break;
3589             }
3590             h->mmco_index= i;
3591         }else{
3592             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3593
3594             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
3595                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3596                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3597                 h->mmco_index= 1;
3598             }else
3599                 h->mmco_index= 0;
3600         }
3601     }
3602
3603     return 0;
3604 }
3605
3606 static int init_poc(H264Context *h){
3607     MpegEncContext * const s = &h->s;
3608     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3609     int field_poc[2];
3610
3611     if(h->nal_unit_type == NAL_IDR_SLICE){
3612         h->frame_num_offset= 0;
3613     }else{
3614         if(h->frame_num < h->prev_frame_num)
3615             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3616         else
3617             h->frame_num_offset= h->prev_frame_num_offset;
3618     }
3619
3620     if(h->sps.poc_type==0){
3621         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3622
3623         if(h->nal_unit_type == NAL_IDR_SLICE){
3624              h->prev_poc_msb=
3625              h->prev_poc_lsb= 0;
3626         }
3627
3628         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3629             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3630         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3631             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3632         else
3633             h->poc_msb = h->prev_poc_msb;
3634 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3635         field_poc[0] =
3636         field_poc[1] = h->poc_msb + h->poc_lsb;
3637         if(s->picture_structure == PICT_FRAME)
3638             field_poc[1] += h->delta_poc_bottom;
3639     }else if(h->sps.poc_type==1){
3640         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3641         int i;
3642
3643         if(h->sps.poc_cycle_length != 0)
3644             abs_frame_num = h->frame_num_offset + h->frame_num;
3645         else
3646             abs_frame_num = 0;
3647
3648         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3649             abs_frame_num--;
3650
3651         expected_delta_per_poc_cycle = 0;
3652         for(i=0; i < h->sps.poc_cycle_length; i++)
3653             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3654
3655         if(abs_frame_num > 0){
3656             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3657             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3658
3659             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3660             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3661                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3662         } else
3663             expectedpoc = 0;
3664
3665         if(h->nal_ref_idc == 0)
3666             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3667
3668         field_poc[0] = expectedpoc + h->delta_poc[0];
3669         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3670
3671         if(s->picture_structure == PICT_FRAME)
3672             field_poc[1] += h->delta_poc[1];
3673     }else{
3674         int poc;
3675         if(h->nal_unit_type == NAL_IDR_SLICE){
3676             poc= 0;
3677         }else{
3678             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3679             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3680         }
3681         field_poc[0]= poc;
3682         field_poc[1]= poc;
3683     }
3684
3685     if(s->picture_structure != PICT_BOTTOM_FIELD)
3686         s->current_picture_ptr->field_poc[0]= field_poc[0];
3687     if(s->picture_structure != PICT_TOP_FIELD)
3688         s->current_picture_ptr->field_poc[1]= field_poc[1];
3689     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
3690         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
3691
3692     return 0;
3693 }
3694
3695
3696 /**
3697  * initialize scan tables
3698  */
3699 static void init_scan_tables(H264Context *h){
3700     MpegEncContext * const s = &h->s;
3701     int i;
3702     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3703         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3704         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3705     }else{
3706         for(i=0; i<16; i++){
3707 #define T(x) (x>>2) | ((x<<2) & 0xF)
3708             h->zigzag_scan[i] = T(zigzag_scan[i]);
3709             h-> field_scan[i] = T( field_scan[i]);
3710 #undef T
3711         }
3712     }
3713     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3714         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3715         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3716         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3717         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3718     }else{
3719         for(i=0; i<64; i++){
3720 #define T(x) (x>>3) | ((x&7)<<3)
3721             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3722             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3723             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3724             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3725 #undef T
3726         }
3727     }
3728     if(h->sps.transform_bypass){ //FIXME same ugly
3729         h->zigzag_scan_q0          = zigzag_scan;
3730         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3731         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3732         h->field_scan_q0           = field_scan;
3733         h->field_scan8x8_q0        = field_scan8x8;
3734         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3735     }else{
3736         h->zigzag_scan_q0          = h->zigzag_scan;
3737         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3738         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3739         h->field_scan_q0           = h->field_scan;
3740         h->field_scan8x8_q0        = h->field_scan8x8;
3741         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3742     }
3743 }
3744
3745 /**
3746  * Replicates H264 "master" context to thread contexts.
3747  */
3748 static void clone_slice(H264Context *dst, H264Context *src)
3749 {
3750     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3751     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3752     dst->s.current_picture      = src->s.current_picture;
3753     dst->s.linesize             = src->s.linesize;
3754     dst->s.uvlinesize           = src->s.uvlinesize;
3755
3756     dst->prev_poc_msb           = src->prev_poc_msb;
3757     dst->prev_poc_lsb           = src->prev_poc_lsb;
3758     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3759     dst->prev_frame_num         = src->prev_frame_num;
3760     dst->short_ref_count        = src->short_ref_count;
3761
3762     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3763     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3764     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3765     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3766
3767     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3768     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3769 }
3770
3771 /**
3772  * decodes a slice header.
3773  * this will allso call MPV_common_init() and frame_start() as needed
3774  *
3775  * @param h h264context
3776  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3777  *
3778  * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
3779  */
3780 static int decode_slice_header(H264Context *h, H264Context *h0){
3781     MpegEncContext * const s = &h->s;
3782     unsigned int first_mb_in_slice;
3783     unsigned int pps_id;
3784     int num_ref_idx_active_override_flag;
3785     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3786     unsigned int slice_type, tmp, i;
3787     int default_ref_list_done = 0;
3788
3789     s->dropable= h->nal_ref_idc == 0;
3790
3791     first_mb_in_slice= get_ue_golomb(&s->gb);
3792
3793     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3794         h0->current_slice = 0;
3795         s->current_picture_ptr= NULL;
3796     }
3797
3798     slice_type= get_ue_golomb(&s->gb);
3799     if(slice_type > 9){
3800         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3801         return -1;
3802     }
3803     if(slice_type > 4){
3804         slice_type -= 5;
3805         h->slice_type_fixed=1;
3806     }else
3807         h->slice_type_fixed=0;
3808
3809     slice_type= slice_type_map[ slice_type ];
3810     if (slice_type == I_TYPE
3811         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3812         default_ref_list_done = 1;
3813     }
3814     h->slice_type= slice_type;
3815
3816     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3817
3818     pps_id= get_ue_golomb(&s->gb);
3819     if(pps_id>=MAX_PPS_COUNT){
3820         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3821         return -1;
3822     }
3823     if(!h0->pps_buffers[pps_id]) {
3824         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3825         return -1;
3826     }
3827     h->pps= *h0->pps_buffers[pps_id];
3828
3829     if(!h0->sps_buffers[h->pps.sps_id]) {
3830         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3831         return -1;
3832     }
3833     h->sps = *h0->sps_buffers[h->pps.sps_id];
3834
3835     if(h == h0 && h->dequant_coeff_pps != pps_id){
3836         h->dequant_coeff_pps = pps_id;
3837         init_dequant_tables(h);
3838     }
3839
3840     s->mb_width= h->sps.mb_width;
3841     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3842
3843     h->b_stride=  s->mb_width*4;
3844     h->b8_stride= s->mb_width*2;
3845
3846     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3847     if(h->sps.frame_mbs_only_flag)
3848         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3849     else
3850         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3851
3852     if (s->context_initialized
3853         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3854         if(h != h0)
3855             return -1;   // width / height changed during parallelized decoding
3856         free_tables(h);
3857         MPV_common_end(s);
3858     }
3859     if (!s->context_initialized) {
3860         if(h != h0)
3861             return -1;  // we cant (re-)initialize context during parallel decoding
3862         if (MPV_common_init(s) < 0)
3863             return -1;
3864
3865         init_scan_tables(h);
3866         alloc_tables(h);
3867
3868         for(i = 1; i < s->avctx->thread_count; i++) {
3869             H264Context *c;
3870             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3871             memcpy(c, h, sizeof(MpegEncContext));
3872             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3873             c->sps = h->sps;
3874             c->pps = h->pps;
3875             init_scan_tables(c);
3876             clone_tables(c, h);
3877         }
3878
3879         for(i = 0; i < s->avctx->thread_count; i++)
3880             if(context_init(h->thread_context[i]) < 0)
3881                 return -1;
3882
3883         s->avctx->width = s->width;
3884         s->avctx->height = s->height;
3885         s->avctx->sample_aspect_ratio= h->sps.sar;
3886         if(!s->avctx->sample_aspect_ratio.den)
3887             s->avctx->sample_aspect_ratio.den = 1;
3888
3889         if(h->sps.timing_info_present_flag){
3890             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3891             if(h->x264_build > 0 && h->x264_build < 44)
3892                 s->avctx->time_base.den *= 2;
3893             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3894                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3895         }
3896     }
3897
3898     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3899
3900     h->mb_mbaff = 0;
3901     h->mb_aff_frame = 0;
3902     if(h->sps.frame_mbs_only_flag){
3903         s->picture_structure= PICT_FRAME;
3904     }else{
3905         if(get_bits1(&s->gb)) { //field_pic_flag
3906             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3907             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
3908         } else {
3909             s->picture_structure= PICT_FRAME;
3910             h->mb_aff_frame = h->sps.mb_aff;
3911         }
3912     }
3913
3914     if(h0->current_slice == 0){
3915         if(frame_start(h) < 0)
3916             return -1;
3917     }
3918     if(h != h0)
3919         clone_slice(h, h0);
3920
3921     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3922
3923     assert(s->mb_num == s->mb_width * s->mb_height);
3924     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3925        first_mb_in_slice                    >= s->mb_num){
3926         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3927         return -1;
3928     }
3929     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3930     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3931     if (s->picture_structure == PICT_BOTTOM_FIELD)
3932         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3933     assert(s->mb_y < s->mb_height);
3934
3935     if(s->picture_structure==PICT_FRAME){
3936         h->curr_pic_num=   h->frame_num;
3937         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3938     }else{
3939         h->curr_pic_num= 2*h->frame_num + 1;
3940         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3941     }
3942
3943     if(h->nal_unit_type == NAL_IDR_SLICE){
3944         get_ue_golomb(&s->gb); /* idr_pic_id */
3945     }
3946
3947     if(h->sps.poc_type==0){
3948         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3949
3950         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3951             h->delta_poc_bottom= get_se_golomb(&s->gb);
3952         }
3953     }
3954
3955     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3956         h->delta_poc[0]= get_se_golomb(&s->gb);
3957
3958         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3959             h->delta_poc[1]= get_se_golomb(&s->gb);
3960     }
3961
3962     init_poc(h);
3963
3964     if(h->pps.redundant_pic_cnt_present){
3965         h->redundant_pic_count= get_ue_golomb(&s->gb);
3966     }
3967
3968     //set defaults, might be overriden a few line later
3969     h->ref_count[0]= h->pps.ref_count[0];
3970     h->ref_count[1]= h->pps.ref_count[1];
3971
3972     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
3973         if(h->slice_type == B_TYPE){
3974             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3975             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
3976                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
3977         }
3978         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3979
3980         if(num_ref_idx_active_override_flag){
3981             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3982             if(h->slice_type==B_TYPE)
3983                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3984
3985             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3986                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3987                 h->ref_count[0]= h->ref_count[1]= 1;
3988                 return -1;
3989             }
3990         }
3991         if(h->slice_type == B_TYPE)
3992             h->list_count= 2;
3993         else
3994             h->list_count= 1;
3995     }else
3996         h->list_count= 0;
3997
3998     if(!default_ref_list_done){
3999         fill_default_ref_list(h);
4000     }
4001
4002     if(decode_ref_pic_list_reordering(h) < 0)
4003         return -1;
4004
4005     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4006        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4007         pred_weight_table(h);
4008     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4009         implicit_weight_table(h);
4010     else
4011         h->use_weight = 0;
4012
4013     if(h->nal_ref_idc)
4014         decode_ref_pic_marking(h0, &s->gb);
4015
4016     if(FRAME_MBAFF)
4017         fill_mbaff_ref_list(h);
4018
4019     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4020         tmp = get_ue_golomb(&s->gb);
4021         if(tmp > 2){
4022             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4023             return -1;
4024         }
4025         h->cabac_init_idc= tmp;
4026     }
4027
4028     h->last_qscale_diff = 0;
4029     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4030     if(tmp>51){
4031         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4032         return -1;
4033     }
4034     s->qscale= tmp;
4035     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4036     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4037     //FIXME qscale / qp ... stuff
4038     if(h->slice_type == SP_TYPE){
4039         get_bits1(&s->gb); /* sp_for_switch_flag */
4040     }
4041     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4042         get_se_golomb(&s->gb); /* slice_qs_delta */
4043     }
4044
4045     h->deblocking_filter = 1;
4046     h->slice_alpha_c0_offset = 0;
4047     h->slice_beta_offset = 0;
4048     if( h->pps.deblocking_filter_parameters_present ) {
4049         tmp= get_ue_golomb(&s->gb);
4050         if(tmp > 2){
4051             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4052             return -1;
4053         }
4054         h->deblocking_filter= tmp;
4055         if(h->deblocking_filter < 2)
4056             h->deblocking_filter^= 1; // 1<->0
4057
4058         if( h->deblocking_filter ) {
4059             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4060             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4061         }
4062     }
4063
4064     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4065        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4066        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4067        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4068         h->deblocking_filter= 0;
4069
4070     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4071         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4072             /* Cheat slightly for speed:
4073                Dont bother to deblock across slices */
4074             h->deblocking_filter = 2;
4075         } else {
4076             h0->max_contexts = 1;
4077             if(!h0->single_decode_warning) {
4078                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4079                 h0->single_decode_warning = 1;
4080             }
4081             if(h != h0)
4082                 return 1; // deblocking switched inside frame
4083         }
4084     }
4085
4086 #if 0 //FMO
4087     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4088         slice_group_change_cycle= get_bits(&s->gb, ?);
4089 #endif
4090
4091     h0->last_slice_type = slice_type;
4092     h->slice_num = ++h0->current_slice;
4093
4094     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4095     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4096
4097     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4098         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4099                h->slice_num,
4100                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4101                first_mb_in_slice,
4102                av_get_pict_type_char(h->slice_type),
4103                pps_id, h->frame_num,
4104                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4105                h->ref_count[0], h->ref_count[1],
4106                s->qscale,
4107                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4108                h->use_weight,
4109                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4110                );
4111     }
4112
4113     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
4114         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4115         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4116     }else{
4117         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4118         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4119     }
4120
4121     return 0;
4122 }
4123
4124 /**
4125  *
4126  */
4127 static inline int get_level_prefix(GetBitContext *gb){
4128     unsigned int buf;
4129     int log;
4130
4131     OPEN_READER(re, gb);
4132     UPDATE_CACHE(re, gb);
4133     buf=GET_CACHE(re, gb);
4134
4135     log= 32 - av_log2(buf);
4136 #ifdef TRACE
4137     print_bin(buf>>(32-log), log);
4138     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4139 #endif
4140
4141     LAST_SKIP_BITS(re, gb, log);
4142     CLOSE_READER(re, gb);
4143
4144     return log-1;
4145 }
4146
4147 static inline int get_dct8x8_allowed(H264Context *h){
4148     int i;
4149     for(i=0; i<4; i++){
4150         if(!IS_SUB_8X8(h->sub_mb_type[i])
4151            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4152             return 0;
4153     }
4154     return 1;
4155 }
4156
4157 /**
4158  * decodes a residual block.
4159  * @param n block index
4160  * @param scantable scantable
4161  * @param max_coeff number of coefficients in the block
4162  * @return <0 if an error occured
4163  */
4164 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4165     MpegEncContext * const s = &h->s;
4166     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4167     int level[16];
4168     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4169
4170     //FIXME put trailing_onex into the context
4171
4172     if(n == CHROMA_DC_BLOCK_INDEX){
4173         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4174         total_coeff= coeff_token>>2;
4175     }else{
4176         if(n == LUMA_DC_BLOCK_INDEX){
4177             total_coeff= pred_non_zero_count(h, 0);
4178             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4179             total_coeff= coeff_token>>2;
4180         }else{
4181             total_coeff= pred_non_zero_count(h, n);
4182             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4183             total_coeff= coeff_token>>2;
4184             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4185         }
4186     }
4187
4188     //FIXME set last_non_zero?
4189
4190     if(total_coeff==0)
4191         return 0;
4192     if(total_coeff > (unsigned)max_coeff) {
4193         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4194         return -1;
4195     }
4196
4197     trailing_ones= coeff_token&3;
4198     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4199     assert(total_coeff<=16);
4200
4201     for(i=0; i<trailing_ones; i++){
4202         level[i]= 1 - 2*get_bits1(gb);
4203     }
4204
4205     if(i<total_coeff) {
4206         int level_code, mask;
4207         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4208         int prefix= get_level_prefix(gb);
4209
4210         //first coefficient has suffix_length equal to 0 or 1
4211         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4212             if(suffix_length)
4213                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4214             else
4215                 level_code= (prefix<<suffix_length); //part
4216         }else if(prefix==14){
4217             if(suffix_length)
4218                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4219             else
4220                 level_code= prefix + get_bits(gb, 4); //part
4221         }else if(prefix==15){
4222             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4223             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4224         }else{
4225             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4226             return -1;
4227         }
4228
4229         if(trailing_ones < 3) level_code += 2;
4230
4231         suffix_length = 1;
4232         if(level_code > 5)
4233             suffix_length++;
4234         mask= -(level_code&1);
4235         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4236         i++;
4237
4238         //remaining coefficients have suffix_length > 0
4239         for(;i<total_coeff;i++) {
4240             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4241             prefix = get_level_prefix(gb);
4242             if(prefix<15){
4243                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4244             }else if(prefix==15){
4245                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4246             }else{
4247                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4248                 return -1;
4249             }
4250             mask= -(level_code&1);
4251             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4252             if(level_code > suffix_limit[suffix_length])
4253                 suffix_length++;
4254         }
4255     }
4256
4257     if(total_coeff == max_coeff)
4258         zeros_left=0;
4259     else{
4260         if(n == CHROMA_DC_BLOCK_INDEX)
4261             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4262         else
4263             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4264     }
4265
4266     coeff_num = zeros_left + total_coeff - 1;
4267     j = scantable[coeff_num];
4268     if(n > 24){
4269         block[j] = level[0];
4270         for(i=1;i<total_coeff;i++) {
4271             if(zeros_left <= 0)
4272                 run_before = 0;
4273             else if(zeros_left < 7){
4274                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4275             }else{
4276                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4277             }
4278             zeros_left -= run_before;
4279             coeff_num -= 1 + run_before;
4280             j= scantable[ coeff_num ];
4281
4282             block[j]= level[i];
4283         }
4284     }else{
4285         block[j] = (level[0] * qmul[j] + 32)>>6;
4286         for(i=1;i<total_coeff;i++) {
4287             if(zeros_left <= 0)
4288                 run_before = 0;
4289             else if(zeros_left < 7){
4290                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4291             }else{
4292                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4293             }
4294             zeros_left -= run_before;
4295             coeff_num -= 1 + run_before;
4296             j= scantable[ coeff_num ];
4297
4298             block[j]= (level[i] * qmul[j] + 32)>>6;
4299         }
4300     }
4301
4302     if(zeros_left<0){
4303         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4304         return -1;
4305     }
4306
4307     return 0;
4308 }
4309
4310 static void predict_field_decoding_flag(H264Context *h){
4311     MpegEncContext * const s = &h->s;
4312     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4313     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4314                 ? s->current_picture.mb_type[mb_xy-1]
4315                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4316                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4317                 : 0;
4318     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4319 }
4320
4321 /**
4322  * decodes a P_SKIP or B_SKIP macroblock
4323  */
4324 static void decode_mb_skip(H264Context *h){
4325     MpegEncContext * const s = &h->s;
4326     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4327     int mb_type=0;
4328
4329     memset(h->non_zero_count[mb_xy], 0, 16);
4330     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4331
4332     if(MB_FIELD)
4333         mb_type|= MB_TYPE_INTERLACED;
4334
4335     if( h->slice_type == B_TYPE )
4336     {
4337         // just for fill_caches. pred_direct_motion will set the real mb_type
4338         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4339
4340         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4341         pred_direct_motion(h, &mb_type);
4342         mb_type|= MB_TYPE_SKIP;
4343     }
4344     else
4345     {
4346         int mx, my;
4347         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4348
4349         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4350         pred_pskip_motion(h, &mx, &my);
4351         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4352         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4353     }
4354
4355     write_back_motion(h, mb_type);
4356     s->current_picture.mb_type[mb_xy]= mb_type;
4357     s->current_picture.qscale_table[mb_xy]= s->qscale;
4358     h->slice_table[ mb_xy ]= h->slice_num;
4359     h->prev_mb_skipped= 1;
4360 }
4361
4362 /**
4363  * decodes a macroblock
4364  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4365  */
4366 static int decode_mb_cavlc(H264Context *h){
4367     MpegEncContext * const s = &h->s;
4368     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4369     int partition_count;
4370     unsigned int mb_type, cbp;
4371     int dct8x8_allowed= h->pps.transform_8x8_mode;
4372
4373     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4374
4375     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4376     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4377                 down the code */
4378     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4379         if(s->mb_skip_run==-1)
4380             s->mb_skip_run= get_ue_golomb(&s->gb);
4381
4382         if (s->mb_skip_run--) {
4383             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4384                 if(s->mb_skip_run==0)
4385                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4386                 else
4387                     predict_field_decoding_flag(h);
4388             }
4389             decode_mb_skip(h);
4390             return 0;
4391         }
4392     }
4393     if(FRAME_MBAFF){
4394         if( (s->mb_y&1) == 0 )
4395             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4396     }else
4397         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4398
4399     h->prev_mb_skipped= 0;
4400
4401     mb_type= get_ue_golomb(&s->gb);
4402     if(h->slice_type == B_TYPE){
4403         if(mb_type < 23){
4404             partition_count= b_mb_type_info[mb_type].partition_count;
4405             mb_type=         b_mb_type_info[mb_type].type;
4406         }else{
4407             mb_type -= 23;
4408             goto decode_intra_mb;
4409         }
4410     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4411         if(mb_type < 5){
4412             partition_count= p_mb_type_info[mb_type].partition_count;
4413             mb_type=         p_mb_type_info[mb_type].type;
4414         }else{
4415             mb_type -= 5;
4416             goto decode_intra_mb;
4417         }
4418     }else{
4419        assert(h->slice_type == I_TYPE);
4420 decode_intra_mb:
4421         if(mb_type > 25){
4422             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4423             return -1;
4424         }
4425         partition_count=0;
4426         cbp= i_mb_type_info[mb_type].cbp;
4427         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4428         mb_type= i_mb_type_info[mb_type].type;
4429     }
4430
4431     if(MB_FIELD)
4432         mb_type |= MB_TYPE_INTERLACED;
4433
4434     h->slice_table[ mb_xy ]= h->slice_num;
4435
4436     if(IS_INTRA_PCM(mb_type)){
4437         unsigned int x, y;
4438
4439         // We assume these blocks are very rare so we do not optimize it.
4440         align_get_bits(&s->gb);
4441
4442         // The pixels are stored in the same order as levels in h->mb array.
4443         for(y=0; y<16; y++){
4444             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4445             for(x=0; x<16; x++){
4446                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4447                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4448             }
4449         }
4450         for(y=0; y<8; y++){
4451             const int index= 256 + 4*(y&3) + 32*(y>>2);
4452             for(x=0; x<8; x++){
4453                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4454                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4455             }
4456         }
4457         for(y=0; y<8; y++){
4458             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4459             for(x=0; x<8; x++){
4460                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4461                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4462             }
4463         }
4464
4465         // In deblocking, the quantizer is 0
4466         s->current_picture.qscale_table[mb_xy]= 0;
4467         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4468         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4469         // All coeffs are present
4470         memset(h->non_zero_count[mb_xy], 16, 16);
4471
4472         s->current_picture.mb_type[mb_xy]= mb_type;
4473         return 0;
4474     }
4475
4476     if(MB_MBAFF){
4477         h->ref_count[0] <<= 1;
4478         h->ref_count[1] <<= 1;
4479     }
4480
4481     fill_caches(h, mb_type, 0);
4482
4483     //mb_pred
4484     if(IS_INTRA(mb_type)){
4485             int pred_mode;
4486 //            init_top_left_availability(h);
4487             if(IS_INTRA4x4(mb_type)){
4488                 int i;
4489                 int di = 1;
4490                 if(dct8x8_allowed && get_bits1(&s->gb)){
4491                     mb_type |= MB_TYPE_8x8DCT;
4492                     di = 4;
4493                 }
4494
4495 //                fill_intra4x4_pred_table(h);
4496                 for(i=0; i<16; i+=di){
4497                     int mode= pred_intra_mode(h, i);
4498
4499                     if(!get_bits1(&s->gb)){
4500                         const int rem_mode= get_bits(&s->gb, 3);
4501                         mode = rem_mode + (rem_mode >= mode);
4502                     }
4503
4504                     if(di==4)
4505                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4506                     else
4507                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4508                 }
4509                 write_back_intra_pred_mode(h);
4510                 if( check_intra4x4_pred_mode(h) < 0)
4511                     return -1;
4512             }else{
4513                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4514                 if(h->intra16x16_pred_mode < 0)
4515                     return -1;
4516             }
4517
4518             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4519             if(pred_mode < 0)
4520                 return -1;
4521             h->chroma_pred_mode= pred_mode;
4522     }else if(partition_count==4){
4523         int i, j, sub_partition_count[4], list, ref[2][4];
4524
4525         if(h->slice_type == B_TYPE){
4526             for(i=0; i<4; i++){
4527                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4528                 if(h->sub_mb_type[i] >=13){
4529                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4530                     return -1;
4531                 }
4532                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4533                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4534             }
4535             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4536                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4537                 pred_direct_motion(h, &mb_type);
4538                 h->ref_cache[0][scan8[4]] =
4539                 h->ref_cache[1][scan8[4]] =
4540                 h->ref_cache[0][scan8[12]] =
4541                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4542             }
4543         }else{
4544             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4545             for(i=0; i<4; i++){
4546                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4547                 if(h->sub_mb_type[i] >=4){
4548                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4549                     return -1;
4550                 }
4551                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4552                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4553             }
4554         }
4555
4556         for(list=0; list<h->list_count; list++){
4557             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4558             for(i=0; i<4; i++){
4559                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4560                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4561                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4562                     if(tmp>=ref_count){
4563                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4564                         return -1;
4565                     }
4566                     ref[list][i]= tmp;
4567                 }else{
4568                  //FIXME
4569                     ref[list][i] = -1;
4570                 }
4571             }
4572         }
4573
4574         if(dct8x8_allowed)
4575             dct8x8_allowed = get_dct8x8_allowed(h);
4576
4577         for(list=0; list<h->list_count; list++){
4578             for(i=0; i<4; i++){
4579                 if(IS_DIRECT(h->sub_mb_type[i])) {
4580                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4581                     continue;
4582                 }
4583                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4584                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4585
4586                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4587                     const int sub_mb_type= h->sub_mb_type[i];
4588                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4589                     for(j=0; j<sub_partition_count[i]; j++){
4590                         int mx, my;
4591                         const int index= 4*i + block_width*j;
4592                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4593                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4594                         mx += get_se_golomb(&s->gb);
4595                         my += get_se_golomb(&s->gb);
4596                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4597
4598                         if(IS_SUB_8X8(sub_mb_type)){
4599                             mv_cache[ 1 ][0]=
4600                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4601                             mv_cache[ 1 ][1]=
4602                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4603                         }else if(IS_SUB_8X4(sub_mb_type)){
4604                             mv_cache[ 1 ][0]= mx;
4605                             mv_cache[ 1 ][1]= my;
4606                         }else if(IS_SUB_4X8(sub_mb_type)){
4607                             mv_cache[ 8 ][0]= mx;
4608                             mv_cache[ 8 ][1]= my;
4609                         }
4610                         mv_cache[ 0 ][0]= mx;
4611                         mv_cache[ 0 ][1]= my;
4612                     }
4613                 }else{
4614                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4615                     p[0] = p[1]=
4616                     p[8] = p[9]= 0;
4617                 }
4618             }
4619         }
4620     }else if(IS_DIRECT(mb_type)){
4621         pred_direct_motion(h, &mb_type);
4622         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4623     }else{
4624         int list, mx, my, i;
4625          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4626         if(IS_16X16(mb_type)){
4627             for(list=0; list<h->list_count; list++){
4628                     unsigned int val;
4629                     if(IS_DIR(mb_type, 0, list)){
4630                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4631                         if(val >= h->ref_count[list]){
4632                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4633                             return -1;
4634                         }
4635                     }else
4636                         val= LIST_NOT_USED&0xFF;
4637                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4638             }
4639             for(list=0; list<h->list_count; list++){
4640                 unsigned int val;
4641                 if(IS_DIR(mb_type, 0, list)){
4642                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4643                     mx += get_se_golomb(&s->gb);
4644                     my += get_se_golomb(&s->gb);
4645                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4646
4647                     val= pack16to32(mx,my);
4648                 }else
4649                     val=0;
4650                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4651             }
4652         }
4653         else if(IS_16X8(mb_type)){
4654             for(list=0; list<h->list_count; list++){
4655                     for(i=0; i<2; i++){
4656                         unsigned int val;
4657                         if(IS_DIR(mb_type, i, list)){
4658                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4659                             if(val >= h->ref_count[list]){
4660                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4661                                 return -1;
4662                             }
4663                         }else
4664                             val= LIST_NOT_USED&0xFF;
4665                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4666                     }
4667             }
4668             for(list=0; list<h->list_count; list++){
4669                 for(i=0; i<2; i++){
4670                     unsigned int val;
4671                     if(IS_DIR(mb_type, i, list)){
4672                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4673                         mx += get_se_golomb(&s->gb);
4674                         my += get_se_golomb(&s->gb);
4675                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4676
4677                         val= pack16to32(mx,my);
4678                     }else
4679                         val=0;
4680                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4681                 }
4682             }
4683         }else{
4684             assert(IS_8X16(mb_type));
4685             for(list=0; list<h->list_count; list++){
4686                     for(i=0; i<2; i++){
4687                         unsigned int val;
4688                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4689                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4690                             if(val >= h->ref_count[list]){
4691                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4692                                 return -1;
4693                             }
4694                         }else
4695                             val= LIST_NOT_USED&0xFF;
4696                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4697                     }
4698             }
4699             for(list=0; list<h->list_count; list++){
4700                 for(i=0; i<2; i++){
4701                     unsigned int val;
4702                     if(IS_DIR(mb_type, i, list)){
4703                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4704                         mx += get_se_golomb(&s->gb);
4705                         my += get_se_golomb(&s->gb);
4706                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4707
4708                         val= pack16to32(mx,my);
4709                     }else
4710                         val=0;
4711                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4712                 }
4713             }
4714         }
4715     }
4716
4717     if(IS_INTER(mb_type))
4718         write_back_motion(h, mb_type);
4719
4720     if(!IS_INTRA16x16(mb_type)){
4721         cbp= get_ue_golomb(&s->gb);
4722         if(cbp > 47){
4723             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4724             return -1;
4725         }
4726
4727         if(IS_INTRA4x4(mb_type))
4728             cbp= golomb_to_intra4x4_cbp[cbp];
4729         else
4730             cbp= golomb_to_inter_cbp[cbp];
4731     }
4732     h->cbp = cbp;
4733
4734     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4735         if(get_bits1(&s->gb))
4736             mb_type |= MB_TYPE_8x8DCT;
4737     }
4738     s->current_picture.mb_type[mb_xy]= mb_type;
4739
4740     if(cbp || IS_INTRA16x16(mb_type)){
4741         int i8x8, i4x4, chroma_idx;
4742         int dquant;
4743         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4744         const uint8_t *scan, *scan8x8, *dc_scan;
4745
4746 //        fill_non_zero_count_cache(h);
4747
4748         if(IS_INTERLACED(mb_type)){
4749             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4750             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4751             dc_scan= luma_dc_field_scan;
4752         }else{
4753             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4754             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4755             dc_scan= luma_dc_zigzag_scan;
4756         }
4757
4758         dquant= get_se_golomb(&s->gb);
4759
4760         if( dquant > 25 || dquant < -26 ){
4761             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4762             return -1;
4763         }
4764
4765         s->qscale += dquant;
4766         if(((unsigned)s->qscale) > 51){
4767             if(s->qscale<0) s->qscale+= 52;
4768             else            s->qscale-= 52;
4769         }
4770
4771         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4772         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4773         if(IS_INTRA16x16(mb_type)){
4774             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4775                 return -1; //FIXME continue if partitioned and other return -1 too
4776             }
4777
4778             assert((cbp&15) == 0 || (cbp&15) == 15);
4779
4780             if(cbp&15){
4781                 for(i8x8=0; i8x8<4; i8x8++){
4782                     for(i4x4=0; i4x4<4; i4x4++){
4783                         const int index= i4x4 + 4*i8x8;
4784                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4785                             return -1;
4786                         }
4787                     }
4788                 }
4789             }else{
4790                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4791             }
4792         }else{
4793             for(i8x8=0; i8x8<4; i8x8++){
4794                 if(cbp & (1<<i8x8)){
4795                     if(IS_8x8DCT(mb_type)){
4796                         DCTELEM *buf = &h->mb[64*i8x8];
4797                         uint8_t *nnz;
4798                         for(i4x4=0; i4x4<4; i4x4++){
4799                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4800                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4801                                 return -1;
4802                         }
4803                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4804                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4805                     }else{
4806                         for(i4x4=0; i4x4<4; i4x4++){
4807                             const int index= i4x4 + 4*i8x8;
4808
4809                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4810                                 return -1;
4811                             }
4812                         }
4813                     }
4814                 }else{
4815                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4816                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4817                 }
4818             }
4819         }
4820
4821         if(cbp&0x30){
4822             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4823                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4824                     return -1;
4825                 }
4826         }
4827
4828         if(cbp&0x20){
4829             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4830                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4831                 for(i4x4=0; i4x4<4; i4x4++){
4832                     const int index= 16 + 4*chroma_idx + i4x4;
4833                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4834                         return -1;
4835                     }
4836                 }
4837             }
4838         }else{
4839             uint8_t * const nnz= &h->non_zero_count_cache[0];
4840             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4841             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4842         }
4843     }else{
4844         uint8_t * const nnz= &h->non_zero_count_cache[0];
4845         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4846         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4847         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4848     }
4849     s->current_picture.qscale_table[mb_xy]= s->qscale;
4850     write_back_non_zero_count(h);
4851
4852     if(MB_MBAFF){
4853         h->ref_count[0] >>= 1;
4854         h->ref_count[1] >>= 1;
4855     }
4856
4857     return 0;
4858 }
4859
4860 static int decode_cabac_field_decoding_flag(H264Context *h) {
4861     MpegEncContext * const s = &h->s;
4862     const int mb_x = s->mb_x;
4863     const int mb_y = s->mb_y & ~1;
4864     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4865     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4866
4867     unsigned int ctx = 0;
4868
4869     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4870         ctx += 1;
4871     }
4872     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4873         ctx += 1;
4874     }
4875
4876     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4877 }
4878
4879 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4880     uint8_t *state= &h->cabac_state[ctx_base];
4881     int mb_type;
4882
4883     if(intra_slice){
4884         MpegEncContext * const s = &h->s;
4885         const int mba_xy = h->left_mb_xy[0];
4886         const int mbb_xy = h->top_mb_xy;
4887         int ctx=0;
4888         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4889             ctx++;
4890         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4891             ctx++;
4892         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4893             return 0;   /* I4x4 */
4894         state += 2;
4895     }else{
4896         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4897             return 0;   /* I4x4 */
4898     }
4899
4900     if( get_cabac_terminate( &h->cabac ) )
4901         return 25;  /* PCM */
4902
4903     mb_type = 1; /* I16x16 */
4904     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4905     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4906         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4907     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4908     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4909     return mb_type;
4910 }
4911
4912 static int decode_cabac_mb_type( H264Context *h ) {
4913     MpegEncContext * const s = &h->s;
4914
4915     if( h->slice_type == I_TYPE ) {
4916         return decode_cabac_intra_mb_type(h, 3, 1);
4917     } else if( h->slice_type == P_TYPE ) {
4918         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4919             /* P-type */
4920             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4921                 /* P_L0_D16x16, P_8x8 */
4922                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4923             } else {
4924                 /* P_L0_D8x16, P_L0_D16x8 */
4925                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4926             }
4927         } else {
4928             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4929         }
4930     } else if( h->slice_type == B_TYPE ) {
4931         const int mba_xy = h->left_mb_xy[0];
4932         const int mbb_xy = h->top_mb_xy;
4933         int ctx = 0;
4934         int bits;
4935
4936         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4937             ctx++;
4938         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4939             ctx++;
4940
4941         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4942             return 0; /* B_Direct_16x16 */
4943
4944         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4945             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4946         }
4947
4948         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4949         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4950         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4951         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4952         if( bits < 8 )
4953             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4954         else if( bits == 13 ) {
4955             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4956         } else if( bits == 14 )
4957             return 11; /* B_L1_L0_8x16 */
4958         else if( bits == 15 )
4959             return 22; /* B_8x8 */
4960
4961         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4962         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4963     } else {
4964         /* TODO SI/SP frames? */
4965         return -1;
4966     }
4967 }
4968
4969 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4970     MpegEncContext * const s = &h->s;
4971     int mba_xy, mbb_xy;
4972     int ctx = 0;
4973
4974     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4975         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4976         mba_xy = mb_xy - 1;
4977         if( (mb_y&1)
4978             && h->slice_table[mba_xy] == h->slice_num
4979             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4980             mba_xy += s->mb_stride;
4981         if( MB_FIELD ){
4982             mbb_xy = mb_xy - s->mb_stride;
4983             if( !(mb_y&1)
4984                 && h->slice_table[mbb_xy] == h->slice_num
4985                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4986                 mbb_xy -= s->mb_stride;
4987         }else
4988             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4989     }else{
4990         int mb_xy = mb_x + mb_y*s->mb_stride;
4991         mba_xy = mb_xy - 1;
4992         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4993     }
4994
4995     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4996         ctx++;
4997     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4998         ctx++;
4999
5000     if( h->slice_type == B_TYPE )
5001         ctx += 13;
5002     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5003 }
5004
5005 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5006     int mode = 0;
5007
5008     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5009         return pred_mode;
5010
5011     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5012     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5013     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5014
5015     if( mode >= pred_mode )
5016         return mode + 1;
5017     else
5018         return mode;
5019 }
5020
5021 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5022     const int mba_xy = h->left_mb_xy[0];
5023     const int mbb_xy = h->top_mb_xy;
5024
5025     int ctx = 0;
5026
5027     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5028     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5029         ctx++;
5030
5031     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5032         ctx++;
5033
5034     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5035         return 0;
5036
5037     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5038         return 1;
5039     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5040         return 2;
5041     else
5042         return 3;
5043 }
5044
5045 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5046     int cbp_b, cbp_a, ctx, cbp = 0;
5047
5048     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5049     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5050
5051     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5052     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5053     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5054     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5055     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5056     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5057     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5058     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5059     return cbp;
5060 }
5061 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5062     int ctx;
5063     int cbp_a, cbp_b;
5064
5065     cbp_a = (h->left_cbp>>4)&0x03;
5066     cbp_b = (h-> top_cbp>>4)&0x03;
5067
5068     ctx = 0;
5069     if( cbp_a > 0 ) ctx++;
5070     if( cbp_b > 0 ) ctx += 2;
5071     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5072         return 0;
5073
5074     ctx = 4;
5075     if( cbp_a == 2 ) ctx++;
5076     if( cbp_b == 2 ) ctx += 2;
5077     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5078 }
5079 static int decode_cabac_mb_dqp( H264Context *h) {
5080     int   ctx = 0;
5081     int   val = 0;
5082
5083     if( h->last_qscale_diff != 0 )
5084         ctx++;
5085
5086     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5087         if( ctx < 2 )
5088             ctx = 2;
5089         else
5090             ctx = 3;
5091         val++;
5092         if(val > 102) //prevent infinite loop
5093             return INT_MIN;
5094     }
5095
5096     if( val&0x01 )
5097         return (val + 1)/2;
5098     else
5099         return -(val + 1)/2;
5100 }
5101 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5102     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5103         return 0;   /* 8x8 */
5104     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5105         return 1;   /* 8x4 */
5106     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5107         return 2;   /* 4x8 */
5108     return 3;       /* 4x4 */
5109 }
5110 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5111     int type;
5112     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5113         return 0;   /* B_Direct_8x8 */
5114     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5115         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5116     type = 3;
5117     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5118         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5119             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5120         type += 4;
5121     }
5122     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5123     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5124     return type;
5125 }
5126
5127 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5128     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5129 }
5130
5131 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5132     int refa = h->ref_cache[list][scan8[n] - 1];
5133     int refb = h->ref_cache[list][scan8[n] - 8];
5134     int ref  = 0;
5135     int ctx  = 0;
5136
5137     if( h->slice_type == B_TYPE) {
5138         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5139             ctx++;
5140         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5141             ctx += 2;
5142     } else {
5143         if( refa > 0 )
5144             ctx++;
5145         if( refb > 0 )
5146             ctx += 2;
5147     }
5148
5149     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5150         ref++;
5151         if( ctx < 4 )
5152             ctx = 4;
5153         else
5154             ctx = 5;
5155         if(ref >= 32 /*h->ref_list[list]*/){
5156             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5157             return 0; //FIXME we should return -1 and check the return everywhere
5158         }
5159     }
5160     return ref;
5161 }
5162
5163 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5164     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5165                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5166     int ctxbase = (l == 0) ? 40 : 47;
5167     int ctx, mvd;
5168
5169     if( amvd < 3 )
5170         ctx = 0;
5171     else if( amvd > 32 )
5172         ctx = 2;
5173     else
5174         ctx = 1;
5175
5176     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5177         return 0;
5178
5179     mvd= 1;
5180     ctx= 3;
5181     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5182         mvd++;
5183         if( ctx < 6 )
5184             ctx++;
5185     }
5186
5187     if( mvd >= 9 ) {
5188         int k = 3;
5189         while( get_cabac_bypass( &h->cabac ) ) {
5190             mvd += 1 << k;
5191             k++;
5192             if(k>24){
5193                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5194                 return INT_MIN;
5195             }
5196         }
5197         while( k-- ) {
5198             if( get_cabac_bypass( &h->cabac ) )
5199                 mvd += 1 << k;
5200         }
5201     }
5202     return get_cabac_bypass_sign( &h->cabac, -mvd );
5203 }
5204
5205 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5206     int nza, nzb;
5207     int ctx = 0;
5208
5209     if( cat == 0 ) {
5210         nza = h->left_cbp&0x100;
5211         nzb = h-> top_cbp&0x100;
5212     } else if( cat == 1 || cat == 2 ) {
5213         nza = h->non_zero_count_cache[scan8[idx] - 1];
5214         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5215     } else if( cat == 3 ) {
5216         nza = (h->left_cbp>>(6+idx))&0x01;
5217         nzb = (h-> top_cbp>>(6+idx))&0x01;
5218     } else {
5219         assert(cat == 4);
5220         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5221         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5222     }
5223
5224     if( nza > 0 )
5225         ctx++;
5226
5227     if( nzb > 0 )
5228         ctx += 2;
5229
5230     return ctx + 4 * cat;
5231 }
5232
5233 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5234     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5235     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5236     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5237     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5238 };
5239
5240 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5241     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5242     static const int significant_coeff_flag_offset[2][6] = {
5243       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5244       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5245     };
5246     static const int last_coeff_flag_offset[2][6] = {
5247       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5248       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5249     };
5250     static const int coeff_abs_level_m1_offset[6] = {
5251         227+0, 227+10, 227+20, 227+30, 227+39, 426
5252     };
5253     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5254       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5255         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5256         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5257        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5258       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5259         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5260         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5261         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5262     };
5263
5264     int index[64];
5265
5266     int av_unused last;
5267     int coeff_count = 0;
5268
5269     int abslevel1 = 1;
5270     int abslevelgt1 = 0;
5271
5272     uint8_t *significant_coeff_ctx_base;
5273     uint8_t *last_coeff_ctx_base;
5274     uint8_t *abs_level_m1_ctx_base;
5275
5276 #ifndef ARCH_X86
5277 #define CABAC_ON_STACK
5278 #endif
5279 #ifdef CABAC_ON_STACK
5280 #define CC &cc
5281     CABACContext cc;
5282     cc.range     = h->cabac.range;
5283     cc.low       = h->cabac.low;
5284     cc.bytestream= h->cabac.bytestream;
5285 #else
5286 #define CC &h->cabac
5287 #endif
5288
5289
5290     /* cat: 0-> DC 16x16  n = 0
5291      *      1-> AC 16x16  n = luma4x4idx
5292      *      2-> Luma4x4   n = luma4x4idx
5293      *      3-> DC Chroma n = iCbCr
5294      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5295      *      5-> Luma8x8   n = 4 * luma8x8idx
5296      */
5297
5298     /* read coded block flag */
5299     if( cat != 5 ) {
5300         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5301             if( cat == 1 || cat == 2 )
5302                 h->non_zero_count_cache[scan8[n]] = 0;
5303             else if( cat == 4 )
5304                 h->non_zero_count_cache[scan8[16+n]] = 0;
5305 #ifdef CABAC_ON_STACK
5306             h->cabac.range     = cc.range     ;
5307             h->cabac.low       = cc.low       ;
5308             h->cabac.bytestream= cc.bytestream;
5309 #endif
5310             return;
5311         }
5312     }
5313
5314     significant_coeff_ctx_base = h->cabac_state
5315         + significant_coeff_flag_offset[MB_FIELD][cat];
5316     last_coeff_ctx_base = h->cabac_state
5317         + last_coeff_flag_offset[MB_FIELD][cat];
5318     abs_level_m1_ctx_base = h->cabac_state
5319         + coeff_abs_level_m1_offset[cat];
5320
5321     if( cat == 5 ) {
5322 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5323         for(last= 0; last < coefs; last++) { \
5324             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5325             if( get_cabac( CC, sig_ctx )) { \
5326                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5327                 index[coeff_count++] = last; \
5328                 if( get_cabac( CC, last_ctx ) ) { \
5329                     last= max_coeff; \
5330                     break; \
5331                 } \
5332             } \
5333         }\
5334         if( last == max_coeff -1 ) {\
5335             index[coeff_count++] = last;\
5336         }
5337         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5338 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5339         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5340     } else {
5341         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5342 #else
5343         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5344     } else {
5345         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5346 #endif
5347     }
5348     assert(coeff_count > 0);
5349
5350     if( cat == 0 )
5351         h->cbp_table[mb_xy] |= 0x100;
5352     else if( cat == 1 || cat == 2 )
5353         h->non_zero_count_cache[scan8[n]] = coeff_count;
5354     else if( cat == 3 )
5355         h->cbp_table[mb_xy] |= 0x40 << n;
5356     else if( cat == 4 )
5357         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5358     else {
5359         assert( cat == 5 );
5360         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5361     }
5362
5363     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5364         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5365         int j= scantable[index[coeff_count]];
5366
5367         if( get_cabac( CC, ctx ) == 0 ) {
5368             if( !qmul ) {
5369                 block[j] = get_cabac_bypass_sign( CC, -1);
5370             }else{
5371                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5372             }
5373
5374             abslevel1++;
5375         } else {
5376             int coeff_abs = 2;
5377             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5378             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5379                 coeff_abs++;
5380             }
5381
5382             if( coeff_abs >= 15 ) {
5383                 int j = 0;
5384                 while( get_cabac_bypass( CC ) ) {
5385                     j++;
5386                 }
5387
5388                 coeff_abs=1;
5389                 while( j-- ) {
5390                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5391                 }
5392                 coeff_abs+= 14;
5393             }
5394
5395             if( !qmul ) {
5396                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5397                 else                                block[j] =  coeff_abs;
5398             }else{
5399                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5400                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5401             }
5402
5403             abslevelgt1++;
5404         }
5405     }
5406 #ifdef CABAC_ON_STACK
5407             h->cabac.range     = cc.range     ;
5408             h->cabac.low       = cc.low       ;
5409             h->cabac.bytestream= cc.bytestream;
5410 #endif
5411
5412 }
5413
5414 static inline void compute_mb_neighbors(H264Context *h)
5415 {
5416     MpegEncContext * const s = &h->s;
5417     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5418     h->top_mb_xy     = mb_xy - s->mb_stride;
5419     h->left_mb_xy[0] = mb_xy - 1;
5420     if(FRAME_MBAFF){
5421         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5422         const int top_pair_xy      = pair_xy     - s->mb_stride;
5423         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5424         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5425         const int curr_mb_frame_flag = !MB_FIELD;
5426         const int bottom = (s->mb_y & 1);
5427         if (bottom
5428                 ? !curr_mb_frame_flag // bottom macroblock
5429                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5430                 ) {
5431             h->top_mb_xy -= s->mb_stride;
5432         }
5433         if (left_mb_frame_flag != curr_mb_frame_flag) {
5434             h->left_mb_xy[0] = pair_xy - 1;
5435         }
5436     } else if (FIELD_PICTURE) {
5437         h->top_mb_xy -= s->mb_stride;
5438     }
5439     return;
5440 }
5441
5442 /**
5443  * decodes a macroblock
5444  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5445  */
5446 static int decode_mb_cabac(H264Context *h) {
5447     MpegEncContext * const s = &h->s;
5448     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5449     int mb_type, partition_count, cbp = 0;
5450     int dct8x8_allowed= h->pps.transform_8x8_mode;
5451
5452     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5453
5454     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5455     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5456         int skip;
5457         /* a skipped mb needs the aff flag from the following mb */
5458         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5459             predict_field_decoding_flag(h);
5460         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5461             skip = h->next_mb_skipped;
5462         else
5463             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5464         /* read skip flags */
5465         if( skip ) {
5466             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5467                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5468                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5469                 if(h->next_mb_skipped)
5470                     predict_field_decoding_flag(h);
5471                 else
5472                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5473             }
5474
5475             decode_mb_skip(h);
5476
5477             h->cbp_table[mb_xy] = 0;
5478             h->chroma_pred_mode_table[mb_xy] = 0;
5479             h->last_qscale_diff = 0;
5480
5481             return 0;
5482
5483         }
5484     }
5485     if(FRAME_MBAFF){
5486         if( (s->mb_y&1) == 0 )
5487             h->mb_mbaff =
5488             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5489     }else
5490         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5491
5492     h->prev_mb_skipped = 0;
5493
5494     compute_mb_neighbors(h);
5495     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5496         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5497         return -1;
5498     }
5499
5500     if( h->slice_type == B_TYPE ) {
5501         if( mb_type < 23 ){
5502             partition_count= b_mb_type_info[mb_type].partition_count;
5503             mb_type=         b_mb_type_info[mb_type].type;
5504         }else{
5505             mb_type -= 23;
5506             goto decode_intra_mb;
5507         }
5508     } else if( h->slice_type == P_TYPE ) {
5509         if( mb_type < 5) {
5510             partition_count= p_mb_type_info[mb_type].partition_count;
5511             mb_type=         p_mb_type_info[mb_type].type;
5512         } else {
5513             mb_type -= 5;
5514             goto decode_intra_mb;
5515         }
5516     } else {
5517        assert(h->slice_type == I_TYPE);
5518 decode_intra_mb:
5519         partition_count = 0;
5520         cbp= i_mb_type_info[mb_type].cbp;
5521         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5522         mb_type= i_mb_type_info[mb_type].type;
5523     }
5524     if(MB_FIELD)
5525         mb_type |= MB_TYPE_INTERLACED;
5526
5527     h->slice_table[ mb_xy ]= h->slice_num;
5528
5529     if(IS_INTRA_PCM(mb_type)) {
5530         const uint8_t *ptr;
5531         unsigned int x, y;
5532
5533         // We assume these blocks are very rare so we do not optimize it.
5534         // FIXME The two following lines get the bitstream position in the cabac
5535         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5536         ptr= h->cabac.bytestream;
5537         if(h->cabac.low&0x1) ptr--;
5538         if(CABAC_BITS==16){
5539             if(h->cabac.low&0x1FF) ptr--;
5540         }
5541
5542         // The pixels are stored in the same order as levels in h->mb array.
5543         for(y=0; y<16; y++){
5544             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5545             for(x=0; x<16; x++){
5546                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5547                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5548             }
5549         }
5550         for(y=0; y<8; y++){
5551             const int index= 256 + 4*(y&3) + 32*(y>>2);
5552             for(x=0; x<8; x++){
5553                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5554                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5555             }
5556         }
5557         for(y=0; y<8; y++){
5558             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5559             for(x=0; x<8; x++){
5560                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5561                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5562             }
5563         }
5564
5565         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5566
5567         // All blocks are present
5568         h->cbp_table[mb_xy] = 0x1ef;
5569         h->chroma_pred_mode_table[mb_xy] = 0;
5570         // In deblocking, the quantizer is 0
5571         s->current_picture.qscale_table[mb_xy]= 0;
5572         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5573         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5574         // All coeffs are present
5575         memset(h->non_zero_count[mb_xy], 16, 16);
5576         s->current_picture.mb_type[mb_xy]= mb_type;
5577         return 0;
5578     }
5579
5580     if(MB_MBAFF){
5581         h->ref_count[0] <<= 1;
5582         h->ref_count[1] <<= 1;
5583     }
5584
5585     fill_caches(h, mb_type, 0);
5586
5587     if( IS_INTRA( mb_type ) ) {
5588         int i, pred_mode;
5589         if( IS_INTRA4x4( mb_type ) ) {
5590             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5591                 mb_type |= MB_TYPE_8x8DCT;
5592                 for( i = 0; i < 16; i+=4 ) {
5593                     int pred = pred_intra_mode( h, i );
5594                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5595                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5596                 }
5597             } else {
5598                 for( i = 0; i < 16; i++ ) {
5599                     int pred = pred_intra_mode( h, i );
5600                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5601
5602                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5603                 }
5604             }
5605             write_back_intra_pred_mode(h);
5606             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5607         } else {
5608             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5609             if( h->intra16x16_pred_mode < 0 ) return -1;
5610         }
5611         h->chroma_pred_mode_table[mb_xy] =
5612         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5613
5614         pred_mode= check_intra_pred_mode( h, pred_mode );
5615         if( pred_mode < 0 ) return -1;
5616         h->chroma_pred_mode= pred_mode;
5617     } else if( partition_count == 4 ) {
5618         int i, j, sub_partition_count[4], list, ref[2][4];
5619
5620         if( h->slice_type == B_TYPE ) {
5621             for( i = 0; i < 4; i++ ) {
5622                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5623                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5624                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5625             }
5626             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5627                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5628                 pred_direct_motion(h, &mb_type);
5629                 h->ref_cache[0][scan8[4]] =
5630                 h->ref_cache[1][scan8[4]] =
5631                 h->ref_cache[0][scan8[12]] =
5632                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5633                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5634                     for( i = 0; i < 4; i++ )
5635                         if( IS_DIRECT(h->sub_mb_type[i]) )
5636                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5637                 }
5638             }
5639         } else {
5640             for( i = 0; i < 4; i++ ) {
5641                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5642                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5643                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5644             }
5645         }
5646
5647         for( list = 0; list < h->list_count; list++ ) {
5648                 for( i = 0; i < 4; i++ ) {
5649                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5650                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5651                         if( h->ref_count[list] > 1 )
5652                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5653                         else
5654                             ref[list][i] = 0;
5655                     } else {
5656                         ref[list][i] = -1;
5657                     }
5658                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5659                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5660                 }
5661         }
5662
5663         if(dct8x8_allowed)
5664             dct8x8_allowed = get_dct8x8_allowed(h);
5665
5666         for(list=0; list<h->list_count; list++){
5667             for(i=0; i<4; i++){
5668                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5669                 if(IS_DIRECT(h->sub_mb_type[i])){
5670                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5671                     continue;
5672                 }
5673
5674                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5675                     const int sub_mb_type= h->sub_mb_type[i];
5676                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5677                     for(j=0; j<sub_partition_count[i]; j++){
5678                         int mpx, mpy;
5679                         int mx, my;
5680                         const int index= 4*i + block_width*j;
5681                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5682                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5683                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5684
5685                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5686                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5687                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5688
5689                         if(IS_SUB_8X8(sub_mb_type)){
5690                             mv_cache[ 1 ][0]=
5691                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5692                             mv_cache[ 1 ][1]=
5693                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5694
5695                             mvd_cache[ 1 ][0]=
5696                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5697                             mvd_cache[ 1 ][1]=
5698                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5699                         }else if(IS_SUB_8X4(sub_mb_type)){
5700                             mv_cache[ 1 ][0]= mx;
5701                             mv_cache[ 1 ][1]= my;
5702
5703                             mvd_cache[ 1 ][0]= mx - mpx;
5704                             mvd_cache[ 1 ][1]= my - mpy;
5705                         }else if(IS_SUB_4X8(sub_mb_type)){
5706                             mv_cache[ 8 ][0]= mx;
5707                             mv_cache[ 8 ][1]= my;
5708
5709                             mvd_cache[ 8 ][0]= mx - mpx;
5710                             mvd_cache[ 8 ][1]= my - mpy;
5711                         }
5712                         mv_cache[ 0 ][0]= mx;
5713                         mv_cache[ 0 ][1]= my;
5714
5715                         mvd_cache[ 0 ][0]= mx - mpx;
5716                         mvd_cache[ 0 ][1]= my - mpy;
5717                     }
5718                 }else{
5719                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5720                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5721                     p[0] = p[1] = p[8] = p[9] = 0;
5722                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5723                 }
5724             }
5725         }
5726     } else if( IS_DIRECT(mb_type) ) {
5727         pred_direct_motion(h, &mb_type);
5728         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5729         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5730         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5731     } else {
5732         int list, mx, my, i, mpx, mpy;
5733         if(IS_16X16(mb_type)){
5734             for(list=0; list<h->list_count; list++){
5735                 if(IS_DIR(mb_type, 0, list)){
5736                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5737                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5738                 }else
5739                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5740             }
5741             for(list=0; list<h->list_count; list++){
5742                 if(IS_DIR(mb_type, 0, list)){
5743                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5744
5745                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5746                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5747                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5748
5749                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5750                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5751                 }else
5752                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5753             }
5754         }
5755         else if(IS_16X8(mb_type)){
5756             for(list=0; list<h->list_count; list++){
5757                     for(i=0; i<2; i++){
5758                         if(IS_DIR(mb_type, i, list)){
5759                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5760                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5761                         }else
5762                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5763                     }
5764             }
5765             for(list=0; list<h->list_count; list++){
5766                 for(i=0; i<2; i++){
5767                     if(IS_DIR(mb_type, i, list)){
5768                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5769                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5770                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5771                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5772
5773                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5774                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5775                     }else{
5776                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5777                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5778                     }
5779                 }
5780             }
5781         }else{
5782             assert(IS_8X16(mb_type));
5783             for(list=0; list<h->list_count; list++){
5784                     for(i=0; i<2; i++){
5785                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5786                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5787                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5788                         }else
5789                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5790                     }
5791             }
5792             for(list=0; list<h->list_count; list++){
5793                 for(i=0; i<2; i++){
5794                     if(IS_DIR(mb_type, i, list)){
5795                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5796                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5797                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5798
5799                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5800                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5801                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5802                     }else{
5803                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5804                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5805                     }
5806                 }
5807             }
5808         }
5809     }
5810
5811    if( IS_INTER( mb_type ) ) {
5812         h->chroma_pred_mode_table[mb_xy] = 0;
5813         write_back_motion( h, mb_type );
5814    }
5815
5816     if( !IS_INTRA16x16( mb_type ) ) {
5817         cbp  = decode_cabac_mb_cbp_luma( h );
5818         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5819     }
5820
5821     h->cbp_table[mb_xy] = h->cbp = cbp;
5822
5823     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5824         if( decode_cabac_mb_transform_size( h ) )
5825             mb_type |= MB_TYPE_8x8DCT;
5826     }
5827     s->current_picture.mb_type[mb_xy]= mb_type;
5828
5829     if( cbp || IS_INTRA16x16( mb_type ) ) {
5830         const uint8_t *scan, *scan8x8, *dc_scan;
5831         const uint32_t *qmul;
5832         int dqp;
5833
5834         if(IS_INTERLACED(mb_type)){
5835             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5836             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5837             dc_scan= luma_dc_field_scan;
5838         }else{
5839             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5840             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5841             dc_scan= luma_dc_zigzag_scan;
5842         }
5843
5844         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5845         if( dqp == INT_MIN ){
5846             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5847             return -1;
5848         }
5849         s->qscale += dqp;
5850         if(((unsigned)s->qscale) > 51){
5851             if(s->qscale<0) s->qscale+= 52;
5852             else            s->qscale-= 52;
5853         }
5854         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5855         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5856
5857         if( IS_INTRA16x16( mb_type ) ) {
5858             int i;
5859             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5860             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5861
5862             if( cbp&15 ) {
5863                 qmul = h->dequant4_coeff[0][s->qscale];
5864                 for( i = 0; i < 16; i++ ) {
5865                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5866                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5867                 }
5868             } else {
5869                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5870             }
5871         } else {
5872             int i8x8, i4x4;
5873             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5874                 if( cbp & (1<<i8x8) ) {
5875                     if( IS_8x8DCT(mb_type) ) {
5876                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5877                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5878                     } else {
5879                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5880                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5881                             const int index = 4*i8x8 + i4x4;
5882                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5883 //START_TIMER
5884                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5885 //STOP_TIMER("decode_residual")
5886                         }
5887                     }
5888                 } else {
5889                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5890                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5891                 }
5892             }
5893         }
5894
5895         if( cbp&0x30 ){
5896             int c;
5897             for( c = 0; c < 2; c++ ) {
5898                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5899                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5900             }
5901         }
5902
5903         if( cbp&0x20 ) {
5904             int c, i;
5905             for( c = 0; c < 2; c++ ) {
5906                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5907                 for( i = 0; i < 4; i++ ) {
5908                     const int index = 16 + 4 * c + i;
5909                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5910                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5911                 }
5912             }
5913         } else {
5914             uint8_t * const nnz= &h->non_zero_count_cache[0];
5915             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5916             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5917         }
5918     } else {
5919         uint8_t * const nnz= &h->non_zero_count_cache[0];
5920         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5921         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5922         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5923         h->last_qscale_diff = 0;
5924     }
5925
5926     s->current_picture.qscale_table[mb_xy]= s->qscale;
5927     write_back_non_zero_count(h);
5928
5929     if(MB_MBAFF){
5930         h->ref_count[0] >>= 1;
5931         h->ref_count[1] >>= 1;
5932     }
5933
5934     return 0;
5935 }
5936
5937
5938 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5939     int i, d;
5940     const int index_a = qp + h->slice_alpha_c0_offset;
5941     const int alpha = (alpha_table+52)[index_a];
5942     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5943
5944     if( bS[0] < 4 ) {
5945         int8_t tc[4];
5946         for(i=0; i<4; i++)
5947             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5948         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5949     } else {
5950         /* 16px edge length, because bS=4 is triggered by being at
5951          * the edge of an intra MB, so all 4 bS are the same */
5952             for( d = 0; d < 16; d++ ) {
5953                 const int p0 = pix[-1];
5954                 const int p1 = pix[-2];
5955                 const int p2 = pix[-3];
5956
5957                 const int q0 = pix[0];
5958                 const int q1 = pix[1];
5959                 const int q2 = pix[2];
5960
5961                 if( FFABS( p0 - q0 ) < alpha &&
5962                     FFABS( p1 - p0 ) < beta &&
5963                     FFABS( q1 - q0 ) < beta ) {
5964
5965                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5966                         if( FFABS( p2 - p0 ) < beta)
5967                         {
5968                             const int p3 = pix[-4];
5969                             /* p0', p1', p2' */
5970                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5971                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5972                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5973                         } else {
5974                             /* p0' */
5975                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5976                         }
5977                         if( FFABS( q2 - q0 ) < beta)
5978                         {
5979                             const int q3 = pix[3];
5980                             /* q0', q1', q2' */
5981                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5982                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5983                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5984                         } else {
5985                             /* q0' */
5986                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5987                         }
5988                     }else{
5989                         /* p0', q0' */
5990                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5991                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5992                     }
5993                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5994                 }
5995                 pix += stride;
5996             }
5997     }
5998 }
5999 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6000     int i;
6001     const int index_a = qp + h->slice_alpha_c0_offset;
6002     const int alpha = (alpha_table+52)[index_a];
6003     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6004
6005     if( bS[0] < 4 ) {
6006         int8_t tc[4];
6007         for(i=0; i<4; i++)
6008             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6009         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6010     } else {
6011         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6012     }
6013 }
6014
6015 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6016     int i;
6017     for( i = 0; i < 16; i++, pix += stride) {
6018         int index_a;
6019         int alpha;
6020         int beta;
6021
6022         int qp_index;
6023         int bS_index = (i >> 1);
6024         if (!MB_FIELD) {
6025             bS_index &= ~1;
6026             bS_index |= (i & 1);
6027         }
6028
6029         if( bS[bS_index] == 0 ) {
6030             continue;
6031         }
6032
6033         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6034         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6035         alpha = (alpha_table+52)[index_a];
6036         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6037
6038         if( bS[bS_index] < 4 ) {
6039             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6040             const int p0 = pix[-1];
6041             const int p1 = pix[-2];
6042             const int p2 = pix[-3];
6043             const int q0 = pix[0];
6044             const int q1 = pix[1];
6045             const int q2 = pix[2];
6046
6047             if( FFABS( p0 - q0 ) < alpha &&
6048                 FFABS( p1 - p0 ) < beta &&
6049                 FFABS( q1 - q0 ) < beta ) {
6050                 int tc = tc0;
6051                 int i_delta;
6052
6053                 if( FFABS( p2 - p0 ) < beta ) {
6054                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6055                     tc++;
6056                 }
6057                 if( FFABS( q2 - q0 ) < beta ) {
6058                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6059                     tc++;
6060                 }
6061
6062                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6063                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6064                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6065                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6066             }
6067         }else{
6068             const int p0 = pix[-1];
6069             const int p1 = pix[-2];
6070             const int p2 = pix[-3];
6071
6072             const int q0 = pix[0];
6073             const int q1 = pix[1];
6074             const int q2 = pix[2];
6075
6076             if( FFABS( p0 - q0 ) < alpha &&
6077                 FFABS( p1 - p0 ) < beta &&
6078                 FFABS( q1 - q0 ) < beta ) {
6079
6080                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6081                     if( FFABS( p2 - p0 ) < beta)
6082                     {
6083                         const int p3 = pix[-4];
6084                         /* p0', p1', p2' */
6085                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6086                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6087                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6088                     } else {
6089                         /* p0' */
6090                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6091                     }
6092                     if( FFABS( q2 - q0 ) < beta)
6093                     {
6094                         const int q3 = pix[3];
6095                         /* q0', q1', q2' */
6096                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6097                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6098                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6099                     } else {
6100                         /* q0' */
6101                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6102                     }
6103                 }else{
6104                     /* p0', q0' */
6105                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6106                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6107                 }
6108                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6109             }
6110         }
6111     }
6112 }
6113 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6114     int i;
6115     for( i = 0; i < 8; i++, pix += stride) {
6116         int index_a;
6117         int alpha;
6118         int beta;
6119
6120         int qp_index;
6121         int bS_index = i;
6122
6123         if( bS[bS_index] == 0 ) {
6124             continue;
6125         }
6126
6127         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6128         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6129         alpha = (alpha_table+52)[index_a];
6130         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6131
6132         if( bS[bS_index] < 4 ) {
6133             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6134             const int p0 = pix[-1];
6135             const int p1 = pix[-2];
6136             const int q0 = pix[0];
6137             const int q1 = pix[1];
6138
6139             if( FFABS( p0 - q0 ) < alpha &&
6140                 FFABS( p1 - p0 ) < beta &&
6141                 FFABS( q1 - q0 ) < beta ) {
6142                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6143
6144                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6145                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6146                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6147             }
6148         }else{
6149             const int p0 = pix[-1];
6150             const int p1 = pix[-2];
6151             const int q0 = pix[0];
6152             const int q1 = pix[1];
6153
6154             if( FFABS( p0 - q0 ) < alpha &&
6155                 FFABS( p1 - p0 ) < beta &&
6156                 FFABS( q1 - q0 ) < beta ) {
6157
6158                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6159                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6160                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6161             }
6162         }
6163     }
6164 }
6165
6166 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6167     int i, d;
6168     const int index_a = qp + h->slice_alpha_c0_offset;
6169     const int alpha = (alpha_table+52)[index_a];
6170     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6171     const int pix_next  = stride;
6172
6173     if( bS[0] < 4 ) {
6174         int8_t tc[4];
6175         for(i=0; i<4; i++)
6176             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6177         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6178     } else {
6179         /* 16px edge length, see filter_mb_edgev */
6180             for( d = 0; d < 16; d++ ) {
6181                 const int p0 = pix[-1*pix_next];
6182                 const int p1 = pix[-2*pix_next];
6183                 const int p2 = pix[-3*pix_next];
6184                 const int q0 = pix[0];
6185                 const int q1 = pix[1*pix_next];
6186                 const int q2 = pix[2*pix_next];
6187
6188                 if( FFABS( p0 - q0 ) < alpha &&
6189                     FFABS( p1 - p0 ) < beta &&
6190                     FFABS( q1 - q0 ) < beta ) {
6191
6192                     const int p3 = pix[-4*pix_next];
6193                     const int q3 = pix[ 3*pix_next];
6194
6195                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6196                         if( FFABS( p2 - p0 ) < beta) {
6197                             /* p0', p1', p2' */
6198                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6199                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6200                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6201                         } else {
6202                             /* p0' */
6203                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6204                         }
6205                         if( FFABS( q2 - q0 ) < beta) {
6206                             /* q0', q1', q2' */
6207                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6208                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6209                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6210                         } else {
6211                             /* q0' */
6212                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6213                         }
6214                     }else{
6215                         /* p0', q0' */
6216                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6217                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6218                     }
6219                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6220                 }
6221                 pix++;
6222             }
6223     }
6224 }
6225
6226 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6227     int i;
6228     const int index_a = qp + h->slice_alpha_c0_offset;
6229     const int alpha = (alpha_table+52)[index_a];
6230     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6231
6232     if( bS[0] < 4 ) {
6233         int8_t tc[4];
6234         for(i=0; i<4; i++)
6235             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6236         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6237     } else {
6238         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6239     }
6240 }
6241
6242 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6243     MpegEncContext * const s = &h->s;
6244     int mb_xy, mb_type;
6245     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6246
6247     mb_xy = mb_x + mb_y*s->mb_stride;
6248
6249     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6250        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6251                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6252         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6253         return;
6254     }
6255     assert(!FRAME_MBAFF);
6256
6257     mb_type = s->current_picture.mb_type[mb_xy];
6258     qp = s->current_picture.qscale_table[mb_xy];
6259     qp0 = s->current_picture.qscale_table[mb_xy-1];
6260     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6261     qpc = get_chroma_qp( h, 0, qp );
6262     qpc0 = get_chroma_qp( h, 0, qp0 );
6263     qpc1 = get_chroma_qp( h, 0, qp1 );
6264     qp0 = (qp + qp0 + 1) >> 1;
6265     qp1 = (qp + qp1 + 1) >> 1;
6266     qpc0 = (qpc + qpc0 + 1) >> 1;
6267     qpc1 = (qpc + qpc1 + 1) >> 1;
6268     qp_thresh = 15 - h->slice_alpha_c0_offset;
6269     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6270        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6271         return;
6272
6273     if( IS_INTRA(mb_type) ) {
6274         int16_t bS4[4] = {4,4,4,4};
6275         int16_t bS3[4] = {3,3,3,3};
6276         if( IS_8x8DCT(mb_type) ) {
6277             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6278             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6279             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6280             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6281         } else {
6282             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6283             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6284             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6285             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6286             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6287             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6288             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6289             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6290         }
6291         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6292         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6293         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6294         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6295         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6296         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6297         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6298         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6299         return;
6300     } else {
6301         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6302         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6303         int edges;
6304         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6305             edges = 4;
6306             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6307         } else {
6308             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6309                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6310             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6311                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6312                              ? 3 : 0;
6313             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6314             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6315             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6316                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6317         }
6318         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6319             bSv[0][0] = 0x0004000400040004ULL;
6320         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6321             bSv[1][0] = 0x0004000400040004ULL;
6322
6323 #define FILTER(hv,dir,edge)\
6324         if(bSv[dir][edge]) {\
6325             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6326             if(!(edge&1)) {\
6327                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6328                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6329             }\
6330         }
6331         if( edges == 1 ) {
6332             FILTER(v,0,0);
6333             FILTER(h,1,0);
6334         } else if( IS_8x8DCT(mb_type) ) {
6335             FILTER(v,0,0);
6336             FILTER(v,0,2);
6337             FILTER(h,1,0);
6338             FILTER(h,1,2);
6339         } else {
6340             FILTER(v,0,0);
6341             FILTER(v,0,1);
6342             FILTER(v,0,2);
6343             FILTER(v,0,3);
6344             FILTER(h,1,0);
6345             FILTER(h,1,1);
6346             FILTER(h,1,2);
6347             FILTER(h,1,3);
6348         }
6349 #undef FILTER
6350     }
6351 }
6352
6353 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6354     MpegEncContext * const s = &h->s;
6355     const int mb_xy= mb_x + mb_y*s->mb_stride;
6356     const int mb_type = s->current_picture.mb_type[mb_xy];
6357     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6358     int first_vertical_edge_done = 0;
6359     int dir;
6360     /* FIXME: A given frame may occupy more than one position in
6361      * the reference list. So ref2frm should be populated with
6362      * frame numbers, not indices. */
6363     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6364                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6365
6366     //for sufficiently low qp, filtering wouldn't do anything
6367     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6368     if(!FRAME_MBAFF){
6369         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6370         int qp = s->current_picture.qscale_table[mb_xy];
6371         if(qp <= qp_thresh
6372            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6373            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6374             return;
6375         }
6376     }
6377
6378     if (FRAME_MBAFF
6379             // left mb is in picture
6380             && h->slice_table[mb_xy-1] != 255
6381             // and current and left pair do not have the same interlaced type
6382             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6383             // and left mb is in the same slice if deblocking_filter == 2
6384             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6385         /* First vertical edge is different in MBAFF frames
6386          * There are 8 different bS to compute and 2 different Qp
6387          */
6388         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6389         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6390         int16_t bS[8];
6391         int qp[2];
6392         int bqp[2];
6393         int rqp[2];
6394         int mb_qp, mbn0_qp, mbn1_qp;
6395         int i;
6396         first_vertical_edge_done = 1;
6397
6398         if( IS_INTRA(mb_type) )
6399             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6400         else {
6401             for( i = 0; i < 8; i++ ) {
6402                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6403
6404                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6405                     bS[i] = 4;
6406                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6407                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6408                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6409                     bS[i] = 2;
6410                 else
6411                     bS[i] = 1;
6412             }
6413         }
6414
6415         mb_qp = s->current_picture.qscale_table[mb_xy];
6416         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6417         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6418         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6419         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6420                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6421         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6422                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6423         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6424         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6425                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6426         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6427                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6428
6429         /* Filter edge */
6430         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6431         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6432         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6433         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6434         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6435     }
6436     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6437     for( dir = 0; dir < 2; dir++ )
6438     {
6439         int edge;
6440         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6441         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6442         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6443
6444         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6445                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6446         // how often to recheck mv-based bS when iterating between edges
6447         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6448                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6449         // how often to recheck mv-based bS when iterating along each edge
6450         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6451
6452         if (first_vertical_edge_done) {
6453             start = 1;
6454             first_vertical_edge_done = 0;
6455         }
6456
6457         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6458             start = 1;
6459
6460         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6461             && !IS_INTERLACED(mb_type)
6462             && IS_INTERLACED(mbm_type)
6463             ) {
6464             // This is a special case in the norm where the filtering must
6465             // be done twice (one each of the field) even if we are in a
6466             // frame macroblock.
6467             //
6468             static const int nnz_idx[4] = {4,5,6,3};
6469             unsigned int tmp_linesize   = 2 *   linesize;
6470             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6471             int mbn_xy = mb_xy - 2 * s->mb_stride;
6472             int qp;
6473             int i, j;
6474             int16_t bS[4];
6475
6476             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6477                 if( IS_INTRA(mb_type) ||
6478                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6479                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6480                 } else {
6481                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6482                     for( i = 0; i < 4; i++ ) {
6483                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6484                             mbn_nnz[nnz_idx[i]] != 0 )
6485                             bS[i] = 2;
6486                         else
6487                             bS[i] = 1;
6488                     }
6489                 }
6490                 // Do not use s->qscale as luma quantizer because it has not the same
6491                 // value in IPCM macroblocks.
6492                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6493                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6494                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6495                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6496                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6497                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6498                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6499                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6500             }
6501
6502             start = 1;
6503         }
6504
6505         /* Calculate bS */
6506         for( edge = start; edge < edges; edge++ ) {
6507             /* mbn_xy: neighbor macroblock */
6508             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6509             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6510             int16_t bS[4];
6511             int qp;
6512
6513             if( (edge&1) && IS_8x8DCT(mb_type) )
6514                 continue;
6515
6516             if( IS_INTRA(mb_type) ||
6517                 IS_INTRA(mbn_type) ) {
6518                 int value;
6519                 if (edge == 0) {
6520                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6521                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6522                     ) {
6523                         value = 4;
6524                     } else {
6525                         value = 3;
6526                     }
6527                 } else {
6528                     value = 3;
6529                 }
6530                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6531             } else {
6532                 int i, l;
6533                 int mv_done;
6534
6535                 if( edge & mask_edge ) {
6536                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6537                     mv_done = 1;
6538                 }
6539                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6540                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6541                     mv_done = 1;
6542                 }
6543                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6544                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6545                     int bn_idx= b_idx - (dir ? 8:1);
6546                     int v = 0;
6547                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6548                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6549                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6550                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6551                     }
6552                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6553                     mv_done = 1;
6554                 }
6555                 else
6556                     mv_done = 0;
6557
6558                 for( i = 0; i < 4; i++ ) {
6559                     int x = dir == 0 ? edge : i;
6560                     int y = dir == 0 ? i    : edge;
6561                     int b_idx= 8 + 4 + x + 8*y;
6562                     int bn_idx= b_idx - (dir ? 8:1);
6563
6564                     if( h->non_zero_count_cache[b_idx] != 0 ||
6565                         h->non_zero_count_cache[bn_idx] != 0 ) {
6566                         bS[i] = 2;
6567                     }
6568                     else if(!mv_done)
6569                     {
6570                         bS[i] = 0;
6571                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6572                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6573                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6574                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6575                                 bS[i] = 1;
6576                                 break;
6577                             }
6578                         }
6579                     }
6580                 }
6581
6582                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6583                     continue;
6584             }
6585
6586             /* Filter edge */
6587             // Do not use s->qscale as luma quantizer because it has not the same
6588             // value in IPCM macroblocks.
6589             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6590             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6591             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6592             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6593             if( dir == 0 ) {
6594                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6595                 if( (edge&1) == 0 ) {
6596                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6597                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6598                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6599                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6600                 }
6601             } else {
6602                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6603                 if( (edge&1) == 0 ) {
6604                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6605                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6606                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6607                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6608                 }
6609             }
6610         }
6611     }
6612 }
6613
6614 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6615     MpegEncContext * const s = &h->s;
6616     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6617
6618     s->mb_skip_run= -1;
6619
6620     if( h->pps.cabac ) {
6621         int i;
6622
6623         /* realign */
6624         align_get_bits( &s->gb );
6625
6626         /* init cabac */
6627         ff_init_cabac_states( &h->cabac);
6628         ff_init_cabac_decoder( &h->cabac,
6629                                s->gb.buffer + get_bits_count(&s->gb)/8,
6630                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6631         /* calculate pre-state */
6632         for( i= 0; i < 460; i++ ) {
6633             int pre;
6634             if( h->slice_type == I_TYPE )
6635                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6636             else
6637                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6638
6639             if( pre <= 63 )
6640                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6641             else
6642                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6643         }
6644
6645         for(;;){
6646 //START_TIMER
6647             int ret = decode_mb_cabac(h);
6648             int eos;
6649 //STOP_TIMER("decode_mb_cabac")
6650
6651             if(ret>=0) hl_decode_mb(h);
6652
6653             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6654                 s->mb_y++;
6655
6656                 if(ret>=0) ret = decode_mb_cabac(h);
6657
6658                 if(ret>=0) hl_decode_mb(h);
6659                 s->mb_y--;
6660             }
6661             eos = get_cabac_terminate( &h->cabac );
6662
6663             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6664                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6665                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6666                 return -1;
6667             }
6668
6669             if( ++s->mb_x >= s->mb_width ) {
6670                 s->mb_x = 0;
6671                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6672                 ++s->mb_y;
6673                 if(FIELD_OR_MBAFF_PICTURE) {
6674                     ++s->mb_y;
6675                 }
6676             }
6677
6678             if( eos || s->mb_y >= s->mb_height ) {
6679                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6680                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6681                 return 0;
6682             }
6683         }
6684
6685     } else {
6686         for(;;){
6687             int ret = decode_mb_cavlc(h);
6688
6689             if(ret>=0) hl_decode_mb(h);
6690
6691             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6692                 s->mb_y++;
6693                 ret = decode_mb_cavlc(h);
6694
6695                 if(ret>=0) hl_decode_mb(h);
6696                 s->mb_y--;
6697             }
6698
6699             if(ret<0){
6700                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6701                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6702
6703                 return -1;
6704             }
6705
6706             if(++s->mb_x >= s->mb_width){
6707                 s->mb_x=0;
6708                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6709                 ++s->mb_y;
6710                 if(FIELD_OR_MBAFF_PICTURE) {
6711                     ++s->mb_y;
6712                 }
6713                 if(s->mb_y >= s->mb_height){
6714                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6715
6716                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6717                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6718
6719                         return 0;
6720                     }else{
6721                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6722
6723                         return -1;
6724                     }
6725                 }
6726             }
6727
6728             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6729                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6730                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6731                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6732
6733                     return 0;
6734                 }else{
6735                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6736
6737                     return -1;
6738                 }
6739             }
6740         }
6741     }
6742
6743 #if 0
6744     for(;s->mb_y < s->mb_height; s->mb_y++){
6745         for(;s->mb_x < s->mb_width; s->mb_x++){
6746             int ret= decode_mb(h);
6747
6748             hl_decode_mb(h);
6749
6750             if(ret<0){
6751                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6752                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6753
6754                 return -1;
6755             }
6756
6757             if(++s->mb_x >= s->mb_width){
6758                 s->mb_x=0;
6759                 if(++s->mb_y >= s->mb_height){
6760                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6761                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6762
6763                         return 0;
6764                     }else{
6765                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6766
6767                         return -1;
6768                     }
6769                 }
6770             }
6771
6772             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6773                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6774                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6775
6776                     return 0;
6777                 }else{
6778                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6779
6780                     return -1;
6781                 }
6782             }
6783         }
6784         s->mb_x=0;
6785         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6786     }
6787 #endif
6788     return -1; //not reached
6789 }
6790
6791 static int decode_unregistered_user_data(H264Context *h, int size){
6792     MpegEncContext * const s = &h->s;
6793     uint8_t user_data[16+256];
6794     int e, build, i;
6795
6796     if(size<16)
6797         return -1;
6798
6799     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6800         user_data[i]= get_bits(&s->gb, 8);
6801     }
6802
6803     user_data[i]= 0;
6804     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6805     if(e==1 && build>=0)
6806         h->x264_build= build;
6807
6808     if(s->avctx->debug & FF_DEBUG_BUGS)
6809         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6810
6811     for(; i<size; i++)
6812         skip_bits(&s->gb, 8);
6813
6814     return 0;
6815 }
6816
6817 static int decode_sei(H264Context *h){
6818     MpegEncContext * const s = &h->s;
6819
6820     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6821         int size, type;
6822
6823         type=0;
6824         do{
6825             type+= show_bits(&s->gb, 8);
6826         }while(get_bits(&s->gb, 8) == 255);
6827
6828         size=0;
6829         do{
6830             size+= show_bits(&s->gb, 8);
6831         }while(get_bits(&s->gb, 8) == 255);
6832
6833         switch(type){
6834         case 5:
6835             if(decode_unregistered_user_data(h, size) < 0)
6836                 return -1;
6837             break;
6838         default:
6839             skip_bits(&s->gb, 8*size);
6840         }
6841
6842         //FIXME check bits here
6843         align_get_bits(&s->gb);
6844     }
6845
6846     return 0;
6847 }
6848
6849 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6850     MpegEncContext * const s = &h->s;
6851     int cpb_count, i;
6852     cpb_count = get_ue_golomb(&s->gb) + 1;
6853     get_bits(&s->gb, 4); /* bit_rate_scale */
6854     get_bits(&s->gb, 4); /* cpb_size_scale */
6855     for(i=0; i<cpb_count; i++){
6856         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6857         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6858         get_bits1(&s->gb);     /* cbr_flag */
6859     }
6860     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6861     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6862     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6863     get_bits(&s->gb, 5); /* time_offset_length */
6864 }
6865
6866 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6867     MpegEncContext * const s = &h->s;
6868     int aspect_ratio_info_present_flag;
6869     unsigned int aspect_ratio_idc;
6870     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6871
6872     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6873
6874     if( aspect_ratio_info_present_flag ) {
6875         aspect_ratio_idc= get_bits(&s->gb, 8);
6876         if( aspect_ratio_idc == EXTENDED_SAR ) {
6877             sps->sar.num= get_bits(&s->gb, 16);
6878             sps->sar.den= get_bits(&s->gb, 16);
6879         }else if(aspect_ratio_idc < 14){
6880             sps->sar=  pixel_aspect[aspect_ratio_idc];
6881         }else{
6882             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6883             return -1;
6884         }
6885     }else{
6886         sps->sar.num=
6887         sps->sar.den= 0;
6888     }
6889 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6890
6891     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6892         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6893     }
6894
6895     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6896         get_bits(&s->gb, 3);    /* video_format */
6897         get_bits1(&s->gb);      /* video_full_range_flag */
6898         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6899             get_bits(&s->gb, 8); /* colour_primaries */
6900             get_bits(&s->gb, 8); /* transfer_characteristics */
6901             get_bits(&s->gb, 8); /* matrix_coefficients */
6902         }
6903     }
6904
6905     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6906         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6907         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6908     }
6909
6910     sps->timing_info_present_flag = get_bits1(&s->gb);
6911     if(sps->timing_info_present_flag){
6912         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6913         sps->time_scale = get_bits_long(&s->gb, 32);
6914         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6915     }
6916
6917     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6918     if(nal_hrd_parameters_present_flag)
6919         decode_hrd_parameters(h, sps);
6920     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6921     if(vcl_hrd_parameters_present_flag)
6922         decode_hrd_parameters(h, sps);
6923     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6924         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6925     get_bits1(&s->gb);         /* pic_struct_present_flag */
6926
6927     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6928     if(sps->bitstream_restriction_flag){
6929         unsigned int num_reorder_frames;
6930         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6931         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6932         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6933         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6934         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6935         num_reorder_frames= get_ue_golomb(&s->gb);
6936         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6937
6938         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6939             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6940             return -1;
6941         }
6942
6943         sps->num_reorder_frames= num_reorder_frames;
6944     }
6945
6946     return 0;
6947 }
6948
6949 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6950                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6951     MpegEncContext * const s = &h->s;
6952     int i, last = 8, next = 8;
6953     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6954     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6955         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6956     else
6957     for(i=0;i<size;i++){
6958         if(next)
6959             next = (last + get_se_golomb(&s->gb)) & 0xff;
6960         if(!i && !next){ /* matrix not written, we use the preset one */
6961             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6962             break;
6963         }
6964         last = factors[scan[i]] = next ? next : last;
6965     }
6966 }
6967
6968 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6969                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6970     MpegEncContext * const s = &h->s;
6971     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6972     const uint8_t *fallback[4] = {
6973         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6974         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6975         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6976         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6977     };
6978     if(get_bits1(&s->gb)){
6979         sps->scaling_matrix_present |= is_sps;
6980         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6981         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6982         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6983         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6984         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6985         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6986         if(is_sps || pps->transform_8x8_mode){
6987             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6988             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6989         }
6990     } else if(fallback_sps) {
6991         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
6992         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
6993     }
6994 }
6995
6996 /**
6997  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
6998  */
6999 static void *
7000 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7001                     const size_t size, const char *name)
7002 {
7003     if(id>=max) {
7004         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7005         return NULL;
7006     }
7007
7008     if(!vec[id]) {
7009         vec[id] = av_mallocz(size);
7010         if(vec[id] == NULL)
7011             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7012     }
7013     return vec[id];
7014 }
7015
7016 static inline int decode_seq_parameter_set(H264Context *h){
7017     MpegEncContext * const s = &h->s;
7018     int profile_idc, level_idc;
7019     unsigned int sps_id, tmp, mb_width, mb_height;
7020     int i;
7021     SPS *sps;
7022
7023     profile_idc= get_bits(&s->gb, 8);
7024     get_bits1(&s->gb);   //constraint_set0_flag
7025     get_bits1(&s->gb);   //constraint_set1_flag
7026     get_bits1(&s->gb);   //constraint_set2_flag
7027     get_bits1(&s->gb);   //constraint_set3_flag
7028     get_bits(&s->gb, 4); // reserved
7029     level_idc= get_bits(&s->gb, 8);
7030     sps_id= get_ue_golomb(&s->gb);
7031
7032     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7033     if(sps == NULL)
7034         return -1;
7035
7036     sps->profile_idc= profile_idc;
7037     sps->level_idc= level_idc;
7038
7039     if(sps->profile_idc >= 100){ //high profile
7040         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7041             get_bits1(&s->gb);  //residual_color_transform_flag
7042         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7043         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7044         sps->transform_bypass = get_bits1(&s->gb);
7045         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7046     }else
7047         sps->scaling_matrix_present = 0;
7048
7049     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7050     sps->poc_type= get_ue_golomb(&s->gb);
7051
7052     if(sps->poc_type == 0){ //FIXME #define
7053         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7054     } else if(sps->poc_type == 1){//FIXME #define
7055         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7056         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7057         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7058         tmp= get_ue_golomb(&s->gb);
7059
7060         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7061             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7062             return -1;
7063         }
7064         sps->poc_cycle_length= tmp;
7065
7066         for(i=0; i<sps->poc_cycle_length; i++)
7067             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7068     }else if(sps->poc_type != 2){
7069         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7070         return -1;
7071     }
7072
7073     tmp= get_ue_golomb(&s->gb);
7074     if(tmp > MAX_PICTURE_COUNT-2){
7075         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7076     }
7077     sps->ref_frame_count= tmp;
7078     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7079     mb_width= get_ue_golomb(&s->gb) + 1;
7080     mb_height= get_ue_golomb(&s->gb) + 1;
7081     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7082        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7083         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7084         return -1;
7085     }
7086     sps->mb_width = mb_width;
7087     sps->mb_height= mb_height;
7088
7089     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7090     if(!sps->frame_mbs_only_flag)
7091         sps->mb_aff= get_bits1(&s->gb);
7092     else
7093         sps->mb_aff= 0;
7094
7095     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7096
7097 #ifndef ALLOW_INTERLACE
7098     if(sps->mb_aff)
7099         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7100 #endif
7101     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7102         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7103
7104     sps->crop= get_bits1(&s->gb);
7105     if(sps->crop){
7106         sps->crop_left  = get_ue_golomb(&s->gb);
7107         sps->crop_right = get_ue_golomb(&s->gb);
7108         sps->crop_top   = get_ue_golomb(&s->gb);
7109         sps->crop_bottom= get_ue_golomb(&s->gb);
7110         if(sps->crop_left || sps->crop_top){
7111             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7112         }
7113     }else{
7114         sps->crop_left  =
7115         sps->crop_right =
7116         sps->crop_top   =
7117         sps->crop_bottom= 0;
7118     }
7119
7120     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7121     if( sps->vui_parameters_present_flag )
7122         decode_vui_parameters(h, sps);
7123
7124     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7125         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7126                sps_id, sps->profile_idc, sps->level_idc,
7127                sps->poc_type,
7128                sps->ref_frame_count,
7129                sps->mb_width, sps->mb_height,
7130                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7131                sps->direct_8x8_inference_flag ? "8B8" : "",
7132                sps->crop_left, sps->crop_right,
7133                sps->crop_top, sps->crop_bottom,
7134                sps->vui_parameters_present_flag ? "VUI" : ""
7135                );
7136     }
7137     return 0;
7138 }
7139
7140 static void
7141 build_qp_table(PPS *pps, int t, int index)
7142 {
7143     int i;
7144     for(i = 0; i < 255; i++)
7145         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7146 }
7147
7148 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7149     MpegEncContext * const s = &h->s;
7150     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7151     PPS *pps;
7152
7153     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7154     if(pps == NULL)
7155         return -1;
7156
7157     tmp= get_ue_golomb(&s->gb);
7158     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7159         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7160         return -1;
7161     }
7162     pps->sps_id= tmp;
7163
7164     pps->cabac= get_bits1(&s->gb);
7165     pps->pic_order_present= get_bits1(&s->gb);
7166     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7167     if(pps->slice_group_count > 1 ){
7168         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7169         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7170         switch(pps->mb_slice_group_map_type){
7171         case 0:
7172 #if 0
7173 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7174 |    run_length[ i ]                                |1  |ue(v)   |
7175 #endif
7176             break;
7177         case 2:
7178 #if 0
7179 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7180 |{                                                  |   |        |
7181 |    top_left_mb[ i ]                               |1  |ue(v)   |
7182 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7183 |   }                                               |   |        |
7184 #endif
7185             break;
7186         case 3:
7187         case 4:
7188         case 5:
7189 #if 0
7190 |   slice_group_change_direction_flag               |1  |u(1)    |
7191 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7192 #endif
7193             break;
7194         case 6:
7195 #if 0
7196 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7197 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7198 |)                                                  |   |        |
7199 |    slice_group_id[ i ]                            |1  |u(v)    |
7200 #endif
7201             break;
7202         }
7203     }
7204     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7205     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7206     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7207         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7208         pps->ref_count[0]= pps->ref_count[1]= 1;
7209         return -1;
7210     }
7211
7212     pps->weighted_pred= get_bits1(&s->gb);
7213     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7214     pps->init_qp= get_se_golomb(&s->gb) + 26;
7215     pps->init_qs= get_se_golomb(&s->gb) + 26;
7216     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7217     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7218     pps->constrained_intra_pred= get_bits1(&s->gb);
7219     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7220
7221     pps->transform_8x8_mode= 0;
7222     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7223     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7224     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7225
7226     if(get_bits_count(&s->gb) < bit_length){
7227         pps->transform_8x8_mode= get_bits1(&s->gb);
7228         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7229         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7230     } else {
7231         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7232     }
7233
7234     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7235     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7236         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7237         h->pps.chroma_qp_diff= 1;
7238     } else
7239         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7240
7241     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7242         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7243                pps_id, pps->sps_id,
7244                pps->cabac ? "CABAC" : "CAVLC",
7245                pps->slice_group_count,
7246                pps->ref_count[0], pps->ref_count[1],
7247                pps->weighted_pred ? "weighted" : "",
7248                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7249                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7250                pps->constrained_intra_pred ? "CONSTR" : "",
7251                pps->redundant_pic_cnt_present ? "REDU" : "",
7252                pps->transform_8x8_mode ? "8x8DCT" : ""
7253                );
7254     }
7255
7256     return 0;
7257 }
7258
7259 /**
7260  * Call decode_slice() for each context.
7261  *
7262  * @param h h264 master context
7263  * @param context_count number of contexts to execute
7264  */
7265 static void execute_decode_slices(H264Context *h, int context_count){
7266     MpegEncContext * const s = &h->s;
7267     AVCodecContext * const avctx= s->avctx;
7268     H264Context *hx;
7269     int i;
7270
7271     if(context_count == 1) {
7272         decode_slice(avctx, h);
7273     } else {
7274         for(i = 1; i < context_count; i++) {
7275             hx = h->thread_context[i];
7276             hx->s.error_resilience = avctx->error_resilience;
7277             hx->s.error_count = 0;
7278         }
7279
7280         avctx->execute(avctx, (void *)decode_slice,
7281                        (void **)h->thread_context, NULL, context_count);
7282
7283         /* pull back stuff from slices to master context */
7284         hx = h->thread_context[context_count - 1];
7285         s->mb_x = hx->s.mb_x;
7286         s->mb_y = hx->s.mb_y;
7287         for(i = 1; i < context_count; i++)
7288             h->s.error_count += h->thread_context[i]->s.error_count;
7289     }
7290 }
7291
7292
7293 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7294     MpegEncContext * const s = &h->s;
7295     AVCodecContext * const avctx= s->avctx;
7296     int buf_index=0;
7297     H264Context *hx; ///< thread context
7298     int context_count = 0;
7299
7300     h->max_contexts = avctx->thread_count;
7301 #if 0
7302     int i;
7303     for(i=0; i<50; i++){
7304         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7305     }
7306 #endif
7307     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7308         h->current_slice = 0;
7309         s->current_picture_ptr= NULL;
7310     }
7311
7312     for(;;){
7313         int consumed;
7314         int dst_length;
7315         int bit_length;
7316         uint8_t *ptr;
7317         int i, nalsize = 0;
7318         int err;
7319
7320         if(h->is_avc) {
7321             if(buf_index >= buf_size) break;
7322             nalsize = 0;
7323             for(i = 0; i < h->nal_length_size; i++)
7324                 nalsize = (nalsize << 8) | buf[buf_index++];
7325             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7326                 if(nalsize == 1){
7327                     buf_index++;
7328                     continue;
7329                 }else{
7330                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7331                     break;
7332                 }
7333             }
7334         } else {
7335             // start code prefix search
7336             for(; buf_index + 3 < buf_size; buf_index++){
7337                 // This should always succeed in the first iteration.
7338                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7339                     break;
7340             }
7341
7342             if(buf_index+3 >= buf_size) break;
7343
7344             buf_index+=3;
7345         }
7346
7347         hx = h->thread_context[context_count];
7348
7349         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7350         if (ptr==NULL || dst_length < 0){
7351             return -1;
7352         }
7353         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7354             dst_length--;
7355         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7356
7357         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7358             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7359         }
7360
7361         if (h->is_avc && (nalsize != consumed))
7362             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7363
7364         buf_index += consumed;
7365
7366         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7367            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7368             continue;
7369
7370       again:
7371         err = 0;
7372         switch(hx->nal_unit_type){
7373         case NAL_IDR_SLICE:
7374             if (h->nal_unit_type != NAL_IDR_SLICE) {
7375                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7376                 return -1;
7377             }
7378             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7379         case NAL_SLICE:
7380             init_get_bits(&hx->s.gb, ptr, bit_length);
7381             hx->intra_gb_ptr=
7382             hx->inter_gb_ptr= &hx->s.gb;
7383             hx->s.data_partitioning = 0;
7384
7385             if((err = decode_slice_header(hx, h)))
7386                break;
7387
7388             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7389             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7390                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7391                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7392                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7393                && avctx->skip_frame < AVDISCARD_ALL)
7394                 context_count++;
7395             break;
7396         case NAL_DPA:
7397             init_get_bits(&hx->s.gb, ptr, bit_length);
7398             hx->intra_gb_ptr=
7399             hx->inter_gb_ptr= NULL;
7400             hx->s.data_partitioning = 1;
7401
7402             err = decode_slice_header(hx, h);
7403             break;
7404         case NAL_DPB:
7405             init_get_bits(&hx->intra_gb, ptr, bit_length);
7406             hx->intra_gb_ptr= &hx->intra_gb;
7407             break;
7408         case NAL_DPC:
7409             init_get_bits(&hx->inter_gb, ptr, bit_length);
7410             hx->inter_gb_ptr= &hx->inter_gb;
7411
7412             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7413                && s->context_initialized
7414                && s->hurry_up < 5
7415                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7416                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7417                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7418                && avctx->skip_frame < AVDISCARD_ALL)
7419                 context_count++;
7420             break;
7421         case NAL_SEI:
7422             init_get_bits(&s->gb, ptr, bit_length);
7423             decode_sei(h);
7424             break;
7425         case NAL_SPS:
7426             init_get_bits(&s->gb, ptr, bit_length);
7427             decode_seq_parameter_set(h);
7428
7429             if(s->flags& CODEC_FLAG_LOW_DELAY)
7430                 s->low_delay=1;
7431
7432             if(avctx->has_b_frames < 2)
7433                 avctx->has_b_frames= !s->low_delay;
7434             break;
7435         case NAL_PPS:
7436             init_get_bits(&s->gb, ptr, bit_length);
7437
7438             decode_picture_parameter_set(h, bit_length);
7439
7440             break;
7441         case NAL_AUD:
7442         case NAL_END_SEQUENCE:
7443         case NAL_END_STREAM:
7444         case NAL_FILLER_DATA:
7445         case NAL_SPS_EXT:
7446         case NAL_AUXILIARY_SLICE:
7447             break;
7448         default:
7449             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7450         }
7451
7452         if(context_count == h->max_contexts) {
7453             execute_decode_slices(h, context_count);
7454             context_count = 0;
7455         }
7456
7457         if (err < 0)
7458             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7459         else if(err == 1) {
7460             /* Slice could not be decoded in parallel mode, copy down
7461              * NAL unit stuff to context 0 and restart. Note that
7462              * rbsp_buffer is not transfered, but since we no longer
7463              * run in parallel mode this should not be an issue. */
7464             h->nal_unit_type = hx->nal_unit_type;
7465             h->nal_ref_idc   = hx->nal_ref_idc;
7466             hx = h;
7467             goto again;
7468         }
7469     }
7470     if(context_count)
7471         execute_decode_slices(h, context_count);
7472     return buf_index;
7473 }
7474
7475 /**
7476  * returns the number of bytes consumed for building the current frame
7477  */
7478 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7479     if(s->flags&CODEC_FLAG_TRUNCATED){
7480         pos -= s->parse_context.last_index;
7481         if(pos<0) pos=0; // FIXME remove (unneeded?)
7482
7483         return pos;
7484     }else{
7485         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7486         if(pos+10>buf_size) pos=buf_size; // oops ;)
7487
7488         return pos;
7489     }
7490 }
7491
7492 static int decode_frame(AVCodecContext *avctx,
7493                              void *data, int *data_size,
7494                              uint8_t *buf, int buf_size)
7495 {
7496     H264Context *h = avctx->priv_data;
7497     MpegEncContext *s = &h->s;
7498     AVFrame *pict = data;
7499     int buf_index;
7500
7501     s->flags= avctx->flags;
7502     s->flags2= avctx->flags2;
7503
7504    /* no supplementary picture */
7505     if (buf_size == 0) {
7506         Picture *out;
7507         int i, out_idx;
7508
7509 //FIXME factorize this with the output code below
7510         out = h->delayed_pic[0];
7511         out_idx = 0;
7512         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7513             if(h->delayed_pic[i]->poc < out->poc){
7514                 out = h->delayed_pic[i];
7515                 out_idx = i;
7516             }
7517
7518         for(i=out_idx; h->delayed_pic[i]; i++)
7519             h->delayed_pic[i] = h->delayed_pic[i+1];
7520
7521         if(out){
7522             *data_size = sizeof(AVFrame);
7523             *pict= *(AVFrame*)out;
7524         }
7525
7526         return 0;
7527     }
7528
7529     if(s->flags&CODEC_FLAG_TRUNCATED){
7530         int next= ff_h264_find_frame_end(h, buf, buf_size);
7531
7532         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7533             return buf_size;
7534 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7535     }
7536
7537     if(h->is_avc && !h->got_avcC) {
7538         int i, cnt, nalsize;
7539         unsigned char *p = avctx->extradata;
7540         if(avctx->extradata_size < 7) {
7541             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7542             return -1;
7543         }
7544         if(*p != 1) {
7545             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7546             return -1;
7547         }
7548         /* sps and pps in the avcC always have length coded with 2 bytes,
7549            so put a fake nal_length_size = 2 while parsing them */
7550         h->nal_length_size = 2;
7551         // Decode sps from avcC
7552         cnt = *(p+5) & 0x1f; // Number of sps
7553         p += 6;
7554         for (i = 0; i < cnt; i++) {
7555             nalsize = AV_RB16(p) + 2;
7556             if(decode_nal_units(h, p, nalsize) < 0) {
7557                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7558                 return -1;
7559             }
7560             p += nalsize;
7561         }
7562         // Decode pps from avcC
7563         cnt = *(p++); // Number of pps
7564         for (i = 0; i < cnt; i++) {
7565             nalsize = AV_RB16(p) + 2;
7566             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7567                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7568                 return -1;
7569             }
7570             p += nalsize;
7571         }
7572         // Now store right nal length size, that will be use to parse all other nals
7573         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7574         // Do not reparse avcC
7575         h->got_avcC = 1;
7576     }
7577
7578     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7579         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7580             return -1;
7581     }
7582
7583     buf_index=decode_nal_units(h, buf, buf_size);
7584     if(buf_index < 0)
7585         return -1;
7586
7587     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7588         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7589         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7590         return -1;
7591     }
7592
7593     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7594         Picture *out = s->current_picture_ptr;
7595         Picture *cur = s->current_picture_ptr;
7596         Picture *prev = h->delayed_output_pic;
7597         int i, pics, cross_idr, out_of_order, out_idx;
7598
7599         s->mb_y= 0;
7600
7601         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7602         s->current_picture_ptr->pict_type= s->pict_type;
7603
7604         h->prev_frame_num_offset= h->frame_num_offset;
7605         h->prev_frame_num= h->frame_num;
7606         if(s->current_picture_ptr->reference & s->picture_structure){
7607             h->prev_poc_msb= h->poc_msb;
7608             h->prev_poc_lsb= h->poc_lsb;
7609             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7610         }
7611
7612         ff_er_frame_end(s);
7613
7614         MPV_frame_end(s);
7615
7616     //FIXME do something with unavailable reference frames
7617
7618 #if 0 //decode order
7619         *data_size = sizeof(AVFrame);
7620 #else
7621         /* Sort B-frames into display order */
7622
7623         if(h->sps.bitstream_restriction_flag
7624            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7625             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7626             s->low_delay = 0;
7627         }
7628
7629         pics = 0;
7630         while(h->delayed_pic[pics]) pics++;
7631
7632         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7633
7634         h->delayed_pic[pics++] = cur;
7635         if(cur->reference == 0)
7636             cur->reference = DELAYED_PIC_REF;
7637
7638         cross_idr = 0;
7639         for(i=0; h->delayed_pic[i]; i++)
7640             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7641                 cross_idr = 1;
7642
7643         out = h->delayed_pic[0];
7644         out_idx = 0;
7645         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7646             if(h->delayed_pic[i]->poc < out->poc){
7647                 out = h->delayed_pic[i];
7648                 out_idx = i;
7649             }
7650
7651         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7652         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7653             { }
7654         else if(prev && pics <= s->avctx->has_b_frames)
7655             out = prev;
7656         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7657            || (s->low_delay &&
7658             ((!cross_idr && prev && out->poc > prev->poc + 2)
7659              || cur->pict_type == B_TYPE)))
7660         {
7661             s->low_delay = 0;
7662             s->avctx->has_b_frames++;
7663             out = prev;
7664         }
7665         else if(out_of_order)
7666             out = prev;
7667
7668         if(out_of_order || pics > s->avctx->has_b_frames){
7669             for(i=out_idx; h->delayed_pic[i]; i++)
7670                 h->delayed_pic[i] = h->delayed_pic[i+1];
7671         }
7672
7673         if(prev == out)
7674             *data_size = 0;
7675         else
7676             *data_size = sizeof(AVFrame);
7677         if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7678             prev->reference = 0;
7679         h->delayed_output_pic = out;
7680 #endif
7681
7682         if(out)
7683             *pict= *(AVFrame*)out;
7684         else
7685             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7686     }
7687
7688     assert(pict->data[0] || !*data_size);
7689     ff_print_debug_info(s, pict);
7690 //printf("out %d\n", (int)pict->data[0]);
7691 #if 0 //?
7692
7693     /* Return the Picture timestamp as the frame number */
7694     /* we substract 1 because it is added on utils.c    */
7695     avctx->frame_number = s->picture_number - 1;
7696 #endif
7697     return get_consumed_bytes(s, buf_index, buf_size);
7698 }
7699 #if 0
7700 static inline void fill_mb_avail(H264Context *h){
7701     MpegEncContext * const s = &h->s;
7702     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7703
7704     if(s->mb_y){
7705         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7706         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7707         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7708     }else{
7709         h->mb_avail[0]=
7710         h->mb_avail[1]=
7711         h->mb_avail[2]= 0;
7712     }
7713     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7714     h->mb_avail[4]= 1; //FIXME move out
7715     h->mb_avail[5]= 0; //FIXME move out
7716 }
7717 #endif
7718
7719 #if 0 //selftest
7720 #undef random
7721 #define COUNT 8000
7722 #define SIZE (COUNT*40)
7723 int main(){
7724     int i;
7725     uint8_t temp[SIZE];
7726     PutBitContext pb;
7727     GetBitContext gb;
7728 //    int int_temp[10000];
7729     DSPContext dsp;
7730     AVCodecContext avctx;
7731
7732     dsputil_init(&dsp, &avctx);
7733
7734     init_put_bits(&pb, temp, SIZE);
7735     printf("testing unsigned exp golomb\n");
7736     for(i=0; i<COUNT; i++){
7737         START_TIMER
7738         set_ue_golomb(&pb, i);
7739         STOP_TIMER("set_ue_golomb");
7740     }
7741     flush_put_bits(&pb);
7742
7743     init_get_bits(&gb, temp, 8*SIZE);
7744     for(i=0; i<COUNT; i++){
7745         int j, s;
7746
7747         s= show_bits(&gb, 24);
7748
7749         START_TIMER
7750         j= get_ue_golomb(&gb);
7751         if(j != i){
7752             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7753 //            return -1;
7754         }
7755         STOP_TIMER("get_ue_golomb");
7756     }
7757
7758
7759     init_put_bits(&pb, temp, SIZE);
7760     printf("testing signed exp golomb\n");
7761     for(i=0; i<COUNT; i++){
7762         START_TIMER
7763         set_se_golomb(&pb, i - COUNT/2);
7764         STOP_TIMER("set_se_golomb");
7765     }
7766     flush_put_bits(&pb);
7767
7768     init_get_bits(&gb, temp, 8*SIZE);
7769     for(i=0; i<COUNT; i++){
7770         int j, s;
7771
7772         s= show_bits(&gb, 24);
7773
7774         START_TIMER
7775         j= get_se_golomb(&gb);
7776         if(j != i - COUNT/2){
7777             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7778 //            return -1;
7779         }
7780         STOP_TIMER("get_se_golomb");
7781     }
7782
7783     printf("testing 4x4 (I)DCT\n");
7784
7785     DCTELEM block[16];
7786     uint8_t src[16], ref[16];
7787     uint64_t error= 0, max_error=0;
7788
7789     for(i=0; i<COUNT; i++){
7790         int j;
7791 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7792         for(j=0; j<16; j++){
7793             ref[j]= random()%255;
7794             src[j]= random()%255;
7795         }
7796
7797         h264_diff_dct_c(block, src, ref, 4);
7798
7799         //normalize
7800         for(j=0; j<16; j++){
7801 //            printf("%d ", block[j]);
7802             block[j]= block[j]*4;
7803             if(j&1) block[j]= (block[j]*4 + 2)/5;
7804             if(j&4) block[j]= (block[j]*4 + 2)/5;
7805         }
7806 //        printf("\n");
7807
7808         s->dsp.h264_idct_add(ref, block, 4);
7809 /*        for(j=0; j<16; j++){
7810             printf("%d ", ref[j]);
7811         }
7812         printf("\n");*/
7813
7814         for(j=0; j<16; j++){
7815             int diff= FFABS(src[j] - ref[j]);
7816
7817             error+= diff*diff;
7818             max_error= FFMAX(max_error, diff);
7819         }
7820     }
7821     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7822 #if 0
7823     printf("testing quantizer\n");
7824     for(qp=0; qp<52; qp++){
7825         for(i=0; i<16; i++)
7826             src1_block[i]= src2_block[i]= random()%255;
7827
7828     }
7829 #endif
7830     printf("Testing NAL layer\n");
7831
7832     uint8_t bitstream[COUNT];
7833     uint8_t nal[COUNT*2];
7834     H264Context h;
7835     memset(&h, 0, sizeof(H264Context));
7836
7837     for(i=0; i<COUNT; i++){
7838         int zeros= i;
7839         int nal_length;
7840         int consumed;
7841         int out_length;
7842         uint8_t *out;
7843         int j;
7844
7845         for(j=0; j<COUNT; j++){
7846             bitstream[j]= (random() % 255) + 1;
7847         }
7848
7849         for(j=0; j<zeros; j++){
7850             int pos= random() % COUNT;
7851             while(bitstream[pos] == 0){
7852                 pos++;
7853                 pos %= COUNT;
7854             }
7855             bitstream[pos]=0;
7856         }
7857
7858         START_TIMER
7859
7860         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7861         if(nal_length<0){
7862             printf("encoding failed\n");
7863             return -1;
7864         }
7865
7866         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7867
7868         STOP_TIMER("NAL")
7869
7870         if(out_length != COUNT){
7871             printf("incorrect length %d %d\n", out_length, COUNT);
7872             return -1;
7873         }
7874
7875         if(consumed != nal_length){
7876             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7877             return -1;
7878         }
7879
7880         if(memcmp(bitstream, out, COUNT)){
7881             printf("mismatch\n");
7882             return -1;
7883         }
7884     }
7885
7886     printf("Testing RBSP\n");
7887
7888
7889     return 0;
7890 }
7891 #endif
7892
7893
7894 static int decode_end(AVCodecContext *avctx)
7895 {
7896     H264Context *h = avctx->priv_data;
7897     MpegEncContext *s = &h->s;
7898
7899     av_freep(&h->rbsp_buffer[0]);
7900     av_freep(&h->rbsp_buffer[1]);
7901     free_tables(h); //FIXME cleanup init stuff perhaps
7902     MPV_common_end(s);
7903
7904 //    memset(h, 0, sizeof(H264Context));
7905
7906     return 0;
7907 }
7908
7909
7910 AVCodec h264_decoder = {
7911     "h264",
7912     CODEC_TYPE_VIDEO,
7913     CODEC_ID_H264,
7914     sizeof(H264Context),
7915     decode_init,
7916     NULL,
7917     decode_end,
7918     decode_frame,
7919     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
7920     .flush= flush_dpb,
7921 };
7922
7923 #include "svq3.c"