git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 /**
  42  * Value of Picture.reference when Picture is not a reference picture, but
  43  * is held for delayed output.
  44  */
  45 #define DELAYED_PIC_REF 4
  46
  47 static VLC coeff_token_vlc[4];
  48 static VLC chroma_dc_coeff_token_vlc;
  49
  50 static VLC total_zeros_vlc[15];
  51 static VLC chroma_dc_total_zeros_vlc[3];
  52
  53 static VLC run_vlc[6];
  54 static VLC run7_vlc;
  55
  56 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  57 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  58 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  59 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  60
  61 static av_always_inline uint32_t pack16to32(int a, int b){
  62 #ifdef WORDS_BIGENDIAN
  63    return (b&0xFFFF) + (a<<16);
  64 #else
  65    return (a&0xFFFF) + (b<<16);
  66 #endif
  67 }
  68
  69 const uint8_t ff_rem6[52]={
  70 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  71 };
  72
  73 const uint8_t ff_div6[52]={
  74 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  75 };
  76
  77
  78 /**
  79  * fill a rectangle.
  80  * @param h height of the rectangle, should be a constant
  81  * @param w width of the rectangle, should be a constant
  82  * @param size the size of val (1 or 4), should be a constant
  83  */
  84 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  85     uint8_t *p= (uint8_t*)vp;
  86     assert(size==1 || size==4);
  87     assert(w<=4);
  88
  89     w      *= size;
  90     stride *= size;
  91
  92     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  93     assert((stride&(w-1))==0);
  94     if(w==2){
  95         const uint16_t v= size==4 ? val : val*0x0101;
  96         *(uint16_t*)(p + 0*stride)= v;
  97         if(h==1) return;
  98         *(uint16_t*)(p + 1*stride)= v;
  99         if(h==2) return;
 100         *(uint16_t*)(p + 2*stride)= v;
 101         *(uint16_t*)(p + 3*stride)= v;
 102     }else if(w==4){
 103         const uint32_t v= size==4 ? val : val*0x01010101;
 104         *(uint32_t*)(p + 0*stride)= v;
 105         if(h==1) return;
 106         *(uint32_t*)(p + 1*stride)= v;
 107         if(h==2) return;
 108         *(uint32_t*)(p + 2*stride)= v;
 109         *(uint32_t*)(p + 3*stride)= v;
 110     }else if(w==8){
 111     //gcc can't optimize 64bit math on x86_32
 112 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 113         const uint64_t v= val*0x0100000001ULL;
 114         *(uint64_t*)(p + 0*stride)= v;
 115         if(h==1) return;
 116         *(uint64_t*)(p + 1*stride)= v;
 117         if(h==2) return;
 118         *(uint64_t*)(p + 2*stride)= v;
 119         *(uint64_t*)(p + 3*stride)= v;
 120     }else if(w==16){
 121         const uint64_t v= val*0x0100000001ULL;
 122         *(uint64_t*)(p + 0+0*stride)= v;
 123         *(uint64_t*)(p + 8+0*stride)= v;
 124         *(uint64_t*)(p + 0+1*stride)= v;
 125         *(uint64_t*)(p + 8+1*stride)= v;
 126         if(h==2) return;
 127         *(uint64_t*)(p + 0+2*stride)= v;
 128         *(uint64_t*)(p + 8+2*stride)= v;
 129         *(uint64_t*)(p + 0+3*stride)= v;
 130         *(uint64_t*)(p + 8+3*stride)= v;
 131 #else
 132         *(uint32_t*)(p + 0+0*stride)= val;
 133         *(uint32_t*)(p + 4+0*stride)= val;
 134         if(h==1) return;
 135         *(uint32_t*)(p + 0+1*stride)= val;
 136         *(uint32_t*)(p + 4+1*stride)= val;
 137         if(h==2) return;
 138         *(uint32_t*)(p + 0+2*stride)= val;
 139         *(uint32_t*)(p + 4+2*stride)= val;
 140         *(uint32_t*)(p + 0+3*stride)= val;
 141         *(uint32_t*)(p + 4+3*stride)= val;
 142     }else if(w==16){
 143         *(uint32_t*)(p + 0+0*stride)= val;
 144         *(uint32_t*)(p + 4+0*stride)= val;
 145         *(uint32_t*)(p + 8+0*stride)= val;
 146         *(uint32_t*)(p +12+0*stride)= val;
 147         *(uint32_t*)(p + 0+1*stride)= val;
 148         *(uint32_t*)(p + 4+1*stride)= val;
 149         *(uint32_t*)(p + 8+1*stride)= val;
 150         *(uint32_t*)(p +12+1*stride)= val;
 151         if(h==2) return;
 152         *(uint32_t*)(p + 0+2*stride)= val;
 153         *(uint32_t*)(p + 4+2*stride)= val;
 154         *(uint32_t*)(p + 8+2*stride)= val;
 155         *(uint32_t*)(p +12+2*stride)= val;
 156         *(uint32_t*)(p + 0+3*stride)= val;
 157         *(uint32_t*)(p + 4+3*stride)= val;
 158         *(uint32_t*)(p + 8+3*stride)= val;
 159         *(uint32_t*)(p +12+3*stride)= val;
 160 #endif
 161     }else
 162         assert(0);
 163     assert(h==4);
 164 }
 165
 166 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 167     MpegEncContext * const s = &h->s;
 168     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 169     int topleft_xy, top_xy, topright_xy, left_xy[2];
 170     int topleft_type, top_type, topright_type, left_type[2];
 171     int left_block[8];
 172     int i;
 173
 174     //FIXME deblocking could skip the intra and nnz parts.
 175     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 176         return;
 177
 178     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 179
 180     top_xy     = mb_xy  - s->mb_stride;
 181     topleft_xy = top_xy - 1;
 182     topright_xy= top_xy + 1;
 183     left_xy[1] = left_xy[0] = mb_xy-1;
 184     left_block[0]= 0;
 185     left_block[1]= 1;
 186     left_block[2]= 2;
 187     left_block[3]= 3;
 188     left_block[4]= 7;
 189     left_block[5]= 10;
 190     left_block[6]= 8;
 191     left_block[7]= 11;
 192     if(FRAME_MBAFF){
 193         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 194         const int top_pair_xy      = pair_xy     - s->mb_stride;
 195         const int topleft_pair_xy  = top_pair_xy - 1;
 196         const int topright_pair_xy = top_pair_xy + 1;
 197         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 198         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 199         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 200         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 201         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 202         const int bottom = (s->mb_y & 1);
 203         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 204         if (bottom
 205                 ? !curr_mb_frame_flag // bottom macroblock
 206                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 207                 ) {
 208             top_xy -= s->mb_stride;
 209         }
 210         if (bottom
 211                 ? !curr_mb_frame_flag // bottom macroblock
 212                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 213                 ) {
 214             topleft_xy -= s->mb_stride;
 215         }
 216         if (bottom
 217                 ? !curr_mb_frame_flag // bottom macroblock
 218                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 219                 ) {
 220             topright_xy -= s->mb_stride;
 221         }
 222         if (left_mb_frame_flag != curr_mb_frame_flag) {
 223             left_xy[1] = left_xy[0] = pair_xy - 1;
 224             if (curr_mb_frame_flag) {
 225                 if (bottom) {
 226                     left_block[0]= 2;
 227                     left_block[1]= 2;
 228                     left_block[2]= 3;
 229                     left_block[3]= 3;
 230                     left_block[4]= 8;
 231                     left_block[5]= 11;
 232                     left_block[6]= 8;
 233                     left_block[7]= 11;
 234                 } else {
 235                     left_block[0]= 0;
 236                     left_block[1]= 0;
 237                     left_block[2]= 1;
 238                     left_block[3]= 1;
 239                     left_block[4]= 7;
 240                     left_block[5]= 10;
 241                     left_block[6]= 7;
 242                     left_block[7]= 10;
 243                 }
 244             } else {
 245                 left_xy[1] += s->mb_stride;
 246                 //left_block[0]= 0;
 247                 left_block[1]= 2;
 248                 left_block[2]= 0;
 249                 left_block[3]= 2;
 250                 //left_block[4]= 7;
 251                 left_block[5]= 10;
 252                 left_block[6]= 7;
 253                 left_block[7]= 10;
 254             }
 255         }
 256     }
 257
 258     h->top_mb_xy = top_xy;
 259     h->left_mb_xy[0] = left_xy[0];
 260     h->left_mb_xy[1] = left_xy[1];
 261     if(for_deblock){
 262         topleft_type = 0;
 263         topright_type = 0;
 264         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 265         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 266         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 267
 268         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 269             int list;
 270             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 271             for(i=0; i<16; i++)
 272                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 273             for(list=0; list<h->list_count; list++){
 274                 if(USES_LIST(mb_type,list)){
 275                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 276                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 277                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 278                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 279                         dst[0] = src[0];
 280                         dst[1] = src[1];
 281                         dst[2] = src[2];
 282                         dst[3] = src[3];
 283                     }
 284                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 285                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 286                     ref += h->b8_stride;
 287                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 288                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 289                 }else{
 290                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 291                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 292                 }
 293             }
 294         }
 295     }else{
 296         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 297         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 298         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 299         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 300         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 301     }
 302
 303     if(IS_INTRA(mb_type)){
 304         h->topleft_samples_available=
 305         h->top_samples_available=
 306         h->left_samples_available= 0xFFFF;
 307         h->topright_samples_available= 0xEEEA;
 308
 309         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 310             h->topleft_samples_available= 0xB3FF;
 311             h->top_samples_available= 0x33FF;
 312             h->topright_samples_available= 0x26EA;
 313         }
 314         for(i=0; i<2; i++){
 315             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 316                 h->topleft_samples_available&= 0xDF5F;
 317                 h->left_samples_available&= 0x5F5F;
 318             }
 319         }
 320
 321         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 322             h->topleft_samples_available&= 0x7FFF;
 323
 324         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 325             h->topright_samples_available&= 0xFBFF;
 326
 327         if(IS_INTRA4x4(mb_type)){
 328             if(IS_INTRA4x4(top_type)){
 329                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 330                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 331                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 332                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 333             }else{
 334                 int pred;
 335                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 336                     pred= -1;
 337                 else{
 338                     pred= 2;
 339                 }
 340                 h->intra4x4_pred_mode_cache[4+8*0]=
 341                 h->intra4x4_pred_mode_cache[5+8*0]=
 342                 h->intra4x4_pred_mode_cache[6+8*0]=
 343                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 344             }
 345             for(i=0; i<2; i++){
 346                 if(IS_INTRA4x4(left_type[i])){
 347                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 348                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 349                 }else{
 350                     int pred;
 351                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 352                         pred= -1;
 353                     else{
 354                         pred= 2;
 355                     }
 356                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 357                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 358                 }
 359             }
 360         }
 361     }
 362
 363
 364 /*
 365 0 . T T. T T T T
 366 1 L . .L . . . .
 367 2 L . .L . . . .
 368 3 . T TL . . . .
 369 4 L . .L . . . .
 370 5 L . .. . . . .
 371 */
 372 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 373     if(top_type){
 374         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 375         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 376         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 377         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 378
 379         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 380         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 381
 382         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 383         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 384
 385     }else{
 386         h->non_zero_count_cache[4+8*0]=
 387         h->non_zero_count_cache[5+8*0]=
 388         h->non_zero_count_cache[6+8*0]=
 389         h->non_zero_count_cache[7+8*0]=
 390
 391         h->non_zero_count_cache[1+8*0]=
 392         h->non_zero_count_cache[2+8*0]=
 393
 394         h->non_zero_count_cache[1+8*3]=
 395         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 396
 397     }
 398
 399     for (i=0; i<2; i++) {
 400         if(left_type[i]){
 401             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 402             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 403             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 404             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 405         }else{
 406             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 407             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 408             h->non_zero_count_cache[0+8*1 +   8*i]=
 409             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 410         }
 411     }
 412
 413     if( h->pps.cabac ) {
 414         // top_cbp
 415         if(top_type) {
 416             h->top_cbp = h->cbp_table[top_xy];
 417         } else if(IS_INTRA(mb_type)) {
 418             h->top_cbp = 0x1C0;
 419         } else {
 420             h->top_cbp = 0;
 421         }
 422         // left_cbp
 423         if (left_type[0]) {
 424             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 425         } else if(IS_INTRA(mb_type)) {
 426             h->left_cbp = 0x1C0;
 427         } else {
 428             h->left_cbp = 0;
 429         }
 430         if (left_type[0]) {
 431             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 432         }
 433         if (left_type[1]) {
 434             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 435         }
 436     }
 437
 438 #if 1
 439     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 440         int list;
 441         for(list=0; list<h->list_count; list++){
 442             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 443                 /*if(!h->mv_cache_clean[list]){
 444                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 445                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 446                     h->mv_cache_clean[list]= 1;
 447                 }*/
 448                 continue;
 449             }
 450             h->mv_cache_clean[list]= 0;
 451
 452             if(USES_LIST(top_type, list)){
 453                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 454                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 455                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 456                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 457                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 458                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 459                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 460                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 461                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 462                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 463             }else{
 464                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 465                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 466                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 467                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 468                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 469             }
 470
 471             for(i=0; i<2; i++){
 472                 int cache_idx = scan8[0] - 1 + i*2*8;
 473                 if(USES_LIST(left_type[i], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 475                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 476                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 477                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 478                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 479                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 480                 }else{
 481                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 482                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 483                     h->ref_cache[list][cache_idx  ]=
 484                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 485                 }
 486             }
 487
 488             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 489                 continue;
 490
 491             if(USES_LIST(topleft_type, list)){
 492                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 493                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 494                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 495                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 496             }else{
 497                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 498                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 499             }
 500
 501             if(USES_LIST(topright_type, list)){
 502                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 503                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 504                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 505                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 506             }else{
 507                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 508                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 509             }
 510
 511             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 512                 continue;
 513
 514             h->ref_cache[list][scan8[5 ]+1] =
 515             h->ref_cache[list][scan8[7 ]+1] =
 516             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 517             h->ref_cache[list][scan8[4 ]] =
 518             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 519             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 520             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 521             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 522             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 523             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 524
 525             if( h->pps.cabac ) {
 526                 /* XXX beurk, Load mvd */
 527                 if(USES_LIST(top_type, list)){
 528                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 529                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 530                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 531                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 532                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 533                 }else{
 534                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 535                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 536                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 537                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 538                 }
 539                 if(USES_LIST(left_type[0], list)){
 540                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 541                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 542                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 543                 }else{
 544                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 545                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 546                 }
 547                 if(USES_LIST(left_type[1], list)){
 548                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 549                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 550                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 551                 }else{
 552                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 553                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 554                 }
 555                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 556                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 557                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 558                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 559                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 560
 561                 if(h->slice_type == B_TYPE){
 562                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 563
 564                     if(IS_DIRECT(top_type)){
 565                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 566                     }else if(IS_8X8(top_type)){
 567                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 568                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 569                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 570                     }else{
 571                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 572                     }
 573
 574                     if(IS_DIRECT(left_type[0]))
 575                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 576                     else if(IS_8X8(left_type[0]))
 577                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 578                     else
 579                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 580
 581                     if(IS_DIRECT(left_type[1]))
 582                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 583                     else if(IS_8X8(left_type[1]))
 584                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 585                     else
 586                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 587                 }
 588             }
 589
 590             if(FRAME_MBAFF){
 591 #define MAP_MVS\
 592                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 593                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 594                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 595                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 596                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 597                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 598                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 599                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 600                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 601                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 602                 if(MB_FIELD){
 603 #define MAP_F2F(idx, mb_type)\
 604                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 605                         h->ref_cache[list][idx] <<= 1;\
 606                         h->mv_cache[list][idx][1] /= 2;\
 607                         h->mvd_cache[list][idx][1] /= 2;\
 608                     }
 609                     MAP_MVS
 610 #undef MAP_F2F
 611                 }else{
 612 #define MAP_F2F(idx, mb_type)\
 613                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 614                         h->ref_cache[list][idx] >>= 1;\
 615                         h->mv_cache[list][idx][1] <<= 1;\
 616                         h->mvd_cache[list][idx][1] <<= 1;\
 617                     }
 618                     MAP_MVS
 619 #undef MAP_F2F
 620                 }
 621             }
 622         }
 623     }
 624 #endif
 625
 626     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 627 }
 628
 629 static inline void write_back_intra_pred_mode(H264Context *h){
 630     MpegEncContext * const s = &h->s;
 631     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 632
 633     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 634     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 635     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 636     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 637     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 638     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 639     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 640 }
 641
 642 /**
 643  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 644  */
 645 static inline int check_intra4x4_pred_mode(H264Context *h){
 646     MpegEncContext * const s = &h->s;
 647     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 648     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 649     int i;
 650
 651     if(!(h->top_samples_available&0x8000)){
 652         for(i=0; i<4; i++){
 653             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 654             if(status<0){
 655                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 656                 return -1;
 657             } else if(status){
 658                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 659             }
 660         }
 661     }
 662
 663     if(!(h->left_samples_available&0x8000)){
 664         for(i=0; i<4; i++){
 665             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 666             if(status<0){
 667                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 668                 return -1;
 669             } else if(status){
 670                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 671             }
 672         }
 673     }
 674
 675     return 0;
 676 } //FIXME cleanup like next
 677
 678 /**
 679  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 680  */
 681 static inline int check_intra_pred_mode(H264Context *h, int mode){
 682     MpegEncContext * const s = &h->s;
 683     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 684     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 685
 686     if(mode > 6U) {
 687         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 688         return -1;
 689     }
 690
 691     if(!(h->top_samples_available&0x8000)){
 692         mode= top[ mode ];
 693         if(mode<0){
 694             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 695             return -1;
 696         }
 697     }
 698
 699     if(!(h->left_samples_available&0x8000)){
 700         mode= left[ mode ];
 701         if(mode<0){
 702             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 703             return -1;
 704         }
 705     }
 706
 707     return mode;
 708 }
 709
 710 /**
 711  * gets the predicted intra4x4 prediction mode.
 712  */
 713 static inline int pred_intra_mode(H264Context *h, int n){
 714     const int index8= scan8[n];
 715     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 716     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 717     const int min= FFMIN(left, top);
 718
 719     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 720
 721     if(min<0) return DC_PRED;
 722     else      return min;
 723 }
 724
 725 static inline void write_back_non_zero_count(H264Context *h){
 726     MpegEncContext * const s = &h->s;
 727     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 728
 729     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 730     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 731     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 732     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 733     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 734     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 735     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 736
 737     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 738     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 739     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 740
 741     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 742     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 743     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 744
 745     if(FRAME_MBAFF){
 746         // store all luma nnzs, for deblocking
 747         int v = 0, i;
 748         for(i=0; i<16; i++)
 749             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 750         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 751     }
 752 }
 753
 754 /**
 755  * gets the predicted number of non zero coefficients.
 756  * @param n block index
 757  */
 758 static inline int pred_non_zero_count(H264Context *h, int n){
 759     const int index8= scan8[n];
 760     const int left= h->non_zero_count_cache[index8 - 1];
 761     const int top = h->non_zero_count_cache[index8 - 8];
 762     int i= left + top;
 763
 764     if(i<64) i= (i+1)>>1;
 765
 766     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 767
 768     return i&31;
 769 }
 770
 771 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 772     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 773     MpegEncContext *s = &h->s;
 774
 775     /* there is no consistent mapping of mvs to neighboring locations that will
 776      * make mbaff happy, so we can't move all this logic to fill_caches */
 777     if(FRAME_MBAFF){
 778         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 779         const int16_t *mv;
 780         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 781         *C = h->mv_cache[list][scan8[0]-2];
 782
 783         if(!MB_FIELD
 784            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 785             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 786             if(IS_INTERLACED(mb_types[topright_xy])){
 787 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 788                 const int x4 = X4, y4 = Y4;\
 789                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 790                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 791                     return LIST_NOT_USED;\
 792                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 793                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 794                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 795                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 796
 797                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 798             }
 799         }
 800         if(topright_ref == PART_NOT_AVAILABLE
 801            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 802            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 803             if(!MB_FIELD
 804                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 805                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 806             }
 807             if(MB_FIELD
 808                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 809                && i >= scan8[0]+8){
 810                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 811                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 812             }
 813         }
 814 #undef SET_DIAG_MV
 815     }
 816
 817     if(topright_ref != PART_NOT_AVAILABLE){
 818         *C= h->mv_cache[list][ i - 8 + part_width ];
 819         return topright_ref;
 820     }else{
 821         tprintf(s->avctx, "topright MV not available\n");
 822
 823         *C= h->mv_cache[list][ i - 8 - 1 ];
 824         return h->ref_cache[list][ i - 8 - 1 ];
 825     }
 826 }
 827
 828 /**
 829  * gets the predicted MV.
 830  * @param n the block index
 831  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 832  * @param mx the x component of the predicted motion vector
 833  * @param my the y component of the predicted motion vector
 834  */
 835 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 836     const int index8= scan8[n];
 837     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 838     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 839     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 840     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 841     const int16_t * C;
 842     int diagonal_ref, match_count;
 843
 844     assert(part_width==1 || part_width==2 || part_width==4);
 845
 846 /* mv_cache
 847   B . . A T T T T
 848   U . . L . . , .
 849   U . . L . . . .
 850   U . . L . . , .
 851   . . . L . . . .
 852 */
 853
 854     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 855     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 856     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 857     if(match_count > 1){ //most common
 858         *mx= mid_pred(A[0], B[0], C[0]);
 859         *my= mid_pred(A[1], B[1], C[1]);
 860     }else if(match_count==1){
 861         if(left_ref==ref){
 862             *mx= A[0];
 863             *my= A[1];
 864         }else if(top_ref==ref){
 865             *mx= B[0];
 866             *my= B[1];
 867         }else{
 868             *mx= C[0];
 869             *my= C[1];
 870         }
 871     }else{
 872         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 873             *mx= A[0];
 874             *my= A[1];
 875         }else{
 876             *mx= mid_pred(A[0], B[0], C[0]);
 877             *my= mid_pred(A[1], B[1], C[1]);
 878         }
 879     }
 880
 881     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 882 }
 883
 884 /**
 885  * gets the directionally predicted 16x8 MV.
 886  * @param n the block index
 887  * @param mx the x component of the predicted motion vector
 888  * @param my the y component of the predicted motion vector
 889  */
 890 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 891     if(n==0){
 892         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 893         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 894
 895         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 896
 897         if(top_ref == ref){
 898             *mx= B[0];
 899             *my= B[1];
 900             return;
 901         }
 902     }else{
 903         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 904         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 905
 906         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 907
 908         if(left_ref == ref){
 909             *mx= A[0];
 910             *my= A[1];
 911             return;
 912         }
 913     }
 914
 915     //RARE
 916     pred_motion(h, n, 4, list, ref, mx, my);
 917 }
 918
 919 /**
 920  * gets the directionally predicted 8x16 MV.
 921  * @param n the block index
 922  * @param mx the x component of the predicted motion vector
 923  * @param my the y component of the predicted motion vector
 924  */
 925 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 926     if(n==0){
 927         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 928         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 929
 930         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 931
 932         if(left_ref == ref){
 933             *mx= A[0];
 934             *my= A[1];
 935             return;
 936         }
 937     }else{
 938         const int16_t * C;
 939         int diagonal_ref;
 940
 941         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 942
 943         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 944
 945         if(diagonal_ref == ref){
 946             *mx= C[0];
 947             *my= C[1];
 948             return;
 949         }
 950     }
 951
 952     //RARE
 953     pred_motion(h, n, 2, list, ref, mx, my);
 954 }
 955
 956 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 957     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 958     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 959
 960     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 961
 962     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 963        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 964        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 965
 966         *mx = *my = 0;
 967         return;
 968     }
 969
 970     pred_motion(h, 0, 4, 0, 0, mx, my);
 971
 972     return;
 973 }
 974
 975 static inline void direct_dist_scale_factor(H264Context * const h){
 976     const int poc = h->s.current_picture_ptr->poc;
 977     const int poc1 = h->ref_list[1][0].poc;
 978     int i;
 979     for(i=0; i<h->ref_count[0]; i++){
 980         int poc0 = h->ref_list[0][i].poc;
 981         int td = av_clip(poc1 - poc0, -128, 127);
 982         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 983             h->dist_scale_factor[i] = 256;
 984         }else{
 985             int tb = av_clip(poc - poc0, -128, 127);
 986             int tx = (16384 + (FFABS(td) >> 1)) / td;
 987             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 988         }
 989     }
 990     if(FRAME_MBAFF){
 991         for(i=0; i<h->ref_count[0]; i++){
 992             h->dist_scale_factor_field[2*i] =
 993             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 994         }
 995     }
 996 }
 997 static inline void direct_ref_list_init(H264Context * const h){
 998     MpegEncContext * const s = &h->s;
 999     Picture * const ref1 = &h->ref_list[1][0];
1000     Picture * const cur = s->current_picture_ptr;
1001     int list, i, j;
1002     if(cur->pict_type == I_TYPE)
1003         cur->ref_count[0] = 0;
1004     if(cur->pict_type != B_TYPE)
1005         cur->ref_count[1] = 0;
1006     for(list=0; list<2; list++){
1007         cur->ref_count[list] = h->ref_count[list];
1008         for(j=0; j<h->ref_count[list]; j++)
1009             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1010     }
1011     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1012         return;
1013     for(list=0; list<2; list++){
1014         for(i=0; i<ref1->ref_count[list]; i++){
1015             const int poc = ref1->ref_poc[list][i];
1016             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1017             for(j=0; j<h->ref_count[list]; j++)
1018                 if(h->ref_list[list][j].poc == poc){
1019                     h->map_col_to_list0[list][i] = j;
1020                     break;
1021                 }
1022         }
1023     }
1024     if(FRAME_MBAFF){
1025         for(list=0; list<2; list++){
1026             for(i=0; i<ref1->ref_count[list]; i++){
1027                 j = h->map_col_to_list0[list][i];
1028                 h->map_col_to_list0_field[list][2*i] = 2*j;
1029                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1030             }
1031         }
1032     }
1033 }
1034
1035 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1036     MpegEncContext * const s = &h->s;
1037     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1038     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1039     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1040     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1041     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1042     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1043     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1044     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1045     const int is_b8x8 = IS_8X8(*mb_type);
1046     unsigned int sub_mb_type;
1047     int i8, i4;
1048
1049 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1050     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1051         /* FIXME save sub mb types from previous frames (or derive from MVs)
1052          * so we know exactly what block size to use */
1053         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1054         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1055     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1056         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1057         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1058     }else{
1059         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1060         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1061     }
1062     if(!is_b8x8)
1063         *mb_type |= MB_TYPE_DIRECT2;
1064     if(MB_FIELD)
1065         *mb_type |= MB_TYPE_INTERLACED;
1066
1067     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1068
1069     if(h->direct_spatial_mv_pred){
1070         int ref[2];
1071         int mv[2][2];
1072         int list;
1073
1074         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1075
1076         /* ref = min(neighbors) */
1077         for(list=0; list<2; list++){
1078             int refa = h->ref_cache[list][scan8[0] - 1];
1079             int refb = h->ref_cache[list][scan8[0] - 8];
1080             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1081             if(refc == -2)
1082                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1083             ref[list] = refa;
1084             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1085                 ref[list] = refb;
1086             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1087                 ref[list] = refc;
1088             if(ref[list] < 0)
1089                 ref[list] = -1;
1090         }
1091
1092         if(ref[0] < 0 && ref[1] < 0){
1093             ref[0] = ref[1] = 0;
1094             mv[0][0] = mv[0][1] =
1095             mv[1][0] = mv[1][1] = 0;
1096         }else{
1097             for(list=0; list<2; list++){
1098                 if(ref[list] >= 0)
1099                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1100                 else
1101                     mv[list][0] = mv[list][1] = 0;
1102             }
1103         }
1104
1105         if(ref[1] < 0){
1106             *mb_type &= ~MB_TYPE_P0L1;
1107             sub_mb_type &= ~MB_TYPE_P0L1;
1108         }else if(ref[0] < 0){
1109             *mb_type &= ~MB_TYPE_P0L0;
1110             sub_mb_type &= ~MB_TYPE_P0L0;
1111         }
1112
1113         if(IS_16X16(*mb_type)){
1114             int a=0, b=0;
1115
1116             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1117             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1118             if(!IS_INTRA(mb_type_col)
1119                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1120                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1121                        && (h->x264_build>33 || !h->x264_build)))){
1122                 if(ref[0] > 0)
1123                     a= pack16to32(mv[0][0],mv[0][1]);
1124                 if(ref[1] > 0)
1125                     b= pack16to32(mv[1][0],mv[1][1]);
1126             }else{
1127                 a= pack16to32(mv[0][0],mv[0][1]);
1128                 b= pack16to32(mv[1][0],mv[1][1]);
1129             }
1130             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1131             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1132         }else{
1133             for(i8=0; i8<4; i8++){
1134                 const int x8 = i8&1;
1135                 const int y8 = i8>>1;
1136
1137                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1138                     continue;
1139                 h->sub_mb_type[i8] = sub_mb_type;
1140
1141                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1142                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1143                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1144                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1145
1146                 /* col_zero_flag */
1147                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1148                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1149                                                   && (h->x264_build>33 || !h->x264_build)))){
1150                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1151                     if(IS_SUB_8X8(sub_mb_type)){
1152                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1153                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1154                             if(ref[0] == 0)
1155                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1156                             if(ref[1] == 0)
1157                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1158                         }
1159                     }else
1160                     for(i4=0; i4<4; i4++){
1161                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1162                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1163                             if(ref[0] == 0)
1164                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1165                             if(ref[1] == 0)
1166                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1167                         }
1168                     }
1169                 }
1170             }
1171         }
1172     }else{ /* direct temporal mv pred */
1173         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1174         const int *dist_scale_factor = h->dist_scale_factor;
1175
1176         if(FRAME_MBAFF){
1177             if(IS_INTERLACED(*mb_type)){
1178                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1179                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1180                 dist_scale_factor = h->dist_scale_factor_field;
1181             }
1182             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1183                 /* FIXME assumes direct_8x8_inference == 1 */
1184                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1185                 int mb_types_col[2];
1186                 int y_shift;
1187
1188                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1189                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1190                          | (*mb_type & MB_TYPE_INTERLACED);
1191                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1192
1193                 if(IS_INTERLACED(*mb_type)){
1194                     /* frame to field scaling */
1195                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1196                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1197                     if(s->mb_y&1){
1198                         l1ref0 -= 2*h->b8_stride;
1199                         l1ref1 -= 2*h->b8_stride;
1200                         l1mv0 -= 4*h->b_stride;
1201                         l1mv1 -= 4*h->b_stride;
1202                     }
1203                     y_shift = 0;
1204
1205                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1206                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1207                        && !is_b8x8)
1208                         *mb_type |= MB_TYPE_16x8;
1209                     else
1210                         *mb_type |= MB_TYPE_8x8;
1211                 }else{
1212                     /* field to frame scaling */
1213                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1214                      * but in MBAFF, top and bottom POC are equal */
1215                     int dy = (s->mb_y&1) ? 1 : 2;
1216                     mb_types_col[0] =
1217                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1218                     l1ref0 += dy*h->b8_stride;
1219                     l1ref1 += dy*h->b8_stride;
1220                     l1mv0 += 2*dy*h->b_stride;
1221                     l1mv1 += 2*dy*h->b_stride;
1222                     y_shift = 2;
1223
1224                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1225                        && !is_b8x8)
1226                         *mb_type |= MB_TYPE_16x16;
1227                     else
1228                         *mb_type |= MB_TYPE_8x8;
1229                 }
1230
1231                 for(i8=0; i8<4; i8++){
1232                     const int x8 = i8&1;
1233                     const int y8 = i8>>1;
1234                     int ref0, scale;
1235                     const int16_t (*l1mv)[2]= l1mv0;
1236
1237                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1238                         continue;
1239                     h->sub_mb_type[i8] = sub_mb_type;
1240
1241                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1242                     if(IS_INTRA(mb_types_col[y8])){
1243                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1244                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1245                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1246                         continue;
1247                     }
1248
1249                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1250                     if(ref0 >= 0)
1251                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1252                     else{
1253                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1254                         l1mv= l1mv1;
1255                     }
1256                     scale = dist_scale_factor[ref0];
1257                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1258
1259                     {
1260                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1261                         int my_col = (mv_col[1]<<y_shift)/2;
1262                         int mx = (scale * mv_col[0] + 128) >> 8;
1263                         int my = (scale * my_col + 128) >> 8;
1264                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1265                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1266                     }
1267                 }
1268                 return;
1269             }
1270         }
1271
1272         /* one-to-one mv scaling */
1273
1274         if(IS_16X16(*mb_type)){
1275             int ref, mv0, mv1;
1276
1277             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1278             if(IS_INTRA(mb_type_col)){
1279                 ref=mv0=mv1=0;
1280             }else{
1281                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1282                                                 : map_col_to_list0[1][l1ref1[0]];
1283                 const int scale = dist_scale_factor[ref0];
1284                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1285                 int mv_l0[2];
1286                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1287                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1288                 ref= ref0;
1289                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1290                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1291             }
1292             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1293             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1294             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1295         }else{
1296             for(i8=0; i8<4; i8++){
1297                 const int x8 = i8&1;
1298                 const int y8 = i8>>1;
1299                 int ref0, scale;
1300                 const int16_t (*l1mv)[2]= l1mv0;
1301
1302                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1303                     continue;
1304                 h->sub_mb_type[i8] = sub_mb_type;
1305                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1306                 if(IS_INTRA(mb_type_col)){
1307                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1308                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1309                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1310                     continue;
1311                 }
1312
1313                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1314                 if(ref0 >= 0)
1315                     ref0 = map_col_to_list0[0][ref0];
1316                 else{
1317                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1318                     l1mv= l1mv1;
1319                 }
1320                 scale = dist_scale_factor[ref0];
1321
1322                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1323                 if(IS_SUB_8X8(sub_mb_type)){
1324                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1325                     int mx = (scale * mv_col[0] + 128) >> 8;
1326                     int my = (scale * mv_col[1] + 128) >> 8;
1327                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1328                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1329                 }else
1330                 for(i4=0; i4<4; i4++){
1331                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1332                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1333                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1334                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1335                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1336                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1337                 }
1338             }
1339         }
1340     }
1341 }
1342
1343 static inline void write_back_motion(H264Context *h, int mb_type){
1344     MpegEncContext * const s = &h->s;
1345     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1346     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1347     int list;
1348
1349     if(!USES_LIST(mb_type, 0))
1350         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1351
1352     for(list=0; list<h->list_count; list++){
1353         int y;
1354         if(!USES_LIST(mb_type, list))
1355             continue;
1356
1357         for(y=0; y<4; y++){
1358             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1359             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1360         }
1361         if( h->pps.cabac ) {
1362             if(IS_SKIP(mb_type))
1363                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1364             else
1365             for(y=0; y<4; y++){
1366                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1367                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1368             }
1369         }
1370
1371         {
1372             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1373             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1374             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1375             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1376             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1377         }
1378     }
1379
1380     if(h->slice_type == B_TYPE && h->pps.cabac){
1381         if(IS_8X8(mb_type)){
1382             uint8_t *direct_table = &h->direct_table[b8_xy];
1383             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1384             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1385             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1386         }
1387     }
1388 }
1389
1390 /**
1391  * Decodes a network abstraction layer unit.
1392  * @param consumed is the number of bytes used as input
1393  * @param length is the length of the array
1394  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1395  * @returns decoded bytes, might be src+1 if no escapes
1396  */
1397 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1398     int i, si, di;
1399     uint8_t *dst;
1400     int bufidx;
1401
1402 //    src[0]&0x80;                //forbidden bit
1403     h->nal_ref_idc= src[0]>>5;
1404     h->nal_unit_type= src[0]&0x1F;
1405
1406     src++; length--;
1407 #if 0
1408     for(i=0; i<length; i++)
1409         printf("%2X ", src[i]);
1410 #endif
1411     for(i=0; i+1<length; i+=2){
1412         if(src[i]) continue;
1413         if(i>0 && src[i-1]==0) i--;
1414         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1415             if(src[i+2]!=3){
1416                 /* startcode, so we must be past the end */
1417                 length=i;
1418             }
1419             break;
1420         }
1421     }
1422
1423     if(i>=length-1){ //no escaped 0
1424         *dst_length= length;
1425         *consumed= length+1; //+1 for the header
1426         return src;
1427     }
1428
1429     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1430     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1431     dst= h->rbsp_buffer[bufidx];
1432
1433     if (dst == NULL){
1434         return NULL;
1435     }
1436
1437 //printf("decoding esc\n");
1438     si=di=0;
1439     while(si<length){
1440         //remove escapes (very rare 1:2^22)
1441         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1442             if(src[si+2]==3){ //escape
1443                 dst[di++]= 0;
1444                 dst[di++]= 0;
1445                 si+=3;
1446                 continue;
1447             }else //next start code
1448                 break;
1449         }
1450
1451         dst[di++]= src[si++];
1452     }
1453
1454     *dst_length= di;
1455     *consumed= si + 1;//+1 for the header
1456 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1457     return dst;
1458 }
1459
1460 /**
1461  * identifies the exact end of the bitstream
1462  * @return the length of the trailing, or 0 if damaged
1463  */
1464 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1465     int v= *src;
1466     int r;
1467
1468     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1469
1470     for(r=1; r<9; r++){
1471         if(v&1) return r;
1472         v>>=1;
1473     }
1474     return 0;
1475 }
1476
1477 /**
1478  * idct tranforms the 16 dc values and dequantize them.
1479  * @param qp quantization parameter
1480  */
1481 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1482 #define stride 16
1483     int i;
1484     int temp[16]; //FIXME check if this is a good idea
1485     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1486     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1487
1488 //memset(block, 64, 2*256);
1489 //return;
1490     for(i=0; i<4; i++){
1491         const int offset= y_offset[i];
1492         const int z0= block[offset+stride*0] + block[offset+stride*4];
1493         const int z1= block[offset+stride*0] - block[offset+stride*4];
1494         const int z2= block[offset+stride*1] - block[offset+stride*5];
1495         const int z3= block[offset+stride*1] + block[offset+stride*5];
1496
1497         temp[4*i+0]= z0+z3;
1498         temp[4*i+1]= z1+z2;
1499         temp[4*i+2]= z1-z2;
1500         temp[4*i+3]= z0-z3;
1501     }
1502
1503     for(i=0; i<4; i++){
1504         const int offset= x_offset[i];
1505         const int z0= temp[4*0+i] + temp[4*2+i];
1506         const int z1= temp[4*0+i] - temp[4*2+i];
1507         const int z2= temp[4*1+i] - temp[4*3+i];
1508         const int z3= temp[4*1+i] + temp[4*3+i];
1509
1510         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1511         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1512         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1513         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1514     }
1515 }
1516
1517 #if 0
1518 /**
1519  * dct tranforms the 16 dc values.
1520  * @param qp quantization parameter ??? FIXME
1521  */
1522 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1523 //    const int qmul= dequant_coeff[qp][0];
1524     int i;
1525     int temp[16]; //FIXME check if this is a good idea
1526     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1527     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1528
1529     for(i=0; i<4; i++){
1530         const int offset= y_offset[i];
1531         const int z0= block[offset+stride*0] + block[offset+stride*4];
1532         const int z1= block[offset+stride*0] - block[offset+stride*4];
1533         const int z2= block[offset+stride*1] - block[offset+stride*5];
1534         const int z3= block[offset+stride*1] + block[offset+stride*5];
1535
1536         temp[4*i+0]= z0+z3;
1537         temp[4*i+1]= z1+z2;
1538         temp[4*i+2]= z1-z2;
1539         temp[4*i+3]= z0-z3;
1540     }
1541
1542     for(i=0; i<4; i++){
1543         const int offset= x_offset[i];
1544         const int z0= temp[4*0+i] + temp[4*2+i];
1545         const int z1= temp[4*0+i] - temp[4*2+i];
1546         const int z2= temp[4*1+i] - temp[4*3+i];
1547         const int z3= temp[4*1+i] + temp[4*3+i];
1548
1549         block[stride*0 +offset]= (z0 + z3)>>1;
1550         block[stride*2 +offset]= (z1 + z2)>>1;
1551         block[stride*8 +offset]= (z1 - z2)>>1;
1552         block[stride*10+offset]= (z0 - z3)>>1;
1553     }
1554 }
1555 #endif
1556
1557 #undef xStride
1558 #undef stride
1559
1560 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1561     const int stride= 16*2;
1562     const int xStride= 16;
1563     int a,b,c,d,e;
1564
1565     a= block[stride*0 + xStride*0];
1566     b= block[stride*0 + xStride*1];
1567     c= block[stride*1 + xStride*0];
1568     d= block[stride*1 + xStride*1];
1569
1570     e= a-b;
1571     a= a+b;
1572     b= c-d;
1573     c= c+d;
1574
1575     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1576     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1577     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1578     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1579 }
1580
1581 #if 0
1582 static void chroma_dc_dct_c(DCTELEM *block){
1583     const int stride= 16*2;
1584     const int xStride= 16;
1585     int a,b,c,d,e;
1586
1587     a= block[stride*0 + xStride*0];
1588     b= block[stride*0 + xStride*1];
1589     c= block[stride*1 + xStride*0];
1590     d= block[stride*1 + xStride*1];
1591
1592     e= a-b;
1593     a= a+b;
1594     b= c-d;
1595     c= c+d;
1596
1597     block[stride*0 + xStride*0]= (a+c);
1598     block[stride*0 + xStride*1]= (e+b);
1599     block[stride*1 + xStride*0]= (a-c);
1600     block[stride*1 + xStride*1]= (e-b);
1601 }
1602 #endif
1603
1604 /**
1605  * gets the chroma qp.
1606  */
1607 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1608     return h->pps.chroma_qp_table[t][qscale & 0xff];
1609 }
1610
1611 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1612 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1613 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1614     int i;
1615     const int * const quant_table= quant_coeff[qscale];
1616     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1617     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1618     const unsigned int threshold2= (threshold1<<1);
1619     int last_non_zero;
1620
1621     if(separate_dc){
1622         if(qscale<=18){
1623             //avoid overflows
1624             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1625             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1626             const unsigned int dc_threshold2= (dc_threshold1<<1);
1627
1628             int level= block[0]*quant_coeff[qscale+18][0];
1629             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1630                 if(level>0){
1631                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1632                     block[0]= level;
1633                 }else{
1634                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1635                     block[0]= -level;
1636                 }
1637 //                last_non_zero = i;
1638             }else{
1639                 block[0]=0;
1640             }
1641         }else{
1642             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1643             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1644             const unsigned int dc_threshold2= (dc_threshold1<<1);
1645
1646             int level= block[0]*quant_table[0];
1647             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1648                 if(level>0){
1649                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1650                     block[0]= level;
1651                 }else{
1652                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1653                     block[0]= -level;
1654                 }
1655 //                last_non_zero = i;
1656             }else{
1657                 block[0]=0;
1658             }
1659         }
1660         last_non_zero= 0;
1661         i=1;
1662     }else{
1663         last_non_zero= -1;
1664         i=0;
1665     }
1666
1667     for(; i<16; i++){
1668         const int j= scantable[i];
1669         int level= block[j]*quant_table[j];
1670
1671 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1672 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1673         if(((unsigned)(level+threshold1))>threshold2){
1674             if(level>0){
1675                 level= (bias + level)>>QUANT_SHIFT;
1676                 block[j]= level;
1677             }else{
1678                 level= (bias - level)>>QUANT_SHIFT;
1679                 block[j]= -level;
1680             }
1681             last_non_zero = i;
1682         }else{
1683             block[j]=0;
1684         }
1685     }
1686
1687     return last_non_zero;
1688 }
1689
1690 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1691                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1692                            int src_x_offset, int src_y_offset,
1693                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1694     MpegEncContext * const s = &h->s;
1695     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1696     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1697     const int luma_xy= (mx&3) + ((my&3)<<2);
1698     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1699     uint8_t * src_cb, * src_cr;
1700     int extra_width= h->emu_edge_width;
1701     int extra_height= h->emu_edge_height;
1702     int emu=0;
1703     const int full_mx= mx>>2;
1704     const int full_my= my>>2;
1705     const int pic_width  = 16*s->mb_width;
1706     const int pic_height = 16*s->mb_height >> (MB_MBAFF || FIELD_PICTURE);
1707
1708     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1709         return;
1710
1711     if(mx&7) extra_width -= 3;
1712     if(my&7) extra_height -= 3;
1713
1714     if(   full_mx < 0-extra_width
1715        || full_my < 0-extra_height
1716        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1717        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1718         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1719             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1720         emu=1;
1721     }
1722
1723     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1724     if(!square){
1725         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1726     }
1727
1728     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1729
1730     if(MB_MBAFF || FIELD_PICTURE){
1731         // chroma offset when predicting from a field of opposite parity
1732         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
1733         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1734     }
1735     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1736     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1737
1738     if(emu){
1739         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1740             src_cb= s->edge_emu_buffer;
1741     }
1742     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1743
1744     if(emu){
1745         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1746             src_cr= s->edge_emu_buffer;
1747     }
1748     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1749 }
1750
1751 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1752                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1753                            int x_offset, int y_offset,
1754                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1755                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1756                            int list0, int list1){
1757     MpegEncContext * const s = &h->s;
1758     qpel_mc_func *qpix_op=  qpix_put;
1759     h264_chroma_mc_func chroma_op= chroma_put;
1760
1761     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1762     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1763     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1764     x_offset += 8*s->mb_x;
1765     y_offset += 8*(s->mb_y >> (MB_MBAFF || FIELD_PICTURE));
1766
1767     if(list0){
1768         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1769         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1770                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1771                            qpix_op, chroma_op);
1772
1773         qpix_op=  qpix_avg;
1774         chroma_op= chroma_avg;
1775     }
1776
1777     if(list1){
1778         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1779         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1780                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1781                            qpix_op, chroma_op);
1782     }
1783 }
1784
1785 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1786                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1787                            int x_offset, int y_offset,
1788                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1789                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1790                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1791                            int list0, int list1){
1792     MpegEncContext * const s = &h->s;
1793
1794     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1795     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1796     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1797     x_offset += 8*s->mb_x;
1798     y_offset += 8*(s->mb_y >> (MB_MBAFF || FIELD_PICTURE));
1799
1800     if(list0 && list1){
1801         /* don't optimize for luma-only case, since B-frames usually
1802          * use implicit weights => chroma too. */
1803         uint8_t *tmp_cb = s->obmc_scratchpad;
1804         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1805         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1806         int refn0 = h->ref_cache[0][ scan8[n] ];
1807         int refn1 = h->ref_cache[1][ scan8[n] ];
1808
1809         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1810                     dest_y, dest_cb, dest_cr,
1811                     x_offset, y_offset, qpix_put, chroma_put);
1812         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1813                     tmp_y, tmp_cb, tmp_cr,
1814                     x_offset, y_offset, qpix_put, chroma_put);
1815
1816         if(h->use_weight == 2){
1817             int weight0 = h->implicit_weight[refn0][refn1];
1818             int weight1 = 64 - weight0;
1819             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1820             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1821             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1822         }else{
1823             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1824                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1825                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1826             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1827                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1828                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1829             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1830                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1831                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1832         }
1833     }else{
1834         int list = list1 ? 1 : 0;
1835         int refn = h->ref_cache[list][ scan8[n] ];
1836         Picture *ref= &h->ref_list[list][refn];
1837         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1838                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1839                     qpix_put, chroma_put);
1840
1841         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1842                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1843         if(h->use_weight_chroma){
1844             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1845                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1846             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1847                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1848         }
1849     }
1850 }
1851
1852 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1853                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1854                            int x_offset, int y_offset,
1855                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1856                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1857                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1858                            int list0, int list1){
1859     if((h->use_weight==2 && list0 && list1
1860         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1861        || h->use_weight==1)
1862         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1863                          x_offset, y_offset, qpix_put, chroma_put,
1864                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1865     else
1866         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1867                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1868 }
1869
1870 static inline void prefetch_motion(H264Context *h, int list){
1871     /* fetch pixels for estimated mv 4 macroblocks ahead
1872      * optimized for 64byte cache lines */
1873     MpegEncContext * const s = &h->s;
1874     const int refn = h->ref_cache[list][scan8[0]];
1875     if(refn >= 0){
1876         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1877         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1878         uint8_t **src= h->ref_list[list][refn].data;
1879         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1880         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1881         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1882         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1883     }
1884 }
1885
1886 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1887                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1888                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1889                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1890     MpegEncContext * const s = &h->s;
1891     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1892     const int mb_type= s->current_picture.mb_type[mb_xy];
1893
1894     assert(IS_INTER(mb_type));
1895
1896     prefetch_motion(h, 0);
1897
1898     if(IS_16X16(mb_type)){
1899         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1900                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1901                 &weight_op[0], &weight_avg[0],
1902                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1903     }else if(IS_16X8(mb_type)){
1904         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1905                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1906                 &weight_op[1], &weight_avg[1],
1907                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1908         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1909                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1910                 &weight_op[1], &weight_avg[1],
1911                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1912     }else if(IS_8X16(mb_type)){
1913         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1914                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1915                 &weight_op[2], &weight_avg[2],
1916                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1917         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1918                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1919                 &weight_op[2], &weight_avg[2],
1920                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1921     }else{
1922         int i;
1923
1924         assert(IS_8X8(mb_type));
1925
1926         for(i=0; i<4; i++){
1927             const int sub_mb_type= h->sub_mb_type[i];
1928             const int n= 4*i;
1929             int x_offset= (i&1)<<2;
1930             int y_offset= (i&2)<<1;
1931
1932             if(IS_SUB_8X8(sub_mb_type)){
1933                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1934                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1935                     &weight_op[3], &weight_avg[3],
1936                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1937             }else if(IS_SUB_8X4(sub_mb_type)){
1938                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1939                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1940                     &weight_op[4], &weight_avg[4],
1941                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1942                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1943                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1944                     &weight_op[4], &weight_avg[4],
1945                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1946             }else if(IS_SUB_4X8(sub_mb_type)){
1947                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1948                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1949                     &weight_op[5], &weight_avg[5],
1950                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1951                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1952                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1953                     &weight_op[5], &weight_avg[5],
1954                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1955             }else{
1956                 int j;
1957                 assert(IS_SUB_4X4(sub_mb_type));
1958                 for(j=0; j<4; j++){
1959                     int sub_x_offset= x_offset + 2*(j&1);
1960                     int sub_y_offset= y_offset +   (j&2);
1961                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1962                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1963                         &weight_op[6], &weight_avg[6],
1964                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1965                 }
1966             }
1967         }
1968     }
1969
1970     prefetch_motion(h, 1);
1971 }
1972
1973 static void decode_init_vlc(void){
1974     static int done = 0;
1975
1976     if (!done) {
1977         int i;
1978         done = 1;
1979
1980         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1981                  &chroma_dc_coeff_token_len [0], 1, 1,
1982                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1983
1984         for(i=0; i<4; i++){
1985             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1986                      &coeff_token_len [i][0], 1, 1,
1987                      &coeff_token_bits[i][0], 1, 1, 1);
1988         }
1989
1990         for(i=0; i<3; i++){
1991             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1992                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1993                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1994         }
1995         for(i=0; i<15; i++){
1996             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1997                      &total_zeros_len [i][0], 1, 1,
1998                      &total_zeros_bits[i][0], 1, 1, 1);
1999         }
2000
2001         for(i=0; i<6; i++){
2002             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2003                      &run_len [i][0], 1, 1,
2004                      &run_bits[i][0], 1, 1, 1);
2005         }
2006         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2007                  &run_len [6][0], 1, 1,
2008                  &run_bits[6][0], 1, 1, 1);
2009     }
2010 }
2011
2012 static void free_tables(H264Context *h){
2013     int i;
2014     H264Context *hx;
2015     av_freep(&h->intra4x4_pred_mode);
2016     av_freep(&h->chroma_pred_mode_table);
2017     av_freep(&h->cbp_table);
2018     av_freep(&h->mvd_table[0]);
2019     av_freep(&h->mvd_table[1]);
2020     av_freep(&h->direct_table);
2021     av_freep(&h->non_zero_count);
2022     av_freep(&h->slice_table_base);
2023     h->slice_table= NULL;
2024
2025     av_freep(&h->mb2b_xy);
2026     av_freep(&h->mb2b8_xy);
2027
2028     for(i = 0; i < MAX_SPS_COUNT; i++)
2029         av_freep(h->sps_buffers + i);
2030
2031     for(i = 0; i < MAX_PPS_COUNT; i++)
2032         av_freep(h->pps_buffers + i);
2033
2034     for(i = 0; i < h->s.avctx->thread_count; i++) {
2035         hx = h->thread_context[i];
2036         if(!hx) continue;
2037         av_freep(&hx->top_borders[1]);
2038         av_freep(&hx->top_borders[0]);
2039         av_freep(&hx->s.obmc_scratchpad);
2040         av_freep(&hx->s.allocated_edge_emu_buffer);
2041     }
2042 }
2043
2044 static void init_dequant8_coeff_table(H264Context *h){
2045     int i,q,x;
2046     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2047     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2048     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2049
2050     for(i=0; i<2; i++ ){
2051         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2052             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2053             break;
2054         }
2055
2056         for(q=0; q<52; q++){
2057             int shift = ff_div6[q];
2058             int idx = ff_rem6[q];
2059             for(x=0; x<64; x++)
2060                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2061                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2062                     h->pps.scaling_matrix8[i][x]) << shift;
2063         }
2064     }
2065 }
2066
2067 static void init_dequant4_coeff_table(H264Context *h){
2068     int i,j,q,x;
2069     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2070     for(i=0; i<6; i++ ){
2071         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2072         for(j=0; j<i; j++){
2073             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2074                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2075                 break;
2076             }
2077         }
2078         if(j<i)
2079             continue;
2080
2081         for(q=0; q<52; q++){
2082             int shift = ff_div6[q] + 2;
2083             int idx = ff_rem6[q];
2084             for(x=0; x<16; x++)
2085                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2086                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2087                     h->pps.scaling_matrix4[i][x]) << shift;
2088         }
2089     }
2090 }
2091
2092 static void init_dequant_tables(H264Context *h){
2093     int i,x;
2094     init_dequant4_coeff_table(h);
2095     if(h->pps.transform_8x8_mode)
2096         init_dequant8_coeff_table(h);
2097     if(h->sps.transform_bypass){
2098         for(i=0; i<6; i++)
2099             for(x=0; x<16; x++)
2100                 h->dequant4_coeff[i][0][x] = 1<<6;
2101         if(h->pps.transform_8x8_mode)
2102             for(i=0; i<2; i++)
2103                 for(x=0; x<64; x++)
2104                     h->dequant8_coeff[i][0][x] = 1<<6;
2105     }
2106 }
2107
2108
2109 /**
2110  * allocates tables.
2111  * needs width/height
2112  */
2113 static int alloc_tables(H264Context *h){
2114     MpegEncContext * const s = &h->s;
2115     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2116     int x,y;
2117
2118     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2119
2120     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2121     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2122     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2123
2124     if( h->pps.cabac ) {
2125         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2126         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2127         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2128         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2129     }
2130
2131     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2132     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2133
2134     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2135     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2136     for(y=0; y<s->mb_height; y++){
2137         for(x=0; x<s->mb_width; x++){
2138             const int mb_xy= x + y*s->mb_stride;
2139             const int b_xy = 4*x + 4*y*h->b_stride;
2140             const int b8_xy= 2*x + 2*y*h->b8_stride;
2141
2142             h->mb2b_xy [mb_xy]= b_xy;
2143             h->mb2b8_xy[mb_xy]= b8_xy;
2144         }
2145     }
2146
2147     s->obmc_scratchpad = NULL;
2148
2149     if(!h->dequant4_coeff[0])
2150         init_dequant_tables(h);
2151
2152     return 0;
2153 fail:
2154     free_tables(h);
2155     return -1;
2156 }
2157
2158 /**
2159  * Mimic alloc_tables(), but for every context thread.
2160  */
2161 static void clone_tables(H264Context *dst, H264Context *src){
2162     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2163     dst->non_zero_count           = src->non_zero_count;
2164     dst->slice_table              = src->slice_table;
2165     dst->cbp_table                = src->cbp_table;
2166     dst->mb2b_xy                  = src->mb2b_xy;
2167     dst->mb2b8_xy                 = src->mb2b8_xy;
2168     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2169     dst->mvd_table[0]             = src->mvd_table[0];
2170     dst->mvd_table[1]             = src->mvd_table[1];
2171     dst->direct_table             = src->direct_table;
2172
2173     dst->s.obmc_scratchpad = NULL;
2174     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2175 }
2176
2177 /**
2178  * Init context
2179  * Allocate buffers which are not shared amongst multiple threads.
2180  */
2181 static int context_init(H264Context *h){
2182     MpegEncContext * const s = &h->s;
2183
2184     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2185     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2186
2187     // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264)
2188     CHECKED_ALLOCZ(s->allocated_edge_emu_buffer,
2189                    (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance
2190     s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21;
2191     return 0;
2192 fail:
2193     return -1; // free_tables will clean up for us
2194 }
2195
2196 static void common_init(H264Context *h){
2197     MpegEncContext * const s = &h->s;
2198
2199     s->width = s->avctx->width;
2200     s->height = s->avctx->height;
2201     s->codec_id= s->avctx->codec->id;
2202
2203     ff_h264_pred_init(&h->hpc, s->codec_id);
2204
2205     h->dequant_coeff_pps= -1;
2206     s->unrestricted_mv=1;
2207     s->decode=1; //FIXME
2208
2209     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2210     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2211 }
2212
2213 static int decode_init(AVCodecContext *avctx){
2214     H264Context *h= avctx->priv_data;
2215     MpegEncContext * const s = &h->s;
2216
2217     MPV_decode_defaults(s);
2218
2219     s->avctx = avctx;
2220     common_init(h);
2221
2222     s->out_format = FMT_H264;
2223     s->workaround_bugs= avctx->workaround_bugs;
2224
2225     // set defaults
2226 //    s->decode_mb= ff_h263_decode_mb;
2227     s->quarter_sample = 1;
2228     s->low_delay= 1;
2229     avctx->pix_fmt= PIX_FMT_YUV420P;
2230
2231     decode_init_vlc();
2232
2233     if(avctx->extradata_size > 0 && avctx->extradata &&
2234        *(char *)avctx->extradata == 1){
2235         h->is_avc = 1;
2236         h->got_avcC = 0;
2237     } else {
2238         h->is_avc = 0;
2239     }
2240
2241     h->thread_context[0] = h;
2242     return 0;
2243 }
2244
2245 static int frame_start(H264Context *h){
2246     MpegEncContext * const s = &h->s;
2247     int i;
2248
2249     if(MPV_frame_start(s, s->avctx) < 0)
2250         return -1;
2251     ff_er_frame_start(s);
2252     /*
2253      * MPV_frame_start uses pict_type to derive key_frame.
2254      * This is incorrect for H.264; IDR markings must be used.
2255      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2256      * See decode_nal_units().
2257      */
2258     s->current_picture_ptr->key_frame= 0;
2259
2260     assert(s->linesize && s->uvlinesize);
2261
2262     for(i=0; i<16; i++){
2263         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2264         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2265     }
2266     for(i=0; i<4; i++){
2267         h->block_offset[16+i]=
2268         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2269         h->block_offset[24+16+i]=
2270         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2271     }
2272
2273     /* can't be in alloc_tables because linesize isn't known there.
2274      * FIXME: redo bipred weight to not require extra buffer? */
2275     for(i = 0; i < s->avctx->thread_count; i++)
2276         if(!h->thread_context[i]->s.obmc_scratchpad)
2277             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2278
2279     /* some macroblocks will be accessed before they're available */
2280     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2281         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2282
2283 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2284     return 0;
2285 }
2286
2287 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2288     MpegEncContext * const s = &h->s;
2289     int i;
2290
2291     src_y  -=   linesize;
2292     src_cb -= uvlinesize;
2293     src_cr -= uvlinesize;
2294
2295     // There are two lines saved, the line above the the top macroblock of a pair,
2296     // and the line above the bottom macroblock
2297     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2298     for(i=1; i<17; i++){
2299         h->left_border[i]= src_y[15+i*  linesize];
2300     }
2301
2302     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2303     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2304
2305     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2306         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2307         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2308         for(i=1; i<9; i++){
2309             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2310             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2311         }
2312         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2313         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2314     }
2315 }
2316
2317 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2318     MpegEncContext * const s = &h->s;
2319     int temp8, i;
2320     uint64_t temp64;
2321     int deblock_left;
2322     int deblock_top;
2323     int mb_xy;
2324
2325     if(h->deblocking_filter == 2) {
2326         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2327         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2328         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2329     } else {
2330         deblock_left = (s->mb_x > 0);
2331         deblock_top =  (s->mb_y > 0);
2332     }
2333
2334     src_y  -=   linesize + 1;
2335     src_cb -= uvlinesize + 1;
2336     src_cr -= uvlinesize + 1;
2337
2338 #define XCHG(a,b,t,xchg)\
2339 t= a;\
2340 if(xchg)\
2341     a= b;\
2342 b= t;
2343
2344     if(deblock_left){
2345         for(i = !deblock_top; i<17; i++){
2346             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2347         }
2348     }
2349
2350     if(deblock_top){
2351         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2352         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2353         if(s->mb_x+1 < s->mb_width){
2354             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2355         }
2356     }
2357
2358     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2359         if(deblock_left){
2360             for(i = !deblock_top; i<9; i++){
2361                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2362                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2363             }
2364         }
2365         if(deblock_top){
2366             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2367             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2368         }
2369     }
2370 }
2371
2372 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2373     MpegEncContext * const s = &h->s;
2374     int i;
2375
2376     src_y  -= 2 *   linesize;
2377     src_cb -= 2 * uvlinesize;
2378     src_cr -= 2 * uvlinesize;
2379
2380     // There are two lines saved, the line above the the top macroblock of a pair,
2381     // and the line above the bottom macroblock
2382     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2383     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2384     for(i=2; i<34; i++){
2385         h->left_border[i]= src_y[15+i*  linesize];
2386     }
2387
2388     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2389     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2390     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2391     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2392
2393     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2394         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2395         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2396         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2397         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2398         for(i=2; i<18; i++){
2399             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2400             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2401         }
2402         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2403         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2404         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2405         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2406     }
2407 }
2408
2409 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2410     MpegEncContext * const s = &h->s;
2411     int temp8, i;
2412     uint64_t temp64;
2413     int deblock_left = (s->mb_x > 0);
2414     int deblock_top  = (s->mb_y > 1);
2415
2416     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2417
2418     src_y  -= 2 *   linesize + 1;
2419     src_cb -= 2 * uvlinesize + 1;
2420     src_cr -= 2 * uvlinesize + 1;
2421
2422 #define XCHG(a,b,t,xchg)\
2423 t= a;\
2424 if(xchg)\
2425     a= b;\
2426 b= t;
2427
2428     if(deblock_left){
2429         for(i = (!deblock_top)<<1; i<34; i++){
2430             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2431         }
2432     }
2433
2434     if(deblock_top){
2435         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2436         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2437         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2438         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2439         if(s->mb_x+1 < s->mb_width){
2440             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2441             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2442         }
2443     }
2444
2445     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2446         if(deblock_left){
2447             for(i = (!deblock_top) << 1; i<18; i++){
2448                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2449                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2450             }
2451         }
2452         if(deblock_top){
2453             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2454             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2455             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2456             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2457         }
2458     }
2459 }
2460
2461 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2462     MpegEncContext * const s = &h->s;
2463     const int mb_x= s->mb_x;
2464     const int mb_y= s->mb_y;
2465     const int mb_xy= mb_x + mb_y*s->mb_stride;
2466     const int mb_type= s->current_picture.mb_type[mb_xy];
2467     uint8_t  *dest_y, *dest_cb, *dest_cr;
2468     int linesize, uvlinesize /*dct_offset*/;
2469     int i;
2470     int *block_offset = &h->block_offset[0];
2471     const unsigned int bottom = mb_y & 1;
2472     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2473     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2474     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2475
2476     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2477     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2478     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2479
2480     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2481     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2482
2483     if (!simple && MB_FIELD) {
2484         linesize   = h->mb_linesize   = s->linesize * 2;
2485         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2486         block_offset = &h->block_offset[24];
2487         if(mb_y&1){ //FIXME move out of this func?
2488             dest_y -= s->linesize*15;
2489             dest_cb-= s->uvlinesize*7;
2490             dest_cr-= s->uvlinesize*7;
2491         }
2492         if(FRAME_MBAFF) {
2493             int list;
2494             for(list=0; list<h->list_count; list++){
2495                 if(!USES_LIST(mb_type, list))
2496                     continue;
2497                 if(IS_16X16(mb_type)){
2498                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2499                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
2500                 }else{
2501                     for(i=0; i<16; i+=4){
2502                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2503                         int ref = h->ref_cache[list][scan8[i]];
2504                         if(ref >= 0)
2505                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
2506                     }
2507                 }
2508             }
2509         }
2510     } else {
2511         linesize   = h->mb_linesize   = s->linesize;
2512         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2513 //        dct_offset = s->linesize * 16;
2514     }
2515
2516     if(transform_bypass){
2517         idct_dc_add =
2518         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2519     }else if(IS_8x8DCT(mb_type)){
2520         idct_dc_add = s->dsp.h264_idct8_dc_add;
2521         idct_add = s->dsp.h264_idct8_add;
2522     }else{
2523         idct_dc_add = s->dsp.h264_idct_dc_add;
2524         idct_add = s->dsp.h264_idct_add;
2525     }
2526
2527     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2528        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2529         int mbt_y = mb_y&~1;
2530         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2531         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2532         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2533         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2534     }
2535
2536     if (!simple && IS_INTRA_PCM(mb_type)) {
2537         unsigned int x, y;
2538
2539         // The pixels are stored in h->mb array in the same order as levels,
2540         // copy them in output in the correct order.
2541         for(i=0; i<16; i++) {
2542             for (y=0; y<4; y++) {
2543                 for (x=0; x<4; x++) {
2544                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2545                 }
2546             }
2547         }
2548         for(i=16; i<16+4; i++) {
2549             for (y=0; y<4; y++) {
2550                 for (x=0; x<4; x++) {
2551                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2552                 }
2553             }
2554         }
2555         for(i=20; i<20+4; i++) {
2556             for (y=0; y<4; y++) {
2557                 for (x=0; x<4; x++) {
2558                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2559                 }
2560             }
2561         }
2562     } else {
2563         if(IS_INTRA(mb_type)){
2564             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2565                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2566
2567             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2568                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2569                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2570             }
2571
2572             if(IS_INTRA4x4(mb_type)){
2573                 if(simple || !s->encoding){
2574                     if(IS_8x8DCT(mb_type)){
2575                         for(i=0; i<16; i+=4){
2576                             uint8_t * const ptr= dest_y + block_offset[i];
2577                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2578                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2579                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2580                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2581                             if(nnz){
2582                                 if(nnz == 1 && h->mb[i*16])
2583                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2584                                 else
2585                                     idct_add(ptr, h->mb + i*16, linesize);
2586                             }
2587                         }
2588                     }else
2589                     for(i=0; i<16; i++){
2590                         uint8_t * const ptr= dest_y + block_offset[i];
2591                         uint8_t *topright;
2592                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2593                         int nnz, tr;
2594
2595                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2596                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2597                             assert(mb_y || linesize <= block_offset[i]);
2598                             if(!topright_avail){
2599                                 tr= ptr[3 - linesize]*0x01010101;
2600                                 topright= (uint8_t*) &tr;
2601                             }else
2602                                 topright= ptr + 4 - linesize;
2603                         }else
2604                             topright= NULL;
2605
2606                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2607                         nnz = h->non_zero_count_cache[ scan8[i] ];
2608                         if(nnz){
2609                             if(is_h264){
2610                                 if(nnz == 1 && h->mb[i*16])
2611                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2612                                 else
2613                                     idct_add(ptr, h->mb + i*16, linesize);
2614                             }else
2615                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2616                         }
2617                     }
2618                 }
2619             }else{
2620                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2621                 if(is_h264){
2622                     if(!transform_bypass)
2623                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2624                 }else
2625                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2626             }
2627             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2628                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2629         }else if(is_h264){
2630             hl_motion(h, dest_y, dest_cb, dest_cr,
2631                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2632                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2633                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2634         }
2635
2636
2637         if(!IS_INTRA4x4(mb_type)){
2638             if(is_h264){
2639                 if(IS_INTRA16x16(mb_type)){
2640                     for(i=0; i<16; i++){
2641                         if(h->non_zero_count_cache[ scan8[i] ])
2642                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2643                         else if(h->mb[i*16])
2644                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2645                     }
2646                 }else{
2647                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2648                     for(i=0; i<16; i+=di){
2649                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2650                         if(nnz){
2651                             if(nnz==1 && h->mb[i*16])
2652                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2653                             else
2654                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2655                         }
2656                     }
2657                 }
2658             }else{
2659                 for(i=0; i<16; i++){
2660                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2661                         uint8_t * const ptr= dest_y + block_offset[i];
2662                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2663                     }
2664                 }
2665             }
2666         }
2667
2668         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2669             uint8_t *dest[2] = {dest_cb, dest_cr};
2670             if(transform_bypass){
2671                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2672             }else{
2673                 idct_add = s->dsp.h264_idct_add;
2674                 idct_dc_add = s->dsp.h264_idct_dc_add;
2675                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2676                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2677             }
2678             if(is_h264){
2679                 for(i=16; i<16+8; i++){
2680                     if(h->non_zero_count_cache[ scan8[i] ])
2681                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2682                     else if(h->mb[i*16])
2683                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2684                 }
2685             }else{
2686                 for(i=16; i<16+8; i++){
2687                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2688                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2689                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2690                     }
2691                 }
2692             }
2693         }
2694     }
2695     if(h->deblocking_filter) {
2696         if (!simple && FRAME_MBAFF) {
2697             //FIXME try deblocking one mb at a time?
2698             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2699             const int mb_y = s->mb_y - 1;
2700             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2701             const int mb_xy= mb_x + mb_y*s->mb_stride;
2702             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2703             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2704             if (!bottom) return;
2705             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2706             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2707             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2708
2709             if(IS_INTRA(mb_type_top | mb_type_bottom))
2710                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2711
2712             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2713             // deblock a pair
2714             // top
2715             s->mb_y--;
2716             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2717             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2718             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2719             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2720             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2721             // bottom
2722             s->mb_y++;
2723             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2724             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2725             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2726             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2727             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2728         } else {
2729             tprintf(h->s.avctx, "call filter_mb\n");
2730             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2731             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2732             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2733         }
2734     }
2735 }
2736
2737 /**
2738  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2739  */
2740 static void hl_decode_mb_simple(H264Context *h){
2741     hl_decode_mb_internal(h, 1);
2742 }
2743
2744 /**
2745  * Process a macroblock; this handles edge cases, such as interlacing.
2746  */
2747 static void av_noinline hl_decode_mb_complex(H264Context *h){
2748     hl_decode_mb_internal(h, 0);
2749 }
2750
2751 static void hl_decode_mb(H264Context *h){
2752     MpegEncContext * const s = &h->s;
2753     const int mb_x= s->mb_x;
2754     const int mb_y= s->mb_y;
2755     const int mb_xy= mb_x + mb_y*s->mb_stride;
2756     const int mb_type= s->current_picture.mb_type[mb_xy];
2757     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2758
2759     if(!s->decode)
2760         return;
2761
2762     if (is_complex)
2763         hl_decode_mb_complex(h);
2764     else hl_decode_mb_simple(h);
2765 }
2766
2767 /**
2768  * fills the default_ref_list.
2769  */
2770 static int fill_default_ref_list(H264Context *h){
2771     MpegEncContext * const s = &h->s;
2772     int i;
2773     int smallest_poc_greater_than_current = -1;
2774     Picture sorted_short_ref[32];
2775
2776     if(h->slice_type==B_TYPE){
2777         int out_i;
2778         int limit= INT_MIN;
2779
2780         /* sort frame according to poc in B slice */
2781         for(out_i=0; out_i<h->short_ref_count; out_i++){
2782             int best_i=INT_MIN;
2783             int best_poc=INT_MAX;
2784
2785             for(i=0; i<h->short_ref_count; i++){
2786                 const int poc= h->short_ref[i]->poc;
2787                 if(poc > limit && poc < best_poc){
2788                     best_poc= poc;
2789                     best_i= i;
2790                 }
2791             }
2792
2793             assert(best_i != INT_MIN);
2794
2795             limit= best_poc;
2796             sorted_short_ref[out_i]= *h->short_ref[best_i];
2797             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2798             if (-1 == smallest_poc_greater_than_current) {
2799                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2800                     smallest_poc_greater_than_current = out_i;
2801                 }
2802             }
2803         }
2804     }
2805
2806     if(s->picture_structure == PICT_FRAME){
2807         if(h->slice_type==B_TYPE){
2808             int list;
2809             tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2810
2811             // find the largest poc
2812             for(list=0; list<2; list++){
2813                 int index = 0;
2814                 int j= -99;
2815                 int step= list ? -1 : 1;
2816
2817                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2818                     while(j<0 || j>= h->short_ref_count){
2819                         if(j != -99 && step == (list ? -1 : 1))
2820                             return -1;
2821                         step = -step;
2822                         j= smallest_poc_greater_than_current + (step>>1);
2823                     }
2824                     if(sorted_short_ref[j].reference != 3) continue;
2825                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
2826                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2827                 }
2828
2829                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2830                     if(h->long_ref[i] == NULL) continue;
2831                     if(h->long_ref[i]->reference != 3) continue;
2832
2833                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
2834                     h->default_ref_list[ list ][index++].pic_id= i;;
2835                 }
2836
2837                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
2838                     // swap the two first elements of L1 when
2839                     // L0 and L1 are identical
2840                     Picture temp= h->default_ref_list[1][0];
2841                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
2842                     h->default_ref_list[1][1] = temp;
2843                 }
2844
2845                 if(index < h->ref_count[ list ])
2846                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
2847             }
2848         }else{
2849             int index=0;
2850             for(i=0; i<h->short_ref_count; i++){
2851                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
2852                 h->default_ref_list[0][index  ]= *h->short_ref[i];
2853                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2854             }
2855             for(i = 0; i < 16; i++){
2856                 if(h->long_ref[i] == NULL) continue;
2857                 if(h->long_ref[i]->reference != 3) continue;
2858                 h->default_ref_list[0][index  ]= *h->long_ref[i];
2859                 h->default_ref_list[0][index++].pic_id= i;;
2860             }
2861             if(index < h->ref_count[0])
2862                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2863         }
2864     }else{ //FIELD
2865         if(h->slice_type==B_TYPE){
2866         }else{
2867             //FIXME second field balh
2868         }
2869     }
2870 #ifdef TRACE
2871     for (i=0; i<h->ref_count[0]; i++) {
2872         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2873     }
2874     if(h->slice_type==B_TYPE){
2875         for (i=0; i<h->ref_count[1]; i++) {
2876             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
2877         }
2878     }
2879 #endif
2880     return 0;
2881 }
2882
2883 static void print_short_term(H264Context *h);
2884 static void print_long_term(H264Context *h);
2885
2886 static int decode_ref_pic_list_reordering(H264Context *h){
2887     MpegEncContext * const s = &h->s;
2888     int list, index;
2889
2890     print_short_term(h);
2891     print_long_term(h);
2892     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
2893
2894     for(list=0; list<h->list_count; list++){
2895         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2896
2897         if(get_bits1(&s->gb)){
2898             int pred= h->curr_pic_num;
2899
2900             for(index=0; ; index++){
2901                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2902                 unsigned int pic_id;
2903                 int i;
2904                 Picture *ref = NULL;
2905
2906                 if(reordering_of_pic_nums_idc==3)
2907                     break;
2908
2909                 if(index >= h->ref_count[list]){
2910                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2911                     return -1;
2912                 }
2913
2914                 if(reordering_of_pic_nums_idc<3){
2915                     if(reordering_of_pic_nums_idc<2){
2916                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2917
2918                         if(abs_diff_pic_num >= h->max_pic_num){
2919                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2920                             return -1;
2921                         }
2922
2923                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2924                         else                                pred+= abs_diff_pic_num;
2925                         pred &= h->max_pic_num - 1;
2926
2927                         for(i= h->short_ref_count-1; i>=0; i--){
2928                             ref = h->short_ref[i];
2929                             assert(ref->reference == 3);
2930                             assert(!ref->long_ref);
2931                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
2932                                 break;
2933                         }
2934                         if(i>=0)
2935                             ref->pic_id= ref->frame_num;
2936                     }else{
2937                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2938                         if(pic_id>31){
2939                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2940                             return -1;
2941                         }
2942                         ref = h->long_ref[pic_id];
2943                         if(ref){
2944                             ref->pic_id= pic_id;
2945                             assert(ref->reference == 3);
2946                             assert(ref->long_ref);
2947                             i=0;
2948                         }else{
2949                             i=-1;
2950                         }
2951                     }
2952
2953                     if (i < 0) {
2954                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2955                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2956                     } else {
2957                         for(i=index; i+1<h->ref_count[list]; i++){
2958                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2959                                 break;
2960                         }
2961                         for(; i > index; i--){
2962                             h->ref_list[list][i]= h->ref_list[list][i-1];
2963                         }
2964                         h->ref_list[list][index]= *ref;
2965                     }
2966                 }else{
2967                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2968                     return -1;
2969                 }
2970             }
2971         }
2972     }
2973     for(list=0; list<h->list_count; list++){
2974         for(index= 0; index < h->ref_count[list]; index++){
2975             if(!h->ref_list[list][index].data[0])
2976                 h->ref_list[list][index]= s->current_picture;
2977         }
2978     }
2979
2980     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
2981         direct_dist_scale_factor(h);
2982     direct_ref_list_init(h);
2983     return 0;
2984 }
2985
2986 static void fill_mbaff_ref_list(H264Context *h){
2987     int list, i, j;
2988     for(list=0; list<2; list++){ //FIXME try list_count
2989         for(i=0; i<h->ref_count[list]; i++){
2990             Picture *frame = &h->ref_list[list][i];
2991             Picture *field = &h->ref_list[list][16+2*i];
2992             field[0] = *frame;
2993             for(j=0; j<3; j++)
2994                 field[0].linesize[j] <<= 1;
2995             field[1] = field[0];
2996             for(j=0; j<3; j++)
2997                 field[1].data[j] += frame->linesize[j];
2998
2999             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3000             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3001             for(j=0; j<2; j++){
3002                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3003                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3004             }
3005         }
3006     }
3007     for(j=0; j<h->ref_count[1]; j++){
3008         for(i=0; i<h->ref_count[0]; i++)
3009             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3010         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3011         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3012     }
3013 }
3014
3015 static int pred_weight_table(H264Context *h){
3016     MpegEncContext * const s = &h->s;
3017     int list, i;
3018     int luma_def, chroma_def;
3019
3020     h->use_weight= 0;
3021     h->use_weight_chroma= 0;
3022     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3023     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3024     luma_def = 1<<h->luma_log2_weight_denom;
3025     chroma_def = 1<<h->chroma_log2_weight_denom;
3026
3027     for(list=0; list<2; list++){
3028         for(i=0; i<h->ref_count[list]; i++){
3029             int luma_weight_flag, chroma_weight_flag;
3030
3031             luma_weight_flag= get_bits1(&s->gb);
3032             if(luma_weight_flag){
3033                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3034                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3035                 if(   h->luma_weight[list][i] != luma_def
3036                    || h->luma_offset[list][i] != 0)
3037                     h->use_weight= 1;
3038             }else{
3039                 h->luma_weight[list][i]= luma_def;
3040                 h->luma_offset[list][i]= 0;
3041             }
3042
3043             chroma_weight_flag= get_bits1(&s->gb);
3044             if(chroma_weight_flag){
3045                 int j;
3046                 for(j=0; j<2; j++){
3047                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3048                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3049                     if(   h->chroma_weight[list][i][j] != chroma_def
3050                        || h->chroma_offset[list][i][j] != 0)
3051                         h->use_weight_chroma= 1;
3052                 }
3053             }else{
3054                 int j;
3055                 for(j=0; j<2; j++){
3056                     h->chroma_weight[list][i][j]= chroma_def;
3057                     h->chroma_offset[list][i][j]= 0;
3058                 }
3059             }
3060         }
3061         if(h->slice_type != B_TYPE) break;
3062     }
3063     h->use_weight= h->use_weight || h->use_weight_chroma;
3064     return 0;
3065 }
3066
3067 static void implicit_weight_table(H264Context *h){
3068     MpegEncContext * const s = &h->s;
3069     int ref0, ref1;
3070     int cur_poc = s->current_picture_ptr->poc;
3071
3072     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3073        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3074         h->use_weight= 0;
3075         h->use_weight_chroma= 0;
3076         return;
3077     }
3078
3079     h->use_weight= 2;
3080     h->use_weight_chroma= 2;
3081     h->luma_log2_weight_denom= 5;
3082     h->chroma_log2_weight_denom= 5;
3083
3084     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3085         int poc0 = h->ref_list[0][ref0].poc;
3086         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3087             int poc1 = h->ref_list[1][ref1].poc;
3088             int td = av_clip(poc1 - poc0, -128, 127);
3089             if(td){
3090                 int tb = av_clip(cur_poc - poc0, -128, 127);
3091                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3092                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3093                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3094                     h->implicit_weight[ref0][ref1] = 32;
3095                 else
3096                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3097             }else
3098                 h->implicit_weight[ref0][ref1] = 32;
3099         }
3100     }
3101 }
3102
3103 /**
3104  * Mark a picture as no longer needed for reference. The refmask
3105  * argument allows unreferencing of individual fields or the whole frame.
3106  * If the picture becomes entirely unreferenced, but is being held for
3107  * display purposes, it is marked as such.
3108  * @param refmask mask of fields to unreference; the mask is bitwise
3109  *                anded with the reference marking of pic
3110  * @return non-zero if pic becomes entirely unreferenced (except possibly
3111  *         for display purposes) zero if one of the fields remains in
3112  *         reference
3113  */
3114 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3115     int i;
3116     if (pic->reference &= refmask) {
3117         return 0;
3118     } else {
3119         if(pic == h->delayed_output_pic)
3120             pic->reference=DELAYED_PIC_REF;
3121         else{
3122             for(i = 0; h->delayed_pic[i]; i++)
3123                 if(pic == h->delayed_pic[i]){
3124                     pic->reference=DELAYED_PIC_REF;
3125                     break;
3126                 }
3127         }
3128         return 1;
3129     }
3130 }
3131
3132 /**
3133  * instantaneous decoder refresh.
3134  */
3135 static void idr(H264Context *h){
3136     int i;
3137
3138     for(i=0; i<16; i++){
3139         if (h->long_ref[i] != NULL) {
3140             unreference_pic(h, h->long_ref[i], 0);
3141             h->long_ref[i]= NULL;
3142         }
3143     }
3144     h->long_ref_count=0;
3145
3146     for(i=0; i<h->short_ref_count; i++){
3147         unreference_pic(h, h->short_ref[i], 0);
3148         h->short_ref[i]= NULL;
3149     }
3150     h->short_ref_count=0;
3151 }
3152
3153 /* forget old pics after a seek */
3154 static void flush_dpb(AVCodecContext *avctx){
3155     H264Context *h= avctx->priv_data;
3156     int i;
3157     for(i=0; i<16; i++) {
3158         if(h->delayed_pic[i])
3159             h->delayed_pic[i]->reference= 0;
3160         h->delayed_pic[i]= NULL;
3161     }
3162     if(h->delayed_output_pic)
3163         h->delayed_output_pic->reference= 0;
3164     h->delayed_output_pic= NULL;
3165     idr(h);
3166     if(h->s.current_picture_ptr)
3167         h->s.current_picture_ptr->reference= 0;
3168 }
3169
3170 /**
3171  * Find a Picture in the short term reference list by frame number.
3172  * @param frame_num frame number to search for
3173  * @param idx the index into h->short_ref where returned picture is found
3174  *            undefined if no picture found.
3175  * @return pointer to the found picture, or NULL if no pic with the provided
3176  *                 frame number is found
3177  */
3178 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3179     MpegEncContext * const s = &h->s;
3180     int i;
3181
3182     for(i=0; i<h->short_ref_count; i++){
3183         Picture *pic= h->short_ref[i];
3184         if(s->avctx->debug&FF_DEBUG_MMCO)
3185             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3186         if(pic->frame_num == frame_num) {
3187             *idx = i;
3188             return pic;
3189         }
3190     }
3191     return NULL;
3192 }
3193
3194 /**
3195  * Remove a picture from the short term reference list by its index in
3196  * that list.  This does no checking on the provided index; it is assumed
3197  * to be valid. Other list entries are shifted down.
3198  * @param i index into h->short_ref of picture to remove.
3199  */
3200 static void remove_short_at_index(H264Context *h, int i){
3201     assert(i > 0 && i < h->short_ref_count);
3202     h->short_ref[i]= NULL;
3203     if (--h->short_ref_count)
3204         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3205 }
3206
3207 /**
3208  *
3209  * @return the removed picture or NULL if an error occurs
3210  */
3211 static Picture * remove_short(H264Context *h, int frame_num){
3212     MpegEncContext * const s = &h->s;
3213     Picture *pic;
3214     int i;
3215
3216     if(s->avctx->debug&FF_DEBUG_MMCO)
3217         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3218
3219     pic = find_short(h, frame_num, &i);
3220     if (pic)
3221         remove_short_at_index(h, i);
3222
3223     return pic;
3224 }
3225
3226 /**
3227  *
3228  * @return the removed picture or NULL if an error occurs
3229  */
3230 static Picture * remove_long(H264Context *h, int i){
3231     Picture *pic;
3232
3233     pic= h->long_ref[i];
3234     h->long_ref[i]= NULL;
3235     if(pic) h->long_ref_count--;
3236
3237     return pic;
3238 }
3239
3240 /**
3241  * print short term list
3242  */
3243 static void print_short_term(H264Context *h) {
3244     uint32_t i;
3245     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3246         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3247         for(i=0; i<h->short_ref_count; i++){
3248             Picture *pic= h->short_ref[i];
3249             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3250         }
3251     }
3252 }
3253
3254 /**
3255  * print long term list
3256  */
3257 static void print_long_term(H264Context *h) {
3258     uint32_t i;
3259     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3260         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3261         for(i = 0; i < 16; i++){
3262             Picture *pic= h->long_ref[i];
3263             if (pic) {
3264                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3265             }
3266         }
3267     }
3268 }
3269
3270 /**
3271  * Executes the reference picture marking (memory management control operations).
3272  */
3273 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3274     MpegEncContext * const s = &h->s;
3275     int i, j;
3276     int current_is_long=0;
3277     Picture *pic;
3278
3279     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3280         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3281
3282     for(i=0; i<mmco_count; i++){
3283         if(s->avctx->debug&FF_DEBUG_MMCO)
3284             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3285
3286         switch(mmco[i].opcode){
3287         case MMCO_SHORT2UNUSED:
3288             pic= remove_short(h, mmco[i].short_pic_num);
3289             if(pic)
3290                 unreference_pic(h, pic, 0);
3291             else if(s->avctx->debug&FF_DEBUG_MMCO)
3292                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3293             break;
3294         case MMCO_SHORT2LONG:
3295             pic= remove_long(h, mmco[i].long_arg);
3296             if(pic) unreference_pic(h, pic, 0);
3297
3298             h->long_ref[ mmco[i].long_arg ]= remove_short(h, mmco[i].short_pic_num);
3299             if (h->long_ref[ mmco[i].long_arg ]){
3300                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3301                 h->long_ref_count++;
3302             }
3303             break;
3304         case MMCO_LONG2UNUSED:
3305             pic= remove_long(h, mmco[i].long_arg);
3306             if(pic)
3307                 unreference_pic(h, pic, 0);
3308             else if(s->avctx->debug&FF_DEBUG_MMCO)
3309                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3310             break;
3311         case MMCO_LONG:
3312             pic= remove_long(h, mmco[i].long_arg);
3313             if(pic) unreference_pic(h, pic, 0);
3314
3315             h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3316             h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3317             h->long_ref_count++;
3318
3319             current_is_long=1;
3320             break;
3321         case MMCO_SET_MAX_LONG:
3322             assert(mmco[i].long_arg <= 16);
3323             // just remove the long term which index is greater than new max
3324             for(j = mmco[i].long_arg; j<16; j++){
3325                 pic = remove_long(h, j);
3326                 if (pic) unreference_pic(h, pic, 0);
3327             }
3328             break;
3329         case MMCO_RESET:
3330             while(h->short_ref_count){
3331                 pic= remove_short(h, h->short_ref[0]->frame_num);
3332                 if(pic) unreference_pic(h, pic, 0);
3333             }
3334             for(j = 0; j < 16; j++) {
3335                 pic= remove_long(h, j);
3336                 if(pic) unreference_pic(h, pic, 0);
3337             }
3338             break;
3339         default: assert(0);
3340         }
3341     }
3342
3343     if(!current_is_long){
3344         pic= remove_short(h, s->current_picture_ptr->frame_num);
3345         if(pic){
3346             unreference_pic(h, pic, 0);
3347             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3348         }
3349
3350         if(h->short_ref_count)
3351             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3352
3353         h->short_ref[0]= s->current_picture_ptr;
3354         h->short_ref[0]->long_ref=0;
3355         h->short_ref_count++;
3356     }
3357
3358     print_short_term(h);
3359     print_long_term(h);
3360     return 0;
3361 }
3362
3363 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3364     MpegEncContext * const s = &h->s;
3365     int i;
3366
3367     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3368         s->broken_link= get_bits1(gb) -1;
3369         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3370         if(h->mmco[0].long_arg == -1)
3371             h->mmco_index= 0;
3372         else{
3373             h->mmco[0].opcode= MMCO_LONG;
3374             h->mmco_index= 1;
3375         }
3376     }else{
3377         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3378             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3379                 MMCOOpcode opcode= get_ue_golomb(gb);
3380
3381                 h->mmco[i].opcode= opcode;
3382                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3383                     h->mmco[i].short_pic_num= (h->frame_num - get_ue_golomb(gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
3384 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3385                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3386                         return -1;
3387                     }*/
3388                 }
3389                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3390                     unsigned int long_arg= get_ue_golomb(gb);
3391                     if(/*h->mmco[i].long_arg >= h->long_ref_count || h->long_ref[ h->mmco[i].long_arg ] == NULL*/ long_arg >= 16){
3392                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3393                         return -1;
3394                     }
3395                     h->mmco[i].long_arg= long_arg;
3396                 }
3397
3398                 if(opcode > (unsigned)MMCO_LONG){
3399                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3400                     return -1;
3401                 }
3402                 if(opcode == MMCO_END)
3403                     break;
3404             }
3405             h->mmco_index= i;
3406         }else{
3407             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3408
3409             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
3410                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3411                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3412                 h->mmco_index= 1;
3413             }else
3414                 h->mmco_index= 0;
3415         }
3416     }
3417
3418     return 0;
3419 }
3420
3421 static int init_poc(H264Context *h){
3422     MpegEncContext * const s = &h->s;
3423     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3424     int field_poc[2];
3425
3426     if(h->nal_unit_type == NAL_IDR_SLICE){
3427         h->frame_num_offset= 0;
3428     }else{
3429         if(h->frame_num < h->prev_frame_num)
3430             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3431         else
3432             h->frame_num_offset= h->prev_frame_num_offset;
3433     }
3434
3435     if(h->sps.poc_type==0){
3436         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3437
3438         if(h->nal_unit_type == NAL_IDR_SLICE){
3439              h->prev_poc_msb=
3440              h->prev_poc_lsb= 0;
3441         }
3442
3443         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3444             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3445         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3446             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3447         else
3448             h->poc_msb = h->prev_poc_msb;
3449 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3450         field_poc[0] =
3451         field_poc[1] = h->poc_msb + h->poc_lsb;
3452         if(s->picture_structure == PICT_FRAME)
3453             field_poc[1] += h->delta_poc_bottom;
3454     }else if(h->sps.poc_type==1){
3455         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3456         int i;
3457
3458         if(h->sps.poc_cycle_length != 0)
3459             abs_frame_num = h->frame_num_offset + h->frame_num;
3460         else
3461             abs_frame_num = 0;
3462
3463         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3464             abs_frame_num--;
3465
3466         expected_delta_per_poc_cycle = 0;
3467         for(i=0; i < h->sps.poc_cycle_length; i++)
3468             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3469
3470         if(abs_frame_num > 0){
3471             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3472             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3473
3474             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3475             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3476                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3477         } else
3478             expectedpoc = 0;
3479
3480         if(h->nal_ref_idc == 0)
3481             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3482
3483         field_poc[0] = expectedpoc + h->delta_poc[0];
3484         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3485
3486         if(s->picture_structure == PICT_FRAME)
3487             field_poc[1] += h->delta_poc[1];
3488     }else{
3489         int poc;
3490         if(h->nal_unit_type == NAL_IDR_SLICE){
3491             poc= 0;
3492         }else{
3493             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3494             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3495         }
3496         field_poc[0]= poc;
3497         field_poc[1]= poc;
3498     }
3499
3500     if(s->picture_structure != PICT_BOTTOM_FIELD)
3501         s->current_picture_ptr->field_poc[0]= field_poc[0];
3502     if(s->picture_structure != PICT_TOP_FIELD)
3503         s->current_picture_ptr->field_poc[1]= field_poc[1];
3504     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
3505         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
3506
3507     return 0;
3508 }
3509
3510
3511 /**
3512  * initialize scan tables
3513  */
3514 static void init_scan_tables(H264Context *h){
3515     MpegEncContext * const s = &h->s;
3516     int i;
3517     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3518         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3519         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3520     }else{
3521         for(i=0; i<16; i++){
3522 #define T(x) (x>>2) | ((x<<2) & 0xF)
3523             h->zigzag_scan[i] = T(zigzag_scan[i]);
3524             h-> field_scan[i] = T( field_scan[i]);
3525 #undef T
3526         }
3527     }
3528     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3529         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3530         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3531         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3532         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3533     }else{
3534         for(i=0; i<64; i++){
3535 #define T(x) (x>>3) | ((x&7)<<3)
3536             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3537             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3538             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3539             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3540 #undef T
3541         }
3542     }
3543     if(h->sps.transform_bypass){ //FIXME same ugly
3544         h->zigzag_scan_q0          = zigzag_scan;
3545         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3546         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3547         h->field_scan_q0           = field_scan;
3548         h->field_scan8x8_q0        = field_scan8x8;
3549         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3550     }else{
3551         h->zigzag_scan_q0          = h->zigzag_scan;
3552         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3553         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3554         h->field_scan_q0           = h->field_scan;
3555         h->field_scan8x8_q0        = h->field_scan8x8;
3556         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3557     }
3558 }
3559
3560 /**
3561  * Replicates H264 "master" context to thread contexts.
3562  */
3563 static void clone_slice(H264Context *dst, H264Context *src)
3564 {
3565     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3566     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3567     dst->s.current_picture      = src->s.current_picture;
3568     dst->s.linesize             = src->s.linesize;
3569     dst->s.uvlinesize           = src->s.uvlinesize;
3570
3571     dst->prev_poc_msb           = src->prev_poc_msb;
3572     dst->prev_poc_lsb           = src->prev_poc_lsb;
3573     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3574     dst->prev_frame_num         = src->prev_frame_num;
3575     dst->short_ref_count        = src->short_ref_count;
3576
3577     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3578     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3579     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3580     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3581
3582     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3583     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3584 }
3585
3586 /**
3587  * decodes a slice header.
3588  * this will allso call MPV_common_init() and frame_start() as needed
3589  *
3590  * @param h h264context
3591  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3592  *
3593  * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
3594  */
3595 static int decode_slice_header(H264Context *h, H264Context *h0){
3596     MpegEncContext * const s = &h->s;
3597     unsigned int first_mb_in_slice;
3598     unsigned int pps_id;
3599     int num_ref_idx_active_override_flag;
3600     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3601     unsigned int slice_type, tmp, i;
3602     int default_ref_list_done = 0;
3603
3604     s->dropable= h->nal_ref_idc == 0;
3605
3606     first_mb_in_slice= get_ue_golomb(&s->gb);
3607
3608     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3609         h0->current_slice = 0;
3610         s->current_picture_ptr= NULL;
3611     }
3612
3613     slice_type= get_ue_golomb(&s->gb);
3614     if(slice_type > 9){
3615         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3616         return -1;
3617     }
3618     if(slice_type > 4){
3619         slice_type -= 5;
3620         h->slice_type_fixed=1;
3621     }else
3622         h->slice_type_fixed=0;
3623
3624     slice_type= slice_type_map[ slice_type ];
3625     if (slice_type == I_TYPE
3626         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3627         default_ref_list_done = 1;
3628     }
3629     h->slice_type= slice_type;
3630
3631     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3632
3633     pps_id= get_ue_golomb(&s->gb);
3634     if(pps_id>=MAX_PPS_COUNT){
3635         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3636         return -1;
3637     }
3638     if(!h0->pps_buffers[pps_id]) {
3639         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3640         return -1;
3641     }
3642     h->pps= *h0->pps_buffers[pps_id];
3643
3644     if(!h0->sps_buffers[h->pps.sps_id]) {
3645         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3646         return -1;
3647     }
3648     h->sps = *h0->sps_buffers[h->pps.sps_id];
3649
3650     if(h == h0 && h->dequant_coeff_pps != pps_id){
3651         h->dequant_coeff_pps = pps_id;
3652         init_dequant_tables(h);
3653     }
3654
3655     s->mb_width= h->sps.mb_width;
3656     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3657
3658     h->b_stride=  s->mb_width*4;
3659     h->b8_stride= s->mb_width*2;
3660
3661     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3662     if(h->sps.frame_mbs_only_flag)
3663         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3664     else
3665         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3666
3667     if (s->context_initialized
3668         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3669         if(h != h0)
3670             return -1;   // width / height changed during parallelized decoding
3671         free_tables(h);
3672         MPV_common_end(s);
3673     }
3674     if (!s->context_initialized) {
3675         if(h != h0)
3676             return -1;  // we cant (re-)initialize context during parallel decoding
3677         if (MPV_common_init(s) < 0)
3678             return -1;
3679
3680         init_scan_tables(h);
3681         alloc_tables(h);
3682
3683         for(i = 1; i < s->avctx->thread_count; i++) {
3684             H264Context *c;
3685             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3686             memcpy(c, h, sizeof(MpegEncContext));
3687             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3688             c->sps = h->sps;
3689             c->pps = h->pps;
3690             init_scan_tables(c);
3691             clone_tables(c, h);
3692         }
3693
3694         for(i = 0; i < s->avctx->thread_count; i++)
3695             if(context_init(h->thread_context[i]) < 0)
3696                 return -1;
3697
3698         s->avctx->width = s->width;
3699         s->avctx->height = s->height;
3700         s->avctx->sample_aspect_ratio= h->sps.sar;
3701         if(!s->avctx->sample_aspect_ratio.den)
3702             s->avctx->sample_aspect_ratio.den = 1;
3703
3704         if(h->sps.timing_info_present_flag){
3705             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3706             if(h->x264_build > 0 && h->x264_build < 44)
3707                 s->avctx->time_base.den *= 2;
3708             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3709                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3710         }
3711     }
3712
3713     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3714
3715     h->mb_mbaff = 0;
3716     h->mb_aff_frame = 0;
3717     if(h->sps.frame_mbs_only_flag){
3718         s->picture_structure= PICT_FRAME;
3719     }else{
3720         if(get_bits1(&s->gb)) { //field_pic_flag
3721             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3722             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
3723         } else {
3724             s->picture_structure= PICT_FRAME;
3725             h->mb_aff_frame = h->sps.mb_aff;
3726         }
3727     }
3728
3729     if(h0->current_slice == 0){
3730         if(frame_start(h) < 0)
3731             return -1;
3732     }
3733     if(h != h0)
3734         clone_slice(h, h0);
3735
3736     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3737
3738     assert(s->mb_num == s->mb_width * s->mb_height);
3739     if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
3740        first_mb_in_slice                    >= s->mb_num){
3741         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3742         return -1;
3743     }
3744     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3745     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
3746     assert(s->mb_y < s->mb_height);
3747
3748     if(s->picture_structure==PICT_FRAME){
3749         h->curr_pic_num=   h->frame_num;
3750         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3751     }else{
3752         h->curr_pic_num= 2*h->frame_num + 1;
3753         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3754     }
3755
3756     if(h->nal_unit_type == NAL_IDR_SLICE){
3757         get_ue_golomb(&s->gb); /* idr_pic_id */
3758     }
3759
3760     if(h->sps.poc_type==0){
3761         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3762
3763         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3764             h->delta_poc_bottom= get_se_golomb(&s->gb);
3765         }
3766     }
3767
3768     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3769         h->delta_poc[0]= get_se_golomb(&s->gb);
3770
3771         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3772             h->delta_poc[1]= get_se_golomb(&s->gb);
3773     }
3774
3775     init_poc(h);
3776
3777     if(h->pps.redundant_pic_cnt_present){
3778         h->redundant_pic_count= get_ue_golomb(&s->gb);
3779     }
3780
3781     //set defaults, might be overriden a few line later
3782     h->ref_count[0]= h->pps.ref_count[0];
3783     h->ref_count[1]= h->pps.ref_count[1];
3784
3785     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
3786         if(h->slice_type == B_TYPE){
3787             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3788             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
3789                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
3790         }
3791         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3792
3793         if(num_ref_idx_active_override_flag){
3794             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3795             if(h->slice_type==B_TYPE)
3796                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3797
3798             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3799                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3800                 h->ref_count[0]= h->ref_count[1]= 1;
3801                 return -1;
3802             }
3803         }
3804         if(h->slice_type == B_TYPE)
3805             h->list_count= 2;
3806         else
3807             h->list_count= 1;
3808     }else
3809         h->list_count= 0;
3810
3811     if(!default_ref_list_done){
3812         fill_default_ref_list(h);
3813     }
3814
3815     if(decode_ref_pic_list_reordering(h) < 0)
3816         return -1;
3817
3818     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
3819        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
3820         pred_weight_table(h);
3821     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
3822         implicit_weight_table(h);
3823     else
3824         h->use_weight = 0;
3825
3826     if(h->nal_ref_idc)
3827         decode_ref_pic_marking(h0, &s->gb);
3828
3829     if(FRAME_MBAFF)
3830         fill_mbaff_ref_list(h);
3831
3832     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
3833         tmp = get_ue_golomb(&s->gb);
3834         if(tmp > 2){
3835             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3836             return -1;
3837         }
3838         h->cabac_init_idc= tmp;
3839     }
3840
3841     h->last_qscale_diff = 0;
3842     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3843     if(tmp>51){
3844         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3845         return -1;
3846     }
3847     s->qscale= tmp;
3848     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3849     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3850     //FIXME qscale / qp ... stuff
3851     if(h->slice_type == SP_TYPE){
3852         get_bits1(&s->gb); /* sp_for_switch_flag */
3853     }
3854     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
3855         get_se_golomb(&s->gb); /* slice_qs_delta */
3856     }
3857
3858     h->deblocking_filter = 1;
3859     h->slice_alpha_c0_offset = 0;
3860     h->slice_beta_offset = 0;
3861     if( h->pps.deblocking_filter_parameters_present ) {
3862         tmp= get_ue_golomb(&s->gb);
3863         if(tmp > 2){
3864             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3865             return -1;
3866         }
3867         h->deblocking_filter= tmp;
3868         if(h->deblocking_filter < 2)
3869             h->deblocking_filter^= 1; // 1<->0
3870
3871         if( h->deblocking_filter ) {
3872             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3873             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3874         }
3875     }
3876
3877     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3878        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
3879        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
3880        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3881         h->deblocking_filter= 0;
3882
3883     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3884         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3885             /* Cheat slightly for speed:
3886                Dont bother to deblock across slices */
3887             h->deblocking_filter = 2;
3888         } else {
3889             h0->max_contexts = 1;
3890             if(!h0->single_decode_warning) {
3891                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3892                 h0->single_decode_warning = 1;
3893             }
3894             if(h != h0)
3895                 return 1; // deblocking switched inside frame
3896         }
3897     }
3898
3899 #if 0 //FMO
3900     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3901         slice_group_change_cycle= get_bits(&s->gb, ?);
3902 #endif
3903
3904     h0->last_slice_type = slice_type;
3905     h->slice_num = ++h0->current_slice;
3906
3907     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3908     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
3909
3910     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3911         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
3912                h->slice_num,
3913                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
3914                first_mb_in_slice,
3915                av_get_pict_type_char(h->slice_type),
3916                pps_id, h->frame_num,
3917                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
3918                h->ref_count[0], h->ref_count[1],
3919                s->qscale,
3920                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
3921                h->use_weight,
3922                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
3923                );
3924     }
3925
3926     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3927         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3928         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3929     }else{
3930         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3931         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3932     }
3933
3934     return 0;
3935 }
3936
3937 /**
3938  *
3939  */
3940 static inline int get_level_prefix(GetBitContext *gb){
3941     unsigned int buf;
3942     int log;
3943
3944     OPEN_READER(re, gb);
3945     UPDATE_CACHE(re, gb);
3946     buf=GET_CACHE(re, gb);
3947
3948     log= 32 - av_log2(buf);
3949 #ifdef TRACE
3950     print_bin(buf>>(32-log), log);
3951     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
3952 #endif
3953
3954     LAST_SKIP_BITS(re, gb, log);
3955     CLOSE_READER(re, gb);
3956
3957     return log-1;
3958 }
3959
3960 static inline int get_dct8x8_allowed(H264Context *h){
3961     int i;
3962     for(i=0; i<4; i++){
3963         if(!IS_SUB_8X8(h->sub_mb_type[i])
3964            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
3965             return 0;
3966     }
3967     return 1;
3968 }
3969
3970 /**
3971  * decodes a residual block.
3972  * @param n block index
3973  * @param scantable scantable
3974  * @param max_coeff number of coefficients in the block
3975  * @return <0 if an error occured
3976  */
3977 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
3978     MpegEncContext * const s = &h->s;
3979     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
3980     int level[16];
3981     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
3982
3983     //FIXME put trailing_onex into the context
3984
3985     if(n == CHROMA_DC_BLOCK_INDEX){
3986         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
3987         total_coeff= coeff_token>>2;
3988     }else{
3989         if(n == LUMA_DC_BLOCK_INDEX){
3990             total_coeff= pred_non_zero_count(h, 0);
3991             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
3992             total_coeff= coeff_token>>2;
3993         }else{
3994             total_coeff= pred_non_zero_count(h, n);
3995             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
3996             total_coeff= coeff_token>>2;
3997             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
3998         }
3999     }
4000
4001     //FIXME set last_non_zero?
4002
4003     if(total_coeff==0)
4004         return 0;
4005     if(total_coeff > (unsigned)max_coeff) {
4006         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4007         return -1;
4008     }
4009
4010     trailing_ones= coeff_token&3;
4011     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4012     assert(total_coeff<=16);
4013
4014     for(i=0; i<trailing_ones; i++){
4015         level[i]= 1 - 2*get_bits1(gb);
4016     }
4017
4018     if(i<total_coeff) {
4019         int level_code, mask;
4020         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4021         int prefix= get_level_prefix(gb);
4022
4023         //first coefficient has suffix_length equal to 0 or 1
4024         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4025             if(suffix_length)
4026                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4027             else
4028                 level_code= (prefix<<suffix_length); //part
4029         }else if(prefix==14){
4030             if(suffix_length)
4031                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4032             else
4033                 level_code= prefix + get_bits(gb, 4); //part
4034         }else if(prefix==15){
4035             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4036             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4037         }else{
4038             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4039             return -1;
4040         }
4041
4042         if(trailing_ones < 3) level_code += 2;
4043
4044         suffix_length = 1;
4045         if(level_code > 5)
4046             suffix_length++;
4047         mask= -(level_code&1);
4048         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4049         i++;
4050
4051         //remaining coefficients have suffix_length > 0
4052         for(;i<total_coeff;i++) {
4053             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4054             prefix = get_level_prefix(gb);
4055             if(prefix<15){
4056                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4057             }else if(prefix==15){
4058                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4059             }else{
4060                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4061                 return -1;
4062             }
4063             mask= -(level_code&1);
4064             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4065             if(level_code > suffix_limit[suffix_length])
4066                 suffix_length++;
4067         }
4068     }
4069
4070     if(total_coeff == max_coeff)
4071         zeros_left=0;
4072     else{
4073         if(n == CHROMA_DC_BLOCK_INDEX)
4074             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4075         else
4076             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4077     }
4078
4079     coeff_num = zeros_left + total_coeff - 1;
4080     j = scantable[coeff_num];
4081     if(n > 24){
4082         block[j] = level[0];
4083         for(i=1;i<total_coeff;i++) {
4084             if(zeros_left <= 0)
4085                 run_before = 0;
4086             else if(zeros_left < 7){
4087                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4088             }else{
4089                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4090             }
4091             zeros_left -= run_before;
4092             coeff_num -= 1 + run_before;
4093             j= scantable[ coeff_num ];
4094
4095             block[j]= level[i];
4096         }
4097     }else{
4098         block[j] = (level[0] * qmul[j] + 32)>>6;
4099         for(i=1;i<total_coeff;i++) {
4100             if(zeros_left <= 0)
4101                 run_before = 0;
4102             else if(zeros_left < 7){
4103                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4104             }else{
4105                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4106             }
4107             zeros_left -= run_before;
4108             coeff_num -= 1 + run_before;
4109             j= scantable[ coeff_num ];
4110
4111             block[j]= (level[i] * qmul[j] + 32)>>6;
4112         }
4113     }
4114
4115     if(zeros_left<0){
4116         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4117         return -1;
4118     }
4119
4120     return 0;
4121 }
4122
4123 static void predict_field_decoding_flag(H264Context *h){
4124     MpegEncContext * const s = &h->s;
4125     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4126     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4127                 ? s->current_picture.mb_type[mb_xy-1]
4128                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4129                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4130                 : 0;
4131     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4132 }
4133
4134 /**
4135  * decodes a P_SKIP or B_SKIP macroblock
4136  */
4137 static void decode_mb_skip(H264Context *h){
4138     MpegEncContext * const s = &h->s;
4139     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4140     int mb_type=0;
4141
4142     memset(h->non_zero_count[mb_xy], 0, 16);
4143     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4144
4145     if(MB_FIELD)
4146         mb_type|= MB_TYPE_INTERLACED;
4147
4148     if( h->slice_type == B_TYPE )
4149     {
4150         // just for fill_caches. pred_direct_motion will set the real mb_type
4151         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4152
4153         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4154         pred_direct_motion(h, &mb_type);
4155         mb_type|= MB_TYPE_SKIP;
4156     }
4157     else
4158     {
4159         int mx, my;
4160         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4161
4162         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4163         pred_pskip_motion(h, &mx, &my);
4164         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4165         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4166     }
4167
4168     write_back_motion(h, mb_type);
4169     s->current_picture.mb_type[mb_xy]= mb_type;
4170     s->current_picture.qscale_table[mb_xy]= s->qscale;
4171     h->slice_table[ mb_xy ]= h->slice_num;
4172     h->prev_mb_skipped= 1;
4173 }
4174
4175 /**
4176  * decodes a macroblock
4177  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4178  */
4179 static int decode_mb_cavlc(H264Context *h){
4180     MpegEncContext * const s = &h->s;
4181     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4182     int partition_count;
4183     unsigned int mb_type, cbp;
4184     int dct8x8_allowed= h->pps.transform_8x8_mode;
4185
4186     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4187
4188     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4189     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4190                 down the code */
4191     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4192         if(s->mb_skip_run==-1)
4193             s->mb_skip_run= get_ue_golomb(&s->gb);
4194
4195         if (s->mb_skip_run--) {
4196             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4197                 if(s->mb_skip_run==0)
4198                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4199                 else
4200                     predict_field_decoding_flag(h);
4201             }
4202             decode_mb_skip(h);
4203             return 0;
4204         }
4205     }
4206     if(FRAME_MBAFF){
4207         if( (s->mb_y&1) == 0 )
4208             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4209     }else
4210         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4211
4212     h->prev_mb_skipped= 0;
4213
4214     mb_type= get_ue_golomb(&s->gb);
4215     if(h->slice_type == B_TYPE){
4216         if(mb_type < 23){
4217             partition_count= b_mb_type_info[mb_type].partition_count;
4218             mb_type=         b_mb_type_info[mb_type].type;
4219         }else{
4220             mb_type -= 23;
4221             goto decode_intra_mb;
4222         }
4223     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4224         if(mb_type < 5){
4225             partition_count= p_mb_type_info[mb_type].partition_count;
4226             mb_type=         p_mb_type_info[mb_type].type;
4227         }else{
4228             mb_type -= 5;
4229             goto decode_intra_mb;
4230         }
4231     }else{
4232        assert(h->slice_type == I_TYPE);
4233 decode_intra_mb:
4234         if(mb_type > 25){
4235             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4236             return -1;
4237         }
4238         partition_count=0;
4239         cbp= i_mb_type_info[mb_type].cbp;
4240         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4241         mb_type= i_mb_type_info[mb_type].type;
4242     }
4243
4244     if(MB_FIELD)
4245         mb_type |= MB_TYPE_INTERLACED;
4246
4247     h->slice_table[ mb_xy ]= h->slice_num;
4248
4249     if(IS_INTRA_PCM(mb_type)){
4250         unsigned int x, y;
4251
4252         // We assume these blocks are very rare so we do not optimize it.
4253         align_get_bits(&s->gb);
4254
4255         // The pixels are stored in the same order as levels in h->mb array.
4256         for(y=0; y<16; y++){
4257             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4258             for(x=0; x<16; x++){
4259                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4260                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4261             }
4262         }
4263         for(y=0; y<8; y++){
4264             const int index= 256 + 4*(y&3) + 32*(y>>2);
4265             for(x=0; x<8; x++){
4266                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4267                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4268             }
4269         }
4270         for(y=0; y<8; y++){
4271             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4272             for(x=0; x<8; x++){
4273                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4274                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4275             }
4276         }
4277
4278         // In deblocking, the quantizer is 0
4279         s->current_picture.qscale_table[mb_xy]= 0;
4280         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4281         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4282         // All coeffs are present
4283         memset(h->non_zero_count[mb_xy], 16, 16);
4284
4285         s->current_picture.mb_type[mb_xy]= mb_type;
4286         return 0;
4287     }
4288
4289     if(MB_MBAFF){
4290         h->ref_count[0] <<= 1;
4291         h->ref_count[1] <<= 1;
4292     }
4293
4294     fill_caches(h, mb_type, 0);
4295
4296     //mb_pred
4297     if(IS_INTRA(mb_type)){
4298             int pred_mode;
4299 //            init_top_left_availability(h);
4300             if(IS_INTRA4x4(mb_type)){
4301                 int i;
4302                 int di = 1;
4303                 if(dct8x8_allowed && get_bits1(&s->gb)){
4304                     mb_type |= MB_TYPE_8x8DCT;
4305                     di = 4;
4306                 }
4307
4308 //                fill_intra4x4_pred_table(h);
4309                 for(i=0; i<16; i+=di){
4310                     int mode= pred_intra_mode(h, i);
4311
4312                     if(!get_bits1(&s->gb)){
4313                         const int rem_mode= get_bits(&s->gb, 3);
4314                         mode = rem_mode + (rem_mode >= mode);
4315                     }
4316
4317                     if(di==4)
4318                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4319                     else
4320                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4321                 }
4322                 write_back_intra_pred_mode(h);
4323                 if( check_intra4x4_pred_mode(h) < 0)
4324                     return -1;
4325             }else{
4326                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4327                 if(h->intra16x16_pred_mode < 0)
4328                     return -1;
4329             }
4330
4331             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4332             if(pred_mode < 0)
4333                 return -1;
4334             h->chroma_pred_mode= pred_mode;
4335     }else if(partition_count==4){
4336         int i, j, sub_partition_count[4], list, ref[2][4];
4337
4338         if(h->slice_type == B_TYPE){
4339             for(i=0; i<4; i++){
4340                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4341                 if(h->sub_mb_type[i] >=13){
4342                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4343                     return -1;
4344                 }
4345                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4346                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4347             }
4348             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4349                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4350                 pred_direct_motion(h, &mb_type);
4351                 h->ref_cache[0][scan8[4]] =
4352                 h->ref_cache[1][scan8[4]] =
4353                 h->ref_cache[0][scan8[12]] =
4354                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4355             }
4356         }else{
4357             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4358             for(i=0; i<4; i++){
4359                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4360                 if(h->sub_mb_type[i] >=4){
4361                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4362                     return -1;
4363                 }
4364                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4365                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4366             }
4367         }
4368
4369         for(list=0; list<h->list_count; list++){
4370             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4371             for(i=0; i<4; i++){
4372                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4373                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4374                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4375                     if(tmp>=ref_count){
4376                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4377                         return -1;
4378                     }
4379                     ref[list][i]= tmp;
4380                 }else{
4381                  //FIXME
4382                     ref[list][i] = -1;
4383                 }
4384             }
4385         }
4386
4387         if(dct8x8_allowed)
4388             dct8x8_allowed = get_dct8x8_allowed(h);
4389
4390         for(list=0; list<h->list_count; list++){
4391             for(i=0; i<4; i++){
4392                 if(IS_DIRECT(h->sub_mb_type[i])) {
4393                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4394                     continue;
4395                 }
4396                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4397                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4398
4399                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4400                     const int sub_mb_type= h->sub_mb_type[i];
4401                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4402                     for(j=0; j<sub_partition_count[i]; j++){
4403                         int mx, my;
4404                         const int index= 4*i + block_width*j;
4405                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4406                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4407                         mx += get_se_golomb(&s->gb);
4408                         my += get_se_golomb(&s->gb);
4409                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4410
4411                         if(IS_SUB_8X8(sub_mb_type)){
4412                             mv_cache[ 1 ][0]=
4413                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4414                             mv_cache[ 1 ][1]=
4415                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4416                         }else if(IS_SUB_8X4(sub_mb_type)){
4417                             mv_cache[ 1 ][0]= mx;
4418                             mv_cache[ 1 ][1]= my;
4419                         }else if(IS_SUB_4X8(sub_mb_type)){
4420                             mv_cache[ 8 ][0]= mx;
4421                             mv_cache[ 8 ][1]= my;
4422                         }
4423                         mv_cache[ 0 ][0]= mx;
4424                         mv_cache[ 0 ][1]= my;
4425                     }
4426                 }else{
4427                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4428                     p[0] = p[1]=
4429                     p[8] = p[9]= 0;
4430                 }
4431             }
4432         }
4433     }else if(IS_DIRECT(mb_type)){
4434         pred_direct_motion(h, &mb_type);
4435         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4436     }else{
4437         int list, mx, my, i;
4438          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4439         if(IS_16X16(mb_type)){
4440             for(list=0; list<h->list_count; list++){
4441                     unsigned int val;
4442                     if(IS_DIR(mb_type, 0, list)){
4443                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4444                         if(val >= h->ref_count[list]){
4445                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4446                             return -1;
4447                         }
4448                     }else
4449                         val= LIST_NOT_USED&0xFF;
4450                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4451             }
4452             for(list=0; list<h->list_count; list++){
4453                 unsigned int val;
4454                 if(IS_DIR(mb_type, 0, list)){
4455                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4456                     mx += get_se_golomb(&s->gb);
4457                     my += get_se_golomb(&s->gb);
4458                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4459
4460                     val= pack16to32(mx,my);
4461                 }else
4462                     val=0;
4463                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4464             }
4465         }
4466         else if(IS_16X8(mb_type)){
4467             for(list=0; list<h->list_count; list++){
4468                     for(i=0; i<2; i++){
4469                         unsigned int val;
4470                         if(IS_DIR(mb_type, i, list)){
4471                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4472                             if(val >= h->ref_count[list]){
4473                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4474                                 return -1;
4475                             }
4476                         }else
4477                             val= LIST_NOT_USED&0xFF;
4478                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4479                     }
4480             }
4481             for(list=0; list<h->list_count; list++){
4482                 for(i=0; i<2; i++){
4483                     unsigned int val;
4484                     if(IS_DIR(mb_type, i, list)){
4485                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4486                         mx += get_se_golomb(&s->gb);
4487                         my += get_se_golomb(&s->gb);
4488                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4489
4490                         val= pack16to32(mx,my);
4491                     }else
4492                         val=0;
4493                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4494                 }
4495             }
4496         }else{
4497             assert(IS_8X16(mb_type));
4498             for(list=0; list<h->list_count; list++){
4499                     for(i=0; i<2; i++){
4500                         unsigned int val;
4501                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4502                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4503                             if(val >= h->ref_count[list]){
4504                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4505                                 return -1;
4506                             }
4507                         }else
4508                             val= LIST_NOT_USED&0xFF;
4509                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4510                     }
4511             }
4512             for(list=0; list<h->list_count; list++){
4513                 for(i=0; i<2; i++){
4514                     unsigned int val;
4515                     if(IS_DIR(mb_type, i, list)){
4516                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4517                         mx += get_se_golomb(&s->gb);
4518                         my += get_se_golomb(&s->gb);
4519                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4520
4521                         val= pack16to32(mx,my);
4522                     }else
4523                         val=0;
4524                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4525                 }
4526             }
4527         }
4528     }
4529
4530     if(IS_INTER(mb_type))
4531         write_back_motion(h, mb_type);
4532
4533     if(!IS_INTRA16x16(mb_type)){
4534         cbp= get_ue_golomb(&s->gb);
4535         if(cbp > 47){
4536             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4537             return -1;
4538         }
4539
4540         if(IS_INTRA4x4(mb_type))
4541             cbp= golomb_to_intra4x4_cbp[cbp];
4542         else
4543             cbp= golomb_to_inter_cbp[cbp];
4544     }
4545     h->cbp = cbp;
4546
4547     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4548         if(get_bits1(&s->gb))
4549             mb_type |= MB_TYPE_8x8DCT;
4550     }
4551     s->current_picture.mb_type[mb_xy]= mb_type;
4552
4553     if(cbp || IS_INTRA16x16(mb_type)){
4554         int i8x8, i4x4, chroma_idx;
4555         int dquant;
4556         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4557         const uint8_t *scan, *scan8x8, *dc_scan;
4558
4559 //        fill_non_zero_count_cache(h);
4560
4561         if(IS_INTERLACED(mb_type)){
4562             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4563             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4564             dc_scan= luma_dc_field_scan;
4565         }else{
4566             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4567             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4568             dc_scan= luma_dc_zigzag_scan;
4569         }
4570
4571         dquant= get_se_golomb(&s->gb);
4572
4573         if( dquant > 25 || dquant < -26 ){
4574             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4575             return -1;
4576         }
4577
4578         s->qscale += dquant;
4579         if(((unsigned)s->qscale) > 51){
4580             if(s->qscale<0) s->qscale+= 52;
4581             else            s->qscale-= 52;
4582         }
4583
4584         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4585         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4586         if(IS_INTRA16x16(mb_type)){
4587             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4588                 return -1; //FIXME continue if partitioned and other return -1 too
4589             }
4590
4591             assert((cbp&15) == 0 || (cbp&15) == 15);
4592
4593             if(cbp&15){
4594                 for(i8x8=0; i8x8<4; i8x8++){
4595                     for(i4x4=0; i4x4<4; i4x4++){
4596                         const int index= i4x4 + 4*i8x8;
4597                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4598                             return -1;
4599                         }
4600                     }
4601                 }
4602             }else{
4603                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4604             }
4605         }else{
4606             for(i8x8=0; i8x8<4; i8x8++){
4607                 if(cbp & (1<<i8x8)){
4608                     if(IS_8x8DCT(mb_type)){
4609                         DCTELEM *buf = &h->mb[64*i8x8];
4610                         uint8_t *nnz;
4611                         for(i4x4=0; i4x4<4; i4x4++){
4612                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4613                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4614                                 return -1;
4615                         }
4616                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4617                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4618                     }else{
4619                         for(i4x4=0; i4x4<4; i4x4++){
4620                             const int index= i4x4 + 4*i8x8;
4621
4622                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4623                                 return -1;
4624                             }
4625                         }
4626                     }
4627                 }else{
4628                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4629                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4630                 }
4631             }
4632         }
4633
4634         if(cbp&0x30){
4635             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4636                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4637                     return -1;
4638                 }
4639         }
4640
4641         if(cbp&0x20){
4642             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4643                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4644                 for(i4x4=0; i4x4<4; i4x4++){
4645                     const int index= 16 + 4*chroma_idx + i4x4;
4646                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4647                         return -1;
4648                     }
4649                 }
4650             }
4651         }else{
4652             uint8_t * const nnz= &h->non_zero_count_cache[0];
4653             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4654             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4655         }
4656     }else{
4657         uint8_t * const nnz= &h->non_zero_count_cache[0];
4658         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4659         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4660         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4661     }
4662     s->current_picture.qscale_table[mb_xy]= s->qscale;
4663     write_back_non_zero_count(h);
4664
4665     if(MB_MBAFF){
4666         h->ref_count[0] >>= 1;
4667         h->ref_count[1] >>= 1;
4668     }
4669
4670     return 0;
4671 }
4672
4673 static int decode_cabac_field_decoding_flag(H264Context *h) {
4674     MpegEncContext * const s = &h->s;
4675     const int mb_x = s->mb_x;
4676     const int mb_y = s->mb_y & ~1;
4677     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4678     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4679
4680     unsigned int ctx = 0;
4681
4682     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4683         ctx += 1;
4684     }
4685     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4686         ctx += 1;
4687     }
4688
4689     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4690 }
4691
4692 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4693     uint8_t *state= &h->cabac_state[ctx_base];
4694     int mb_type;
4695
4696     if(intra_slice){
4697         MpegEncContext * const s = &h->s;
4698         const int mba_xy = h->left_mb_xy[0];
4699         const int mbb_xy = h->top_mb_xy;
4700         int ctx=0;
4701         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4702             ctx++;
4703         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4704             ctx++;
4705         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4706             return 0;   /* I4x4 */
4707         state += 2;
4708     }else{
4709         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4710             return 0;   /* I4x4 */
4711     }
4712
4713     if( get_cabac_terminate( &h->cabac ) )
4714         return 25;  /* PCM */
4715
4716     mb_type = 1; /* I16x16 */
4717     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4718     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4719         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4720     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4721     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4722     return mb_type;
4723 }
4724
4725 static int decode_cabac_mb_type( H264Context *h ) {
4726     MpegEncContext * const s = &h->s;
4727
4728     if( h->slice_type == I_TYPE ) {
4729         return decode_cabac_intra_mb_type(h, 3, 1);
4730     } else if( h->slice_type == P_TYPE ) {
4731         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4732             /* P-type */
4733             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4734                 /* P_L0_D16x16, P_8x8 */
4735                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4736             } else {
4737                 /* P_L0_D8x16, P_L0_D16x8 */
4738                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4739             }
4740         } else {
4741             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4742         }
4743     } else if( h->slice_type == B_TYPE ) {
4744         const int mba_xy = h->left_mb_xy[0];
4745         const int mbb_xy = h->top_mb_xy;
4746         int ctx = 0;
4747         int bits;
4748
4749         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4750             ctx++;
4751         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4752             ctx++;
4753
4754         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4755             return 0; /* B_Direct_16x16 */
4756
4757         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4758             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4759         }
4760
4761         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4762         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4763         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4764         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4765         if( bits < 8 )
4766             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4767         else if( bits == 13 ) {
4768             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4769         } else if( bits == 14 )
4770             return 11; /* B_L1_L0_8x16 */
4771         else if( bits == 15 )
4772             return 22; /* B_8x8 */
4773
4774         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4775         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4776     } else {
4777         /* TODO SI/SP frames? */
4778         return -1;
4779     }
4780 }
4781
4782 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4783     MpegEncContext * const s = &h->s;
4784     int mba_xy, mbb_xy;
4785     int ctx = 0;
4786
4787     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4788         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4789         mba_xy = mb_xy - 1;
4790         if( (mb_y&1)
4791             && h->slice_table[mba_xy] == h->slice_num
4792             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4793             mba_xy += s->mb_stride;
4794         if( MB_FIELD ){
4795             mbb_xy = mb_xy - s->mb_stride;
4796             if( !(mb_y&1)
4797                 && h->slice_table[mbb_xy] == h->slice_num
4798                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4799                 mbb_xy -= s->mb_stride;
4800         }else
4801             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4802     }else{
4803         int mb_xy = mb_x + mb_y*s->mb_stride;
4804         mba_xy = mb_xy - 1;
4805         mbb_xy = mb_xy - s->mb_stride;
4806     }
4807
4808     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4809         ctx++;
4810     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4811         ctx++;
4812
4813     if( h->slice_type == B_TYPE )
4814         ctx += 13;
4815     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4816 }
4817
4818 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4819     int mode = 0;
4820
4821     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4822         return pred_mode;
4823
4824     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4825     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4826     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4827
4828     if( mode >= pred_mode )
4829         return mode + 1;
4830     else
4831         return mode;
4832 }
4833
4834 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4835     const int mba_xy = h->left_mb_xy[0];
4836     const int mbb_xy = h->top_mb_xy;
4837
4838     int ctx = 0;
4839
4840     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4841     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4842         ctx++;
4843
4844     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4845         ctx++;
4846
4847     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4848         return 0;
4849
4850     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4851         return 1;
4852     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4853         return 2;
4854     else
4855         return 3;
4856 }
4857
4858 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4859     int cbp_b, cbp_a, ctx, cbp = 0;
4860
4861     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4862     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4863
4864     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4865     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4866     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4867     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4868     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4869     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4870     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4871     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4872     return cbp;
4873 }
4874 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4875     int ctx;
4876     int cbp_a, cbp_b;
4877
4878     cbp_a = (h->left_cbp>>4)&0x03;
4879     cbp_b = (h-> top_cbp>>4)&0x03;
4880
4881     ctx = 0;
4882     if( cbp_a > 0 ) ctx++;
4883     if( cbp_b > 0 ) ctx += 2;
4884     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4885         return 0;
4886
4887     ctx = 4;
4888     if( cbp_a == 2 ) ctx++;
4889     if( cbp_b == 2 ) ctx += 2;
4890     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4891 }
4892 static int decode_cabac_mb_dqp( H264Context *h) {
4893     int   ctx = 0;
4894     int   val = 0;
4895
4896     if( h->last_qscale_diff != 0 )
4897         ctx++;
4898
4899     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4900         if( ctx < 2 )
4901             ctx = 2;
4902         else
4903             ctx = 3;
4904         val++;
4905         if(val > 102) //prevent infinite loop
4906             return INT_MIN;
4907     }
4908
4909     if( val&0x01 )
4910         return (val + 1)/2;
4911     else
4912         return -(val + 1)/2;
4913 }
4914 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4915     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4916         return 0;   /* 8x8 */
4917     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4918         return 1;   /* 8x4 */
4919     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4920         return 2;   /* 4x8 */
4921     return 3;       /* 4x4 */
4922 }
4923 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4924     int type;
4925     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4926         return 0;   /* B_Direct_8x8 */
4927     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4928         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4929     type = 3;
4930     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4931         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4932             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4933         type += 4;
4934     }
4935     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4936     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4937     return type;
4938 }
4939
4940 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
4941     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
4942 }
4943
4944 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
4945     int refa = h->ref_cache[list][scan8[n] - 1];
4946     int refb = h->ref_cache[list][scan8[n] - 8];
4947     int ref  = 0;
4948     int ctx  = 0;
4949
4950     if( h->slice_type == B_TYPE) {
4951         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
4952             ctx++;
4953         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
4954             ctx += 2;
4955     } else {
4956         if( refa > 0 )
4957             ctx++;
4958         if( refb > 0 )
4959             ctx += 2;
4960     }
4961
4962     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
4963         ref++;
4964         if( ctx < 4 )
4965             ctx = 4;
4966         else
4967             ctx = 5;
4968         if(ref >= 32 /*h->ref_list[list]*/){
4969             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
4970             return 0; //FIXME we should return -1 and check the return everywhere
4971         }
4972     }
4973     return ref;
4974 }
4975
4976 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
4977     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
4978                abs( h->mvd_cache[list][scan8[n] - 8][l] );
4979     int ctxbase = (l == 0) ? 40 : 47;
4980     int ctx, mvd;
4981
4982     if( amvd < 3 )
4983         ctx = 0;
4984     else if( amvd > 32 )
4985         ctx = 2;
4986     else
4987         ctx = 1;
4988
4989     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
4990         return 0;
4991
4992     mvd= 1;
4993     ctx= 3;
4994     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
4995         mvd++;
4996         if( ctx < 6 )
4997             ctx++;
4998     }
4999
5000     if( mvd >= 9 ) {
5001         int k = 3;
5002         while( get_cabac_bypass( &h->cabac ) ) {
5003             mvd += 1 << k;
5004             k++;
5005             if(k>24){
5006                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5007                 return INT_MIN;
5008             }
5009         }
5010         while( k-- ) {
5011             if( get_cabac_bypass( &h->cabac ) )
5012                 mvd += 1 << k;
5013         }
5014     }
5015     return get_cabac_bypass_sign( &h->cabac, -mvd );
5016 }
5017
5018 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5019     int nza, nzb;
5020     int ctx = 0;
5021
5022     if( cat == 0 ) {
5023         nza = h->left_cbp&0x100;
5024         nzb = h-> top_cbp&0x100;
5025     } else if( cat == 1 || cat == 2 ) {
5026         nza = h->non_zero_count_cache[scan8[idx] - 1];
5027         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5028     } else if( cat == 3 ) {
5029         nza = (h->left_cbp>>(6+idx))&0x01;
5030         nzb = (h-> top_cbp>>(6+idx))&0x01;
5031     } else {
5032         assert(cat == 4);
5033         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5034         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5035     }
5036
5037     if( nza > 0 )
5038         ctx++;
5039
5040     if( nzb > 0 )
5041         ctx += 2;
5042
5043     return ctx + 4 * cat;
5044 }
5045
5046 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5047     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5048     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5049     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5050     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5051 };
5052
5053 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5054     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5055     static const int significant_coeff_flag_offset[2][6] = {
5056       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5057       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5058     };
5059     static const int last_coeff_flag_offset[2][6] = {
5060       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5061       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5062     };
5063     static const int coeff_abs_level_m1_offset[6] = {
5064         227+0, 227+10, 227+20, 227+30, 227+39, 426
5065     };
5066     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5067       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5068         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5069         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5070        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5071       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5072         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5073         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5074         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5075     };
5076
5077     int index[64];
5078
5079     int av_unused last;
5080     int coeff_count = 0;
5081
5082     int abslevel1 = 1;
5083     int abslevelgt1 = 0;
5084
5085     uint8_t *significant_coeff_ctx_base;
5086     uint8_t *last_coeff_ctx_base;
5087     uint8_t *abs_level_m1_ctx_base;
5088
5089 #ifndef ARCH_X86
5090 #define CABAC_ON_STACK
5091 #endif
5092 #ifdef CABAC_ON_STACK
5093 #define CC &cc
5094     CABACContext cc;
5095     cc.range     = h->cabac.range;
5096     cc.low       = h->cabac.low;
5097     cc.bytestream= h->cabac.bytestream;
5098 #else
5099 #define CC &h->cabac
5100 #endif
5101
5102
5103     /* cat: 0-> DC 16x16  n = 0
5104      *      1-> AC 16x16  n = luma4x4idx
5105      *      2-> Luma4x4   n = luma4x4idx
5106      *      3-> DC Chroma n = iCbCr
5107      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5108      *      5-> Luma8x8   n = 4 * luma8x8idx
5109      */
5110
5111     /* read coded block flag */
5112     if( cat != 5 ) {
5113         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5114             if( cat == 1 || cat == 2 )
5115                 h->non_zero_count_cache[scan8[n]] = 0;
5116             else if( cat == 4 )
5117                 h->non_zero_count_cache[scan8[16+n]] = 0;
5118 #ifdef CABAC_ON_STACK
5119             h->cabac.range     = cc.range     ;
5120             h->cabac.low       = cc.low       ;
5121             h->cabac.bytestream= cc.bytestream;
5122 #endif
5123             return;
5124         }
5125     }
5126
5127     significant_coeff_ctx_base = h->cabac_state
5128         + significant_coeff_flag_offset[MB_FIELD][cat];
5129     last_coeff_ctx_base = h->cabac_state
5130         + last_coeff_flag_offset[MB_FIELD][cat];
5131     abs_level_m1_ctx_base = h->cabac_state
5132         + coeff_abs_level_m1_offset[cat];
5133
5134     if( cat == 5 ) {
5135 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5136         for(last= 0; last < coefs; last++) { \
5137             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5138             if( get_cabac( CC, sig_ctx )) { \
5139                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5140                 index[coeff_count++] = last; \
5141                 if( get_cabac( CC, last_ctx ) ) { \
5142                     last= max_coeff; \
5143                     break; \
5144                 } \
5145             } \
5146         }\
5147         if( last == max_coeff -1 ) {\
5148             index[coeff_count++] = last;\
5149         }
5150         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5151 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5152         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5153     } else {
5154         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5155 #else
5156         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5157     } else {
5158         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5159 #endif
5160     }
5161     assert(coeff_count > 0);
5162
5163     if( cat == 0 )
5164         h->cbp_table[mb_xy] |= 0x100;
5165     else if( cat == 1 || cat == 2 )
5166         h->non_zero_count_cache[scan8[n]] = coeff_count;
5167     else if( cat == 3 )
5168         h->cbp_table[mb_xy] |= 0x40 << n;
5169     else if( cat == 4 )
5170         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5171     else {
5172         assert( cat == 5 );
5173         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5174     }
5175
5176     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5177         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5178         int j= scantable[index[coeff_count]];
5179
5180         if( get_cabac( CC, ctx ) == 0 ) {
5181             if( !qmul ) {
5182                 block[j] = get_cabac_bypass_sign( CC, -1);
5183             }else{
5184                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5185             }
5186
5187             abslevel1++;
5188         } else {
5189             int coeff_abs = 2;
5190             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5191             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5192                 coeff_abs++;
5193             }
5194
5195             if( coeff_abs >= 15 ) {
5196                 int j = 0;
5197                 while( get_cabac_bypass( CC ) ) {
5198                     j++;
5199                 }
5200
5201                 coeff_abs=1;
5202                 while( j-- ) {
5203                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5204                 }
5205                 coeff_abs+= 14;
5206             }
5207
5208             if( !qmul ) {
5209                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5210                 else                                block[j] =  coeff_abs;
5211             }else{
5212                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5213                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5214             }
5215
5216             abslevelgt1++;
5217         }
5218     }
5219 #ifdef CABAC_ON_STACK
5220             h->cabac.range     = cc.range     ;
5221             h->cabac.low       = cc.low       ;
5222             h->cabac.bytestream= cc.bytestream;
5223 #endif
5224
5225 }
5226
5227 static inline void compute_mb_neighbors(H264Context *h)
5228 {
5229     MpegEncContext * const s = &h->s;
5230     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5231     h->top_mb_xy     = mb_xy - s->mb_stride;
5232     h->left_mb_xy[0] = mb_xy - 1;
5233     if(FRAME_MBAFF){
5234         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5235         const int top_pair_xy      = pair_xy     - s->mb_stride;
5236         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5237         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5238         const int curr_mb_frame_flag = !MB_FIELD;
5239         const int bottom = (s->mb_y & 1);
5240         if (bottom
5241                 ? !curr_mb_frame_flag // bottom macroblock
5242                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5243                 ) {
5244             h->top_mb_xy -= s->mb_stride;
5245         }
5246         if (left_mb_frame_flag != curr_mb_frame_flag) {
5247             h->left_mb_xy[0] = pair_xy - 1;
5248         }
5249     }
5250     return;
5251 }
5252
5253 /**
5254  * decodes a macroblock
5255  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5256  */
5257 static int decode_mb_cabac(H264Context *h) {
5258     MpegEncContext * const s = &h->s;
5259     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5260     int mb_type, partition_count, cbp = 0;
5261     int dct8x8_allowed= h->pps.transform_8x8_mode;
5262
5263     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5264
5265     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5266     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5267         int skip;
5268         /* a skipped mb needs the aff flag from the following mb */
5269         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5270             predict_field_decoding_flag(h);
5271         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5272             skip = h->next_mb_skipped;
5273         else
5274             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5275         /* read skip flags */
5276         if( skip ) {
5277             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5278                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5279                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5280                 if(h->next_mb_skipped)
5281                     predict_field_decoding_flag(h);
5282                 else
5283                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5284             }
5285
5286             decode_mb_skip(h);
5287
5288             h->cbp_table[mb_xy] = 0;
5289             h->chroma_pred_mode_table[mb_xy] = 0;
5290             h->last_qscale_diff = 0;
5291
5292             return 0;
5293
5294         }
5295     }
5296     if(FRAME_MBAFF){
5297         if( (s->mb_y&1) == 0 )
5298             h->mb_mbaff =
5299             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5300     }else
5301         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5302
5303     h->prev_mb_skipped = 0;
5304
5305     compute_mb_neighbors(h);
5306     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5307         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5308         return -1;
5309     }
5310
5311     if( h->slice_type == B_TYPE ) {
5312         if( mb_type < 23 ){
5313             partition_count= b_mb_type_info[mb_type].partition_count;
5314             mb_type=         b_mb_type_info[mb_type].type;
5315         }else{
5316             mb_type -= 23;
5317             goto decode_intra_mb;
5318         }
5319     } else if( h->slice_type == P_TYPE ) {
5320         if( mb_type < 5) {
5321             partition_count= p_mb_type_info[mb_type].partition_count;
5322             mb_type=         p_mb_type_info[mb_type].type;
5323         } else {
5324             mb_type -= 5;
5325             goto decode_intra_mb;
5326         }
5327     } else {
5328        assert(h->slice_type == I_TYPE);
5329 decode_intra_mb:
5330         partition_count = 0;
5331         cbp= i_mb_type_info[mb_type].cbp;
5332         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5333         mb_type= i_mb_type_info[mb_type].type;
5334     }
5335     if(MB_FIELD)
5336         mb_type |= MB_TYPE_INTERLACED;
5337
5338     h->slice_table[ mb_xy ]= h->slice_num;
5339
5340     if(IS_INTRA_PCM(mb_type)) {
5341         const uint8_t *ptr;
5342         unsigned int x, y;
5343
5344         // We assume these blocks are very rare so we do not optimize it.
5345         // FIXME The two following lines get the bitstream position in the cabac
5346         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5347         ptr= h->cabac.bytestream;
5348         if(h->cabac.low&0x1) ptr--;
5349         if(CABAC_BITS==16){
5350             if(h->cabac.low&0x1FF) ptr--;
5351         }
5352
5353         // The pixels are stored in the same order as levels in h->mb array.
5354         for(y=0; y<16; y++){
5355             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5356             for(x=0; x<16; x++){
5357                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5358                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5359             }
5360         }
5361         for(y=0; y<8; y++){
5362             const int index= 256 + 4*(y&3) + 32*(y>>2);
5363             for(x=0; x<8; x++){
5364                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5365                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5366             }
5367         }
5368         for(y=0; y<8; y++){
5369             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5370             for(x=0; x<8; x++){
5371                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5372                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5373             }
5374         }
5375
5376         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5377
5378         // All blocks are present
5379         h->cbp_table[mb_xy] = 0x1ef;
5380         h->chroma_pred_mode_table[mb_xy] = 0;
5381         // In deblocking, the quantizer is 0
5382         s->current_picture.qscale_table[mb_xy]= 0;
5383         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5384         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5385         // All coeffs are present
5386         memset(h->non_zero_count[mb_xy], 16, 16);
5387         s->current_picture.mb_type[mb_xy]= mb_type;
5388         return 0;
5389     }
5390
5391     if(MB_MBAFF){
5392         h->ref_count[0] <<= 1;
5393         h->ref_count[1] <<= 1;
5394     }
5395
5396     fill_caches(h, mb_type, 0);
5397
5398     if( IS_INTRA( mb_type ) ) {
5399         int i, pred_mode;
5400         if( IS_INTRA4x4( mb_type ) ) {
5401             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5402                 mb_type |= MB_TYPE_8x8DCT;
5403                 for( i = 0; i < 16; i+=4 ) {
5404                     int pred = pred_intra_mode( h, i );
5405                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5406                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5407                 }
5408             } else {
5409                 for( i = 0; i < 16; i++ ) {
5410                     int pred = pred_intra_mode( h, i );
5411                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5412
5413                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5414                 }
5415             }
5416             write_back_intra_pred_mode(h);
5417             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5418         } else {
5419             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5420             if( h->intra16x16_pred_mode < 0 ) return -1;
5421         }
5422         h->chroma_pred_mode_table[mb_xy] =
5423         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5424
5425         pred_mode= check_intra_pred_mode( h, pred_mode );
5426         if( pred_mode < 0 ) return -1;
5427         h->chroma_pred_mode= pred_mode;
5428     } else if( partition_count == 4 ) {
5429         int i, j, sub_partition_count[4], list, ref[2][4];
5430
5431         if( h->slice_type == B_TYPE ) {
5432             for( i = 0; i < 4; i++ ) {
5433                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5434                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5435                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5436             }
5437             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5438                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5439                 pred_direct_motion(h, &mb_type);
5440                 h->ref_cache[0][scan8[4]] =
5441                 h->ref_cache[1][scan8[4]] =
5442                 h->ref_cache[0][scan8[12]] =
5443                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5444                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5445                     for( i = 0; i < 4; i++ )
5446                         if( IS_DIRECT(h->sub_mb_type[i]) )
5447                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5448                 }
5449             }
5450         } else {
5451             for( i = 0; i < 4; i++ ) {
5452                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5453                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5454                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5455             }
5456         }
5457
5458         for( list = 0; list < h->list_count; list++ ) {
5459                 for( i = 0; i < 4; i++ ) {
5460                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5461                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5462                         if( h->ref_count[list] > 1 )
5463                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5464                         else
5465                             ref[list][i] = 0;
5466                     } else {
5467                         ref[list][i] = -1;
5468                     }
5469                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5470                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5471                 }
5472         }
5473
5474         if(dct8x8_allowed)
5475             dct8x8_allowed = get_dct8x8_allowed(h);
5476
5477         for(list=0; list<h->list_count; list++){
5478             for(i=0; i<4; i++){
5479                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5480                 if(IS_DIRECT(h->sub_mb_type[i])){
5481                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5482                     continue;
5483                 }
5484
5485                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5486                     const int sub_mb_type= h->sub_mb_type[i];
5487                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5488                     for(j=0; j<sub_partition_count[i]; j++){
5489                         int mpx, mpy;
5490                         int mx, my;
5491                         const int index= 4*i + block_width*j;
5492                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5493                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5494                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5495
5496                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5497                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5498                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5499
5500                         if(IS_SUB_8X8(sub_mb_type)){
5501                             mv_cache[ 1 ][0]=
5502                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5503                             mv_cache[ 1 ][1]=
5504                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5505
5506                             mvd_cache[ 1 ][0]=
5507                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5508                             mvd_cache[ 1 ][1]=
5509                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5510                         }else if(IS_SUB_8X4(sub_mb_type)){
5511                             mv_cache[ 1 ][0]= mx;
5512                             mv_cache[ 1 ][1]= my;
5513
5514                             mvd_cache[ 1 ][0]= mx - mpx;
5515                             mvd_cache[ 1 ][1]= my - mpy;
5516                         }else if(IS_SUB_4X8(sub_mb_type)){
5517                             mv_cache[ 8 ][0]= mx;
5518                             mv_cache[ 8 ][1]= my;
5519
5520                             mvd_cache[ 8 ][0]= mx - mpx;
5521                             mvd_cache[ 8 ][1]= my - mpy;
5522                         }
5523                         mv_cache[ 0 ][0]= mx;
5524                         mv_cache[ 0 ][1]= my;
5525
5526                         mvd_cache[ 0 ][0]= mx - mpx;
5527                         mvd_cache[ 0 ][1]= my - mpy;
5528                     }
5529                 }else{
5530                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5531                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5532                     p[0] = p[1] = p[8] = p[9] = 0;
5533                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5534                 }
5535             }
5536         }
5537     } else if( IS_DIRECT(mb_type) ) {
5538         pred_direct_motion(h, &mb_type);
5539         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5540         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5541         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5542     } else {
5543         int list, mx, my, i, mpx, mpy;
5544         if(IS_16X16(mb_type)){
5545             for(list=0; list<h->list_count; list++){
5546                 if(IS_DIR(mb_type, 0, list)){
5547                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5548                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5549                 }else
5550                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5551             }
5552             for(list=0; list<h->list_count; list++){
5553                 if(IS_DIR(mb_type, 0, list)){
5554                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5555
5556                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5557                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5558                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5559
5560                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5561                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5562                 }else
5563                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5564             }
5565         }
5566         else if(IS_16X8(mb_type)){
5567             for(list=0; list<h->list_count; list++){
5568                     for(i=0; i<2; i++){
5569                         if(IS_DIR(mb_type, i, list)){
5570                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5571                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5572                         }else
5573                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5574                     }
5575             }
5576             for(list=0; list<h->list_count; list++){
5577                 for(i=0; i<2; i++){
5578                     if(IS_DIR(mb_type, i, list)){
5579                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5580                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5581                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5582                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5583
5584                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5585                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5586                     }else{
5587                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5588                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5589                     }
5590                 }
5591             }
5592         }else{
5593             assert(IS_8X16(mb_type));
5594             for(list=0; list<h->list_count; list++){
5595                     for(i=0; i<2; i++){
5596                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5597                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5598                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5599                         }else
5600                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5601                     }
5602             }
5603             for(list=0; list<h->list_count; list++){
5604                 for(i=0; i<2; i++){
5605                     if(IS_DIR(mb_type, i, list)){
5606                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5607                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5608                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5609
5610                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5611                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5612                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5613                     }else{
5614                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5615                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5616                     }
5617                 }
5618             }
5619         }
5620     }
5621
5622    if( IS_INTER( mb_type ) ) {
5623         h->chroma_pred_mode_table[mb_xy] = 0;
5624         write_back_motion( h, mb_type );
5625    }
5626
5627     if( !IS_INTRA16x16( mb_type ) ) {
5628         cbp  = decode_cabac_mb_cbp_luma( h );
5629         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5630     }
5631
5632     h->cbp_table[mb_xy] = h->cbp = cbp;
5633
5634     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5635         if( decode_cabac_mb_transform_size( h ) )
5636             mb_type |= MB_TYPE_8x8DCT;
5637     }
5638     s->current_picture.mb_type[mb_xy]= mb_type;
5639
5640     if( cbp || IS_INTRA16x16( mb_type ) ) {
5641         const uint8_t *scan, *scan8x8, *dc_scan;
5642         const uint32_t *qmul;
5643         int dqp;
5644
5645         if(IS_INTERLACED(mb_type)){
5646             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5647             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5648             dc_scan= luma_dc_field_scan;
5649         }else{
5650             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5651             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5652             dc_scan= luma_dc_zigzag_scan;
5653         }
5654
5655         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5656         if( dqp == INT_MIN ){
5657             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5658             return -1;
5659         }
5660         s->qscale += dqp;
5661         if(((unsigned)s->qscale) > 51){
5662             if(s->qscale<0) s->qscale+= 52;
5663             else            s->qscale-= 52;
5664         }
5665         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5666         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5667
5668         if( IS_INTRA16x16( mb_type ) ) {
5669             int i;
5670             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5671             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5672
5673             if( cbp&15 ) {
5674                 qmul = h->dequant4_coeff[0][s->qscale];
5675                 for( i = 0; i < 16; i++ ) {
5676                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5677                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5678                 }
5679             } else {
5680                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5681             }
5682         } else {
5683             int i8x8, i4x4;
5684             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5685                 if( cbp & (1<<i8x8) ) {
5686                     if( IS_8x8DCT(mb_type) ) {
5687                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5688                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5689                     } else {
5690                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5691                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5692                             const int index = 4*i8x8 + i4x4;
5693                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5694 //START_TIMER
5695                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5696 //STOP_TIMER("decode_residual")
5697                         }
5698                     }
5699                 } else {
5700                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5701                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5702                 }
5703             }
5704         }
5705
5706         if( cbp&0x30 ){
5707             int c;
5708             for( c = 0; c < 2; c++ ) {
5709                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5710                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5711             }
5712         }
5713
5714         if( cbp&0x20 ) {
5715             int c, i;
5716             for( c = 0; c < 2; c++ ) {
5717                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5718                 for( i = 0; i < 4; i++ ) {
5719                     const int index = 16 + 4 * c + i;
5720                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5721                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5722                 }
5723             }
5724         } else {
5725             uint8_t * const nnz= &h->non_zero_count_cache[0];
5726             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5727             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5728         }
5729     } else {
5730         uint8_t * const nnz= &h->non_zero_count_cache[0];
5731         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5732         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5733         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5734         h->last_qscale_diff = 0;
5735     }
5736
5737     s->current_picture.qscale_table[mb_xy]= s->qscale;
5738     write_back_non_zero_count(h);
5739
5740     if(MB_MBAFF){
5741         h->ref_count[0] >>= 1;
5742         h->ref_count[1] >>= 1;
5743     }
5744
5745     return 0;
5746 }
5747
5748
5749 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5750     int i, d;
5751     const int index_a = qp + h->slice_alpha_c0_offset;
5752     const int alpha = (alpha_table+52)[index_a];
5753     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5754
5755     if( bS[0] < 4 ) {
5756         int8_t tc[4];
5757         for(i=0; i<4; i++)
5758             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5759         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5760     } else {
5761         /* 16px edge length, because bS=4 is triggered by being at
5762          * the edge of an intra MB, so all 4 bS are the same */
5763             for( d = 0; d < 16; d++ ) {
5764                 const int p0 = pix[-1];
5765                 const int p1 = pix[-2];
5766                 const int p2 = pix[-3];
5767
5768                 const int q0 = pix[0];
5769                 const int q1 = pix[1];
5770                 const int q2 = pix[2];
5771
5772                 if( FFABS( p0 - q0 ) < alpha &&
5773                     FFABS( p1 - p0 ) < beta &&
5774                     FFABS( q1 - q0 ) < beta ) {
5775
5776                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5777                         if( FFABS( p2 - p0 ) < beta)
5778                         {
5779                             const int p3 = pix[-4];
5780                             /* p0', p1', p2' */
5781                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5782                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5783                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5784                         } else {
5785                             /* p0' */
5786                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5787                         }
5788                         if( FFABS( q2 - q0 ) < beta)
5789                         {
5790                             const int q3 = pix[3];
5791                             /* q0', q1', q2' */
5792                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5793                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5794                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5795                         } else {
5796                             /* q0' */
5797                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5798                         }
5799                     }else{
5800                         /* p0', q0' */
5801                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5802                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5803                     }
5804                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5805                 }
5806                 pix += stride;
5807             }
5808     }
5809 }
5810 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5811     int i;
5812     const int index_a = qp + h->slice_alpha_c0_offset;
5813     const int alpha = (alpha_table+52)[index_a];
5814     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5815
5816     if( bS[0] < 4 ) {
5817         int8_t tc[4];
5818         for(i=0; i<4; i++)
5819             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5820         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5821     } else {
5822         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5823     }
5824 }
5825
5826 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5827     int i;
5828     for( i = 0; i < 16; i++, pix += stride) {
5829         int index_a;
5830         int alpha;
5831         int beta;
5832
5833         int qp_index;
5834         int bS_index = (i >> 1);
5835         if (!MB_FIELD) {
5836             bS_index &= ~1;
5837             bS_index |= (i & 1);
5838         }
5839
5840         if( bS[bS_index] == 0 ) {
5841             continue;
5842         }
5843
5844         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5845         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5846         alpha = (alpha_table+52)[index_a];
5847         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5848
5849         if( bS[bS_index] < 4 ) {
5850             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5851             const int p0 = pix[-1];
5852             const int p1 = pix[-2];
5853             const int p2 = pix[-3];
5854             const int q0 = pix[0];
5855             const int q1 = pix[1];
5856             const int q2 = pix[2];
5857
5858             if( FFABS( p0 - q0 ) < alpha &&
5859                 FFABS( p1 - p0 ) < beta &&
5860                 FFABS( q1 - q0 ) < beta ) {
5861                 int tc = tc0;
5862                 int i_delta;
5863
5864                 if( FFABS( p2 - p0 ) < beta ) {
5865                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5866                     tc++;
5867                 }
5868                 if( FFABS( q2 - q0 ) < beta ) {
5869                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5870                     tc++;
5871                 }
5872
5873                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5874                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5875                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5876                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5877             }
5878         }else{
5879             const int p0 = pix[-1];
5880             const int p1 = pix[-2];
5881             const int p2 = pix[-3];
5882
5883             const int q0 = pix[0];
5884             const int q1 = pix[1];
5885             const int q2 = pix[2];
5886
5887             if( FFABS( p0 - q0 ) < alpha &&
5888                 FFABS( p1 - p0 ) < beta &&
5889                 FFABS( q1 - q0 ) < beta ) {
5890
5891                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5892                     if( FFABS( p2 - p0 ) < beta)
5893                     {
5894                         const int p3 = pix[-4];
5895                         /* p0', p1', p2' */
5896                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5897                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5898                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5899                     } else {
5900                         /* p0' */
5901                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5902                     }
5903                     if( FFABS( q2 - q0 ) < beta)
5904                     {
5905                         const int q3 = pix[3];
5906                         /* q0', q1', q2' */
5907                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5908                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5909                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5910                     } else {
5911                         /* q0' */
5912                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5913                     }
5914                 }else{
5915                     /* p0', q0' */
5916                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5917                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5918                 }
5919                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5920             }
5921         }
5922     }
5923 }
5924 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5925     int i;
5926     for( i = 0; i < 8; i++, pix += stride) {
5927         int index_a;
5928         int alpha;
5929         int beta;
5930
5931         int qp_index;
5932         int bS_index = i;
5933
5934         if( bS[bS_index] == 0 ) {
5935             continue;
5936         }
5937
5938         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
5939         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5940         alpha = (alpha_table+52)[index_a];
5941         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5942
5943         if( bS[bS_index] < 4 ) {
5944             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
5945             const int p0 = pix[-1];
5946             const int p1 = pix[-2];
5947             const int q0 = pix[0];
5948             const int q1 = pix[1];
5949
5950             if( FFABS( p0 - q0 ) < alpha &&
5951                 FFABS( p1 - p0 ) < beta &&
5952                 FFABS( q1 - q0 ) < beta ) {
5953                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5954
5955                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5956                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5957                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5958             }
5959         }else{
5960             const int p0 = pix[-1];
5961             const int p1 = pix[-2];
5962             const int q0 = pix[0];
5963             const int q1 = pix[1];
5964
5965             if( FFABS( p0 - q0 ) < alpha &&
5966                 FFABS( p1 - p0 ) < beta &&
5967                 FFABS( q1 - q0 ) < beta ) {
5968
5969                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
5970                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
5971                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5972             }
5973         }
5974     }
5975 }
5976
5977 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5978     int i, d;
5979     const int index_a = qp + h->slice_alpha_c0_offset;
5980     const int alpha = (alpha_table+52)[index_a];
5981     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5982     const int pix_next  = stride;
5983
5984     if( bS[0] < 4 ) {
5985         int8_t tc[4];
5986         for(i=0; i<4; i++)
5987             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5988         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
5989     } else {
5990         /* 16px edge length, see filter_mb_edgev */
5991             for( d = 0; d < 16; d++ ) {
5992                 const int p0 = pix[-1*pix_next];
5993                 const int p1 = pix[-2*pix_next];
5994                 const int p2 = pix[-3*pix_next];
5995                 const int q0 = pix[0];
5996                 const int q1 = pix[1*pix_next];
5997                 const int q2 = pix[2*pix_next];
5998
5999                 if( FFABS( p0 - q0 ) < alpha &&
6000                     FFABS( p1 - p0 ) < beta &&
6001                     FFABS( q1 - q0 ) < beta ) {
6002
6003                     const int p3 = pix[-4*pix_next];
6004                     const int q3 = pix[ 3*pix_next];
6005
6006                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6007                         if( FFABS( p2 - p0 ) < beta) {
6008                             /* p0', p1', p2' */
6009                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6010                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6011                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6012                         } else {
6013                             /* p0' */
6014                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6015                         }
6016                         if( FFABS( q2 - q0 ) < beta) {
6017                             /* q0', q1', q2' */
6018                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6019                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6020                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6021                         } else {
6022                             /* q0' */
6023                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6024                         }
6025                     }else{
6026                         /* p0', q0' */
6027                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6028                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6029                     }
6030                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6031                 }
6032                 pix++;
6033             }
6034     }
6035 }
6036
6037 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6038     int i;
6039     const int index_a = qp + h->slice_alpha_c0_offset;
6040     const int alpha = (alpha_table+52)[index_a];
6041     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6042
6043     if( bS[0] < 4 ) {
6044         int8_t tc[4];
6045         for(i=0; i<4; i++)
6046             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6047         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6048     } else {
6049         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6050     }
6051 }
6052
6053 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6054     MpegEncContext * const s = &h->s;
6055     int mb_xy, mb_type;
6056     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6057
6058     mb_xy = mb_x + mb_y*s->mb_stride;
6059
6060     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6061        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6062                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6063         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6064         return;
6065     }
6066     assert(!FRAME_MBAFF);
6067
6068     mb_type = s->current_picture.mb_type[mb_xy];
6069     qp = s->current_picture.qscale_table[mb_xy];
6070     qp0 = s->current_picture.qscale_table[mb_xy-1];
6071     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6072     qpc = get_chroma_qp( h, 0, qp );
6073     qpc0 = get_chroma_qp( h, 0, qp0 );
6074     qpc1 = get_chroma_qp( h, 0, qp1 );
6075     qp0 = (qp + qp0 + 1) >> 1;
6076     qp1 = (qp + qp1 + 1) >> 1;
6077     qpc0 = (qpc + qpc0 + 1) >> 1;
6078     qpc1 = (qpc + qpc1 + 1) >> 1;
6079     qp_thresh = 15 - h->slice_alpha_c0_offset;
6080     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6081        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6082         return;
6083
6084     if( IS_INTRA(mb_type) ) {
6085         int16_t bS4[4] = {4,4,4,4};
6086         int16_t bS3[4] = {3,3,3,3};
6087         if( IS_8x8DCT(mb_type) ) {
6088             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6089             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6090             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6091             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6092         } else {
6093             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6094             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6095             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6096             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6097             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6098             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6099             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6100             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6101         }
6102         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6103         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6104         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6105         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6106         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6107         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6108         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6109         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6110         return;
6111     } else {
6112         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6113         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6114         int edges;
6115         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6116             edges = 4;
6117             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6118         } else {
6119             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6120                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6121             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6122                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6123                              ? 3 : 0;
6124             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6125             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6126             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6127                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6128         }
6129         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6130             bSv[0][0] = 0x0004000400040004ULL;
6131         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6132             bSv[1][0] = 0x0004000400040004ULL;
6133
6134 #define FILTER(hv,dir,edge)\
6135         if(bSv[dir][edge]) {\
6136             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6137             if(!(edge&1)) {\
6138                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6139                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6140             }\
6141         }
6142         if( edges == 1 ) {
6143             FILTER(v,0,0);
6144             FILTER(h,1,0);
6145         } else if( IS_8x8DCT(mb_type) ) {
6146             FILTER(v,0,0);
6147             FILTER(v,0,2);
6148             FILTER(h,1,0);
6149             FILTER(h,1,2);
6150         } else {
6151             FILTER(v,0,0);
6152             FILTER(v,0,1);
6153             FILTER(v,0,2);
6154             FILTER(v,0,3);
6155             FILTER(h,1,0);
6156             FILTER(h,1,1);
6157             FILTER(h,1,2);
6158             FILTER(h,1,3);
6159         }
6160 #undef FILTER
6161     }
6162 }
6163
6164 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6165     MpegEncContext * const s = &h->s;
6166     const int mb_xy= mb_x + mb_y*s->mb_stride;
6167     const int mb_type = s->current_picture.mb_type[mb_xy];
6168     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6169     int first_vertical_edge_done = 0;
6170     int dir;
6171     /* FIXME: A given frame may occupy more than one position in
6172      * the reference list. So ref2frm should be populated with
6173      * frame numbers, not indices. */
6174     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6175                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6176
6177     //for sufficiently low qp, filtering wouldn't do anything
6178     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6179     if(!FRAME_MBAFF){
6180         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6181         int qp = s->current_picture.qscale_table[mb_xy];
6182         if(qp <= qp_thresh
6183            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6184            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6185             return;
6186         }
6187     }
6188
6189     if (FRAME_MBAFF
6190             // left mb is in picture
6191             && h->slice_table[mb_xy-1] != 255
6192             // and current and left pair do not have the same interlaced type
6193             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6194             // and left mb is in the same slice if deblocking_filter == 2
6195             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6196         /* First vertical edge is different in MBAFF frames
6197          * There are 8 different bS to compute and 2 different Qp
6198          */
6199         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6200         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6201         int16_t bS[8];
6202         int qp[2];
6203         int bqp[2];
6204         int rqp[2];
6205         int mb_qp, mbn0_qp, mbn1_qp;
6206         int i;
6207         first_vertical_edge_done = 1;
6208
6209         if( IS_INTRA(mb_type) )
6210             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6211         else {
6212             for( i = 0; i < 8; i++ ) {
6213                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6214
6215                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6216                     bS[i] = 4;
6217                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6218                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6219                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6220                     bS[i] = 2;
6221                 else
6222                     bS[i] = 1;
6223             }
6224         }
6225
6226         mb_qp = s->current_picture.qscale_table[mb_xy];
6227         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6228         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6229         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6230         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6231                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6232         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6233                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6234         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6235         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6236                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6237         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6238                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6239
6240         /* Filter edge */
6241         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6242         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6243         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6244         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6245         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6246     }
6247     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6248     for( dir = 0; dir < 2; dir++ )
6249     {
6250         int edge;
6251         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6252         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6253         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6254
6255         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6256                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6257         // how often to recheck mv-based bS when iterating between edges
6258         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6259                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6260         // how often to recheck mv-based bS when iterating along each edge
6261         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6262
6263         if (first_vertical_edge_done) {
6264             start = 1;
6265             first_vertical_edge_done = 0;
6266         }
6267
6268         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6269             start = 1;
6270
6271         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6272             && !IS_INTERLACED(mb_type)
6273             && IS_INTERLACED(mbm_type)
6274             ) {
6275             // This is a special case in the norm where the filtering must
6276             // be done twice (one each of the field) even if we are in a
6277             // frame macroblock.
6278             //
6279             static const int nnz_idx[4] = {4,5,6,3};
6280             unsigned int tmp_linesize   = 2 *   linesize;
6281             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6282             int mbn_xy = mb_xy - 2 * s->mb_stride;
6283             int qp;
6284             int i, j;
6285             int16_t bS[4];
6286
6287             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6288                 if( IS_INTRA(mb_type) ||
6289                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6290                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6291                 } else {
6292                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6293                     for( i = 0; i < 4; i++ ) {
6294                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6295                             mbn_nnz[nnz_idx[i]] != 0 )
6296                             bS[i] = 2;
6297                         else
6298                             bS[i] = 1;
6299                     }
6300                 }
6301                 // Do not use s->qscale as luma quantizer because it has not the same
6302                 // value in IPCM macroblocks.
6303                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6304                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6305                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6306                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6307                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6308                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6309                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6310                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6311             }
6312
6313             start = 1;
6314         }
6315
6316         /* Calculate bS */
6317         for( edge = start; edge < edges; edge++ ) {
6318             /* mbn_xy: neighbor macroblock */
6319             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6320             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6321             int16_t bS[4];
6322             int qp;
6323
6324             if( (edge&1) && IS_8x8DCT(mb_type) )
6325                 continue;
6326
6327             if( IS_INTRA(mb_type) ||
6328                 IS_INTRA(mbn_type) ) {
6329                 int value;
6330                 if (edge == 0) {
6331                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6332                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6333                     ) {
6334                         value = 4;
6335                     } else {
6336                         value = 3;
6337                     }
6338                 } else {
6339                     value = 3;
6340                 }
6341                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6342             } else {
6343                 int i, l;
6344                 int mv_done;
6345
6346                 if( edge & mask_edge ) {
6347                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6348                     mv_done = 1;
6349                 }
6350                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6351                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6352                     mv_done = 1;
6353                 }
6354                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6355                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6356                     int bn_idx= b_idx - (dir ? 8:1);
6357                     int v = 0;
6358                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6359                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6360                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6361                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6362                     }
6363                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6364                     mv_done = 1;
6365                 }
6366                 else
6367                     mv_done = 0;
6368
6369                 for( i = 0; i < 4; i++ ) {
6370                     int x = dir == 0 ? edge : i;
6371                     int y = dir == 0 ? i    : edge;
6372                     int b_idx= 8 + 4 + x + 8*y;
6373                     int bn_idx= b_idx - (dir ? 8:1);
6374
6375                     if( h->non_zero_count_cache[b_idx] != 0 ||
6376                         h->non_zero_count_cache[bn_idx] != 0 ) {
6377                         bS[i] = 2;
6378                     }
6379                     else if(!mv_done)
6380                     {
6381                         bS[i] = 0;
6382                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6383                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6384                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6385                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6386                                 bS[i] = 1;
6387                                 break;
6388                             }
6389                         }
6390                     }
6391                 }
6392
6393                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6394                     continue;
6395             }
6396
6397             /* Filter edge */
6398             // Do not use s->qscale as luma quantizer because it has not the same
6399             // value in IPCM macroblocks.
6400             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6401             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6402             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6403             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6404             if( dir == 0 ) {
6405                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6406                 if( (edge&1) == 0 ) {
6407                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6408                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6409                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6410                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6411                 }
6412             } else {
6413                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6414                 if( (edge&1) == 0 ) {
6415                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6416                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6417                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6418                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6419                 }
6420             }
6421         }
6422     }
6423 }
6424
6425 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6426     MpegEncContext * const s = &h->s;
6427     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6428
6429     s->mb_skip_run= -1;
6430
6431     if( h->pps.cabac ) {
6432         int i;
6433
6434         /* realign */
6435         align_get_bits( &s->gb );
6436
6437         /* init cabac */
6438         ff_init_cabac_states( &h->cabac);
6439         ff_init_cabac_decoder( &h->cabac,
6440                                s->gb.buffer + get_bits_count(&s->gb)/8,
6441                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6442         /* calculate pre-state */
6443         for( i= 0; i < 460; i++ ) {
6444             int pre;
6445             if( h->slice_type == I_TYPE )
6446                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6447             else
6448                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6449
6450             if( pre <= 63 )
6451                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6452             else
6453                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6454         }
6455
6456         for(;;){
6457 //START_TIMER
6458             int ret = decode_mb_cabac(h);
6459             int eos;
6460 //STOP_TIMER("decode_mb_cabac")
6461
6462             if(ret>=0) hl_decode_mb(h);
6463
6464             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6465                 s->mb_y++;
6466
6467                 if(ret>=0) ret = decode_mb_cabac(h);
6468
6469                 if(ret>=0) hl_decode_mb(h);
6470                 s->mb_y--;
6471             }
6472             eos = get_cabac_terminate( &h->cabac );
6473
6474             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6475                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6476                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6477                 return -1;
6478             }
6479
6480             if( ++s->mb_x >= s->mb_width ) {
6481                 s->mb_x = 0;
6482                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6483                 ++s->mb_y;
6484                 if(FRAME_MBAFF) {
6485                     ++s->mb_y;
6486                 }
6487             }
6488
6489             if( eos || s->mb_y >= s->mb_height ) {
6490                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6491                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6492                 return 0;
6493             }
6494         }
6495
6496     } else {
6497         for(;;){
6498             int ret = decode_mb_cavlc(h);
6499
6500             if(ret>=0) hl_decode_mb(h);
6501
6502             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6503                 s->mb_y++;
6504                 ret = decode_mb_cavlc(h);
6505
6506                 if(ret>=0) hl_decode_mb(h);
6507                 s->mb_y--;
6508             }
6509
6510             if(ret<0){
6511                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6512                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6513
6514                 return -1;
6515             }
6516
6517             if(++s->mb_x >= s->mb_width){
6518                 s->mb_x=0;
6519                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6520                 ++s->mb_y;
6521                 if(FRAME_MBAFF) {
6522                     ++s->mb_y;
6523                 }
6524                 if(s->mb_y >= s->mb_height){
6525                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6526
6527                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6528                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6529
6530                         return 0;
6531                     }else{
6532                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6533
6534                         return -1;
6535                     }
6536                 }
6537             }
6538
6539             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6540                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6541                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6542                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6543
6544                     return 0;
6545                 }else{
6546                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6547
6548                     return -1;
6549                 }
6550             }
6551         }
6552     }
6553
6554 #if 0
6555     for(;s->mb_y < s->mb_height; s->mb_y++){
6556         for(;s->mb_x < s->mb_width; s->mb_x++){
6557             int ret= decode_mb(h);
6558
6559             hl_decode_mb(h);
6560
6561             if(ret<0){
6562                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6563                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6564
6565                 return -1;
6566             }
6567
6568             if(++s->mb_x >= s->mb_width){
6569                 s->mb_x=0;
6570                 if(++s->mb_y >= s->mb_height){
6571                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6572                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6573
6574                         return 0;
6575                     }else{
6576                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6577
6578                         return -1;
6579                     }
6580                 }
6581             }
6582
6583             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6584                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6585                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6586
6587                     return 0;
6588                 }else{
6589                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6590
6591                     return -1;
6592                 }
6593             }
6594         }
6595         s->mb_x=0;
6596         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6597     }
6598 #endif
6599     return -1; //not reached
6600 }
6601
6602 static int decode_unregistered_user_data(H264Context *h, int size){
6603     MpegEncContext * const s = &h->s;
6604     uint8_t user_data[16+256];
6605     int e, build, i;
6606
6607     if(size<16)
6608         return -1;
6609
6610     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6611         user_data[i]= get_bits(&s->gb, 8);
6612     }
6613
6614     user_data[i]= 0;
6615     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6616     if(e==1 && build>=0)
6617         h->x264_build= build;
6618
6619     if(s->avctx->debug & FF_DEBUG_BUGS)
6620         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6621
6622     for(; i<size; i++)
6623         skip_bits(&s->gb, 8);
6624
6625     return 0;
6626 }
6627
6628 static int decode_sei(H264Context *h){
6629     MpegEncContext * const s = &h->s;
6630
6631     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6632         int size, type;
6633
6634         type=0;
6635         do{
6636             type+= show_bits(&s->gb, 8);
6637         }while(get_bits(&s->gb, 8) == 255);
6638
6639         size=0;
6640         do{
6641             size+= show_bits(&s->gb, 8);
6642         }while(get_bits(&s->gb, 8) == 255);
6643
6644         switch(type){
6645         case 5:
6646             if(decode_unregistered_user_data(h, size) < 0)
6647                 return -1;
6648             break;
6649         default:
6650             skip_bits(&s->gb, 8*size);
6651         }
6652
6653         //FIXME check bits here
6654         align_get_bits(&s->gb);
6655     }
6656
6657     return 0;
6658 }
6659
6660 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6661     MpegEncContext * const s = &h->s;
6662     int cpb_count, i;
6663     cpb_count = get_ue_golomb(&s->gb) + 1;
6664     get_bits(&s->gb, 4); /* bit_rate_scale */
6665     get_bits(&s->gb, 4); /* cpb_size_scale */
6666     for(i=0; i<cpb_count; i++){
6667         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6668         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6669         get_bits1(&s->gb);     /* cbr_flag */
6670     }
6671     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6672     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6673     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6674     get_bits(&s->gb, 5); /* time_offset_length */
6675 }
6676
6677 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6678     MpegEncContext * const s = &h->s;
6679     int aspect_ratio_info_present_flag;
6680     unsigned int aspect_ratio_idc;
6681     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6682
6683     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6684
6685     if( aspect_ratio_info_present_flag ) {
6686         aspect_ratio_idc= get_bits(&s->gb, 8);
6687         if( aspect_ratio_idc == EXTENDED_SAR ) {
6688             sps->sar.num= get_bits(&s->gb, 16);
6689             sps->sar.den= get_bits(&s->gb, 16);
6690         }else if(aspect_ratio_idc < 14){
6691             sps->sar=  pixel_aspect[aspect_ratio_idc];
6692         }else{
6693             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6694             return -1;
6695         }
6696     }else{
6697         sps->sar.num=
6698         sps->sar.den= 0;
6699     }
6700 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6701
6702     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6703         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6704     }
6705
6706     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6707         get_bits(&s->gb, 3);    /* video_format */
6708         get_bits1(&s->gb);      /* video_full_range_flag */
6709         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6710             get_bits(&s->gb, 8); /* colour_primaries */
6711             get_bits(&s->gb, 8); /* transfer_characteristics */
6712             get_bits(&s->gb, 8); /* matrix_coefficients */
6713         }
6714     }
6715
6716     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6717         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6718         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6719     }
6720
6721     sps->timing_info_present_flag = get_bits1(&s->gb);
6722     if(sps->timing_info_present_flag){
6723         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6724         sps->time_scale = get_bits_long(&s->gb, 32);
6725         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6726     }
6727
6728     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6729     if(nal_hrd_parameters_present_flag)
6730         decode_hrd_parameters(h, sps);
6731     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6732     if(vcl_hrd_parameters_present_flag)
6733         decode_hrd_parameters(h, sps);
6734     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6735         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6736     get_bits1(&s->gb);         /* pic_struct_present_flag */
6737
6738     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6739     if(sps->bitstream_restriction_flag){
6740         unsigned int num_reorder_frames;
6741         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6742         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6743         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6744         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6745         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6746         num_reorder_frames= get_ue_golomb(&s->gb);
6747         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6748
6749         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6750             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6751             return -1;
6752         }
6753
6754         sps->num_reorder_frames= num_reorder_frames;
6755     }
6756
6757     return 0;
6758 }
6759
6760 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6761                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6762     MpegEncContext * const s = &h->s;
6763     int i, last = 8, next = 8;
6764     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6765     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6766         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6767     else
6768     for(i=0;i<size;i++){
6769         if(next)
6770             next = (last + get_se_golomb(&s->gb)) & 0xff;
6771         if(!i && !next){ /* matrix not written, we use the preset one */
6772             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6773             break;
6774         }
6775         last = factors[scan[i]] = next ? next : last;
6776     }
6777 }
6778
6779 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6780                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6781     MpegEncContext * const s = &h->s;
6782     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6783     const uint8_t *fallback[4] = {
6784         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6785         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6786         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6787         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6788     };
6789     if(get_bits1(&s->gb)){
6790         sps->scaling_matrix_present |= is_sps;
6791         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6792         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6793         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6794         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6795         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6796         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6797         if(is_sps || pps->transform_8x8_mode){
6798             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6799             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6800         }
6801     } else if(fallback_sps) {
6802         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
6803         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
6804     }
6805 }
6806
6807 /**
6808  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
6809  */
6810 static void *
6811 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
6812                     const size_t size, const char *name)
6813 {
6814     if(id>=max) {
6815         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
6816         return NULL;
6817     }
6818
6819     if(!vec[id]) {
6820         vec[id] = av_mallocz(size);
6821         if(vec[id] == NULL)
6822             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
6823     }
6824     return vec[id];
6825 }
6826
6827 static inline int decode_seq_parameter_set(H264Context *h){
6828     MpegEncContext * const s = &h->s;
6829     int profile_idc, level_idc;
6830     unsigned int sps_id, tmp, mb_width, mb_height;
6831     int i;
6832     SPS *sps;
6833
6834     profile_idc= get_bits(&s->gb, 8);
6835     get_bits1(&s->gb);   //constraint_set0_flag
6836     get_bits1(&s->gb);   //constraint_set1_flag
6837     get_bits1(&s->gb);   //constraint_set2_flag
6838     get_bits1(&s->gb);   //constraint_set3_flag
6839     get_bits(&s->gb, 4); // reserved
6840     level_idc= get_bits(&s->gb, 8);
6841     sps_id= get_ue_golomb(&s->gb);
6842
6843     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
6844     if(sps == NULL)
6845         return -1;
6846
6847     sps->profile_idc= profile_idc;
6848     sps->level_idc= level_idc;
6849
6850     if(sps->profile_idc >= 100){ //high profile
6851         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
6852             get_bits1(&s->gb);  //residual_color_transform_flag
6853         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
6854         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
6855         sps->transform_bypass = get_bits1(&s->gb);
6856         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
6857     }else
6858         sps->scaling_matrix_present = 0;
6859
6860     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6861     sps->poc_type= get_ue_golomb(&s->gb);
6862
6863     if(sps->poc_type == 0){ //FIXME #define
6864         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6865     } else if(sps->poc_type == 1){//FIXME #define
6866         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6867         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6868         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6869         tmp= get_ue_golomb(&s->gb);
6870
6871         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
6872             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
6873             return -1;
6874         }
6875         sps->poc_cycle_length= tmp;
6876
6877         for(i=0; i<sps->poc_cycle_length; i++)
6878             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
6879     }else if(sps->poc_type != 2){
6880         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
6881         return -1;
6882     }
6883
6884     tmp= get_ue_golomb(&s->gb);
6885     if(tmp > MAX_PICTURE_COUNT-2){
6886         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
6887     }
6888     sps->ref_frame_count= tmp;
6889     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
6890     mb_width= get_ue_golomb(&s->gb) + 1;
6891     mb_height= get_ue_golomb(&s->gb) + 1;
6892     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
6893        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
6894         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
6895         return -1;
6896     }
6897     sps->mb_width = mb_width;
6898     sps->mb_height= mb_height;
6899
6900     sps->frame_mbs_only_flag= get_bits1(&s->gb);
6901     if(!sps->frame_mbs_only_flag)
6902         sps->mb_aff= get_bits1(&s->gb);
6903     else
6904         sps->mb_aff= 0;
6905
6906     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
6907
6908 #ifndef ALLOW_INTERLACE
6909     if(sps->mb_aff)
6910         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
6911 #endif
6912     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
6913         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
6914
6915     sps->crop= get_bits1(&s->gb);
6916     if(sps->crop){
6917         sps->crop_left  = get_ue_golomb(&s->gb);
6918         sps->crop_right = get_ue_golomb(&s->gb);
6919         sps->crop_top   = get_ue_golomb(&s->gb);
6920         sps->crop_bottom= get_ue_golomb(&s->gb);
6921         if(sps->crop_left || sps->crop_top){
6922             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
6923         }
6924     }else{
6925         sps->crop_left  =
6926         sps->crop_right =
6927         sps->crop_top   =
6928         sps->crop_bottom= 0;
6929     }
6930
6931     sps->vui_parameters_present_flag= get_bits1(&s->gb);
6932     if( sps->vui_parameters_present_flag )
6933         decode_vui_parameters(h, sps);
6934
6935     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
6936         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
6937                sps_id, sps->profile_idc, sps->level_idc,
6938                sps->poc_type,
6939                sps->ref_frame_count,
6940                sps->mb_width, sps->mb_height,
6941                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
6942                sps->direct_8x8_inference_flag ? "8B8" : "",
6943                sps->crop_left, sps->crop_right,
6944                sps->crop_top, sps->crop_bottom,
6945                sps->vui_parameters_present_flag ? "VUI" : ""
6946                );
6947     }
6948     return 0;
6949 }
6950
6951 static void
6952 build_qp_table(PPS *pps, int t, int index)
6953 {
6954     int i;
6955     for(i = 0; i < 255; i++)
6956         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
6957 }
6958
6959 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
6960     MpegEncContext * const s = &h->s;
6961     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
6962     PPS *pps;
6963
6964     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
6965     if(pps == NULL)
6966         return -1;
6967
6968     tmp= get_ue_golomb(&s->gb);
6969     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
6970         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
6971         return -1;
6972     }
6973     pps->sps_id= tmp;
6974
6975     pps->cabac= get_bits1(&s->gb);
6976     pps->pic_order_present= get_bits1(&s->gb);
6977     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
6978     if(pps->slice_group_count > 1 ){
6979         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
6980         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
6981         switch(pps->mb_slice_group_map_type){
6982         case 0:
6983 #if 0
6984 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
6985 |    run_length[ i ]                                |1  |ue(v)   |
6986 #endif
6987             break;
6988         case 2:
6989 #if 0
6990 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
6991 |{                                                  |   |        |
6992 |    top_left_mb[ i ]                               |1  |ue(v)   |
6993 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
6994 |   }                                               |   |        |
6995 #endif
6996             break;
6997         case 3:
6998         case 4:
6999         case 5:
7000 #if 0
7001 |   slice_group_change_direction_flag               |1  |u(1)    |
7002 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7003 #endif
7004             break;
7005         case 6:
7006 #if 0
7007 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7008 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7009 |)                                                  |   |        |
7010 |    slice_group_id[ i ]                            |1  |u(v)    |
7011 #endif
7012             break;
7013         }
7014     }
7015     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7016     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7017     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7018         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7019         pps->ref_count[0]= pps->ref_count[1]= 1;
7020         return -1;
7021     }
7022
7023     pps->weighted_pred= get_bits1(&s->gb);
7024     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7025     pps->init_qp= get_se_golomb(&s->gb) + 26;
7026     pps->init_qs= get_se_golomb(&s->gb) + 26;
7027     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7028     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7029     pps->constrained_intra_pred= get_bits1(&s->gb);
7030     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7031
7032     pps->transform_8x8_mode= 0;
7033     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7034     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7035     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7036
7037     if(get_bits_count(&s->gb) < bit_length){
7038         pps->transform_8x8_mode= get_bits1(&s->gb);
7039         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7040         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7041     } else {
7042         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7043     }
7044
7045     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7046     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7047         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7048         h->pps.chroma_qp_diff= 1;
7049     } else
7050         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7051
7052     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7053         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7054                pps_id, pps->sps_id,
7055                pps->cabac ? "CABAC" : "CAVLC",
7056                pps->slice_group_count,
7057                pps->ref_count[0], pps->ref_count[1],
7058                pps->weighted_pred ? "weighted" : "",
7059                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7060                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7061                pps->constrained_intra_pred ? "CONSTR" : "",
7062                pps->redundant_pic_cnt_present ? "REDU" : "",
7063                pps->transform_8x8_mode ? "8x8DCT" : ""
7064                );
7065     }
7066
7067     return 0;
7068 }
7069
7070 /**
7071  * Call decode_slice() for each context.
7072  *
7073  * @param h h264 master context
7074  * @param context_count number of contexts to execute
7075  */
7076 static void execute_decode_slices(H264Context *h, int context_count){
7077     MpegEncContext * const s = &h->s;
7078     AVCodecContext * const avctx= s->avctx;
7079     H264Context *hx;
7080     int i;
7081
7082     if(context_count == 1) {
7083         decode_slice(avctx, h);
7084     } else {
7085         for(i = 1; i < context_count; i++) {
7086             hx = h->thread_context[i];
7087             hx->s.error_resilience = avctx->error_resilience;
7088             hx->s.error_count = 0;
7089         }
7090
7091         avctx->execute(avctx, (void *)decode_slice,
7092                        (void **)h->thread_context, NULL, context_count);
7093
7094         /* pull back stuff from slices to master context */
7095         hx = h->thread_context[context_count - 1];
7096         s->mb_x = hx->s.mb_x;
7097         s->mb_y = hx->s.mb_y;
7098         for(i = 1; i < context_count; i++)
7099             h->s.error_count += h->thread_context[i]->s.error_count;
7100     }
7101 }
7102
7103
7104 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7105     MpegEncContext * const s = &h->s;
7106     AVCodecContext * const avctx= s->avctx;
7107     int buf_index=0;
7108     H264Context *hx; ///< thread context
7109     int context_count = 0;
7110
7111     h->max_contexts = avctx->thread_count;
7112 #if 0
7113     int i;
7114     for(i=0; i<50; i++){
7115         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7116     }
7117 #endif
7118     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7119         h->current_slice = 0;
7120         s->current_picture_ptr= NULL;
7121     }
7122
7123     for(;;){
7124         int consumed;
7125         int dst_length;
7126         int bit_length;
7127         uint8_t *ptr;
7128         int i, nalsize = 0;
7129         int err;
7130
7131         if(h->is_avc) {
7132             if(buf_index >= buf_size) break;
7133             nalsize = 0;
7134             for(i = 0; i < h->nal_length_size; i++)
7135                 nalsize = (nalsize << 8) | buf[buf_index++];
7136             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7137                 if(nalsize == 1){
7138                     buf_index++;
7139                     continue;
7140                 }else{
7141                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7142                     break;
7143                 }
7144             }
7145         } else {
7146             // start code prefix search
7147             for(; buf_index + 3 < buf_size; buf_index++){
7148                 // This should always succeed in the first iteration.
7149                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7150                     break;
7151             }
7152
7153             if(buf_index+3 >= buf_size) break;
7154
7155             buf_index+=3;
7156         }
7157
7158         hx = h->thread_context[context_count];
7159
7160         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7161         if (ptr==NULL || dst_length < 0){
7162             return -1;
7163         }
7164         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7165             dst_length--;
7166         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7167
7168         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7169             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7170         }
7171
7172         if (h->is_avc && (nalsize != consumed))
7173             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7174
7175         buf_index += consumed;
7176
7177         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7178            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7179             continue;
7180
7181       again:
7182         err = 0;
7183         switch(hx->nal_unit_type){
7184         case NAL_IDR_SLICE:
7185             if (h->nal_unit_type != NAL_IDR_SLICE) {
7186                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7187                 return -1;
7188             }
7189             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7190         case NAL_SLICE:
7191             init_get_bits(&hx->s.gb, ptr, bit_length);
7192             hx->intra_gb_ptr=
7193             hx->inter_gb_ptr= &hx->s.gb;
7194             hx->s.data_partitioning = 0;
7195
7196             if((err = decode_slice_header(hx, h)))
7197                break;
7198
7199             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7200             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7201                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7202                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7203                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7204                && avctx->skip_frame < AVDISCARD_ALL)
7205                 context_count++;
7206             break;
7207         case NAL_DPA:
7208             init_get_bits(&hx->s.gb, ptr, bit_length);
7209             hx->intra_gb_ptr=
7210             hx->inter_gb_ptr= NULL;
7211             hx->s.data_partitioning = 1;
7212
7213             err = decode_slice_header(hx, h);
7214             break;
7215         case NAL_DPB:
7216             init_get_bits(&hx->intra_gb, ptr, bit_length);
7217             hx->intra_gb_ptr= &hx->intra_gb;
7218             break;
7219         case NAL_DPC:
7220             init_get_bits(&hx->inter_gb, ptr, bit_length);
7221             hx->inter_gb_ptr= &hx->inter_gb;
7222
7223             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7224                && s->context_initialized
7225                && s->hurry_up < 5
7226                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7227                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7228                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7229                && avctx->skip_frame < AVDISCARD_ALL)
7230                 context_count++;
7231             break;
7232         case NAL_SEI:
7233             init_get_bits(&s->gb, ptr, bit_length);
7234             decode_sei(h);
7235             break;
7236         case NAL_SPS:
7237             init_get_bits(&s->gb, ptr, bit_length);
7238             decode_seq_parameter_set(h);
7239
7240             if(s->flags& CODEC_FLAG_LOW_DELAY)
7241                 s->low_delay=1;
7242
7243             if(avctx->has_b_frames < 2)
7244                 avctx->has_b_frames= !s->low_delay;
7245             break;
7246         case NAL_PPS:
7247             init_get_bits(&s->gb, ptr, bit_length);
7248
7249             decode_picture_parameter_set(h, bit_length);
7250
7251             break;
7252         case NAL_AUD:
7253         case NAL_END_SEQUENCE:
7254         case NAL_END_STREAM:
7255         case NAL_FILLER_DATA:
7256         case NAL_SPS_EXT:
7257         case NAL_AUXILIARY_SLICE:
7258             break;
7259         default:
7260             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7261         }
7262
7263         if(context_count == h->max_contexts) {
7264             execute_decode_slices(h, context_count);
7265             context_count = 0;
7266         }
7267
7268         if (err < 0)
7269             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7270         else if(err == 1) {
7271             /* Slice could not be decoded in parallel mode, copy down
7272              * NAL unit stuff to context 0 and restart. Note that
7273              * rbsp_buffer is not transfered, but since we no longer
7274              * run in parallel mode this should not be an issue. */
7275             h->nal_unit_type = hx->nal_unit_type;
7276             h->nal_ref_idc   = hx->nal_ref_idc;
7277             hx = h;
7278             goto again;
7279         }
7280     }
7281     if(context_count)
7282         execute_decode_slices(h, context_count);
7283     return buf_index;
7284 }
7285
7286 /**
7287  * returns the number of bytes consumed for building the current frame
7288  */
7289 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7290     if(s->flags&CODEC_FLAG_TRUNCATED){
7291         pos -= s->parse_context.last_index;
7292         if(pos<0) pos=0; // FIXME remove (unneeded?)
7293
7294         return pos;
7295     }else{
7296         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7297         if(pos+10>buf_size) pos=buf_size; // oops ;)
7298
7299         return pos;
7300     }
7301 }
7302
7303 static int decode_frame(AVCodecContext *avctx,
7304                              void *data, int *data_size,
7305                              uint8_t *buf, int buf_size)
7306 {
7307     H264Context *h = avctx->priv_data;
7308     MpegEncContext *s = &h->s;
7309     AVFrame *pict = data;
7310     int buf_index;
7311
7312     s->flags= avctx->flags;
7313     s->flags2= avctx->flags2;
7314
7315    /* no supplementary picture */
7316     if (buf_size == 0) {
7317         Picture *out;
7318         int i, out_idx;
7319
7320 //FIXME factorize this with the output code below
7321         out = h->delayed_pic[0];
7322         out_idx = 0;
7323         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7324             if(h->delayed_pic[i]->poc < out->poc){
7325                 out = h->delayed_pic[i];
7326                 out_idx = i;
7327             }
7328
7329         for(i=out_idx; h->delayed_pic[i]; i++)
7330             h->delayed_pic[i] = h->delayed_pic[i+1];
7331
7332         if(out){
7333             *data_size = sizeof(AVFrame);
7334             *pict= *(AVFrame*)out;
7335         }
7336
7337         return 0;
7338     }
7339
7340     if(s->flags&CODEC_FLAG_TRUNCATED){
7341         int next= ff_h264_find_frame_end(h, buf, buf_size);
7342
7343         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7344             return buf_size;
7345 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7346     }
7347
7348     if(h->is_avc && !h->got_avcC) {
7349         int i, cnt, nalsize;
7350         unsigned char *p = avctx->extradata;
7351         if(avctx->extradata_size < 7) {
7352             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7353             return -1;
7354         }
7355         if(*p != 1) {
7356             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7357             return -1;
7358         }
7359         /* sps and pps in the avcC always have length coded with 2 bytes,
7360            so put a fake nal_length_size = 2 while parsing them */
7361         h->nal_length_size = 2;
7362         // Decode sps from avcC
7363         cnt = *(p+5) & 0x1f; // Number of sps
7364         p += 6;
7365         for (i = 0; i < cnt; i++) {
7366             nalsize = AV_RB16(p) + 2;
7367             if(decode_nal_units(h, p, nalsize) < 0) {
7368                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7369                 return -1;
7370             }
7371             p += nalsize;
7372         }
7373         // Decode pps from avcC
7374         cnt = *(p++); // Number of pps
7375         for (i = 0; i < cnt; i++) {
7376             nalsize = AV_RB16(p) + 2;
7377             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7378                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7379                 return -1;
7380             }
7381             p += nalsize;
7382         }
7383         // Now store right nal length size, that will be use to parse all other nals
7384         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7385         // Do not reparse avcC
7386         h->got_avcC = 1;
7387     }
7388
7389     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7390         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7391             return -1;
7392     }
7393
7394     buf_index=decode_nal_units(h, buf, buf_size);
7395     if(buf_index < 0)
7396         return -1;
7397
7398     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7399         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7400         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7401         return -1;
7402     }
7403
7404     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7405         Picture *out = s->current_picture_ptr;
7406         Picture *cur = s->current_picture_ptr;
7407         Picture *prev = h->delayed_output_pic;
7408         int i, pics, cross_idr, out_of_order, out_idx;
7409
7410         s->mb_y= 0;
7411
7412         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7413         s->current_picture_ptr->pict_type= s->pict_type;
7414
7415         h->prev_frame_num_offset= h->frame_num_offset;
7416         h->prev_frame_num= h->frame_num;
7417         if(s->current_picture_ptr->reference & s->picture_structure){
7418             h->prev_poc_msb= h->poc_msb;
7419             h->prev_poc_lsb= h->poc_lsb;
7420             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7421         }
7422
7423         ff_er_frame_end(s);
7424
7425         MPV_frame_end(s);
7426
7427     //FIXME do something with unavailable reference frames
7428
7429 #if 0 //decode order
7430         *data_size = sizeof(AVFrame);
7431 #else
7432         /* Sort B-frames into display order */
7433
7434         if(h->sps.bitstream_restriction_flag
7435            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7436             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7437             s->low_delay = 0;
7438         }
7439
7440         pics = 0;
7441         while(h->delayed_pic[pics]) pics++;
7442
7443         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7444
7445         h->delayed_pic[pics++] = cur;
7446         if(cur->reference == 0)
7447             cur->reference = DELAYED_PIC_REF;
7448
7449         cross_idr = 0;
7450         for(i=0; h->delayed_pic[i]; i++)
7451             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7452                 cross_idr = 1;
7453
7454         out = h->delayed_pic[0];
7455         out_idx = 0;
7456         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7457             if(h->delayed_pic[i]->poc < out->poc){
7458                 out = h->delayed_pic[i];
7459                 out_idx = i;
7460             }
7461
7462         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7463         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7464             { }
7465         else if(prev && pics <= s->avctx->has_b_frames)
7466             out = prev;
7467         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7468            || (s->low_delay &&
7469             ((!cross_idr && prev && out->poc > prev->poc + 2)
7470              || cur->pict_type == B_TYPE)))
7471         {
7472             s->low_delay = 0;
7473             s->avctx->has_b_frames++;
7474             out = prev;
7475         }
7476         else if(out_of_order)
7477             out = prev;
7478
7479         if(out_of_order || pics > s->avctx->has_b_frames){
7480             for(i=out_idx; h->delayed_pic[i]; i++)
7481                 h->delayed_pic[i] = h->delayed_pic[i+1];
7482         }
7483
7484         if(prev == out)
7485             *data_size = 0;
7486         else
7487             *data_size = sizeof(AVFrame);
7488         if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7489             prev->reference = 0;
7490         h->delayed_output_pic = out;
7491 #endif
7492
7493         if(out)
7494             *pict= *(AVFrame*)out;
7495         else
7496             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7497     }
7498
7499     assert(pict->data[0] || !*data_size);
7500     ff_print_debug_info(s, pict);
7501 //printf("out %d\n", (int)pict->data[0]);
7502 #if 0 //?
7503
7504     /* Return the Picture timestamp as the frame number */
7505     /* we substract 1 because it is added on utils.c    */
7506     avctx->frame_number = s->picture_number - 1;
7507 #endif
7508     return get_consumed_bytes(s, buf_index, buf_size);
7509 }
7510 #if 0
7511 static inline void fill_mb_avail(H264Context *h){
7512     MpegEncContext * const s = &h->s;
7513     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7514
7515     if(s->mb_y){
7516         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7517         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7518         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7519     }else{
7520         h->mb_avail[0]=
7521         h->mb_avail[1]=
7522         h->mb_avail[2]= 0;
7523     }
7524     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7525     h->mb_avail[4]= 1; //FIXME move out
7526     h->mb_avail[5]= 0; //FIXME move out
7527 }
7528 #endif
7529
7530 #if 0 //selftest
7531 #undef random
7532 #define COUNT 8000
7533 #define SIZE (COUNT*40)
7534 int main(){
7535     int i;
7536     uint8_t temp[SIZE];
7537     PutBitContext pb;
7538     GetBitContext gb;
7539 //    int int_temp[10000];
7540     DSPContext dsp;
7541     AVCodecContext avctx;
7542
7543     dsputil_init(&dsp, &avctx);
7544
7545     init_put_bits(&pb, temp, SIZE);
7546     printf("testing unsigned exp golomb\n");
7547     for(i=0; i<COUNT; i++){
7548         START_TIMER
7549         set_ue_golomb(&pb, i);
7550         STOP_TIMER("set_ue_golomb");
7551     }
7552     flush_put_bits(&pb);
7553
7554     init_get_bits(&gb, temp, 8*SIZE);
7555     for(i=0; i<COUNT; i++){
7556         int j, s;
7557
7558         s= show_bits(&gb, 24);
7559
7560         START_TIMER
7561         j= get_ue_golomb(&gb);
7562         if(j != i){
7563             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7564 //            return -1;
7565         }
7566         STOP_TIMER("get_ue_golomb");
7567     }
7568
7569
7570     init_put_bits(&pb, temp, SIZE);
7571     printf("testing signed exp golomb\n");
7572     for(i=0; i<COUNT; i++){
7573         START_TIMER
7574         set_se_golomb(&pb, i - COUNT/2);
7575         STOP_TIMER("set_se_golomb");
7576     }
7577     flush_put_bits(&pb);
7578
7579     init_get_bits(&gb, temp, 8*SIZE);
7580     for(i=0; i<COUNT; i++){
7581         int j, s;
7582
7583         s= show_bits(&gb, 24);
7584
7585         START_TIMER
7586         j= get_se_golomb(&gb);
7587         if(j != i - COUNT/2){
7588             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7589 //            return -1;
7590         }
7591         STOP_TIMER("get_se_golomb");
7592     }
7593
7594     printf("testing 4x4 (I)DCT\n");
7595
7596     DCTELEM block[16];
7597     uint8_t src[16], ref[16];
7598     uint64_t error= 0, max_error=0;
7599
7600     for(i=0; i<COUNT; i++){
7601         int j;
7602 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7603         for(j=0; j<16; j++){
7604             ref[j]= random()%255;
7605             src[j]= random()%255;
7606         }
7607
7608         h264_diff_dct_c(block, src, ref, 4);
7609
7610         //normalize
7611         for(j=0; j<16; j++){
7612 //            printf("%d ", block[j]);
7613             block[j]= block[j]*4;
7614             if(j&1) block[j]= (block[j]*4 + 2)/5;
7615             if(j&4) block[j]= (block[j]*4 + 2)/5;
7616         }
7617 //        printf("\n");
7618
7619         s->dsp.h264_idct_add(ref, block, 4);
7620 /*        for(j=0; j<16; j++){
7621             printf("%d ", ref[j]);
7622         }
7623         printf("\n");*/
7624
7625         for(j=0; j<16; j++){
7626             int diff= FFABS(src[j] - ref[j]);
7627
7628             error+= diff*diff;
7629             max_error= FFMAX(max_error, diff);
7630         }
7631     }
7632     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7633 #if 0
7634     printf("testing quantizer\n");
7635     for(qp=0; qp<52; qp++){
7636         for(i=0; i<16; i++)
7637             src1_block[i]= src2_block[i]= random()%255;
7638
7639     }
7640 #endif
7641     printf("Testing NAL layer\n");
7642
7643     uint8_t bitstream[COUNT];
7644     uint8_t nal[COUNT*2];
7645     H264Context h;
7646     memset(&h, 0, sizeof(H264Context));
7647
7648     for(i=0; i<COUNT; i++){
7649         int zeros= i;
7650         int nal_length;
7651         int consumed;
7652         int out_length;
7653         uint8_t *out;
7654         int j;
7655
7656         for(j=0; j<COUNT; j++){
7657             bitstream[j]= (random() % 255) + 1;
7658         }
7659
7660         for(j=0; j<zeros; j++){
7661             int pos= random() % COUNT;
7662             while(bitstream[pos] == 0){
7663                 pos++;
7664                 pos %= COUNT;
7665             }
7666             bitstream[pos]=0;
7667         }
7668
7669         START_TIMER
7670
7671         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7672         if(nal_length<0){
7673             printf("encoding failed\n");
7674             return -1;
7675         }
7676
7677         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7678
7679         STOP_TIMER("NAL")
7680
7681         if(out_length != COUNT){
7682             printf("incorrect length %d %d\n", out_length, COUNT);
7683             return -1;
7684         }
7685
7686         if(consumed != nal_length){
7687             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7688             return -1;
7689         }
7690
7691         if(memcmp(bitstream, out, COUNT)){
7692             printf("mismatch\n");
7693             return -1;
7694         }
7695     }
7696
7697     printf("Testing RBSP\n");
7698
7699
7700     return 0;
7701 }
7702 #endif
7703
7704
7705 static int decode_end(AVCodecContext *avctx)
7706 {
7707     H264Context *h = avctx->priv_data;
7708     MpegEncContext *s = &h->s;
7709
7710     av_freep(&h->rbsp_buffer[0]);
7711     av_freep(&h->rbsp_buffer[1]);
7712     free_tables(h); //FIXME cleanup init stuff perhaps
7713     MPV_common_end(s);
7714
7715 //    memset(h, 0, sizeof(H264Context));
7716
7717     return 0;
7718 }
7719
7720
7721 AVCodec h264_decoder = {
7722     "h264",
7723     CODEC_TYPE_VIDEO,
7724     CODEC_ID_H264,
7725     sizeof(H264Context),
7726     decode_init,
7727     NULL,
7728     decode_end,
7729     decode_frame,
7730     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
7731     .flush= flush_dpb,
7732 };
7733
7734 #include "svq3.c"