git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 static VLC coeff_token_vlc[4];
  42 static VLC chroma_dc_coeff_token_vlc;
  43
  44 static VLC total_zeros_vlc[15];
  45 static VLC chroma_dc_total_zeros_vlc[3];
  46
  47 static VLC run_vlc[6];
  48 static VLC run7_vlc;
  49
  50 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  51 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  52 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  53 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  54
  55 static av_always_inline uint32_t pack16to32(int a, int b){
  56 #ifdef WORDS_BIGENDIAN
  57    return (b&0xFFFF) + (a<<16);
  58 #else
  59    return (a&0xFFFF) + (b<<16);
  60 #endif
  61 }
  62
  63 const uint8_t ff_rem6[52]={
  64 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  65 };
  66
  67 const uint8_t ff_div6[52]={
  68 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  69 };
  70
  71
  72 /**
  73  * fill a rectangle.
  74  * @param h height of the rectangle, should be a constant
  75  * @param w width of the rectangle, should be a constant
  76  * @param size the size of val (1 or 4), should be a constant
  77  */
  78 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  79     uint8_t *p= (uint8_t*)vp;
  80     assert(size==1 || size==4);
  81     assert(w<=4);
  82
  83     w      *= size;
  84     stride *= size;
  85
  86     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  87     assert((stride&(w-1))==0);
  88     if(w==2){
  89         const uint16_t v= size==4 ? val : val*0x0101;
  90         *(uint16_t*)(p + 0*stride)= v;
  91         if(h==1) return;
  92         *(uint16_t*)(p + 1*stride)= v;
  93         if(h==2) return;
  94         *(uint16_t*)(p + 2*stride)= v;
  95         *(uint16_t*)(p + 3*stride)= v;
  96     }else if(w==4){
  97         const uint32_t v= size==4 ? val : val*0x01010101;
  98         *(uint32_t*)(p + 0*stride)= v;
  99         if(h==1) return;
 100         *(uint32_t*)(p + 1*stride)= v;
 101         if(h==2) return;
 102         *(uint32_t*)(p + 2*stride)= v;
 103         *(uint32_t*)(p + 3*stride)= v;
 104     }else if(w==8){
 105     //gcc can't optimize 64bit math on x86_32
 106 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 107         const uint64_t v= val*0x0100000001ULL;
 108         *(uint64_t*)(p + 0*stride)= v;
 109         if(h==1) return;
 110         *(uint64_t*)(p + 1*stride)= v;
 111         if(h==2) return;
 112         *(uint64_t*)(p + 2*stride)= v;
 113         *(uint64_t*)(p + 3*stride)= v;
 114     }else if(w==16){
 115         const uint64_t v= val*0x0100000001ULL;
 116         *(uint64_t*)(p + 0+0*stride)= v;
 117         *(uint64_t*)(p + 8+0*stride)= v;
 118         *(uint64_t*)(p + 0+1*stride)= v;
 119         *(uint64_t*)(p + 8+1*stride)= v;
 120         if(h==2) return;
 121         *(uint64_t*)(p + 0+2*stride)= v;
 122         *(uint64_t*)(p + 8+2*stride)= v;
 123         *(uint64_t*)(p + 0+3*stride)= v;
 124         *(uint64_t*)(p + 8+3*stride)= v;
 125 #else
 126         *(uint32_t*)(p + 0+0*stride)= val;
 127         *(uint32_t*)(p + 4+0*stride)= val;
 128         if(h==1) return;
 129         *(uint32_t*)(p + 0+1*stride)= val;
 130         *(uint32_t*)(p + 4+1*stride)= val;
 131         if(h==2) return;
 132         *(uint32_t*)(p + 0+2*stride)= val;
 133         *(uint32_t*)(p + 4+2*stride)= val;
 134         *(uint32_t*)(p + 0+3*stride)= val;
 135         *(uint32_t*)(p + 4+3*stride)= val;
 136     }else if(w==16){
 137         *(uint32_t*)(p + 0+0*stride)= val;
 138         *(uint32_t*)(p + 4+0*stride)= val;
 139         *(uint32_t*)(p + 8+0*stride)= val;
 140         *(uint32_t*)(p +12+0*stride)= val;
 141         *(uint32_t*)(p + 0+1*stride)= val;
 142         *(uint32_t*)(p + 4+1*stride)= val;
 143         *(uint32_t*)(p + 8+1*stride)= val;
 144         *(uint32_t*)(p +12+1*stride)= val;
 145         if(h==2) return;
 146         *(uint32_t*)(p + 0+2*stride)= val;
 147         *(uint32_t*)(p + 4+2*stride)= val;
 148         *(uint32_t*)(p + 8+2*stride)= val;
 149         *(uint32_t*)(p +12+2*stride)= val;
 150         *(uint32_t*)(p + 0+3*stride)= val;
 151         *(uint32_t*)(p + 4+3*stride)= val;
 152         *(uint32_t*)(p + 8+3*stride)= val;
 153         *(uint32_t*)(p +12+3*stride)= val;
 154 #endif
 155     }else
 156         assert(0);
 157     assert(h==4);
 158 }
 159
 160 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 161     MpegEncContext * const s = &h->s;
 162     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 163     int topleft_xy, top_xy, topright_xy, left_xy[2];
 164     int topleft_type, top_type, topright_type, left_type[2];
 165     int left_block[8];
 166     int i;
 167
 168     //FIXME deblocking could skip the intra and nnz parts.
 169     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 170         return;
 171
 172     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 173
 174     top_xy     = mb_xy  - s->mb_stride;
 175     topleft_xy = top_xy - 1;
 176     topright_xy= top_xy + 1;
 177     left_xy[1] = left_xy[0] = mb_xy-1;
 178     left_block[0]= 0;
 179     left_block[1]= 1;
 180     left_block[2]= 2;
 181     left_block[3]= 3;
 182     left_block[4]= 7;
 183     left_block[5]= 10;
 184     left_block[6]= 8;
 185     left_block[7]= 11;
 186     if(FRAME_MBAFF){
 187         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 188         const int top_pair_xy      = pair_xy     - s->mb_stride;
 189         const int topleft_pair_xy  = top_pair_xy - 1;
 190         const int topright_pair_xy = top_pair_xy + 1;
 191         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 192         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 193         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 194         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 195         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 196         const int bottom = (s->mb_y & 1);
 197         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 198         if (bottom
 199                 ? !curr_mb_frame_flag // bottom macroblock
 200                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 201                 ) {
 202             top_xy -= s->mb_stride;
 203         }
 204         if (bottom
 205                 ? !curr_mb_frame_flag // bottom macroblock
 206                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 207                 ) {
 208             topleft_xy -= s->mb_stride;
 209         }
 210         if (bottom
 211                 ? !curr_mb_frame_flag // bottom macroblock
 212                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 213                 ) {
 214             topright_xy -= s->mb_stride;
 215         }
 216         if (left_mb_frame_flag != curr_mb_frame_flag) {
 217             left_xy[1] = left_xy[0] = pair_xy - 1;
 218             if (curr_mb_frame_flag) {
 219                 if (bottom) {
 220                     left_block[0]= 2;
 221                     left_block[1]= 2;
 222                     left_block[2]= 3;
 223                     left_block[3]= 3;
 224                     left_block[4]= 8;
 225                     left_block[5]= 11;
 226                     left_block[6]= 8;
 227                     left_block[7]= 11;
 228                 } else {
 229                     left_block[0]= 0;
 230                     left_block[1]= 0;
 231                     left_block[2]= 1;
 232                     left_block[3]= 1;
 233                     left_block[4]= 7;
 234                     left_block[5]= 10;
 235                     left_block[6]= 7;
 236                     left_block[7]= 10;
 237                 }
 238             } else {
 239                 left_xy[1] += s->mb_stride;
 240                 //left_block[0]= 0;
 241                 left_block[1]= 2;
 242                 left_block[2]= 0;
 243                 left_block[3]= 2;
 244                 //left_block[4]= 7;
 245                 left_block[5]= 10;
 246                 left_block[6]= 7;
 247                 left_block[7]= 10;
 248             }
 249         }
 250     }
 251
 252     h->top_mb_xy = top_xy;
 253     h->left_mb_xy[0] = left_xy[0];
 254     h->left_mb_xy[1] = left_xy[1];
 255     if(for_deblock){
 256         topleft_type = 0;
 257         topright_type = 0;
 258         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 259         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 260         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 261
 262         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 263             int list;
 264             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 265             for(i=0; i<16; i++)
 266                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 267             for(list=0; list<h->list_count; list++){
 268                 if(USES_LIST(mb_type,list)){
 269                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 270                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 271                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 272                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 273                         dst[0] = src[0];
 274                         dst[1] = src[1];
 275                         dst[2] = src[2];
 276                         dst[3] = src[3];
 277                     }
 278                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 279                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 280                     ref += h->b8_stride;
 281                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 282                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 283                 }else{
 284                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 285                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 286                 }
 287             }
 288         }
 289     }else{
 290         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 291         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 292         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 293         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 294         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 295     }
 296
 297     if(IS_INTRA(mb_type)){
 298         h->topleft_samples_available=
 299         h->top_samples_available=
 300         h->left_samples_available= 0xFFFF;
 301         h->topright_samples_available= 0xEEEA;
 302
 303         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 304             h->topleft_samples_available= 0xB3FF;
 305             h->top_samples_available= 0x33FF;
 306             h->topright_samples_available= 0x26EA;
 307         }
 308         for(i=0; i<2; i++){
 309             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 310                 h->topleft_samples_available&= 0xDF5F;
 311                 h->left_samples_available&= 0x5F5F;
 312             }
 313         }
 314
 315         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 316             h->topleft_samples_available&= 0x7FFF;
 317
 318         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 319             h->topright_samples_available&= 0xFBFF;
 320
 321         if(IS_INTRA4x4(mb_type)){
 322             if(IS_INTRA4x4(top_type)){
 323                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 324                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 325                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 326                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 327             }else{
 328                 int pred;
 329                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 330                     pred= -1;
 331                 else{
 332                     pred= 2;
 333                 }
 334                 h->intra4x4_pred_mode_cache[4+8*0]=
 335                 h->intra4x4_pred_mode_cache[5+8*0]=
 336                 h->intra4x4_pred_mode_cache[6+8*0]=
 337                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 338             }
 339             for(i=0; i<2; i++){
 340                 if(IS_INTRA4x4(left_type[i])){
 341                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 342                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 343                 }else{
 344                     int pred;
 345                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 346                         pred= -1;
 347                     else{
 348                         pred= 2;
 349                     }
 350                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 351                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 352                 }
 353             }
 354         }
 355     }
 356
 357
 358 /*
 359 0 . T T. T T T T
 360 1 L . .L . . . .
 361 2 L . .L . . . .
 362 3 . T TL . . . .
 363 4 L . .L . . . .
 364 5 L . .. . . . .
 365 */
 366 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 367     if(top_type){
 368         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 369         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 370         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 371         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 372
 373         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 374         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 375
 376         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 377         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 378
 379     }else{
 380         h->non_zero_count_cache[4+8*0]=
 381         h->non_zero_count_cache[5+8*0]=
 382         h->non_zero_count_cache[6+8*0]=
 383         h->non_zero_count_cache[7+8*0]=
 384
 385         h->non_zero_count_cache[1+8*0]=
 386         h->non_zero_count_cache[2+8*0]=
 387
 388         h->non_zero_count_cache[1+8*3]=
 389         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 390
 391     }
 392
 393     for (i=0; i<2; i++) {
 394         if(left_type[i]){
 395             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 396             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 397             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 398             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 399         }else{
 400             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 401             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 402             h->non_zero_count_cache[0+8*1 +   8*i]=
 403             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 404         }
 405     }
 406
 407     if( h->pps.cabac ) {
 408         // top_cbp
 409         if(top_type) {
 410             h->top_cbp = h->cbp_table[top_xy];
 411         } else if(IS_INTRA(mb_type)) {
 412             h->top_cbp = 0x1C0;
 413         } else {
 414             h->top_cbp = 0;
 415         }
 416         // left_cbp
 417         if (left_type[0]) {
 418             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 419         } else if(IS_INTRA(mb_type)) {
 420             h->left_cbp = 0x1C0;
 421         } else {
 422             h->left_cbp = 0;
 423         }
 424         if (left_type[0]) {
 425             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 426         }
 427         if (left_type[1]) {
 428             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 429         }
 430     }
 431
 432 #if 1
 433     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 434         int list;
 435         for(list=0; list<h->list_count; list++){
 436             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 437                 /*if(!h->mv_cache_clean[list]){
 438                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 439                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 440                     h->mv_cache_clean[list]= 1;
 441                 }*/
 442                 continue;
 443             }
 444             h->mv_cache_clean[list]= 0;
 445
 446             if(USES_LIST(top_type, list)){
 447                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 448                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 449                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 450                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 451                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 452                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 453                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 454                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 455                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 456                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 457             }else{
 458                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 459                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 460                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 461                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 462                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 463             }
 464
 465             for(i=0; i<2; i++){
 466                 int cache_idx = scan8[0] - 1 + i*2*8;
 467                 if(USES_LIST(left_type[i], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 469                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 470                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 471                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 472                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 473                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 474                 }else{
 475                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 476                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 477                     h->ref_cache[list][cache_idx  ]=
 478                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 479                 }
 480             }
 481
 482             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 483                 continue;
 484
 485             if(USES_LIST(topleft_type, list)){
 486                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 487                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 488                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 489                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 490             }else{
 491                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 492                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 493             }
 494
 495             if(USES_LIST(topright_type, list)){
 496                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 497                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 498                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 499                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 500             }else{
 501                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 502                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 503             }
 504
 505             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 506                 continue;
 507
 508             h->ref_cache[list][scan8[5 ]+1] =
 509             h->ref_cache[list][scan8[7 ]+1] =
 510             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 511             h->ref_cache[list][scan8[4 ]] =
 512             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 513             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 514             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 515             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 516             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 517             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 518
 519             if( h->pps.cabac ) {
 520                 /* XXX beurk, Load mvd */
 521                 if(USES_LIST(top_type, list)){
 522                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 523                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 524                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 525                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 526                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 527                 }else{
 528                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 529                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 530                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 531                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 532                 }
 533                 if(USES_LIST(left_type[0], list)){
 534                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 535                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 536                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 537                 }else{
 538                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 539                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 540                 }
 541                 if(USES_LIST(left_type[1], list)){
 542                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 543                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 544                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 545                 }else{
 546                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 547                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 548                 }
 549                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 550                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 551                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 552                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 553                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 554
 555                 if(h->slice_type == B_TYPE){
 556                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 557
 558                     if(IS_DIRECT(top_type)){
 559                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 560                     }else if(IS_8X8(top_type)){
 561                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 562                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 563                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 564                     }else{
 565                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 566                     }
 567
 568                     if(IS_DIRECT(left_type[0]))
 569                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 570                     else if(IS_8X8(left_type[0]))
 571                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 572                     else
 573                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 574
 575                     if(IS_DIRECT(left_type[1]))
 576                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 577                     else if(IS_8X8(left_type[1]))
 578                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 579                     else
 580                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 581                 }
 582             }
 583
 584             if(FRAME_MBAFF){
 585 #define MAP_MVS\
 586                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 587                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 588                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 589                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 590                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 591                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 592                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 593                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 594                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 595                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 596                 if(MB_FIELD){
 597 #define MAP_F2F(idx, mb_type)\
 598                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 599                         h->ref_cache[list][idx] <<= 1;\
 600                         h->mv_cache[list][idx][1] /= 2;\
 601                         h->mvd_cache[list][idx][1] /= 2;\
 602                     }
 603                     MAP_MVS
 604 #undef MAP_F2F
 605                 }else{
 606 #define MAP_F2F(idx, mb_type)\
 607                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 608                         h->ref_cache[list][idx] >>= 1;\
 609                         h->mv_cache[list][idx][1] <<= 1;\
 610                         h->mvd_cache[list][idx][1] <<= 1;\
 611                     }
 612                     MAP_MVS
 613 #undef MAP_F2F
 614                 }
 615             }
 616         }
 617     }
 618 #endif
 619
 620     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 621 }
 622
 623 static inline void write_back_intra_pred_mode(H264Context *h){
 624     MpegEncContext * const s = &h->s;
 625     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 626
 627     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 628     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 629     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 630     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 631     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 632     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 633     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 634 }
 635
 636 /**
 637  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 638  */
 639 static inline int check_intra4x4_pred_mode(H264Context *h){
 640     MpegEncContext * const s = &h->s;
 641     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 642     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 643     int i;
 644
 645     if(!(h->top_samples_available&0x8000)){
 646         for(i=0; i<4; i++){
 647             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 648             if(status<0){
 649                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 650                 return -1;
 651             } else if(status){
 652                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 653             }
 654         }
 655     }
 656
 657     if(!(h->left_samples_available&0x8000)){
 658         for(i=0; i<4; i++){
 659             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 660             if(status<0){
 661                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 662                 return -1;
 663             } else if(status){
 664                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 665             }
 666         }
 667     }
 668
 669     return 0;
 670 } //FIXME cleanup like next
 671
 672 /**
 673  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 674  */
 675 static inline int check_intra_pred_mode(H264Context *h, int mode){
 676     MpegEncContext * const s = &h->s;
 677     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 678     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 679
 680     if(mode > 6U) {
 681         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 682         return -1;
 683     }
 684
 685     if(!(h->top_samples_available&0x8000)){
 686         mode= top[ mode ];
 687         if(mode<0){
 688             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 689             return -1;
 690         }
 691     }
 692
 693     if(!(h->left_samples_available&0x8000)){
 694         mode= left[ mode ];
 695         if(mode<0){
 696             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 697             return -1;
 698         }
 699     }
 700
 701     return mode;
 702 }
 703
 704 /**
 705  * gets the predicted intra4x4 prediction mode.
 706  */
 707 static inline int pred_intra_mode(H264Context *h, int n){
 708     const int index8= scan8[n];
 709     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 710     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 711     const int min= FFMIN(left, top);
 712
 713     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 714
 715     if(min<0) return DC_PRED;
 716     else      return min;
 717 }
 718
 719 static inline void write_back_non_zero_count(H264Context *h){
 720     MpegEncContext * const s = &h->s;
 721     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 722
 723     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 724     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 725     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 726     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 727     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 728     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 729     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 730
 731     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 732     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 733     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 734
 735     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 736     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 737     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 738
 739     if(FRAME_MBAFF){
 740         // store all luma nnzs, for deblocking
 741         int v = 0, i;
 742         for(i=0; i<16; i++)
 743             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 744         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 745     }
 746 }
 747
 748 /**
 749  * gets the predicted number of non zero coefficients.
 750  * @param n block index
 751  */
 752 static inline int pred_non_zero_count(H264Context *h, int n){
 753     const int index8= scan8[n];
 754     const int left= h->non_zero_count_cache[index8 - 1];
 755     const int top = h->non_zero_count_cache[index8 - 8];
 756     int i= left + top;
 757
 758     if(i<64) i= (i+1)>>1;
 759
 760     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 761
 762     return i&31;
 763 }
 764
 765 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 766     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 767     MpegEncContext *s = &h->s;
 768
 769     /* there is no consistent mapping of mvs to neighboring locations that will
 770      * make mbaff happy, so we can't move all this logic to fill_caches */
 771     if(FRAME_MBAFF){
 772         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 773         const int16_t *mv;
 774         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 775         *C = h->mv_cache[list][scan8[0]-2];
 776
 777         if(!MB_FIELD
 778            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 779             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 780             if(IS_INTERLACED(mb_types[topright_xy])){
 781 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 782                 const int x4 = X4, y4 = Y4;\
 783                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 784                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 785                     return LIST_NOT_USED;\
 786                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 787                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 788                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 789                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 790
 791                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 792             }
 793         }
 794         if(topright_ref == PART_NOT_AVAILABLE
 795            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 796            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 797             if(!MB_FIELD
 798                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 799                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 800             }
 801             if(MB_FIELD
 802                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 803                && i >= scan8[0]+8){
 804                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 805                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 806             }
 807         }
 808 #undef SET_DIAG_MV
 809     }
 810
 811     if(topright_ref != PART_NOT_AVAILABLE){
 812         *C= h->mv_cache[list][ i - 8 + part_width ];
 813         return topright_ref;
 814     }else{
 815         tprintf(s->avctx, "topright MV not available\n");
 816
 817         *C= h->mv_cache[list][ i - 8 - 1 ];
 818         return h->ref_cache[list][ i - 8 - 1 ];
 819     }
 820 }
 821
 822 /**
 823  * gets the predicted MV.
 824  * @param n the block index
 825  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 826  * @param mx the x component of the predicted motion vector
 827  * @param my the y component of the predicted motion vector
 828  */
 829 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 830     const int index8= scan8[n];
 831     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 832     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 833     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 834     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 835     const int16_t * C;
 836     int diagonal_ref, match_count;
 837
 838     assert(part_width==1 || part_width==2 || part_width==4);
 839
 840 /* mv_cache
 841   B . . A T T T T
 842   U . . L . . , .
 843   U . . L . . . .
 844   U . . L . . , .
 845   . . . L . . . .
 846 */
 847
 848     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 849     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 850     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 851     if(match_count > 1){ //most common
 852         *mx= mid_pred(A[0], B[0], C[0]);
 853         *my= mid_pred(A[1], B[1], C[1]);
 854     }else if(match_count==1){
 855         if(left_ref==ref){
 856             *mx= A[0];
 857             *my= A[1];
 858         }else if(top_ref==ref){
 859             *mx= B[0];
 860             *my= B[1];
 861         }else{
 862             *mx= C[0];
 863             *my= C[1];
 864         }
 865     }else{
 866         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 867             *mx= A[0];
 868             *my= A[1];
 869         }else{
 870             *mx= mid_pred(A[0], B[0], C[0]);
 871             *my= mid_pred(A[1], B[1], C[1]);
 872         }
 873     }
 874
 875     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 876 }
 877
 878 /**
 879  * gets the directionally predicted 16x8 MV.
 880  * @param n the block index
 881  * @param mx the x component of the predicted motion vector
 882  * @param my the y component of the predicted motion vector
 883  */
 884 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 885     if(n==0){
 886         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 887         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 888
 889         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 890
 891         if(top_ref == ref){
 892             *mx= B[0];
 893             *my= B[1];
 894             return;
 895         }
 896     }else{
 897         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 898         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 899
 900         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 901
 902         if(left_ref == ref){
 903             *mx= A[0];
 904             *my= A[1];
 905             return;
 906         }
 907     }
 908
 909     //RARE
 910     pred_motion(h, n, 4, list, ref, mx, my);
 911 }
 912
 913 /**
 914  * gets the directionally predicted 8x16 MV.
 915  * @param n the block index
 916  * @param mx the x component of the predicted motion vector
 917  * @param my the y component of the predicted motion vector
 918  */
 919 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 920     if(n==0){
 921         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 922         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 923
 924         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 925
 926         if(left_ref == ref){
 927             *mx= A[0];
 928             *my= A[1];
 929             return;
 930         }
 931     }else{
 932         const int16_t * C;
 933         int diagonal_ref;
 934
 935         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 936
 937         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 938
 939         if(diagonal_ref == ref){
 940             *mx= C[0];
 941             *my= C[1];
 942             return;
 943         }
 944     }
 945
 946     //RARE
 947     pred_motion(h, n, 2, list, ref, mx, my);
 948 }
 949
 950 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 951     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 952     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 953
 954     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 955
 956     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 957        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 958        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 959
 960         *mx = *my = 0;
 961         return;
 962     }
 963
 964     pred_motion(h, 0, 4, 0, 0, mx, my);
 965
 966     return;
 967 }
 968
 969 static inline void direct_dist_scale_factor(H264Context * const h){
 970     const int poc = h->s.current_picture_ptr->poc;
 971     const int poc1 = h->ref_list[1][0].poc;
 972     int i;
 973     for(i=0; i<h->ref_count[0]; i++){
 974         int poc0 = h->ref_list[0][i].poc;
 975         int td = av_clip(poc1 - poc0, -128, 127);
 976         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 977             h->dist_scale_factor[i] = 256;
 978         }else{
 979             int tb = av_clip(poc - poc0, -128, 127);
 980             int tx = (16384 + (FFABS(td) >> 1)) / td;
 981             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 982         }
 983     }
 984     if(FRAME_MBAFF){
 985         for(i=0; i<h->ref_count[0]; i++){
 986             h->dist_scale_factor_field[2*i] =
 987             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 988         }
 989     }
 990 }
 991 static inline void direct_ref_list_init(H264Context * const h){
 992     MpegEncContext * const s = &h->s;
 993     Picture * const ref1 = &h->ref_list[1][0];
 994     Picture * const cur = s->current_picture_ptr;
 995     int list, i, j;
 996     if(cur->pict_type == I_TYPE)
 997         cur->ref_count[0] = 0;
 998     if(cur->pict_type != B_TYPE)
 999         cur->ref_count[1] = 0;
1000     for(list=0; list<2; list++){
1001         cur->ref_count[list] = h->ref_count[list];
1002         for(j=0; j<h->ref_count[list]; j++)
1003             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1004     }
1005     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1006         return;
1007     for(list=0; list<2; list++){
1008         for(i=0; i<ref1->ref_count[list]; i++){
1009             const int poc = ref1->ref_poc[list][i];
1010             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1011             for(j=0; j<h->ref_count[list]; j++)
1012                 if(h->ref_list[list][j].poc == poc){
1013                     h->map_col_to_list0[list][i] = j;
1014                     break;
1015                 }
1016         }
1017     }
1018     if(FRAME_MBAFF){
1019         for(list=0; list<2; list++){
1020             for(i=0; i<ref1->ref_count[list]; i++){
1021                 j = h->map_col_to_list0[list][i];
1022                 h->map_col_to_list0_field[list][2*i] = 2*j;
1023                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1024             }
1025         }
1026     }
1027 }
1028
1029 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1030     MpegEncContext * const s = &h->s;
1031     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1032     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1033     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1034     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1035     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1036     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1037     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1038     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1039     const int is_b8x8 = IS_8X8(*mb_type);
1040     unsigned int sub_mb_type;
1041     int i8, i4;
1042
1043 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1044     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1045         /* FIXME save sub mb types from previous frames (or derive from MVs)
1046          * so we know exactly what block size to use */
1047         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1048         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1049     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1050         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1051         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1052     }else{
1053         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1054         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1055     }
1056     if(!is_b8x8)
1057         *mb_type |= MB_TYPE_DIRECT2;
1058     if(MB_FIELD)
1059         *mb_type |= MB_TYPE_INTERLACED;
1060
1061     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1062
1063     if(h->direct_spatial_mv_pred){
1064         int ref[2];
1065         int mv[2][2];
1066         int list;
1067
1068         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1069
1070         /* ref = min(neighbors) */
1071         for(list=0; list<2; list++){
1072             int refa = h->ref_cache[list][scan8[0] - 1];
1073             int refb = h->ref_cache[list][scan8[0] - 8];
1074             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1075             if(refc == -2)
1076                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1077             ref[list] = refa;
1078             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1079                 ref[list] = refb;
1080             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1081                 ref[list] = refc;
1082             if(ref[list] < 0)
1083                 ref[list] = -1;
1084         }
1085
1086         if(ref[0] < 0 && ref[1] < 0){
1087             ref[0] = ref[1] = 0;
1088             mv[0][0] = mv[0][1] =
1089             mv[1][0] = mv[1][1] = 0;
1090         }else{
1091             for(list=0; list<2; list++){
1092                 if(ref[list] >= 0)
1093                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1094                 else
1095                     mv[list][0] = mv[list][1] = 0;
1096             }
1097         }
1098
1099         if(ref[1] < 0){
1100             *mb_type &= ~MB_TYPE_P0L1;
1101             sub_mb_type &= ~MB_TYPE_P0L1;
1102         }else if(ref[0] < 0){
1103             *mb_type &= ~MB_TYPE_P0L0;
1104             sub_mb_type &= ~MB_TYPE_P0L0;
1105         }
1106
1107         if(IS_16X16(*mb_type)){
1108             int a=0, b=0;
1109
1110             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1111             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1112             if(!IS_INTRA(mb_type_col)
1113                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1114                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1115                        && (h->x264_build>33 || !h->x264_build)))){
1116                 if(ref[0] > 0)
1117                     a= pack16to32(mv[0][0],mv[0][1]);
1118                 if(ref[1] > 0)
1119                     b= pack16to32(mv[1][0],mv[1][1]);
1120             }else{
1121                 a= pack16to32(mv[0][0],mv[0][1]);
1122                 b= pack16to32(mv[1][0],mv[1][1]);
1123             }
1124             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1125             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1126         }else{
1127             for(i8=0; i8<4; i8++){
1128                 const int x8 = i8&1;
1129                 const int y8 = i8>>1;
1130
1131                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1132                     continue;
1133                 h->sub_mb_type[i8] = sub_mb_type;
1134
1135                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1136                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1137                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1138                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1139
1140                 /* col_zero_flag */
1141                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1142                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1143                                                   && (h->x264_build>33 || !h->x264_build)))){
1144                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1145                     if(IS_SUB_8X8(sub_mb_type)){
1146                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1147                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1148                             if(ref[0] == 0)
1149                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1150                             if(ref[1] == 0)
1151                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1152                         }
1153                     }else
1154                     for(i4=0; i4<4; i4++){
1155                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1156                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1157                             if(ref[0] == 0)
1158                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1159                             if(ref[1] == 0)
1160                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1161                         }
1162                     }
1163                 }
1164             }
1165         }
1166     }else{ /* direct temporal mv pred */
1167         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1168         const int *dist_scale_factor = h->dist_scale_factor;
1169
1170         if(FRAME_MBAFF){
1171             if(IS_INTERLACED(*mb_type)){
1172                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1173                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1174                 dist_scale_factor = h->dist_scale_factor_field;
1175             }
1176             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1177                 /* FIXME assumes direct_8x8_inference == 1 */
1178                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1179                 int mb_types_col[2];
1180                 int y_shift;
1181
1182                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1183                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1184                          | (*mb_type & MB_TYPE_INTERLACED);
1185                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1186
1187                 if(IS_INTERLACED(*mb_type)){
1188                     /* frame to field scaling */
1189                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1190                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1191                     if(s->mb_y&1){
1192                         l1ref0 -= 2*h->b8_stride;
1193                         l1ref1 -= 2*h->b8_stride;
1194                         l1mv0 -= 4*h->b_stride;
1195                         l1mv1 -= 4*h->b_stride;
1196                     }
1197                     y_shift = 0;
1198
1199                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1200                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1201                        && !is_b8x8)
1202                         *mb_type |= MB_TYPE_16x8;
1203                     else
1204                         *mb_type |= MB_TYPE_8x8;
1205                 }else{
1206                     /* field to frame scaling */
1207                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1208                      * but in MBAFF, top and bottom POC are equal */
1209                     int dy = (s->mb_y&1) ? 1 : 2;
1210                     mb_types_col[0] =
1211                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1212                     l1ref0 += dy*h->b8_stride;
1213                     l1ref1 += dy*h->b8_stride;
1214                     l1mv0 += 2*dy*h->b_stride;
1215                     l1mv1 += 2*dy*h->b_stride;
1216                     y_shift = 2;
1217
1218                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1219                        && !is_b8x8)
1220                         *mb_type |= MB_TYPE_16x16;
1221                     else
1222                         *mb_type |= MB_TYPE_8x8;
1223                 }
1224
1225                 for(i8=0; i8<4; i8++){
1226                     const int x8 = i8&1;
1227                     const int y8 = i8>>1;
1228                     int ref0, scale;
1229                     const int16_t (*l1mv)[2]= l1mv0;
1230
1231                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1232                         continue;
1233                     h->sub_mb_type[i8] = sub_mb_type;
1234
1235                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1236                     if(IS_INTRA(mb_types_col[y8])){
1237                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1238                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1239                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1240                         continue;
1241                     }
1242
1243                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1244                     if(ref0 >= 0)
1245                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1246                     else{
1247                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1248                         l1mv= l1mv1;
1249                     }
1250                     scale = dist_scale_factor[ref0];
1251                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1252
1253                     {
1254                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1255                         int my_col = (mv_col[1]<<y_shift)/2;
1256                         int mx = (scale * mv_col[0] + 128) >> 8;
1257                         int my = (scale * my_col + 128) >> 8;
1258                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1259                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1260                     }
1261                 }
1262                 return;
1263             }
1264         }
1265
1266         /* one-to-one mv scaling */
1267
1268         if(IS_16X16(*mb_type)){
1269             int ref, mv0, mv1;
1270
1271             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1272             if(IS_INTRA(mb_type_col)){
1273                 ref=mv0=mv1=0;
1274             }else{
1275                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1276                                                 : map_col_to_list0[1][l1ref1[0]];
1277                 const int scale = dist_scale_factor[ref0];
1278                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1279                 int mv_l0[2];
1280                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1281                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1282                 ref= ref0;
1283                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1284                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1285             }
1286             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1287             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1288             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1289         }else{
1290             for(i8=0; i8<4; i8++){
1291                 const int x8 = i8&1;
1292                 const int y8 = i8>>1;
1293                 int ref0, scale;
1294                 const int16_t (*l1mv)[2]= l1mv0;
1295
1296                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1297                     continue;
1298                 h->sub_mb_type[i8] = sub_mb_type;
1299                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1300                 if(IS_INTRA(mb_type_col)){
1301                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1302                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1303                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1304                     continue;
1305                 }
1306
1307                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1308                 if(ref0 >= 0)
1309                     ref0 = map_col_to_list0[0][ref0];
1310                 else{
1311                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1312                     l1mv= l1mv1;
1313                 }
1314                 scale = dist_scale_factor[ref0];
1315
1316                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1317                 if(IS_SUB_8X8(sub_mb_type)){
1318                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1319                     int mx = (scale * mv_col[0] + 128) >> 8;
1320                     int my = (scale * mv_col[1] + 128) >> 8;
1321                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1322                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1323                 }else
1324                 for(i4=0; i4<4; i4++){
1325                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1326                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1327                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1328                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1329                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1330                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1331                 }
1332             }
1333         }
1334     }
1335 }
1336
1337 static inline void write_back_motion(H264Context *h, int mb_type){
1338     MpegEncContext * const s = &h->s;
1339     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1340     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1341     int list;
1342
1343     if(!USES_LIST(mb_type, 0))
1344         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1345
1346     for(list=0; list<h->list_count; list++){
1347         int y;
1348         if(!USES_LIST(mb_type, list))
1349             continue;
1350
1351         for(y=0; y<4; y++){
1352             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1353             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1354         }
1355         if( h->pps.cabac ) {
1356             if(IS_SKIP(mb_type))
1357                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1358             else
1359             for(y=0; y<4; y++){
1360                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1361                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1362             }
1363         }
1364
1365         {
1366             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1367             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1368             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1369             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1370             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1371         }
1372     }
1373
1374     if(h->slice_type == B_TYPE && h->pps.cabac){
1375         if(IS_8X8(mb_type)){
1376             uint8_t *direct_table = &h->direct_table[b8_xy];
1377             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1378             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1379             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1380         }
1381     }
1382 }
1383
1384 /**
1385  * Decodes a network abstraction layer unit.
1386  * @param consumed is the number of bytes used as input
1387  * @param length is the length of the array
1388  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1389  * @returns decoded bytes, might be src+1 if no escapes
1390  */
1391 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1392     int i, si, di;
1393     uint8_t *dst;
1394     int bufidx;
1395
1396 //    src[0]&0x80;                //forbidden bit
1397     h->nal_ref_idc= src[0]>>5;
1398     h->nal_unit_type= src[0]&0x1F;
1399
1400     src++; length--;
1401 #if 0
1402     for(i=0; i<length; i++)
1403         printf("%2X ", src[i]);
1404 #endif
1405     for(i=0; i+1<length; i+=2){
1406         if(src[i]) continue;
1407         if(i>0 && src[i-1]==0) i--;
1408         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1409             if(src[i+2]!=3){
1410                 /* startcode, so we must be past the end */
1411                 length=i;
1412             }
1413             break;
1414         }
1415     }
1416
1417     if(i>=length-1){ //no escaped 0
1418         *dst_length= length;
1419         *consumed= length+1; //+1 for the header
1420         return src;
1421     }
1422
1423     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1424     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1425     dst= h->rbsp_buffer[bufidx];
1426
1427     if (dst == NULL){
1428         return NULL;
1429     }
1430
1431 //printf("decoding esc\n");
1432     si=di=0;
1433     while(si<length){
1434         //remove escapes (very rare 1:2^22)
1435         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1436             if(src[si+2]==3){ //escape
1437                 dst[di++]= 0;
1438                 dst[di++]= 0;
1439                 si+=3;
1440                 continue;
1441             }else //next start code
1442                 break;
1443         }
1444
1445         dst[di++]= src[si++];
1446     }
1447
1448     *dst_length= di;
1449     *consumed= si + 1;//+1 for the header
1450 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1451     return dst;
1452 }
1453
1454 /**
1455  * identifies the exact end of the bitstream
1456  * @return the length of the trailing, or 0 if damaged
1457  */
1458 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1459     int v= *src;
1460     int r;
1461
1462     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1463
1464     for(r=1; r<9; r++){
1465         if(v&1) return r;
1466         v>>=1;
1467     }
1468     return 0;
1469 }
1470
1471 /**
1472  * idct tranforms the 16 dc values and dequantize them.
1473  * @param qp quantization parameter
1474  */
1475 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1476 #define stride 16
1477     int i;
1478     int temp[16]; //FIXME check if this is a good idea
1479     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1480     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1481
1482 //memset(block, 64, 2*256);
1483 //return;
1484     for(i=0; i<4; i++){
1485         const int offset= y_offset[i];
1486         const int z0= block[offset+stride*0] + block[offset+stride*4];
1487         const int z1= block[offset+stride*0] - block[offset+stride*4];
1488         const int z2= block[offset+stride*1] - block[offset+stride*5];
1489         const int z3= block[offset+stride*1] + block[offset+stride*5];
1490
1491         temp[4*i+0]= z0+z3;
1492         temp[4*i+1]= z1+z2;
1493         temp[4*i+2]= z1-z2;
1494         temp[4*i+3]= z0-z3;
1495     }
1496
1497     for(i=0; i<4; i++){
1498         const int offset= x_offset[i];
1499         const int z0= temp[4*0+i] + temp[4*2+i];
1500         const int z1= temp[4*0+i] - temp[4*2+i];
1501         const int z2= temp[4*1+i] - temp[4*3+i];
1502         const int z3= temp[4*1+i] + temp[4*3+i];
1503
1504         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1505         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1506         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1507         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1508     }
1509 }
1510
1511 #if 0
1512 /**
1513  * dct tranforms the 16 dc values.
1514  * @param qp quantization parameter ??? FIXME
1515  */
1516 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1517 //    const int qmul= dequant_coeff[qp][0];
1518     int i;
1519     int temp[16]; //FIXME check if this is a good idea
1520     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1521     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1522
1523     for(i=0; i<4; i++){
1524         const int offset= y_offset[i];
1525         const int z0= block[offset+stride*0] + block[offset+stride*4];
1526         const int z1= block[offset+stride*0] - block[offset+stride*4];
1527         const int z2= block[offset+stride*1] - block[offset+stride*5];
1528         const int z3= block[offset+stride*1] + block[offset+stride*5];
1529
1530         temp[4*i+0]= z0+z3;
1531         temp[4*i+1]= z1+z2;
1532         temp[4*i+2]= z1-z2;
1533         temp[4*i+3]= z0-z3;
1534     }
1535
1536     for(i=0; i<4; i++){
1537         const int offset= x_offset[i];
1538         const int z0= temp[4*0+i] + temp[4*2+i];
1539         const int z1= temp[4*0+i] - temp[4*2+i];
1540         const int z2= temp[4*1+i] - temp[4*3+i];
1541         const int z3= temp[4*1+i] + temp[4*3+i];
1542
1543         block[stride*0 +offset]= (z0 + z3)>>1;
1544         block[stride*2 +offset]= (z1 + z2)>>1;
1545         block[stride*8 +offset]= (z1 - z2)>>1;
1546         block[stride*10+offset]= (z0 - z3)>>1;
1547     }
1548 }
1549 #endif
1550
1551 #undef xStride
1552 #undef stride
1553
1554 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1555     const int stride= 16*2;
1556     const int xStride= 16;
1557     int a,b,c,d,e;
1558
1559     a= block[stride*0 + xStride*0];
1560     b= block[stride*0 + xStride*1];
1561     c= block[stride*1 + xStride*0];
1562     d= block[stride*1 + xStride*1];
1563
1564     e= a-b;
1565     a= a+b;
1566     b= c-d;
1567     c= c+d;
1568
1569     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1570     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1571     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1572     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1573 }
1574
1575 #if 0
1576 static void chroma_dc_dct_c(DCTELEM *block){
1577     const int stride= 16*2;
1578     const int xStride= 16;
1579     int a,b,c,d,e;
1580
1581     a= block[stride*0 + xStride*0];
1582     b= block[stride*0 + xStride*1];
1583     c= block[stride*1 + xStride*0];
1584     d= block[stride*1 + xStride*1];
1585
1586     e= a-b;
1587     a= a+b;
1588     b= c-d;
1589     c= c+d;
1590
1591     block[stride*0 + xStride*0]= (a+c);
1592     block[stride*0 + xStride*1]= (e+b);
1593     block[stride*1 + xStride*0]= (a-c);
1594     block[stride*1 + xStride*1]= (e-b);
1595 }
1596 #endif
1597
1598 /**
1599  * gets the chroma qp.
1600  */
1601 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1602     return h->pps.chroma_qp_table[t][qscale & 0xff];
1603 }
1604
1605 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1606 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1607 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1608     int i;
1609     const int * const quant_table= quant_coeff[qscale];
1610     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1611     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1612     const unsigned int threshold2= (threshold1<<1);
1613     int last_non_zero;
1614
1615     if(separate_dc){
1616         if(qscale<=18){
1617             //avoid overflows
1618             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1619             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1620             const unsigned int dc_threshold2= (dc_threshold1<<1);
1621
1622             int level= block[0]*quant_coeff[qscale+18][0];
1623             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1624                 if(level>0){
1625                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1626                     block[0]= level;
1627                 }else{
1628                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1629                     block[0]= -level;
1630                 }
1631 //                last_non_zero = i;
1632             }else{
1633                 block[0]=0;
1634             }
1635         }else{
1636             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1637             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1638             const unsigned int dc_threshold2= (dc_threshold1<<1);
1639
1640             int level= block[0]*quant_table[0];
1641             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1642                 if(level>0){
1643                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1644                     block[0]= level;
1645                 }else{
1646                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1647                     block[0]= -level;
1648                 }
1649 //                last_non_zero = i;
1650             }else{
1651                 block[0]=0;
1652             }
1653         }
1654         last_non_zero= 0;
1655         i=1;
1656     }else{
1657         last_non_zero= -1;
1658         i=0;
1659     }
1660
1661     for(; i<16; i++){
1662         const int j= scantable[i];
1663         int level= block[j]*quant_table[j];
1664
1665 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1666 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1667         if(((unsigned)(level+threshold1))>threshold2){
1668             if(level>0){
1669                 level= (bias + level)>>QUANT_SHIFT;
1670                 block[j]= level;
1671             }else{
1672                 level= (bias - level)>>QUANT_SHIFT;
1673                 block[j]= -level;
1674             }
1675             last_non_zero = i;
1676         }else{
1677             block[j]=0;
1678         }
1679     }
1680
1681     return last_non_zero;
1682 }
1683
1684 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1685                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1686                            int src_x_offset, int src_y_offset,
1687                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1688     MpegEncContext * const s = &h->s;
1689     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1690     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1691     const int luma_xy= (mx&3) + ((my&3)<<2);
1692     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1693     uint8_t * src_cb, * src_cr;
1694     int extra_width= h->emu_edge_width;
1695     int extra_height= h->emu_edge_height;
1696     int emu=0;
1697     const int full_mx= mx>>2;
1698     const int full_my= my>>2;
1699     const int pic_width  = 16*s->mb_width;
1700     const int pic_height = 16*s->mb_height >> MB_MBAFF;
1701
1702     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1703         return;
1704
1705     if(mx&7) extra_width -= 3;
1706     if(my&7) extra_height -= 3;
1707
1708     if(   full_mx < 0-extra_width
1709        || full_my < 0-extra_height
1710        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1711        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1712         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1713             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1714         emu=1;
1715     }
1716
1717     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1718     if(!square){
1719         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1720     }
1721
1722     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1723
1724     if(MB_MBAFF){
1725         // chroma offset when predicting from a field of opposite parity
1726         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
1727         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1728     }
1729     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1730     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1731
1732     if(emu){
1733         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1734             src_cb= s->edge_emu_buffer;
1735     }
1736     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1737
1738     if(emu){
1739         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1740             src_cr= s->edge_emu_buffer;
1741     }
1742     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1743 }
1744
1745 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1746                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1747                            int x_offset, int y_offset,
1748                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1749                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1750                            int list0, int list1){
1751     MpegEncContext * const s = &h->s;
1752     qpel_mc_func *qpix_op=  qpix_put;
1753     h264_chroma_mc_func chroma_op= chroma_put;
1754
1755     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1756     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1757     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1758     x_offset += 8*s->mb_x;
1759     y_offset += 8*(s->mb_y >> MB_MBAFF);
1760
1761     if(list0){
1762         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1763         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1764                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1765                            qpix_op, chroma_op);
1766
1767         qpix_op=  qpix_avg;
1768         chroma_op= chroma_avg;
1769     }
1770
1771     if(list1){
1772         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1773         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1774                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1775                            qpix_op, chroma_op);
1776     }
1777 }
1778
1779 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1780                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1781                            int x_offset, int y_offset,
1782                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1783                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1784                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1785                            int list0, int list1){
1786     MpegEncContext * const s = &h->s;
1787
1788     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1789     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1790     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1791     x_offset += 8*s->mb_x;
1792     y_offset += 8*(s->mb_y >> MB_MBAFF);
1793
1794     if(list0 && list1){
1795         /* don't optimize for luma-only case, since B-frames usually
1796          * use implicit weights => chroma too. */
1797         uint8_t *tmp_cb = s->obmc_scratchpad;
1798         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1799         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1800         int refn0 = h->ref_cache[0][ scan8[n] ];
1801         int refn1 = h->ref_cache[1][ scan8[n] ];
1802
1803         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1804                     dest_y, dest_cb, dest_cr,
1805                     x_offset, y_offset, qpix_put, chroma_put);
1806         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1807                     tmp_y, tmp_cb, tmp_cr,
1808                     x_offset, y_offset, qpix_put, chroma_put);
1809
1810         if(h->use_weight == 2){
1811             int weight0 = h->implicit_weight[refn0][refn1];
1812             int weight1 = 64 - weight0;
1813             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1814             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1815             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1816         }else{
1817             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1818                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1819                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1820             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1821                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1822                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1823             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1824                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1825                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1826         }
1827     }else{
1828         int list = list1 ? 1 : 0;
1829         int refn = h->ref_cache[list][ scan8[n] ];
1830         Picture *ref= &h->ref_list[list][refn];
1831         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1832                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1833                     qpix_put, chroma_put);
1834
1835         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1836                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1837         if(h->use_weight_chroma){
1838             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1839                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1840             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1841                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1842         }
1843     }
1844 }
1845
1846 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1847                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1848                            int x_offset, int y_offset,
1849                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1850                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1851                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1852                            int list0, int list1){
1853     if((h->use_weight==2 && list0 && list1
1854         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1855        || h->use_weight==1)
1856         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1857                          x_offset, y_offset, qpix_put, chroma_put,
1858                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1859     else
1860         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1861                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1862 }
1863
1864 static inline void prefetch_motion(H264Context *h, int list){
1865     /* fetch pixels for estimated mv 4 macroblocks ahead
1866      * optimized for 64byte cache lines */
1867     MpegEncContext * const s = &h->s;
1868     const int refn = h->ref_cache[list][scan8[0]];
1869     if(refn >= 0){
1870         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1871         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1872         uint8_t **src= h->ref_list[list][refn].data;
1873         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1874         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1875         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1876         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1877     }
1878 }
1879
1880 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1881                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1882                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1883                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1884     MpegEncContext * const s = &h->s;
1885     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1886     const int mb_type= s->current_picture.mb_type[mb_xy];
1887
1888     assert(IS_INTER(mb_type));
1889
1890     prefetch_motion(h, 0);
1891
1892     if(IS_16X16(mb_type)){
1893         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1894                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1895                 &weight_op[0], &weight_avg[0],
1896                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1897     }else if(IS_16X8(mb_type)){
1898         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1899                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1900                 &weight_op[1], &weight_avg[1],
1901                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1902         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1903                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1904                 &weight_op[1], &weight_avg[1],
1905                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1906     }else if(IS_8X16(mb_type)){
1907         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1908                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1909                 &weight_op[2], &weight_avg[2],
1910                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1911         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1912                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1913                 &weight_op[2], &weight_avg[2],
1914                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1915     }else{
1916         int i;
1917
1918         assert(IS_8X8(mb_type));
1919
1920         for(i=0; i<4; i++){
1921             const int sub_mb_type= h->sub_mb_type[i];
1922             const int n= 4*i;
1923             int x_offset= (i&1)<<2;
1924             int y_offset= (i&2)<<1;
1925
1926             if(IS_SUB_8X8(sub_mb_type)){
1927                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1928                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1929                     &weight_op[3], &weight_avg[3],
1930                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1931             }else if(IS_SUB_8X4(sub_mb_type)){
1932                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1933                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1934                     &weight_op[4], &weight_avg[4],
1935                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1936                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1937                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1938                     &weight_op[4], &weight_avg[4],
1939                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1940             }else if(IS_SUB_4X8(sub_mb_type)){
1941                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1942                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1943                     &weight_op[5], &weight_avg[5],
1944                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1945                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1946                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1947                     &weight_op[5], &weight_avg[5],
1948                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1949             }else{
1950                 int j;
1951                 assert(IS_SUB_4X4(sub_mb_type));
1952                 for(j=0; j<4; j++){
1953                     int sub_x_offset= x_offset + 2*(j&1);
1954                     int sub_y_offset= y_offset +   (j&2);
1955                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1956                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1957                         &weight_op[6], &weight_avg[6],
1958                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1959                 }
1960             }
1961         }
1962     }
1963
1964     prefetch_motion(h, 1);
1965 }
1966
1967 static void decode_init_vlc(void){
1968     static int done = 0;
1969
1970     if (!done) {
1971         int i;
1972         done = 1;
1973
1974         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1975                  &chroma_dc_coeff_token_len [0], 1, 1,
1976                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1977
1978         for(i=0; i<4; i++){
1979             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1980                      &coeff_token_len [i][0], 1, 1,
1981                      &coeff_token_bits[i][0], 1, 1, 1);
1982         }
1983
1984         for(i=0; i<3; i++){
1985             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1986                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1987                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1988         }
1989         for(i=0; i<15; i++){
1990             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1991                      &total_zeros_len [i][0], 1, 1,
1992                      &total_zeros_bits[i][0], 1, 1, 1);
1993         }
1994
1995         for(i=0; i<6; i++){
1996             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1997                      &run_len [i][0], 1, 1,
1998                      &run_bits[i][0], 1, 1, 1);
1999         }
2000         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2001                  &run_len [6][0], 1, 1,
2002                  &run_bits[6][0], 1, 1, 1);
2003     }
2004 }
2005
2006 static void free_tables(H264Context *h){
2007     int i;
2008     H264Context *hx;
2009     av_freep(&h->intra4x4_pred_mode);
2010     av_freep(&h->chroma_pred_mode_table);
2011     av_freep(&h->cbp_table);
2012     av_freep(&h->mvd_table[0]);
2013     av_freep(&h->mvd_table[1]);
2014     av_freep(&h->direct_table);
2015     av_freep(&h->non_zero_count);
2016     av_freep(&h->slice_table_base);
2017     h->slice_table= NULL;
2018
2019     av_freep(&h->mb2b_xy);
2020     av_freep(&h->mb2b8_xy);
2021
2022     for(i = 0; i < MAX_SPS_COUNT; i++)
2023         av_freep(h->sps_buffers + i);
2024
2025     for(i = 0; i < MAX_PPS_COUNT; i++)
2026         av_freep(h->pps_buffers + i);
2027
2028     for(i = 0; i < h->s.avctx->thread_count; i++) {
2029         hx = h->thread_context[i];
2030         if(!hx) continue;
2031         av_freep(&hx->top_borders[1]);
2032         av_freep(&hx->top_borders[0]);
2033         av_freep(&hx->s.obmc_scratchpad);
2034         av_freep(&hx->s.allocated_edge_emu_buffer);
2035     }
2036 }
2037
2038 static void init_dequant8_coeff_table(H264Context *h){
2039     int i,q,x;
2040     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2041     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2042     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2043
2044     for(i=0; i<2; i++ ){
2045         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2046             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2047             break;
2048         }
2049
2050         for(q=0; q<52; q++){
2051             int shift = ff_div6[q];
2052             int idx = ff_rem6[q];
2053             for(x=0; x<64; x++)
2054                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2055                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2056                     h->pps.scaling_matrix8[i][x]) << shift;
2057         }
2058     }
2059 }
2060
2061 static void init_dequant4_coeff_table(H264Context *h){
2062     int i,j,q,x;
2063     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2064     for(i=0; i<6; i++ ){
2065         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2066         for(j=0; j<i; j++){
2067             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2068                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2069                 break;
2070             }
2071         }
2072         if(j<i)
2073             continue;
2074
2075         for(q=0; q<52; q++){
2076             int shift = ff_div6[q] + 2;
2077             int idx = ff_rem6[q];
2078             for(x=0; x<16; x++)
2079                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2080                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2081                     h->pps.scaling_matrix4[i][x]) << shift;
2082         }
2083     }
2084 }
2085
2086 static void init_dequant_tables(H264Context *h){
2087     int i,x;
2088     init_dequant4_coeff_table(h);
2089     if(h->pps.transform_8x8_mode)
2090         init_dequant8_coeff_table(h);
2091     if(h->sps.transform_bypass){
2092         for(i=0; i<6; i++)
2093             for(x=0; x<16; x++)
2094                 h->dequant4_coeff[i][0][x] = 1<<6;
2095         if(h->pps.transform_8x8_mode)
2096             for(i=0; i<2; i++)
2097                 for(x=0; x<64; x++)
2098                     h->dequant8_coeff[i][0][x] = 1<<6;
2099     }
2100 }
2101
2102
2103 /**
2104  * allocates tables.
2105  * needs width/height
2106  */
2107 static int alloc_tables(H264Context *h){
2108     MpegEncContext * const s = &h->s;
2109     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2110     int x,y;
2111
2112     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2113
2114     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2115     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2116     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2117
2118     if( h->pps.cabac ) {
2119         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2120         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2121         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2122         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2123     }
2124
2125     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2126     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2127
2128     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2129     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2130     for(y=0; y<s->mb_height; y++){
2131         for(x=0; x<s->mb_width; x++){
2132             const int mb_xy= x + y*s->mb_stride;
2133             const int b_xy = 4*x + 4*y*h->b_stride;
2134             const int b8_xy= 2*x + 2*y*h->b8_stride;
2135
2136             h->mb2b_xy [mb_xy]= b_xy;
2137             h->mb2b8_xy[mb_xy]= b8_xy;
2138         }
2139     }
2140
2141     s->obmc_scratchpad = NULL;
2142
2143     if(!h->dequant4_coeff[0])
2144         init_dequant_tables(h);
2145
2146     return 0;
2147 fail:
2148     free_tables(h);
2149     return -1;
2150 }
2151
2152 /**
2153  * Mimic alloc_tables(), but for every context thread.
2154  */
2155 static void clone_tables(H264Context *dst, H264Context *src){
2156     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2157     dst->non_zero_count           = src->non_zero_count;
2158     dst->slice_table              = src->slice_table;
2159     dst->cbp_table                = src->cbp_table;
2160     dst->mb2b_xy                  = src->mb2b_xy;
2161     dst->mb2b8_xy                 = src->mb2b8_xy;
2162     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2163     dst->mvd_table[0]             = src->mvd_table[0];
2164     dst->mvd_table[1]             = src->mvd_table[1];
2165     dst->direct_table             = src->direct_table;
2166
2167     dst->s.obmc_scratchpad = NULL;
2168     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2169 }
2170
2171 /**
2172  * Init context
2173  * Allocate buffers which are not shared amongst multiple threads.
2174  */
2175 static int context_init(H264Context *h){
2176     MpegEncContext * const s = &h->s;
2177
2178     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2179     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2180
2181     // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264)
2182     CHECKED_ALLOCZ(s->allocated_edge_emu_buffer,
2183                    (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance
2184     s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21;
2185     return 0;
2186 fail:
2187     return -1; // free_tables will clean up for us
2188 }
2189
2190 static void common_init(H264Context *h){
2191     MpegEncContext * const s = &h->s;
2192
2193     s->width = s->avctx->width;
2194     s->height = s->avctx->height;
2195     s->codec_id= s->avctx->codec->id;
2196
2197     ff_h264_pred_init(&h->hpc, s->codec_id);
2198
2199     h->dequant_coeff_pps= -1;
2200     s->unrestricted_mv=1;
2201     s->decode=1; //FIXME
2202
2203     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2204     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2205 }
2206
2207 static int decode_init(AVCodecContext *avctx){
2208     H264Context *h= avctx->priv_data;
2209     MpegEncContext * const s = &h->s;
2210
2211     MPV_decode_defaults(s);
2212
2213     s->avctx = avctx;
2214     common_init(h);
2215
2216     s->out_format = FMT_H264;
2217     s->workaround_bugs= avctx->workaround_bugs;
2218
2219     // set defaults
2220 //    s->decode_mb= ff_h263_decode_mb;
2221     s->quarter_sample = 1;
2222     s->low_delay= 1;
2223     avctx->pix_fmt= PIX_FMT_YUV420P;
2224
2225     decode_init_vlc();
2226
2227     if(avctx->extradata_size > 0 && avctx->extradata &&
2228        *(char *)avctx->extradata == 1){
2229         h->is_avc = 1;
2230         h->got_avcC = 0;
2231     } else {
2232         h->is_avc = 0;
2233     }
2234
2235     h->thread_context[0] = h;
2236     return 0;
2237 }
2238
2239 static int frame_start(H264Context *h){
2240     MpegEncContext * const s = &h->s;
2241     int i;
2242
2243     if(MPV_frame_start(s, s->avctx) < 0)
2244         return -1;
2245     ff_er_frame_start(s);
2246
2247     assert(s->linesize && s->uvlinesize);
2248
2249     for(i=0; i<16; i++){
2250         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2251         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2252     }
2253     for(i=0; i<4; i++){
2254         h->block_offset[16+i]=
2255         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2256         h->block_offset[24+16+i]=
2257         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2258     }
2259
2260     /* can't be in alloc_tables because linesize isn't known there.
2261      * FIXME: redo bipred weight to not require extra buffer? */
2262     for(i = 0; i < s->avctx->thread_count; i++)
2263         if(!h->thread_context[i]->s.obmc_scratchpad)
2264             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2265
2266     /* some macroblocks will be accessed before they're available */
2267     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2268         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2269
2270 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2271     return 0;
2272 }
2273
2274 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2275     MpegEncContext * const s = &h->s;
2276     int i;
2277
2278     src_y  -=   linesize;
2279     src_cb -= uvlinesize;
2280     src_cr -= uvlinesize;
2281
2282     // There are two lines saved, the line above the the top macroblock of a pair,
2283     // and the line above the bottom macroblock
2284     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2285     for(i=1; i<17; i++){
2286         h->left_border[i]= src_y[15+i*  linesize];
2287     }
2288
2289     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2290     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2291
2292     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2293         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2294         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2295         for(i=1; i<9; i++){
2296             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2297             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2298         }
2299         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2300         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2301     }
2302 }
2303
2304 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2305     MpegEncContext * const s = &h->s;
2306     int temp8, i;
2307     uint64_t temp64;
2308     int deblock_left;
2309     int deblock_top;
2310     int mb_xy;
2311
2312     if(h->deblocking_filter == 2) {
2313         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2314         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2315         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2316     } else {
2317         deblock_left = (s->mb_x > 0);
2318         deblock_top =  (s->mb_y > 0);
2319     }
2320
2321     src_y  -=   linesize + 1;
2322     src_cb -= uvlinesize + 1;
2323     src_cr -= uvlinesize + 1;
2324
2325 #define XCHG(a,b,t,xchg)\
2326 t= a;\
2327 if(xchg)\
2328     a= b;\
2329 b= t;
2330
2331     if(deblock_left){
2332         for(i = !deblock_top; i<17; i++){
2333             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2334         }
2335     }
2336
2337     if(deblock_top){
2338         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2339         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2340         if(s->mb_x+1 < s->mb_width){
2341             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2342         }
2343     }
2344
2345     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2346         if(deblock_left){
2347             for(i = !deblock_top; i<9; i++){
2348                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2349                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2350             }
2351         }
2352         if(deblock_top){
2353             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2354             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2355         }
2356     }
2357 }
2358
2359 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2360     MpegEncContext * const s = &h->s;
2361     int i;
2362
2363     src_y  -= 2 *   linesize;
2364     src_cb -= 2 * uvlinesize;
2365     src_cr -= 2 * uvlinesize;
2366
2367     // There are two lines saved, the line above the the top macroblock of a pair,
2368     // and the line above the bottom macroblock
2369     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2370     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2371     for(i=2; i<34; i++){
2372         h->left_border[i]= src_y[15+i*  linesize];
2373     }
2374
2375     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2376     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2377     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2378     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2379
2380     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2381         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2382         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2383         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2384         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2385         for(i=2; i<18; i++){
2386             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2387             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2388         }
2389         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2390         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2391         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2392         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2393     }
2394 }
2395
2396 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2397     MpegEncContext * const s = &h->s;
2398     int temp8, i;
2399     uint64_t temp64;
2400     int deblock_left = (s->mb_x > 0);
2401     int deblock_top  = (s->mb_y > 1);
2402
2403     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2404
2405     src_y  -= 2 *   linesize + 1;
2406     src_cb -= 2 * uvlinesize + 1;
2407     src_cr -= 2 * uvlinesize + 1;
2408
2409 #define XCHG(a,b,t,xchg)\
2410 t= a;\
2411 if(xchg)\
2412     a= b;\
2413 b= t;
2414
2415     if(deblock_left){
2416         for(i = (!deblock_top)<<1; i<34; i++){
2417             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2418         }
2419     }
2420
2421     if(deblock_top){
2422         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2423         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2424         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2425         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2426         if(s->mb_x+1 < s->mb_width){
2427             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2428             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2429         }
2430     }
2431
2432     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2433         if(deblock_left){
2434             for(i = (!deblock_top) << 1; i<18; i++){
2435                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2436                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2437             }
2438         }
2439         if(deblock_top){
2440             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2441             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2442             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2443             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2444         }
2445     }
2446 }
2447
2448 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2449     MpegEncContext * const s = &h->s;
2450     const int mb_x= s->mb_x;
2451     const int mb_y= s->mb_y;
2452     const int mb_xy= mb_x + mb_y*s->mb_stride;
2453     const int mb_type= s->current_picture.mb_type[mb_xy];
2454     uint8_t  *dest_y, *dest_cb, *dest_cr;
2455     int linesize, uvlinesize /*dct_offset*/;
2456     int i;
2457     int *block_offset = &h->block_offset[0];
2458     const unsigned int bottom = mb_y & 1;
2459     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2460     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2461     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2462
2463     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2464     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2465     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2466
2467     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2468     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2469
2470     if (!simple && MB_FIELD) {
2471         linesize   = h->mb_linesize   = s->linesize * 2;
2472         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2473         block_offset = &h->block_offset[24];
2474         if(mb_y&1){ //FIXME move out of this func?
2475             dest_y -= s->linesize*15;
2476             dest_cb-= s->uvlinesize*7;
2477             dest_cr-= s->uvlinesize*7;
2478         }
2479         if(FRAME_MBAFF) {
2480             int list;
2481             for(list=0; list<h->list_count; list++){
2482                 if(!USES_LIST(mb_type, list))
2483                     continue;
2484                 if(IS_16X16(mb_type)){
2485                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2486                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
2487                 }else{
2488                     for(i=0; i<16; i+=4){
2489                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2490                         int ref = h->ref_cache[list][scan8[i]];
2491                         if(ref >= 0)
2492                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
2493                     }
2494                 }
2495             }
2496         }
2497     } else {
2498         linesize   = h->mb_linesize   = s->linesize;
2499         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2500 //        dct_offset = s->linesize * 16;
2501     }
2502
2503     if(transform_bypass){
2504         idct_dc_add =
2505         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2506     }else if(IS_8x8DCT(mb_type)){
2507         idct_dc_add = s->dsp.h264_idct8_dc_add;
2508         idct_add = s->dsp.h264_idct8_add;
2509     }else{
2510         idct_dc_add = s->dsp.h264_idct_dc_add;
2511         idct_add = s->dsp.h264_idct_add;
2512     }
2513
2514     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2515        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2516         int mbt_y = mb_y&~1;
2517         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2518         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2519         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2520         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2521     }
2522
2523     if (!simple && IS_INTRA_PCM(mb_type)) {
2524         unsigned int x, y;
2525
2526         // The pixels are stored in h->mb array in the same order as levels,
2527         // copy them in output in the correct order.
2528         for(i=0; i<16; i++) {
2529             for (y=0; y<4; y++) {
2530                 for (x=0; x<4; x++) {
2531                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2532                 }
2533             }
2534         }
2535         for(i=16; i<16+4; i++) {
2536             for (y=0; y<4; y++) {
2537                 for (x=0; x<4; x++) {
2538                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2539                 }
2540             }
2541         }
2542         for(i=20; i<20+4; i++) {
2543             for (y=0; y<4; y++) {
2544                 for (x=0; x<4; x++) {
2545                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2546                 }
2547             }
2548         }
2549     } else {
2550         if(IS_INTRA(mb_type)){
2551             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2552                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2553
2554             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2555                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2556                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2557             }
2558
2559             if(IS_INTRA4x4(mb_type)){
2560                 if(simple || !s->encoding){
2561                     if(IS_8x8DCT(mb_type)){
2562                         for(i=0; i<16; i+=4){
2563                             uint8_t * const ptr= dest_y + block_offset[i];
2564                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2565                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2566                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2567                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2568                             if(nnz){
2569                                 if(nnz == 1 && h->mb[i*16])
2570                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2571                                 else
2572                                     idct_add(ptr, h->mb + i*16, linesize);
2573                             }
2574                         }
2575                     }else
2576                     for(i=0; i<16; i++){
2577                         uint8_t * const ptr= dest_y + block_offset[i];
2578                         uint8_t *topright;
2579                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2580                         int nnz, tr;
2581
2582                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2583                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2584                             assert(mb_y || linesize <= block_offset[i]);
2585                             if(!topright_avail){
2586                                 tr= ptr[3 - linesize]*0x01010101;
2587                                 topright= (uint8_t*) &tr;
2588                             }else
2589                                 topright= ptr + 4 - linesize;
2590                         }else
2591                             topright= NULL;
2592
2593                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2594                         nnz = h->non_zero_count_cache[ scan8[i] ];
2595                         if(nnz){
2596                             if(is_h264){
2597                                 if(nnz == 1 && h->mb[i*16])
2598                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2599                                 else
2600                                     idct_add(ptr, h->mb + i*16, linesize);
2601                             }else
2602                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2603                         }
2604                     }
2605                 }
2606             }else{
2607                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2608                 if(is_h264){
2609                     if(!transform_bypass)
2610                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
2611                 }else
2612                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2613             }
2614             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2615                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2616         }else if(is_h264){
2617             hl_motion(h, dest_y, dest_cb, dest_cr,
2618                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2619                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2620                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2621         }
2622
2623
2624         if(!IS_INTRA4x4(mb_type)){
2625             if(is_h264){
2626                 if(IS_INTRA16x16(mb_type)){
2627                     for(i=0; i<16; i++){
2628                         if(h->non_zero_count_cache[ scan8[i] ])
2629                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2630                         else if(h->mb[i*16])
2631                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2632                     }
2633                 }else{
2634                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2635                     for(i=0; i<16; i+=di){
2636                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2637                         if(nnz){
2638                             if(nnz==1 && h->mb[i*16])
2639                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2640                             else
2641                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2642                         }
2643                     }
2644                 }
2645             }else{
2646                 for(i=0; i<16; i++){
2647                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2648                         uint8_t * const ptr= dest_y + block_offset[i];
2649                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2650                     }
2651                 }
2652             }
2653         }
2654
2655         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2656             uint8_t *dest[2] = {dest_cb, dest_cr};
2657             if(transform_bypass){
2658                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2659             }else{
2660                 idct_add = s->dsp.h264_idct_add;
2661                 idct_dc_add = s->dsp.h264_idct_dc_add;
2662                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2663                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2664             }
2665             if(is_h264){
2666                 for(i=16; i<16+8; i++){
2667                     if(h->non_zero_count_cache[ scan8[i] ])
2668                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2669                     else if(h->mb[i*16])
2670                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2671                 }
2672             }else{
2673                 for(i=16; i<16+8; i++){
2674                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2675                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2676                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2677                     }
2678                 }
2679             }
2680         }
2681     }
2682     if(h->deblocking_filter) {
2683         if (!simple && FRAME_MBAFF) {
2684             //FIXME try deblocking one mb at a time?
2685             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2686             const int mb_y = s->mb_y - 1;
2687             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2688             const int mb_xy= mb_x + mb_y*s->mb_stride;
2689             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2690             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2691             if (!bottom) return;
2692             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2693             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2694             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2695
2696             if(IS_INTRA(mb_type_top | mb_type_bottom))
2697                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2698
2699             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2700             // deblock a pair
2701             // top
2702             s->mb_y--;
2703             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2704             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2705             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2706             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2707             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2708             // bottom
2709             s->mb_y++;
2710             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2711             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2712             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2713             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2714             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2715         } else {
2716             tprintf(h->s.avctx, "call filter_mb\n");
2717             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2718             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2719             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2720         }
2721     }
2722 }
2723
2724 /**
2725  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2726  */
2727 static void hl_decode_mb_simple(H264Context *h){
2728     hl_decode_mb_internal(h, 1);
2729 }
2730
2731 /**
2732  * Process a macroblock; this handles edge cases, such as interlacing.
2733  */
2734 static void av_noinline hl_decode_mb_complex(H264Context *h){
2735     hl_decode_mb_internal(h, 0);
2736 }
2737
2738 static void hl_decode_mb(H264Context *h){
2739     MpegEncContext * const s = &h->s;
2740     const int mb_x= s->mb_x;
2741     const int mb_y= s->mb_y;
2742     const int mb_xy= mb_x + mb_y*s->mb_stride;
2743     const int mb_type= s->current_picture.mb_type[mb_xy];
2744     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2745
2746     if(!s->decode)
2747         return;
2748
2749     if (is_complex)
2750         hl_decode_mb_complex(h);
2751     else hl_decode_mb_simple(h);
2752 }
2753
2754 /**
2755  * fills the default_ref_list.
2756  */
2757 static int fill_default_ref_list(H264Context *h){
2758     MpegEncContext * const s = &h->s;
2759     int i;
2760     int smallest_poc_greater_than_current = -1;
2761     Picture sorted_short_ref[32];
2762
2763     if(h->slice_type==B_TYPE){
2764         int out_i;
2765         int limit= INT_MIN;
2766
2767         /* sort frame according to poc in B slice */
2768         for(out_i=0; out_i<h->short_ref_count; out_i++){
2769             int best_i=INT_MIN;
2770             int best_poc=INT_MAX;
2771
2772             for(i=0; i<h->short_ref_count; i++){
2773                 const int poc= h->short_ref[i]->poc;
2774                 if(poc > limit && poc < best_poc){
2775                     best_poc= poc;
2776                     best_i= i;
2777                 }
2778             }
2779
2780             assert(best_i != INT_MIN);
2781
2782             limit= best_poc;
2783             sorted_short_ref[out_i]= *h->short_ref[best_i];
2784             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2785             if (-1 == smallest_poc_greater_than_current) {
2786                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2787                     smallest_poc_greater_than_current = out_i;
2788                 }
2789             }
2790         }
2791     }
2792
2793     if(s->picture_structure == PICT_FRAME){
2794         if(h->slice_type==B_TYPE){
2795             int list;
2796             tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2797
2798             // find the largest poc
2799             for(list=0; list<2; list++){
2800                 int index = 0;
2801                 int j= -99;
2802                 int step= list ? -1 : 1;
2803
2804                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2805                     while(j<0 || j>= h->short_ref_count){
2806                         if(j != -99 && step == (list ? -1 : 1))
2807                             return -1;
2808                         step = -step;
2809                         j= smallest_poc_greater_than_current + (step>>1);
2810                     }
2811                     if(sorted_short_ref[j].reference != 3) continue;
2812                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
2813                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2814                 }
2815
2816                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2817                     if(h->long_ref[i] == NULL) continue;
2818                     if(h->long_ref[i]->reference != 3) continue;
2819
2820                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
2821                     h->default_ref_list[ list ][index++].pic_id= i;;
2822                 }
2823
2824                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
2825                     // swap the two first elements of L1 when
2826                     // L0 and L1 are identical
2827                     Picture temp= h->default_ref_list[1][0];
2828                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
2829                     h->default_ref_list[1][1] = temp;
2830                 }
2831
2832                 if(index < h->ref_count[ list ])
2833                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
2834             }
2835         }else{
2836             int index=0;
2837             for(i=0; i<h->short_ref_count; i++){
2838                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
2839                 h->default_ref_list[0][index  ]= *h->short_ref[i];
2840                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2841             }
2842             for(i = 0; i < 16; i++){
2843                 if(h->long_ref[i] == NULL) continue;
2844                 if(h->long_ref[i]->reference != 3) continue;
2845                 h->default_ref_list[0][index  ]= *h->long_ref[i];
2846                 h->default_ref_list[0][index++].pic_id= i;;
2847             }
2848             if(index < h->ref_count[0])
2849                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2850         }
2851     }else{ //FIELD
2852         if(h->slice_type==B_TYPE){
2853         }else{
2854             //FIXME second field balh
2855         }
2856     }
2857 #ifdef TRACE
2858     for (i=0; i<h->ref_count[0]; i++) {
2859         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2860     }
2861     if(h->slice_type==B_TYPE){
2862         for (i=0; i<h->ref_count[1]; i++) {
2863             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
2864         }
2865     }
2866 #endif
2867     return 0;
2868 }
2869
2870 static void print_short_term(H264Context *h);
2871 static void print_long_term(H264Context *h);
2872
2873 static int decode_ref_pic_list_reordering(H264Context *h){
2874     MpegEncContext * const s = &h->s;
2875     int list, index;
2876
2877     print_short_term(h);
2878     print_long_term(h);
2879     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
2880
2881     for(list=0; list<h->list_count; list++){
2882         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2883
2884         if(get_bits1(&s->gb)){
2885             int pred= h->curr_pic_num;
2886
2887             for(index=0; ; index++){
2888                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2889                 unsigned int pic_id;
2890                 int i;
2891                 Picture *ref = NULL;
2892
2893                 if(reordering_of_pic_nums_idc==3)
2894                     break;
2895
2896                 if(index >= h->ref_count[list]){
2897                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2898                     return -1;
2899                 }
2900
2901                 if(reordering_of_pic_nums_idc<3){
2902                     if(reordering_of_pic_nums_idc<2){
2903                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2904
2905                         if(abs_diff_pic_num >= h->max_pic_num){
2906                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2907                             return -1;
2908                         }
2909
2910                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2911                         else                                pred+= abs_diff_pic_num;
2912                         pred &= h->max_pic_num - 1;
2913
2914                         for(i= h->short_ref_count-1; i>=0; i--){
2915                             ref = h->short_ref[i];
2916                             assert(ref->reference == 3);
2917                             assert(!ref->long_ref);
2918                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
2919                                 break;
2920                         }
2921                         if(i>=0)
2922                             ref->pic_id= ref->frame_num;
2923                     }else{
2924                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2925                         if(pic_id>31){
2926                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2927                             return -1;
2928                         }
2929                         ref = h->long_ref[pic_id];
2930                         if(ref){
2931                             ref->pic_id= pic_id;
2932                             assert(ref->reference == 3);
2933                             assert(ref->long_ref);
2934                             i=0;
2935                         }else{
2936                             i=-1;
2937                         }
2938                     }
2939
2940                     if (i < 0) {
2941                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2942                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2943                     } else {
2944                         for(i=index; i+1<h->ref_count[list]; i++){
2945                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2946                                 break;
2947                         }
2948                         for(; i > index; i--){
2949                             h->ref_list[list][i]= h->ref_list[list][i-1];
2950                         }
2951                         h->ref_list[list][index]= *ref;
2952                     }
2953                 }else{
2954                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2955                     return -1;
2956                 }
2957             }
2958         }
2959     }
2960     for(list=0; list<h->list_count; list++){
2961         for(index= 0; index < h->ref_count[list]; index++){
2962             if(!h->ref_list[list][index].data[0])
2963                 h->ref_list[list][index]= s->current_picture;
2964         }
2965     }
2966
2967     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
2968         direct_dist_scale_factor(h);
2969     direct_ref_list_init(h);
2970     return 0;
2971 }
2972
2973 static void fill_mbaff_ref_list(H264Context *h){
2974     int list, i, j;
2975     for(list=0; list<2; list++){ //FIXME try list_count
2976         for(i=0; i<h->ref_count[list]; i++){
2977             Picture *frame = &h->ref_list[list][i];
2978             Picture *field = &h->ref_list[list][16+2*i];
2979             field[0] = *frame;
2980             for(j=0; j<3; j++)
2981                 field[0].linesize[j] <<= 1;
2982             field[1] = field[0];
2983             for(j=0; j<3; j++)
2984                 field[1].data[j] += frame->linesize[j];
2985
2986             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2987             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2988             for(j=0; j<2; j++){
2989                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2990                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2991             }
2992         }
2993     }
2994     for(j=0; j<h->ref_count[1]; j++){
2995         for(i=0; i<h->ref_count[0]; i++)
2996             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2997         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2998         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2999     }
3000 }
3001
3002 static int pred_weight_table(H264Context *h){
3003     MpegEncContext * const s = &h->s;
3004     int list, i;
3005     int luma_def, chroma_def;
3006
3007     h->use_weight= 0;
3008     h->use_weight_chroma= 0;
3009     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3010     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3011     luma_def = 1<<h->luma_log2_weight_denom;
3012     chroma_def = 1<<h->chroma_log2_weight_denom;
3013
3014     for(list=0; list<2; list++){
3015         for(i=0; i<h->ref_count[list]; i++){
3016             int luma_weight_flag, chroma_weight_flag;
3017
3018             luma_weight_flag= get_bits1(&s->gb);
3019             if(luma_weight_flag){
3020                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3021                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3022                 if(   h->luma_weight[list][i] != luma_def
3023                    || h->luma_offset[list][i] != 0)
3024                     h->use_weight= 1;
3025             }else{
3026                 h->luma_weight[list][i]= luma_def;
3027                 h->luma_offset[list][i]= 0;
3028             }
3029
3030             chroma_weight_flag= get_bits1(&s->gb);
3031             if(chroma_weight_flag){
3032                 int j;
3033                 for(j=0; j<2; j++){
3034                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3035                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3036                     if(   h->chroma_weight[list][i][j] != chroma_def
3037                        || h->chroma_offset[list][i][j] != 0)
3038                         h->use_weight_chroma= 1;
3039                 }
3040             }else{
3041                 int j;
3042                 for(j=0; j<2; j++){
3043                     h->chroma_weight[list][i][j]= chroma_def;
3044                     h->chroma_offset[list][i][j]= 0;
3045                 }
3046             }
3047         }
3048         if(h->slice_type != B_TYPE) break;
3049     }
3050     h->use_weight= h->use_weight || h->use_weight_chroma;
3051     return 0;
3052 }
3053
3054 static void implicit_weight_table(H264Context *h){
3055     MpegEncContext * const s = &h->s;
3056     int ref0, ref1;
3057     int cur_poc = s->current_picture_ptr->poc;
3058
3059     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3060        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3061         h->use_weight= 0;
3062         h->use_weight_chroma= 0;
3063         return;
3064     }
3065
3066     h->use_weight= 2;
3067     h->use_weight_chroma= 2;
3068     h->luma_log2_weight_denom= 5;
3069     h->chroma_log2_weight_denom= 5;
3070
3071     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3072         int poc0 = h->ref_list[0][ref0].poc;
3073         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3074             int poc1 = h->ref_list[1][ref1].poc;
3075             int td = av_clip(poc1 - poc0, -128, 127);
3076             if(td){
3077                 int tb = av_clip(cur_poc - poc0, -128, 127);
3078                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3079                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3080                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3081                     h->implicit_weight[ref0][ref1] = 32;
3082                 else
3083                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3084             }else
3085                 h->implicit_weight[ref0][ref1] = 32;
3086         }
3087     }
3088 }
3089
3090 static inline void unreference_pic(H264Context *h, Picture *pic){
3091     int i;
3092     pic->reference=0;
3093     if(pic == h->delayed_output_pic)
3094         pic->reference=1;
3095     else{
3096         for(i = 0; h->delayed_pic[i]; i++)
3097             if(pic == h->delayed_pic[i]){
3098                 pic->reference=1;
3099                 break;
3100             }
3101     }
3102 }
3103
3104 /**
3105  * instantaneous decoder refresh.
3106  */
3107 static void idr(H264Context *h){
3108     int i;
3109
3110     for(i=0; i<16; i++){
3111         if (h->long_ref[i] != NULL) {
3112             unreference_pic(h, h->long_ref[i]);
3113             h->long_ref[i]= NULL;
3114         }
3115     }
3116     h->long_ref_count=0;
3117
3118     for(i=0; i<h->short_ref_count; i++){
3119         unreference_pic(h, h->short_ref[i]);
3120         h->short_ref[i]= NULL;
3121     }
3122     h->short_ref_count=0;
3123 }
3124
3125 /* forget old pics after a seek */
3126 static void flush_dpb(AVCodecContext *avctx){
3127     H264Context *h= avctx->priv_data;
3128     int i;
3129     for(i=0; i<16; i++) {
3130         if(h->delayed_pic[i])
3131             h->delayed_pic[i]->reference= 0;
3132         h->delayed_pic[i]= NULL;
3133     }
3134     if(h->delayed_output_pic)
3135         h->delayed_output_pic->reference= 0;
3136     h->delayed_output_pic= NULL;
3137     idr(h);
3138     if(h->s.current_picture_ptr)
3139         h->s.current_picture_ptr->reference= 0;
3140 }
3141
3142 /**
3143  *
3144  * @return the removed picture or NULL if an error occurs
3145  */
3146 static Picture * remove_short(H264Context *h, int frame_num){
3147     MpegEncContext * const s = &h->s;
3148     int i;
3149
3150     if(s->avctx->debug&FF_DEBUG_MMCO)
3151         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3152
3153     for(i=0; i<h->short_ref_count; i++){
3154         Picture *pic= h->short_ref[i];
3155         if(s->avctx->debug&FF_DEBUG_MMCO)
3156             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3157         if(pic->frame_num == frame_num){
3158             h->short_ref[i]= NULL;
3159             if (--h->short_ref_count)
3160                 memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3161             return pic;
3162         }
3163     }
3164     return NULL;
3165 }
3166
3167 /**
3168  *
3169  * @return the removed picture or NULL if an error occurs
3170  */
3171 static Picture * remove_long(H264Context *h, int i){
3172     Picture *pic;
3173
3174     pic= h->long_ref[i];
3175     h->long_ref[i]= NULL;
3176     if(pic) h->long_ref_count--;
3177
3178     return pic;
3179 }
3180
3181 /**
3182  * print short term list
3183  */
3184 static void print_short_term(H264Context *h) {
3185     uint32_t i;
3186     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3187         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3188         for(i=0; i<h->short_ref_count; i++){
3189             Picture *pic= h->short_ref[i];
3190             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3191         }
3192     }
3193 }
3194
3195 /**
3196  * print long term list
3197  */
3198 static void print_long_term(H264Context *h) {
3199     uint32_t i;
3200     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3201         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3202         for(i = 0; i < 16; i++){
3203             Picture *pic= h->long_ref[i];
3204             if (pic) {
3205                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3206             }
3207         }
3208     }
3209 }
3210
3211 /**
3212  * Executes the reference picture marking (memory management control operations).
3213  */
3214 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3215     MpegEncContext * const s = &h->s;
3216     int i, j;
3217     int current_is_long=0;
3218     Picture *pic;
3219
3220     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3221         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3222
3223     for(i=0; i<mmco_count; i++){
3224         if(s->avctx->debug&FF_DEBUG_MMCO)
3225             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
3226
3227         switch(mmco[i].opcode){
3228         case MMCO_SHORT2UNUSED:
3229             pic= remove_short(h, mmco[i].short_frame_num);
3230             if(pic)
3231                 unreference_pic(h, pic);
3232             else if(s->avctx->debug&FF_DEBUG_MMCO)
3233                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3234             break;
3235         case MMCO_SHORT2LONG:
3236             pic= remove_long(h, mmco[i].long_index);
3237             if(pic) unreference_pic(h, pic);
3238
3239             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
3240             if (h->long_ref[ mmco[i].long_index ]){
3241                 h->long_ref[ mmco[i].long_index ]->long_ref=1;
3242                 h->long_ref_count++;
3243             }
3244             break;
3245         case MMCO_LONG2UNUSED:
3246             pic= remove_long(h, mmco[i].long_index);
3247             if(pic)
3248                 unreference_pic(h, pic);
3249             else if(s->avctx->debug&FF_DEBUG_MMCO)
3250                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3251             break;
3252         case MMCO_LONG:
3253             pic= remove_long(h, mmco[i].long_index);
3254             if(pic) unreference_pic(h, pic);
3255
3256             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
3257             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3258             h->long_ref_count++;
3259
3260             current_is_long=1;
3261             break;
3262         case MMCO_SET_MAX_LONG:
3263             assert(mmco[i].long_index <= 16);
3264             // just remove the long term which index is greater than new max
3265             for(j = mmco[i].long_index; j<16; j++){
3266                 pic = remove_long(h, j);
3267                 if (pic) unreference_pic(h, pic);
3268             }
3269             break;
3270         case MMCO_RESET:
3271             while(h->short_ref_count){
3272                 pic= remove_short(h, h->short_ref[0]->frame_num);
3273                 if(pic) unreference_pic(h, pic);
3274             }
3275             for(j = 0; j < 16; j++) {
3276                 pic= remove_long(h, j);
3277                 if(pic) unreference_pic(h, pic);
3278             }
3279             break;
3280         default: assert(0);
3281         }
3282     }
3283
3284     if(!current_is_long){
3285         pic= remove_short(h, s->current_picture_ptr->frame_num);
3286         if(pic){
3287             unreference_pic(h, pic);
3288             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3289         }
3290
3291         if(h->short_ref_count)
3292             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3293
3294         h->short_ref[0]= s->current_picture_ptr;
3295         h->short_ref[0]->long_ref=0;
3296         h->short_ref_count++;
3297     }
3298
3299     print_short_term(h);
3300     print_long_term(h);
3301     return 0;
3302 }
3303
3304 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3305     MpegEncContext * const s = &h->s;
3306     int i;
3307
3308     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3309         s->broken_link= get_bits1(gb) -1;
3310         h->mmco[0].long_index= get_bits1(gb) - 1; // current_long_term_idx
3311         if(h->mmco[0].long_index == -1)
3312             h->mmco_index= 0;
3313         else{
3314             h->mmco[0].opcode= MMCO_LONG;
3315             h->mmco_index= 1;
3316         }
3317     }else{
3318         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3319             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3320                 MMCOOpcode opcode= get_ue_golomb(gb);
3321
3322                 h->mmco[i].opcode= opcode;
3323                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3324                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
3325 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
3326                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3327                         return -1;
3328                     }*/
3329                 }
3330                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3331                     unsigned int long_index= get_ue_golomb(gb);
3332                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
3333                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3334                         return -1;
3335                     }
3336                     h->mmco[i].long_index= long_index;
3337                 }
3338
3339                 if(opcode > (unsigned)MMCO_LONG){
3340                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3341                     return -1;
3342                 }
3343                 if(opcode == MMCO_END)
3344                     break;
3345             }
3346             h->mmco_index= i;
3347         }else{
3348             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3349
3350             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
3351                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3352                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3353                 h->mmco_index= 1;
3354             }else
3355                 h->mmco_index= 0;
3356         }
3357     }
3358
3359     return 0;
3360 }
3361
3362 static int init_poc(H264Context *h){
3363     MpegEncContext * const s = &h->s;
3364     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3365     int field_poc[2];
3366
3367     if(h->nal_unit_type == NAL_IDR_SLICE){
3368         h->frame_num_offset= 0;
3369     }else{
3370         if(h->frame_num < h->prev_frame_num)
3371             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3372         else
3373             h->frame_num_offset= h->prev_frame_num_offset;
3374     }
3375
3376     if(h->sps.poc_type==0){
3377         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3378
3379         if(h->nal_unit_type == NAL_IDR_SLICE){
3380              h->prev_poc_msb=
3381              h->prev_poc_lsb= 0;
3382         }
3383
3384         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3385             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3386         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3387             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3388         else
3389             h->poc_msb = h->prev_poc_msb;
3390 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3391         field_poc[0] =
3392         field_poc[1] = h->poc_msb + h->poc_lsb;
3393         if(s->picture_structure == PICT_FRAME)
3394             field_poc[1] += h->delta_poc_bottom;
3395     }else if(h->sps.poc_type==1){
3396         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3397         int i;
3398
3399         if(h->sps.poc_cycle_length != 0)
3400             abs_frame_num = h->frame_num_offset + h->frame_num;
3401         else
3402             abs_frame_num = 0;
3403
3404         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3405             abs_frame_num--;
3406
3407         expected_delta_per_poc_cycle = 0;
3408         for(i=0; i < h->sps.poc_cycle_length; i++)
3409             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3410
3411         if(abs_frame_num > 0){
3412             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3413             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3414
3415             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3416             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3417                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3418         } else
3419             expectedpoc = 0;
3420
3421         if(h->nal_ref_idc == 0)
3422             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3423
3424         field_poc[0] = expectedpoc + h->delta_poc[0];
3425         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3426
3427         if(s->picture_structure == PICT_FRAME)
3428             field_poc[1] += h->delta_poc[1];
3429     }else{
3430         int poc;
3431         if(h->nal_unit_type == NAL_IDR_SLICE){
3432             poc= 0;
3433         }else{
3434             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3435             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3436         }
3437         field_poc[0]= poc;
3438         field_poc[1]= poc;
3439     }
3440
3441     if(s->picture_structure != PICT_BOTTOM_FIELD)
3442         s->current_picture_ptr->field_poc[0]= field_poc[0];
3443     if(s->picture_structure != PICT_TOP_FIELD)
3444         s->current_picture_ptr->field_poc[1]= field_poc[1];
3445     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
3446         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
3447
3448     return 0;
3449 }
3450
3451
3452 /**
3453  * initialize scan tables
3454  */
3455 static void init_scan_tables(H264Context *h){
3456     MpegEncContext * const s = &h->s;
3457     int i;
3458     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3459         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3460         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3461     }else{
3462         for(i=0; i<16; i++){
3463 #define T(x) (x>>2) | ((x<<2) & 0xF)
3464             h->zigzag_scan[i] = T(zigzag_scan[i]);
3465             h-> field_scan[i] = T( field_scan[i]);
3466 #undef T
3467         }
3468     }
3469     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3470         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3471         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3472         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3473         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3474     }else{
3475         for(i=0; i<64; i++){
3476 #define T(x) (x>>3) | ((x&7)<<3)
3477             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3478             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3479             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3480             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3481 #undef T
3482         }
3483     }
3484     if(h->sps.transform_bypass){ //FIXME same ugly
3485         h->zigzag_scan_q0          = zigzag_scan;
3486         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3487         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3488         h->field_scan_q0           = field_scan;
3489         h->field_scan8x8_q0        = field_scan8x8;
3490         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3491     }else{
3492         h->zigzag_scan_q0          = h->zigzag_scan;
3493         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3494         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3495         h->field_scan_q0           = h->field_scan;
3496         h->field_scan8x8_q0        = h->field_scan8x8;
3497         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3498     }
3499 }
3500
3501 /**
3502  * Replicates H264 "master" context to thread contexts.
3503  */
3504 static void clone_slice(H264Context *dst, H264Context *src)
3505 {
3506     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3507     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3508     dst->s.current_picture      = src->s.current_picture;
3509     dst->s.linesize             = src->s.linesize;
3510     dst->s.uvlinesize           = src->s.uvlinesize;
3511
3512     dst->prev_poc_msb           = src->prev_poc_msb;
3513     dst->prev_poc_lsb           = src->prev_poc_lsb;
3514     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3515     dst->prev_frame_num         = src->prev_frame_num;
3516     dst->short_ref_count        = src->short_ref_count;
3517
3518     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3519     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3520     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3521     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3522
3523     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3524     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3525 }
3526
3527 /**
3528  * decodes a slice header.
3529  * this will allso call MPV_common_init() and frame_start() as needed
3530  *
3531  * @param h h264context
3532  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3533  *
3534  * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
3535  */
3536 static int decode_slice_header(H264Context *h, H264Context *h0){
3537     MpegEncContext * const s = &h->s;
3538     unsigned int first_mb_in_slice;
3539     unsigned int pps_id;
3540     int num_ref_idx_active_override_flag;
3541     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3542     unsigned int slice_type, tmp, i;
3543     int default_ref_list_done = 0;
3544
3545     s->current_picture.reference= h->nal_ref_idc != 0;
3546     s->dropable= h->nal_ref_idc == 0;
3547
3548     first_mb_in_slice= get_ue_golomb(&s->gb);
3549
3550     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3551         h0->current_slice = 0;
3552         s->current_picture_ptr= NULL;
3553     }
3554
3555     slice_type= get_ue_golomb(&s->gb);
3556     if(slice_type > 9){
3557         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3558         return -1;
3559     }
3560     if(slice_type > 4){
3561         slice_type -= 5;
3562         h->slice_type_fixed=1;
3563     }else
3564         h->slice_type_fixed=0;
3565
3566     slice_type= slice_type_map[ slice_type ];
3567     if (slice_type == I_TYPE
3568         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3569         default_ref_list_done = 1;
3570     }
3571     h->slice_type= slice_type;
3572
3573     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3574
3575     pps_id= get_ue_golomb(&s->gb);
3576     if(pps_id>=MAX_PPS_COUNT){
3577         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3578         return -1;
3579     }
3580     if(!h0->pps_buffers[pps_id]) {
3581         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3582         return -1;
3583     }
3584     h->pps= *h0->pps_buffers[pps_id];
3585
3586     if(!h0->sps_buffers[h->pps.sps_id]) {
3587         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3588         return -1;
3589     }
3590     h->sps = *h0->sps_buffers[h->pps.sps_id];
3591
3592     if(h == h0 && h->dequant_coeff_pps != pps_id){
3593         h->dequant_coeff_pps = pps_id;
3594         init_dequant_tables(h);
3595     }
3596
3597     s->mb_width= h->sps.mb_width;
3598     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3599
3600     h->b_stride=  s->mb_width*4;
3601     h->b8_stride= s->mb_width*2;
3602
3603     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3604     if(h->sps.frame_mbs_only_flag)
3605         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3606     else
3607         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3608
3609     if (s->context_initialized
3610         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3611         if(h != h0)
3612             return -1;   // width / height changed during parallelized decoding
3613         free_tables(h);
3614         MPV_common_end(s);
3615     }
3616     if (!s->context_initialized) {
3617         if(h != h0)
3618             return -1;  // we cant (re-)initialize context during parallel decoding
3619         if (MPV_common_init(s) < 0)
3620             return -1;
3621
3622         init_scan_tables(h);
3623         alloc_tables(h);
3624
3625         for(i = 1; i < s->avctx->thread_count; i++) {
3626             H264Context *c;
3627             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3628             memcpy(c, h, sizeof(MpegEncContext));
3629             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3630             c->sps = h->sps;
3631             c->pps = h->pps;
3632             init_scan_tables(c);
3633             clone_tables(c, h);
3634         }
3635
3636         for(i = 0; i < s->avctx->thread_count; i++)
3637             if(context_init(h->thread_context[i]) < 0)
3638                 return -1;
3639
3640         s->avctx->width = s->width;
3641         s->avctx->height = s->height;
3642         s->avctx->sample_aspect_ratio= h->sps.sar;
3643         if(!s->avctx->sample_aspect_ratio.den)
3644             s->avctx->sample_aspect_ratio.den = 1;
3645
3646         if(h->sps.timing_info_present_flag){
3647             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3648             if(h->x264_build > 0 && h->x264_build < 44)
3649                 s->avctx->time_base.den *= 2;
3650             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3651                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3652         }
3653     }
3654
3655     if(h0->current_slice == 0){
3656         if(frame_start(h) < 0)
3657             return -1;
3658     }
3659     if(h != h0)
3660         clone_slice(h, h0);
3661
3662     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
3663     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3664
3665     h->mb_mbaff = 0;
3666     h->mb_aff_frame = 0;
3667     if(h->sps.frame_mbs_only_flag){
3668         s->picture_structure= PICT_FRAME;
3669     }else{
3670         if(get_bits1(&s->gb)) { //field_pic_flag
3671             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3672             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
3673         } else {
3674             s->picture_structure= PICT_FRAME;
3675             h->mb_aff_frame = h->sps.mb_aff;
3676         }
3677     }
3678     assert(s->mb_num == s->mb_width * s->mb_height);
3679     if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
3680        first_mb_in_slice                    >= s->mb_num){
3681         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3682         return -1;
3683     }
3684     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3685     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
3686     assert(s->mb_y < s->mb_height);
3687
3688     if(s->picture_structure==PICT_FRAME){
3689         h->curr_pic_num=   h->frame_num;
3690         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3691     }else{
3692         h->curr_pic_num= 2*h->frame_num;
3693         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3694     }
3695
3696     if(h->nal_unit_type == NAL_IDR_SLICE){
3697         get_ue_golomb(&s->gb); /* idr_pic_id */
3698     }
3699
3700     if(h->sps.poc_type==0){
3701         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3702
3703         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3704             h->delta_poc_bottom= get_se_golomb(&s->gb);
3705         }
3706     }
3707
3708     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3709         h->delta_poc[0]= get_se_golomb(&s->gb);
3710
3711         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3712             h->delta_poc[1]= get_se_golomb(&s->gb);
3713     }
3714
3715     init_poc(h);
3716
3717     if(h->pps.redundant_pic_cnt_present){
3718         h->redundant_pic_count= get_ue_golomb(&s->gb);
3719     }
3720
3721     //set defaults, might be overriden a few line later
3722     h->ref_count[0]= h->pps.ref_count[0];
3723     h->ref_count[1]= h->pps.ref_count[1];
3724
3725     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
3726         if(h->slice_type == B_TYPE){
3727             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3728             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
3729                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
3730         }
3731         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3732
3733         if(num_ref_idx_active_override_flag){
3734             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3735             if(h->slice_type==B_TYPE)
3736                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3737
3738             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3739                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3740                 h->ref_count[0]= h->ref_count[1]= 1;
3741                 return -1;
3742             }
3743         }
3744         if(h->slice_type == B_TYPE)
3745             h->list_count= 2;
3746         else
3747             h->list_count= 1;
3748     }else
3749         h->list_count= 0;
3750
3751     if(!default_ref_list_done){
3752         fill_default_ref_list(h);
3753     }
3754
3755     if(decode_ref_pic_list_reordering(h) < 0)
3756         return -1;
3757
3758     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
3759        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
3760         pred_weight_table(h);
3761     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
3762         implicit_weight_table(h);
3763     else
3764         h->use_weight = 0;
3765
3766     if(s->current_picture.reference)
3767         decode_ref_pic_marking(h0, &s->gb);
3768
3769     if(FRAME_MBAFF)
3770         fill_mbaff_ref_list(h);
3771
3772     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
3773         tmp = get_ue_golomb(&s->gb);
3774         if(tmp > 2){
3775             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3776             return -1;
3777         }
3778         h->cabac_init_idc= tmp;
3779     }
3780
3781     h->last_qscale_diff = 0;
3782     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3783     if(tmp>51){
3784         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3785         return -1;
3786     }
3787     s->qscale= tmp;
3788     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3789     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3790     //FIXME qscale / qp ... stuff
3791     if(h->slice_type == SP_TYPE){
3792         get_bits1(&s->gb); /* sp_for_switch_flag */
3793     }
3794     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
3795         get_se_golomb(&s->gb); /* slice_qs_delta */
3796     }
3797
3798     h->deblocking_filter = 1;
3799     h->slice_alpha_c0_offset = 0;
3800     h->slice_beta_offset = 0;
3801     if( h->pps.deblocking_filter_parameters_present ) {
3802         tmp= get_ue_golomb(&s->gb);
3803         if(tmp > 2){
3804             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3805             return -1;
3806         }
3807         h->deblocking_filter= tmp;
3808         if(h->deblocking_filter < 2)
3809             h->deblocking_filter^= 1; // 1<->0
3810
3811         if( h->deblocking_filter ) {
3812             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3813             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3814         }
3815     }
3816
3817     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3818         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3819             /* Cheat slightly for speed:
3820                Dont bother to deblock across slices */
3821             h->deblocking_filter = 2;
3822         } else {
3823             h0->max_contexts = 1;
3824             if(!h0->single_decode_warning) {
3825                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3826                 h0->single_decode_warning = 1;
3827             }
3828             if(h != h0)
3829                 return 1; // deblocking switched inside frame
3830         }
3831     }
3832
3833     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3834        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
3835        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
3836        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3837         h->deblocking_filter= 0;
3838
3839 #if 0 //FMO
3840     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3841         slice_group_change_cycle= get_bits(&s->gb, ?);
3842 #endif
3843
3844     h0->last_slice_type = slice_type;
3845     h->slice_num = ++h0->current_slice;
3846
3847     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3848     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
3849
3850     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3851         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
3852                h->slice_num,
3853                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
3854                first_mb_in_slice,
3855                av_get_pict_type_char(h->slice_type),
3856                pps_id, h->frame_num,
3857                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
3858                h->ref_count[0], h->ref_count[1],
3859                s->qscale,
3860                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
3861                h->use_weight,
3862                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
3863                );
3864     }
3865
3866     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
3867         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3868         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3869     }else{
3870         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3871         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3872     }
3873
3874     return 0;
3875 }
3876
3877 /**
3878  *
3879  */
3880 static inline int get_level_prefix(GetBitContext *gb){
3881     unsigned int buf;
3882     int log;
3883
3884     OPEN_READER(re, gb);
3885     UPDATE_CACHE(re, gb);
3886     buf=GET_CACHE(re, gb);
3887
3888     log= 32 - av_log2(buf);
3889 #ifdef TRACE
3890     print_bin(buf>>(32-log), log);
3891     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
3892 #endif
3893
3894     LAST_SKIP_BITS(re, gb, log);
3895     CLOSE_READER(re, gb);
3896
3897     return log-1;
3898 }
3899
3900 static inline int get_dct8x8_allowed(H264Context *h){
3901     int i;
3902     for(i=0; i<4; i++){
3903         if(!IS_SUB_8X8(h->sub_mb_type[i])
3904            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
3905             return 0;
3906     }
3907     return 1;
3908 }
3909
3910 /**
3911  * decodes a residual block.
3912  * @param n block index
3913  * @param scantable scantable
3914  * @param max_coeff number of coefficients in the block
3915  * @return <0 if an error occured
3916  */
3917 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
3918     MpegEncContext * const s = &h->s;
3919     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
3920     int level[16];
3921     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
3922
3923     //FIXME put trailing_onex into the context
3924
3925     if(n == CHROMA_DC_BLOCK_INDEX){
3926         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
3927         total_coeff= coeff_token>>2;
3928     }else{
3929         if(n == LUMA_DC_BLOCK_INDEX){
3930             total_coeff= pred_non_zero_count(h, 0);
3931             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
3932             total_coeff= coeff_token>>2;
3933         }else{
3934             total_coeff= pred_non_zero_count(h, n);
3935             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
3936             total_coeff= coeff_token>>2;
3937             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
3938         }
3939     }
3940
3941     //FIXME set last_non_zero?
3942
3943     if(total_coeff==0)
3944         return 0;
3945     if(total_coeff > (unsigned)max_coeff) {
3946         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
3947         return -1;
3948     }
3949
3950     trailing_ones= coeff_token&3;
3951     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
3952     assert(total_coeff<=16);
3953
3954     for(i=0; i<trailing_ones; i++){
3955         level[i]= 1 - 2*get_bits1(gb);
3956     }
3957
3958     if(i<total_coeff) {
3959         int level_code, mask;
3960         int suffix_length = total_coeff > 10 && trailing_ones < 3;
3961         int prefix= get_level_prefix(gb);
3962
3963         //first coefficient has suffix_length equal to 0 or 1
3964         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
3965             if(suffix_length)
3966                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
3967             else
3968                 level_code= (prefix<<suffix_length); //part
3969         }else if(prefix==14){
3970             if(suffix_length)
3971                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
3972             else
3973                 level_code= prefix + get_bits(gb, 4); //part
3974         }else if(prefix==15){
3975             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
3976             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
3977         }else{
3978             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
3979             return -1;
3980         }
3981
3982         if(trailing_ones < 3) level_code += 2;
3983
3984         suffix_length = 1;
3985         if(level_code > 5)
3986             suffix_length++;
3987         mask= -(level_code&1);
3988         level[i]= (((2+level_code)>>1) ^ mask) - mask;
3989         i++;
3990
3991         //remaining coefficients have suffix_length > 0
3992         for(;i<total_coeff;i++) {
3993             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
3994             prefix = get_level_prefix(gb);
3995             if(prefix<15){
3996                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
3997             }else if(prefix==15){
3998                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
3999             }else{
4000                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4001                 return -1;
4002             }
4003             mask= -(level_code&1);
4004             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4005             if(level_code > suffix_limit[suffix_length])
4006                 suffix_length++;
4007         }
4008     }
4009
4010     if(total_coeff == max_coeff)
4011         zeros_left=0;
4012     else{
4013         if(n == CHROMA_DC_BLOCK_INDEX)
4014             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4015         else
4016             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4017     }
4018
4019     coeff_num = zeros_left + total_coeff - 1;
4020     j = scantable[coeff_num];
4021     if(n > 24){
4022         block[j] = level[0];
4023         for(i=1;i<total_coeff;i++) {
4024             if(zeros_left <= 0)
4025                 run_before = 0;
4026             else if(zeros_left < 7){
4027                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4028             }else{
4029                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4030             }
4031             zeros_left -= run_before;
4032             coeff_num -= 1 + run_before;
4033             j= scantable[ coeff_num ];
4034
4035             block[j]= level[i];
4036         }
4037     }else{
4038         block[j] = (level[0] * qmul[j] + 32)>>6;
4039         for(i=1;i<total_coeff;i++) {
4040             if(zeros_left <= 0)
4041                 run_before = 0;
4042             else if(zeros_left < 7){
4043                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4044             }else{
4045                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4046             }
4047             zeros_left -= run_before;
4048             coeff_num -= 1 + run_before;
4049             j= scantable[ coeff_num ];
4050
4051             block[j]= (level[i] * qmul[j] + 32)>>6;
4052         }
4053     }
4054
4055     if(zeros_left<0){
4056         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4057         return -1;
4058     }
4059
4060     return 0;
4061 }
4062
4063 static void predict_field_decoding_flag(H264Context *h){
4064     MpegEncContext * const s = &h->s;
4065     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4066     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4067                 ? s->current_picture.mb_type[mb_xy-1]
4068                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4069                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4070                 : 0;
4071     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4072 }
4073
4074 /**
4075  * decodes a P_SKIP or B_SKIP macroblock
4076  */
4077 static void decode_mb_skip(H264Context *h){
4078     MpegEncContext * const s = &h->s;
4079     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4080     int mb_type=0;
4081
4082     memset(h->non_zero_count[mb_xy], 0, 16);
4083     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4084
4085     if(MB_FIELD)
4086         mb_type|= MB_TYPE_INTERLACED;
4087
4088     if( h->slice_type == B_TYPE )
4089     {
4090         // just for fill_caches. pred_direct_motion will set the real mb_type
4091         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4092
4093         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4094         pred_direct_motion(h, &mb_type);
4095         mb_type|= MB_TYPE_SKIP;
4096     }
4097     else
4098     {
4099         int mx, my;
4100         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4101
4102         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4103         pred_pskip_motion(h, &mx, &my);
4104         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4105         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4106     }
4107
4108     write_back_motion(h, mb_type);
4109     s->current_picture.mb_type[mb_xy]= mb_type;
4110     s->current_picture.qscale_table[mb_xy]= s->qscale;
4111     h->slice_table[ mb_xy ]= h->slice_num;
4112     h->prev_mb_skipped= 1;
4113 }
4114
4115 /**
4116  * decodes a macroblock
4117  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4118  */
4119 static int decode_mb_cavlc(H264Context *h){
4120     MpegEncContext * const s = &h->s;
4121     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4122     int partition_count;
4123     unsigned int mb_type, cbp;
4124     int dct8x8_allowed= h->pps.transform_8x8_mode;
4125
4126     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4127
4128     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4129     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4130                 down the code */
4131     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4132         if(s->mb_skip_run==-1)
4133             s->mb_skip_run= get_ue_golomb(&s->gb);
4134
4135         if (s->mb_skip_run--) {
4136             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4137                 if(s->mb_skip_run==0)
4138                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4139                 else
4140                     predict_field_decoding_flag(h);
4141             }
4142             decode_mb_skip(h);
4143             return 0;
4144         }
4145     }
4146     if(FRAME_MBAFF){
4147         if( (s->mb_y&1) == 0 )
4148             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4149     }else
4150         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4151
4152     h->prev_mb_skipped= 0;
4153
4154     mb_type= get_ue_golomb(&s->gb);
4155     if(h->slice_type == B_TYPE){
4156         if(mb_type < 23){
4157             partition_count= b_mb_type_info[mb_type].partition_count;
4158             mb_type=         b_mb_type_info[mb_type].type;
4159         }else{
4160             mb_type -= 23;
4161             goto decode_intra_mb;
4162         }
4163     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4164         if(mb_type < 5){
4165             partition_count= p_mb_type_info[mb_type].partition_count;
4166             mb_type=         p_mb_type_info[mb_type].type;
4167         }else{
4168             mb_type -= 5;
4169             goto decode_intra_mb;
4170         }
4171     }else{
4172        assert(h->slice_type == I_TYPE);
4173 decode_intra_mb:
4174         if(mb_type > 25){
4175             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4176             return -1;
4177         }
4178         partition_count=0;
4179         cbp= i_mb_type_info[mb_type].cbp;
4180         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4181         mb_type= i_mb_type_info[mb_type].type;
4182     }
4183
4184     if(MB_FIELD)
4185         mb_type |= MB_TYPE_INTERLACED;
4186
4187     h->slice_table[ mb_xy ]= h->slice_num;
4188
4189     if(IS_INTRA_PCM(mb_type)){
4190         unsigned int x, y;
4191
4192         // We assume these blocks are very rare so we do not optimize it.
4193         align_get_bits(&s->gb);
4194
4195         // The pixels are stored in the same order as levels in h->mb array.
4196         for(y=0; y<16; y++){
4197             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4198             for(x=0; x<16; x++){
4199                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4200                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4201             }
4202         }
4203         for(y=0; y<8; y++){
4204             const int index= 256 + 4*(y&3) + 32*(y>>2);
4205             for(x=0; x<8; x++){
4206                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4207                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4208             }
4209         }
4210         for(y=0; y<8; y++){
4211             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4212             for(x=0; x<8; x++){
4213                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4214                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4215             }
4216         }
4217
4218         // In deblocking, the quantizer is 0
4219         s->current_picture.qscale_table[mb_xy]= 0;
4220         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4221         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4222         // All coeffs are present
4223         memset(h->non_zero_count[mb_xy], 16, 16);
4224
4225         s->current_picture.mb_type[mb_xy]= mb_type;
4226         return 0;
4227     }
4228
4229     if(MB_MBAFF){
4230         h->ref_count[0] <<= 1;
4231         h->ref_count[1] <<= 1;
4232     }
4233
4234     fill_caches(h, mb_type, 0);
4235
4236     //mb_pred
4237     if(IS_INTRA(mb_type)){
4238             int pred_mode;
4239 //            init_top_left_availability(h);
4240             if(IS_INTRA4x4(mb_type)){
4241                 int i;
4242                 int di = 1;
4243                 if(dct8x8_allowed && get_bits1(&s->gb)){
4244                     mb_type |= MB_TYPE_8x8DCT;
4245                     di = 4;
4246                 }
4247
4248 //                fill_intra4x4_pred_table(h);
4249                 for(i=0; i<16; i+=di){
4250                     int mode= pred_intra_mode(h, i);
4251
4252                     if(!get_bits1(&s->gb)){
4253                         const int rem_mode= get_bits(&s->gb, 3);
4254                         mode = rem_mode + (rem_mode >= mode);
4255                     }
4256
4257                     if(di==4)
4258                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4259                     else
4260                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4261                 }
4262                 write_back_intra_pred_mode(h);
4263                 if( check_intra4x4_pred_mode(h) < 0)
4264                     return -1;
4265             }else{
4266                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4267                 if(h->intra16x16_pred_mode < 0)
4268                     return -1;
4269             }
4270
4271             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4272             if(pred_mode < 0)
4273                 return -1;
4274             h->chroma_pred_mode= pred_mode;
4275     }else if(partition_count==4){
4276         int i, j, sub_partition_count[4], list, ref[2][4];
4277
4278         if(h->slice_type == B_TYPE){
4279             for(i=0; i<4; i++){
4280                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4281                 if(h->sub_mb_type[i] >=13){
4282                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4283                     return -1;
4284                 }
4285                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4286                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4287             }
4288             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4289                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4290                 pred_direct_motion(h, &mb_type);
4291                 h->ref_cache[0][scan8[4]] =
4292                 h->ref_cache[1][scan8[4]] =
4293                 h->ref_cache[0][scan8[12]] =
4294                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4295             }
4296         }else{
4297             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4298             for(i=0; i<4; i++){
4299                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4300                 if(h->sub_mb_type[i] >=4){
4301                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4302                     return -1;
4303                 }
4304                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4305                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4306             }
4307         }
4308
4309         for(list=0; list<h->list_count; list++){
4310             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4311             for(i=0; i<4; i++){
4312                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4313                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4314                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4315                     if(tmp>=ref_count){
4316                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4317                         return -1;
4318                     }
4319                     ref[list][i]= tmp;
4320                 }else{
4321                  //FIXME
4322                     ref[list][i] = -1;
4323                 }
4324             }
4325         }
4326
4327         if(dct8x8_allowed)
4328             dct8x8_allowed = get_dct8x8_allowed(h);
4329
4330         for(list=0; list<h->list_count; list++){
4331             for(i=0; i<4; i++){
4332                 if(IS_DIRECT(h->sub_mb_type[i])) {
4333                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4334                     continue;
4335                 }
4336                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4337                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4338
4339                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4340                     const int sub_mb_type= h->sub_mb_type[i];
4341                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4342                     for(j=0; j<sub_partition_count[i]; j++){
4343                         int mx, my;
4344                         const int index= 4*i + block_width*j;
4345                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4346                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4347                         mx += get_se_golomb(&s->gb);
4348                         my += get_se_golomb(&s->gb);
4349                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4350
4351                         if(IS_SUB_8X8(sub_mb_type)){
4352                             mv_cache[ 1 ][0]=
4353                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4354                             mv_cache[ 1 ][1]=
4355                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4356                         }else if(IS_SUB_8X4(sub_mb_type)){
4357                             mv_cache[ 1 ][0]= mx;
4358                             mv_cache[ 1 ][1]= my;
4359                         }else if(IS_SUB_4X8(sub_mb_type)){
4360                             mv_cache[ 8 ][0]= mx;
4361                             mv_cache[ 8 ][1]= my;
4362                         }
4363                         mv_cache[ 0 ][0]= mx;
4364                         mv_cache[ 0 ][1]= my;
4365                     }
4366                 }else{
4367                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4368                     p[0] = p[1]=
4369                     p[8] = p[9]= 0;
4370                 }
4371             }
4372         }
4373     }else if(IS_DIRECT(mb_type)){
4374         pred_direct_motion(h, &mb_type);
4375         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4376     }else{
4377         int list, mx, my, i;
4378          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4379         if(IS_16X16(mb_type)){
4380             for(list=0; list<h->list_count; list++){
4381                     unsigned int val;
4382                     if(IS_DIR(mb_type, 0, list)){
4383                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4384                         if(val >= h->ref_count[list]){
4385                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4386                             return -1;
4387                         }
4388                     }else
4389                         val= LIST_NOT_USED&0xFF;
4390                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4391             }
4392             for(list=0; list<h->list_count; list++){
4393                 unsigned int val;
4394                 if(IS_DIR(mb_type, 0, list)){
4395                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4396                     mx += get_se_golomb(&s->gb);
4397                     my += get_se_golomb(&s->gb);
4398                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4399
4400                     val= pack16to32(mx,my);
4401                 }else
4402                     val=0;
4403                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4404             }
4405         }
4406         else if(IS_16X8(mb_type)){
4407             for(list=0; list<h->list_count; list++){
4408                     for(i=0; i<2; i++){
4409                         unsigned int val;
4410                         if(IS_DIR(mb_type, i, list)){
4411                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4412                             if(val >= h->ref_count[list]){
4413                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4414                                 return -1;
4415                             }
4416                         }else
4417                             val= LIST_NOT_USED&0xFF;
4418                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4419                     }
4420             }
4421             for(list=0; list<h->list_count; list++){
4422                 for(i=0; i<2; i++){
4423                     unsigned int val;
4424                     if(IS_DIR(mb_type, i, list)){
4425                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4426                         mx += get_se_golomb(&s->gb);
4427                         my += get_se_golomb(&s->gb);
4428                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4429
4430                         val= pack16to32(mx,my);
4431                     }else
4432                         val=0;
4433                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4434                 }
4435             }
4436         }else{
4437             assert(IS_8X16(mb_type));
4438             for(list=0; list<h->list_count; list++){
4439                     for(i=0; i<2; i++){
4440                         unsigned int val;
4441                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4442                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4443                             if(val >= h->ref_count[list]){
4444                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4445                                 return -1;
4446                             }
4447                         }else
4448                             val= LIST_NOT_USED&0xFF;
4449                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4450                     }
4451             }
4452             for(list=0; list<h->list_count; list++){
4453                 for(i=0; i<2; i++){
4454                     unsigned int val;
4455                     if(IS_DIR(mb_type, i, list)){
4456                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4457                         mx += get_se_golomb(&s->gb);
4458                         my += get_se_golomb(&s->gb);
4459                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4460
4461                         val= pack16to32(mx,my);
4462                     }else
4463                         val=0;
4464                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4465                 }
4466             }
4467         }
4468     }
4469
4470     if(IS_INTER(mb_type))
4471         write_back_motion(h, mb_type);
4472
4473     if(!IS_INTRA16x16(mb_type)){
4474         cbp= get_ue_golomb(&s->gb);
4475         if(cbp > 47){
4476             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4477             return -1;
4478         }
4479
4480         if(IS_INTRA4x4(mb_type))
4481             cbp= golomb_to_intra4x4_cbp[cbp];
4482         else
4483             cbp= golomb_to_inter_cbp[cbp];
4484     }
4485     h->cbp = cbp;
4486
4487     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4488         if(get_bits1(&s->gb))
4489             mb_type |= MB_TYPE_8x8DCT;
4490     }
4491     s->current_picture.mb_type[mb_xy]= mb_type;
4492
4493     if(cbp || IS_INTRA16x16(mb_type)){
4494         int i8x8, i4x4, chroma_idx;
4495         int dquant;
4496         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4497         const uint8_t *scan, *scan8x8, *dc_scan;
4498
4499 //        fill_non_zero_count_cache(h);
4500
4501         if(IS_INTERLACED(mb_type)){
4502             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4503             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4504             dc_scan= luma_dc_field_scan;
4505         }else{
4506             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4507             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4508             dc_scan= luma_dc_zigzag_scan;
4509         }
4510
4511         dquant= get_se_golomb(&s->gb);
4512
4513         if( dquant > 25 || dquant < -26 ){
4514             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4515             return -1;
4516         }
4517
4518         s->qscale += dquant;
4519         if(((unsigned)s->qscale) > 51){
4520             if(s->qscale<0) s->qscale+= 52;
4521             else            s->qscale-= 52;
4522         }
4523
4524         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4525         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4526         if(IS_INTRA16x16(mb_type)){
4527             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4528                 return -1; //FIXME continue if partitioned and other return -1 too
4529             }
4530
4531             assert((cbp&15) == 0 || (cbp&15) == 15);
4532
4533             if(cbp&15){
4534                 for(i8x8=0; i8x8<4; i8x8++){
4535                     for(i4x4=0; i4x4<4; i4x4++){
4536                         const int index= i4x4 + 4*i8x8;
4537                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4538                             return -1;
4539                         }
4540                     }
4541                 }
4542             }else{
4543                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4544             }
4545         }else{
4546             for(i8x8=0; i8x8<4; i8x8++){
4547                 if(cbp & (1<<i8x8)){
4548                     if(IS_8x8DCT(mb_type)){
4549                         DCTELEM *buf = &h->mb[64*i8x8];
4550                         uint8_t *nnz;
4551                         for(i4x4=0; i4x4<4; i4x4++){
4552                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4553                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4554                                 return -1;
4555                         }
4556                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4557                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4558                     }else{
4559                         for(i4x4=0; i4x4<4; i4x4++){
4560                             const int index= i4x4 + 4*i8x8;
4561
4562                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4563                                 return -1;
4564                             }
4565                         }
4566                     }
4567                 }else{
4568                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4569                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4570                 }
4571             }
4572         }
4573
4574         if(cbp&0x30){
4575             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4576                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4577                     return -1;
4578                 }
4579         }
4580
4581         if(cbp&0x20){
4582             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4583                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4584                 for(i4x4=0; i4x4<4; i4x4++){
4585                     const int index= 16 + 4*chroma_idx + i4x4;
4586                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4587                         return -1;
4588                     }
4589                 }
4590             }
4591         }else{
4592             uint8_t * const nnz= &h->non_zero_count_cache[0];
4593             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4594             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4595         }
4596     }else{
4597         uint8_t * const nnz= &h->non_zero_count_cache[0];
4598         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4599         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4600         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4601     }
4602     s->current_picture.qscale_table[mb_xy]= s->qscale;
4603     write_back_non_zero_count(h);
4604
4605     if(MB_MBAFF){
4606         h->ref_count[0] >>= 1;
4607         h->ref_count[1] >>= 1;
4608     }
4609
4610     return 0;
4611 }
4612
4613 static int decode_cabac_field_decoding_flag(H264Context *h) {
4614     MpegEncContext * const s = &h->s;
4615     const int mb_x = s->mb_x;
4616     const int mb_y = s->mb_y & ~1;
4617     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4618     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4619
4620     unsigned int ctx = 0;
4621
4622     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4623         ctx += 1;
4624     }
4625     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4626         ctx += 1;
4627     }
4628
4629     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4630 }
4631
4632 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4633     uint8_t *state= &h->cabac_state[ctx_base];
4634     int mb_type;
4635
4636     if(intra_slice){
4637         MpegEncContext * const s = &h->s;
4638         const int mba_xy = h->left_mb_xy[0];
4639         const int mbb_xy = h->top_mb_xy;
4640         int ctx=0;
4641         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4642             ctx++;
4643         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4644             ctx++;
4645         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4646             return 0;   /* I4x4 */
4647         state += 2;
4648     }else{
4649         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4650             return 0;   /* I4x4 */
4651     }
4652
4653     if( get_cabac_terminate( &h->cabac ) )
4654         return 25;  /* PCM */
4655
4656     mb_type = 1; /* I16x16 */
4657     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4658     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4659         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4660     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4661     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4662     return mb_type;
4663 }
4664
4665 static int decode_cabac_mb_type( H264Context *h ) {
4666     MpegEncContext * const s = &h->s;
4667
4668     if( h->slice_type == I_TYPE ) {
4669         return decode_cabac_intra_mb_type(h, 3, 1);
4670     } else if( h->slice_type == P_TYPE ) {
4671         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4672             /* P-type */
4673             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4674                 /* P_L0_D16x16, P_8x8 */
4675                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4676             } else {
4677                 /* P_L0_D8x16, P_L0_D16x8 */
4678                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4679             }
4680         } else {
4681             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4682         }
4683     } else if( h->slice_type == B_TYPE ) {
4684         const int mba_xy = h->left_mb_xy[0];
4685         const int mbb_xy = h->top_mb_xy;
4686         int ctx = 0;
4687         int bits;
4688
4689         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4690             ctx++;
4691         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4692             ctx++;
4693
4694         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4695             return 0; /* B_Direct_16x16 */
4696
4697         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4698             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4699         }
4700
4701         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4702         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4703         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4704         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4705         if( bits < 8 )
4706             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4707         else if( bits == 13 ) {
4708             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4709         } else if( bits == 14 )
4710             return 11; /* B_L1_L0_8x16 */
4711         else if( bits == 15 )
4712             return 22; /* B_8x8 */
4713
4714         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4715         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4716     } else {
4717         /* TODO SI/SP frames? */
4718         return -1;
4719     }
4720 }
4721
4722 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4723     MpegEncContext * const s = &h->s;
4724     int mba_xy, mbb_xy;
4725     int ctx = 0;
4726
4727     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4728         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4729         mba_xy = mb_xy - 1;
4730         if( (mb_y&1)
4731             && h->slice_table[mba_xy] == h->slice_num
4732             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4733             mba_xy += s->mb_stride;
4734         if( MB_FIELD ){
4735             mbb_xy = mb_xy - s->mb_stride;
4736             if( !(mb_y&1)
4737                 && h->slice_table[mbb_xy] == h->slice_num
4738                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4739                 mbb_xy -= s->mb_stride;
4740         }else
4741             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4742     }else{
4743         int mb_xy = mb_x + mb_y*s->mb_stride;
4744         mba_xy = mb_xy - 1;
4745         mbb_xy = mb_xy - s->mb_stride;
4746     }
4747
4748     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4749         ctx++;
4750     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4751         ctx++;
4752
4753     if( h->slice_type == B_TYPE )
4754         ctx += 13;
4755     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4756 }
4757
4758 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4759     int mode = 0;
4760
4761     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4762         return pred_mode;
4763
4764     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4765     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4766     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4767
4768     if( mode >= pred_mode )
4769         return mode + 1;
4770     else
4771         return mode;
4772 }
4773
4774 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4775     const int mba_xy = h->left_mb_xy[0];
4776     const int mbb_xy = h->top_mb_xy;
4777
4778     int ctx = 0;
4779
4780     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4781     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4782         ctx++;
4783
4784     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4785         ctx++;
4786
4787     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4788         return 0;
4789
4790     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4791         return 1;
4792     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4793         return 2;
4794     else
4795         return 3;
4796 }
4797
4798 static const uint8_t block_idx_x[16] = {
4799     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
4800 };
4801 static const uint8_t block_idx_y[16] = {
4802     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
4803 };
4804 static const uint8_t block_idx_xy[4][4] = {
4805     { 0, 2, 8,  10},
4806     { 1, 3, 9,  11},
4807     { 4, 6, 12, 14},
4808     { 5, 7, 13, 15}
4809 };
4810
4811 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4812     int cbp = 0;
4813     int cbp_b = -1;
4814     int i8x8;
4815
4816     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
4817         cbp_b = h->top_cbp;
4818         tprintf(h->s.avctx, "cbp_b = top_cbp = %x\n", cbp_b);
4819     }
4820
4821     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
4822         int cbp_a = -1;
4823         int x, y;
4824         int ctx = 0;
4825
4826         x = block_idx_x[4*i8x8];
4827         y = block_idx_y[4*i8x8];
4828
4829         if( x > 0 )
4830             cbp_a = cbp;
4831         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
4832             cbp_a = h->left_cbp;
4833             tprintf(h->s.avctx, "cbp_a = left_cbp = %x\n", cbp_a);
4834         }
4835
4836         if( y > 0 )
4837             cbp_b = cbp;
4838
4839         /* No need to test for skip as we put 0 for skip block */
4840         /* No need to test for IPCM as we put 1 for IPCM block */
4841         if( cbp_a >= 0 ) {
4842             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
4843             if( ((cbp_a >> i8x8a)&0x01) == 0 )
4844                 ctx++;
4845         }
4846
4847         if( cbp_b >= 0 ) {
4848             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
4849             if( ((cbp_b >> i8x8b)&0x01) == 0 )
4850                 ctx += 2;
4851         }
4852
4853         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
4854             cbp |= 1 << i8x8;
4855         }
4856     }
4857     return cbp;
4858 }
4859 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4860     int ctx;
4861     int cbp_a, cbp_b;
4862
4863     cbp_a = (h->left_cbp>>4)&0x03;
4864     cbp_b = (h-> top_cbp>>4)&0x03;
4865
4866     ctx = 0;
4867     if( cbp_a > 0 ) ctx++;
4868     if( cbp_b > 0 ) ctx += 2;
4869     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4870         return 0;
4871
4872     ctx = 4;
4873     if( cbp_a == 2 ) ctx++;
4874     if( cbp_b == 2 ) ctx += 2;
4875     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4876 }
4877 static int decode_cabac_mb_dqp( H264Context *h) {
4878     MpegEncContext * const s = &h->s;
4879     int mbn_xy;
4880     int   ctx = 0;
4881     int   val = 0;
4882
4883     if( s->mb_x > 0 )
4884         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
4885     else
4886         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
4887
4888     if( h->last_qscale_diff != 0 )
4889         ctx++;
4890
4891     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4892         if( ctx < 2 )
4893             ctx = 2;
4894         else
4895             ctx = 3;
4896         val++;
4897         if(val > 102) //prevent infinite loop
4898             return INT_MIN;
4899     }
4900
4901     if( val&0x01 )
4902         return (val + 1)/2;
4903     else
4904         return -(val + 1)/2;
4905 }
4906 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4907     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4908         return 0;   /* 8x8 */
4909     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4910         return 1;   /* 8x4 */
4911     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4912         return 2;   /* 4x8 */
4913     return 3;       /* 4x4 */
4914 }
4915 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4916     int type;
4917     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4918         return 0;   /* B_Direct_8x8 */
4919     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4920         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4921     type = 3;
4922     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4923         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4924             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4925         type += 4;
4926     }
4927     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4928     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4929     return type;
4930 }
4931
4932 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
4933     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
4934 }
4935
4936 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
4937     int refa = h->ref_cache[list][scan8[n] - 1];
4938     int refb = h->ref_cache[list][scan8[n] - 8];
4939     int ref  = 0;
4940     int ctx  = 0;
4941
4942     if( h->slice_type == B_TYPE) {
4943         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
4944             ctx++;
4945         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
4946             ctx += 2;
4947     } else {
4948         if( refa > 0 )
4949             ctx++;
4950         if( refb > 0 )
4951             ctx += 2;
4952     }
4953
4954     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
4955         ref++;
4956         if( ctx < 4 )
4957             ctx = 4;
4958         else
4959             ctx = 5;
4960         if(ref >= 32 /*h->ref_list[list]*/){
4961             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
4962             return 0; //FIXME we should return -1 and check the return everywhere
4963         }
4964     }
4965     return ref;
4966 }
4967
4968 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
4969     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
4970                abs( h->mvd_cache[list][scan8[n] - 8][l] );
4971     int ctxbase = (l == 0) ? 40 : 47;
4972     int ctx, mvd;
4973
4974     if( amvd < 3 )
4975         ctx = 0;
4976     else if( amvd > 32 )
4977         ctx = 2;
4978     else
4979         ctx = 1;
4980
4981     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
4982         return 0;
4983
4984     mvd= 1;
4985     ctx= 3;
4986     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
4987         mvd++;
4988         if( ctx < 6 )
4989             ctx++;
4990     }
4991
4992     if( mvd >= 9 ) {
4993         int k = 3;
4994         while( get_cabac_bypass( &h->cabac ) ) {
4995             mvd += 1 << k;
4996             k++;
4997             if(k>24){
4998                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
4999                 return INT_MIN;
5000             }
5001         }
5002         while( k-- ) {
5003             if( get_cabac_bypass( &h->cabac ) )
5004                 mvd += 1 << k;
5005         }
5006     }
5007     return get_cabac_bypass_sign( &h->cabac, -mvd );
5008 }
5009
5010 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5011     int nza, nzb;
5012     int ctx = 0;
5013
5014     if( cat == 0 ) {
5015         nza = h->left_cbp&0x100;
5016         nzb = h-> top_cbp&0x100;
5017     } else if( cat == 1 || cat == 2 ) {
5018         nza = h->non_zero_count_cache[scan8[idx] - 1];
5019         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5020     } else if( cat == 3 ) {
5021         nza = (h->left_cbp>>(6+idx))&0x01;
5022         nzb = (h-> top_cbp>>(6+idx))&0x01;
5023     } else {
5024         assert(cat == 4);
5025         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5026         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5027     }
5028
5029     if( nza > 0 )
5030         ctx++;
5031
5032     if( nzb > 0 )
5033         ctx += 2;
5034
5035     return ctx + 4 * cat;
5036 }
5037
5038 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5039     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5040     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5041     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5042     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5043 };
5044
5045 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5046     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5047     static const int significant_coeff_flag_offset[2][6] = {
5048       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5049       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5050     };
5051     static const int last_coeff_flag_offset[2][6] = {
5052       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5053       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5054     };
5055     static const int coeff_abs_level_m1_offset[6] = {
5056         227+0, 227+10, 227+20, 227+30, 227+39, 426
5057     };
5058     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5059       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5060         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5061         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5062        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5063       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5064         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5065         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5066         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5067     };
5068
5069     int index[64];
5070
5071     int av_unused last;
5072     int coeff_count = 0;
5073
5074     int abslevel1 = 1;
5075     int abslevelgt1 = 0;
5076
5077     uint8_t *significant_coeff_ctx_base;
5078     uint8_t *last_coeff_ctx_base;
5079     uint8_t *abs_level_m1_ctx_base;
5080
5081 #ifndef ARCH_X86
5082 #define CABAC_ON_STACK
5083 #endif
5084 #ifdef CABAC_ON_STACK
5085 #define CC &cc
5086     CABACContext cc;
5087     cc.range     = h->cabac.range;
5088     cc.low       = h->cabac.low;
5089     cc.bytestream= h->cabac.bytestream;
5090 #else
5091 #define CC &h->cabac
5092 #endif
5093
5094
5095     /* cat: 0-> DC 16x16  n = 0
5096      *      1-> AC 16x16  n = luma4x4idx
5097      *      2-> Luma4x4   n = luma4x4idx
5098      *      3-> DC Chroma n = iCbCr
5099      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5100      *      5-> Luma8x8   n = 4 * luma8x8idx
5101      */
5102
5103     /* read coded block flag */
5104     if( cat != 5 ) {
5105         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5106             if( cat == 1 || cat == 2 )
5107                 h->non_zero_count_cache[scan8[n]] = 0;
5108             else if( cat == 4 )
5109                 h->non_zero_count_cache[scan8[16+n]] = 0;
5110 #ifdef CABAC_ON_STACK
5111             h->cabac.range     = cc.range     ;
5112             h->cabac.low       = cc.low       ;
5113             h->cabac.bytestream= cc.bytestream;
5114 #endif
5115             return;
5116         }
5117     }
5118
5119     significant_coeff_ctx_base = h->cabac_state
5120         + significant_coeff_flag_offset[MB_FIELD][cat];
5121     last_coeff_ctx_base = h->cabac_state
5122         + last_coeff_flag_offset[MB_FIELD][cat];
5123     abs_level_m1_ctx_base = h->cabac_state
5124         + coeff_abs_level_m1_offset[cat];
5125
5126     if( cat == 5 ) {
5127 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5128         for(last= 0; last < coefs; last++) { \
5129             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5130             if( get_cabac( CC, sig_ctx )) { \
5131                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5132                 index[coeff_count++] = last; \
5133                 if( get_cabac( CC, last_ctx ) ) { \
5134                     last= max_coeff; \
5135                     break; \
5136                 } \
5137             } \
5138         }\
5139         if( last == max_coeff -1 ) {\
5140             index[coeff_count++] = last;\
5141         }
5142         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5143 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5144         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5145     } else {
5146         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5147 #else
5148         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5149     } else {
5150         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5151 #endif
5152     }
5153     assert(coeff_count > 0);
5154
5155     if( cat == 0 )
5156         h->cbp_table[mb_xy] |= 0x100;
5157     else if( cat == 1 || cat == 2 )
5158         h->non_zero_count_cache[scan8[n]] = coeff_count;
5159     else if( cat == 3 )
5160         h->cbp_table[mb_xy] |= 0x40 << n;
5161     else if( cat == 4 )
5162         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5163     else {
5164         assert( cat == 5 );
5165         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5166     }
5167
5168     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5169         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5170         int j= scantable[index[coeff_count]];
5171
5172         if( get_cabac( CC, ctx ) == 0 ) {
5173             if( !qmul ) {
5174                 block[j] = get_cabac_bypass_sign( CC, -1);
5175             }else{
5176                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5177             }
5178
5179             abslevel1++;
5180         } else {
5181             int coeff_abs = 2;
5182             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5183             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5184                 coeff_abs++;
5185             }
5186
5187             if( coeff_abs >= 15 ) {
5188                 int j = 0;
5189                 while( get_cabac_bypass( CC ) ) {
5190                     j++;
5191                 }
5192
5193                 coeff_abs=1;
5194                 while( j-- ) {
5195                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5196                 }
5197                 coeff_abs+= 14;
5198             }
5199
5200             if( !qmul ) {
5201                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5202                 else                                block[j] =  coeff_abs;
5203             }else{
5204                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5205                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5206             }
5207
5208             abslevelgt1++;
5209         }
5210     }
5211 #ifdef CABAC_ON_STACK
5212             h->cabac.range     = cc.range     ;
5213             h->cabac.low       = cc.low       ;
5214             h->cabac.bytestream= cc.bytestream;
5215 #endif
5216
5217 }
5218
5219 static inline void compute_mb_neighbors(H264Context *h)
5220 {
5221     MpegEncContext * const s = &h->s;
5222     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5223     h->top_mb_xy     = mb_xy - s->mb_stride;
5224     h->left_mb_xy[0] = mb_xy - 1;
5225     if(FRAME_MBAFF){
5226         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5227         const int top_pair_xy      = pair_xy     - s->mb_stride;
5228         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5229         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5230         const int curr_mb_frame_flag = !MB_FIELD;
5231         const int bottom = (s->mb_y & 1);
5232         if (bottom
5233                 ? !curr_mb_frame_flag // bottom macroblock
5234                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5235                 ) {
5236             h->top_mb_xy -= s->mb_stride;
5237         }
5238         if (left_mb_frame_flag != curr_mb_frame_flag) {
5239             h->left_mb_xy[0] = pair_xy - 1;
5240         }
5241     }
5242     return;
5243 }
5244
5245 /**
5246  * decodes a macroblock
5247  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5248  */
5249 static int decode_mb_cabac(H264Context *h) {
5250     MpegEncContext * const s = &h->s;
5251     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5252     int mb_type, partition_count, cbp = 0;
5253     int dct8x8_allowed= h->pps.transform_8x8_mode;
5254
5255     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5256
5257     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5258     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5259         int skip;
5260         /* a skipped mb needs the aff flag from the following mb */
5261         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5262             predict_field_decoding_flag(h);
5263         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5264             skip = h->next_mb_skipped;
5265         else
5266             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5267         /* read skip flags */
5268         if( skip ) {
5269             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5270                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5271                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5272                 if(h->next_mb_skipped)
5273                     predict_field_decoding_flag(h);
5274                 else
5275                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5276             }
5277
5278             decode_mb_skip(h);
5279
5280             h->cbp_table[mb_xy] = 0;
5281             h->chroma_pred_mode_table[mb_xy] = 0;
5282             h->last_qscale_diff = 0;
5283
5284             return 0;
5285
5286         }
5287     }
5288     if(FRAME_MBAFF){
5289         if( (s->mb_y&1) == 0 )
5290             h->mb_mbaff =
5291             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5292     }else
5293         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5294
5295     h->prev_mb_skipped = 0;
5296
5297     compute_mb_neighbors(h);
5298     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5299         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5300         return -1;
5301     }
5302
5303     if( h->slice_type == B_TYPE ) {
5304         if( mb_type < 23 ){
5305             partition_count= b_mb_type_info[mb_type].partition_count;
5306             mb_type=         b_mb_type_info[mb_type].type;
5307         }else{
5308             mb_type -= 23;
5309             goto decode_intra_mb;
5310         }
5311     } else if( h->slice_type == P_TYPE ) {
5312         if( mb_type < 5) {
5313             partition_count= p_mb_type_info[mb_type].partition_count;
5314             mb_type=         p_mb_type_info[mb_type].type;
5315         } else {
5316             mb_type -= 5;
5317             goto decode_intra_mb;
5318         }
5319     } else {
5320        assert(h->slice_type == I_TYPE);
5321 decode_intra_mb:
5322         partition_count = 0;
5323         cbp= i_mb_type_info[mb_type].cbp;
5324         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5325         mb_type= i_mb_type_info[mb_type].type;
5326     }
5327     if(MB_FIELD)
5328         mb_type |= MB_TYPE_INTERLACED;
5329
5330     h->slice_table[ mb_xy ]= h->slice_num;
5331
5332     if(IS_INTRA_PCM(mb_type)) {
5333         const uint8_t *ptr;
5334         unsigned int x, y;
5335
5336         // We assume these blocks are very rare so we do not optimize it.
5337         // FIXME The two following lines get the bitstream position in the cabac
5338         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5339         ptr= h->cabac.bytestream;
5340         if(h->cabac.low&0x1) ptr--;
5341         if(CABAC_BITS==16){
5342             if(h->cabac.low&0x1FF) ptr--;
5343         }
5344
5345         // The pixels are stored in the same order as levels in h->mb array.
5346         for(y=0; y<16; y++){
5347             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5348             for(x=0; x<16; x++){
5349                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5350                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5351             }
5352         }
5353         for(y=0; y<8; y++){
5354             const int index= 256 + 4*(y&3) + 32*(y>>2);
5355             for(x=0; x<8; x++){
5356                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5357                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5358             }
5359         }
5360         for(y=0; y<8; y++){
5361             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5362             for(x=0; x<8; x++){
5363                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5364                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5365             }
5366         }
5367
5368         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5369
5370         // All blocks are present
5371         h->cbp_table[mb_xy] = 0x1ef;
5372         h->chroma_pred_mode_table[mb_xy] = 0;
5373         // In deblocking, the quantizer is 0
5374         s->current_picture.qscale_table[mb_xy]= 0;
5375         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5376         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5377         // All coeffs are present
5378         memset(h->non_zero_count[mb_xy], 16, 16);
5379         s->current_picture.mb_type[mb_xy]= mb_type;
5380         return 0;
5381     }
5382
5383     if(MB_MBAFF){
5384         h->ref_count[0] <<= 1;
5385         h->ref_count[1] <<= 1;
5386     }
5387
5388     fill_caches(h, mb_type, 0);
5389
5390     if( IS_INTRA( mb_type ) ) {
5391         int i, pred_mode;
5392         if( IS_INTRA4x4( mb_type ) ) {
5393             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5394                 mb_type |= MB_TYPE_8x8DCT;
5395                 for( i = 0; i < 16; i+=4 ) {
5396                     int pred = pred_intra_mode( h, i );
5397                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5398                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5399                 }
5400             } else {
5401                 for( i = 0; i < 16; i++ ) {
5402                     int pred = pred_intra_mode( h, i );
5403                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5404
5405                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5406                 }
5407             }
5408             write_back_intra_pred_mode(h);
5409             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5410         } else {
5411             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5412             if( h->intra16x16_pred_mode < 0 ) return -1;
5413         }
5414         h->chroma_pred_mode_table[mb_xy] =
5415         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5416
5417         pred_mode= check_intra_pred_mode( h, pred_mode );
5418         if( pred_mode < 0 ) return -1;
5419         h->chroma_pred_mode= pred_mode;
5420     } else if( partition_count == 4 ) {
5421         int i, j, sub_partition_count[4], list, ref[2][4];
5422
5423         if( h->slice_type == B_TYPE ) {
5424             for( i = 0; i < 4; i++ ) {
5425                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5426                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5427                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5428             }
5429             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5430                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5431                 pred_direct_motion(h, &mb_type);
5432                 h->ref_cache[0][scan8[4]] =
5433                 h->ref_cache[1][scan8[4]] =
5434                 h->ref_cache[0][scan8[12]] =
5435                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5436                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5437                     for( i = 0; i < 4; i++ )
5438                         if( IS_DIRECT(h->sub_mb_type[i]) )
5439                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5440                 }
5441             }
5442         } else {
5443             for( i = 0; i < 4; i++ ) {
5444                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5445                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5446                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5447             }
5448         }
5449
5450         for( list = 0; list < h->list_count; list++ ) {
5451                 for( i = 0; i < 4; i++ ) {
5452                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5453                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5454                         if( h->ref_count[list] > 1 )
5455                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5456                         else
5457                             ref[list][i] = 0;
5458                     } else {
5459                         ref[list][i] = -1;
5460                     }
5461                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5462                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5463                 }
5464         }
5465
5466         if(dct8x8_allowed)
5467             dct8x8_allowed = get_dct8x8_allowed(h);
5468
5469         for(list=0; list<h->list_count; list++){
5470             for(i=0; i<4; i++){
5471                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5472                 if(IS_DIRECT(h->sub_mb_type[i])){
5473                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5474                     continue;
5475                 }
5476
5477                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5478                     const int sub_mb_type= h->sub_mb_type[i];
5479                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5480                     for(j=0; j<sub_partition_count[i]; j++){
5481                         int mpx, mpy;
5482                         int mx, my;
5483                         const int index= 4*i + block_width*j;
5484                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5485                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5486                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5487
5488                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5489                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5490                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5491
5492                         if(IS_SUB_8X8(sub_mb_type)){
5493                             mv_cache[ 1 ][0]=
5494                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5495                             mv_cache[ 1 ][1]=
5496                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5497
5498                             mvd_cache[ 1 ][0]=
5499                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5500                             mvd_cache[ 1 ][1]=
5501                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5502                         }else if(IS_SUB_8X4(sub_mb_type)){
5503                             mv_cache[ 1 ][0]= mx;
5504                             mv_cache[ 1 ][1]= my;
5505
5506                             mvd_cache[ 1 ][0]= mx - mpx;
5507                             mvd_cache[ 1 ][1]= my - mpy;
5508                         }else if(IS_SUB_4X8(sub_mb_type)){
5509                             mv_cache[ 8 ][0]= mx;
5510                             mv_cache[ 8 ][1]= my;
5511
5512                             mvd_cache[ 8 ][0]= mx - mpx;
5513                             mvd_cache[ 8 ][1]= my - mpy;
5514                         }
5515                         mv_cache[ 0 ][0]= mx;
5516                         mv_cache[ 0 ][1]= my;
5517
5518                         mvd_cache[ 0 ][0]= mx - mpx;
5519                         mvd_cache[ 0 ][1]= my - mpy;
5520                     }
5521                 }else{
5522                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5523                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5524                     p[0] = p[1] = p[8] = p[9] = 0;
5525                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5526                 }
5527             }
5528         }
5529     } else if( IS_DIRECT(mb_type) ) {
5530         pred_direct_motion(h, &mb_type);
5531         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5532         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5533         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5534     } else {
5535         int list, mx, my, i, mpx, mpy;
5536         if(IS_16X16(mb_type)){
5537             for(list=0; list<h->list_count; list++){
5538                 if(IS_DIR(mb_type, 0, list)){
5539                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5540                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5541                 }else
5542                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5543             }
5544             for(list=0; list<h->list_count; list++){
5545                 if(IS_DIR(mb_type, 0, list)){
5546                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5547
5548                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5549                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5550                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5551
5552                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5553                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5554                 }else
5555                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5556             }
5557         }
5558         else if(IS_16X8(mb_type)){
5559             for(list=0; list<h->list_count; list++){
5560                     for(i=0; i<2; i++){
5561                         if(IS_DIR(mb_type, i, list)){
5562                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5563                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5564                         }else
5565                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5566                     }
5567             }
5568             for(list=0; list<h->list_count; list++){
5569                 for(i=0; i<2; i++){
5570                     if(IS_DIR(mb_type, i, list)){
5571                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5572                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5573                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5574                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5575
5576                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5577                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5578                     }else{
5579                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5580                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5581                     }
5582                 }
5583             }
5584         }else{
5585             assert(IS_8X16(mb_type));
5586             for(list=0; list<h->list_count; list++){
5587                     for(i=0; i<2; i++){
5588                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5589                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5590                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5591                         }else
5592                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5593                     }
5594             }
5595             for(list=0; list<h->list_count; list++){
5596                 for(i=0; i<2; i++){
5597                     if(IS_DIR(mb_type, i, list)){
5598                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5599                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5600                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5601
5602                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5603                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5604                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5605                     }else{
5606                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5607                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5608                     }
5609                 }
5610             }
5611         }
5612     }
5613
5614    if( IS_INTER( mb_type ) ) {
5615         h->chroma_pred_mode_table[mb_xy] = 0;
5616         write_back_motion( h, mb_type );
5617    }
5618
5619     if( !IS_INTRA16x16( mb_type ) ) {
5620         cbp  = decode_cabac_mb_cbp_luma( h );
5621         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5622     }
5623
5624     h->cbp_table[mb_xy] = h->cbp = cbp;
5625
5626     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5627         if( decode_cabac_mb_transform_size( h ) )
5628             mb_type |= MB_TYPE_8x8DCT;
5629     }
5630     s->current_picture.mb_type[mb_xy]= mb_type;
5631
5632     if( cbp || IS_INTRA16x16( mb_type ) ) {
5633         const uint8_t *scan, *scan8x8, *dc_scan;
5634         int dqp;
5635
5636         if(IS_INTERLACED(mb_type)){
5637             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5638             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5639             dc_scan= luma_dc_field_scan;
5640         }else{
5641             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5642             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5643             dc_scan= luma_dc_zigzag_scan;
5644         }
5645
5646         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5647         if( dqp == INT_MIN ){
5648             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5649             return -1;
5650         }
5651         s->qscale += dqp;
5652         if(((unsigned)s->qscale) > 51){
5653             if(s->qscale<0) s->qscale+= 52;
5654             else            s->qscale-= 52;
5655         }
5656         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5657         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5658
5659         if( IS_INTRA16x16( mb_type ) ) {
5660             int i;
5661             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5662             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5663
5664             if( cbp&15 ) {
5665                 for( i = 0; i < 16; i++ ) {
5666                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5667                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15);
5668                 }
5669             } else {
5670                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5671             }
5672         } else {
5673             int i8x8, i4x4;
5674             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5675                 if( cbp & (1<<i8x8) ) {
5676                     if( IS_8x8DCT(mb_type) ) {
5677                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5678                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5679                     } else
5680                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5681                         const int index = 4*i8x8 + i4x4;
5682                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5683 //START_TIMER
5684                         decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16);
5685 //STOP_TIMER("decode_residual")
5686                     }
5687                 } else {
5688                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5689                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5690                 }
5691             }
5692         }
5693
5694         if( cbp&0x30 ){
5695             int c;
5696             for( c = 0; c < 2; c++ ) {
5697                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5698                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5699             }
5700         }
5701
5702         if( cbp&0x20 ) {
5703             int c, i;
5704             for( c = 0; c < 2; c++ ) {
5705                 const uint32_t *qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5706                 for( i = 0; i < 4; i++ ) {
5707                     const int index = 16 + 4 * c + i;
5708                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5709                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5710                 }
5711             }
5712         } else {
5713             uint8_t * const nnz= &h->non_zero_count_cache[0];
5714             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5715             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5716         }
5717     } else {
5718         uint8_t * const nnz= &h->non_zero_count_cache[0];
5719         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5720         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5721         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5722         h->last_qscale_diff = 0;
5723     }
5724
5725     s->current_picture.qscale_table[mb_xy]= s->qscale;
5726     write_back_non_zero_count(h);
5727
5728     if(MB_MBAFF){
5729         h->ref_count[0] >>= 1;
5730         h->ref_count[1] >>= 1;
5731     }
5732
5733     return 0;
5734 }
5735
5736
5737 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5738     int i, d;
5739     const int index_a = qp + h->slice_alpha_c0_offset;
5740     const int alpha = (alpha_table+52)[index_a];
5741     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5742
5743     if( bS[0] < 4 ) {
5744         int8_t tc[4];
5745         for(i=0; i<4; i++)
5746             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5747         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5748     } else {
5749         /* 16px edge length, because bS=4 is triggered by being at
5750          * the edge of an intra MB, so all 4 bS are the same */
5751             for( d = 0; d < 16; d++ ) {
5752                 const int p0 = pix[-1];
5753                 const int p1 = pix[-2];
5754                 const int p2 = pix[-3];
5755
5756                 const int q0 = pix[0];
5757                 const int q1 = pix[1];
5758                 const int q2 = pix[2];
5759
5760                 if( FFABS( p0 - q0 ) < alpha &&
5761                     FFABS( p1 - p0 ) < beta &&
5762                     FFABS( q1 - q0 ) < beta ) {
5763
5764                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5765                         if( FFABS( p2 - p0 ) < beta)
5766                         {
5767                             const int p3 = pix[-4];
5768                             /* p0', p1', p2' */
5769                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5770                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5771                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5772                         } else {
5773                             /* p0' */
5774                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5775                         }
5776                         if( FFABS( q2 - q0 ) < beta)
5777                         {
5778                             const int q3 = pix[3];
5779                             /* q0', q1', q2' */
5780                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5781                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5782                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5783                         } else {
5784                             /* q0' */
5785                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5786                         }
5787                     }else{
5788                         /* p0', q0' */
5789                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5790                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5791                     }
5792                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5793                 }
5794                 pix += stride;
5795             }
5796     }
5797 }
5798 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5799     int i;
5800     const int index_a = qp + h->slice_alpha_c0_offset;
5801     const int alpha = (alpha_table+52)[index_a];
5802     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5803
5804     if( bS[0] < 4 ) {
5805         int8_t tc[4];
5806         for(i=0; i<4; i++)
5807             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5808         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5809     } else {
5810         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5811     }
5812 }
5813
5814 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5815     int i;
5816     for( i = 0; i < 16; i++, pix += stride) {
5817         int index_a;
5818         int alpha;
5819         int beta;
5820
5821         int qp_index;
5822         int bS_index = (i >> 1);
5823         if (!MB_FIELD) {
5824             bS_index &= ~1;
5825             bS_index |= (i & 1);
5826         }
5827
5828         if( bS[bS_index] == 0 ) {
5829             continue;
5830         }
5831
5832         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5833         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5834         alpha = (alpha_table+52)[index_a];
5835         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5836
5837         if( bS[bS_index] < 4 ) {
5838             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5839             const int p0 = pix[-1];
5840             const int p1 = pix[-2];
5841             const int p2 = pix[-3];
5842             const int q0 = pix[0];
5843             const int q1 = pix[1];
5844             const int q2 = pix[2];
5845
5846             if( FFABS( p0 - q0 ) < alpha &&
5847                 FFABS( p1 - p0 ) < beta &&
5848                 FFABS( q1 - q0 ) < beta ) {
5849                 int tc = tc0;
5850                 int i_delta;
5851
5852                 if( FFABS( p2 - p0 ) < beta ) {
5853                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5854                     tc++;
5855                 }
5856                 if( FFABS( q2 - q0 ) < beta ) {
5857                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5858                     tc++;
5859                 }
5860
5861                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5862                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5863                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5864                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5865             }
5866         }else{
5867             const int p0 = pix[-1];
5868             const int p1 = pix[-2];
5869             const int p2 = pix[-3];
5870
5871             const int q0 = pix[0];
5872             const int q1 = pix[1];
5873             const int q2 = pix[2];
5874
5875             if( FFABS( p0 - q0 ) < alpha &&
5876                 FFABS( p1 - p0 ) < beta &&
5877                 FFABS( q1 - q0 ) < beta ) {
5878
5879                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5880                     if( FFABS( p2 - p0 ) < beta)
5881                     {
5882                         const int p3 = pix[-4];
5883                         /* p0', p1', p2' */
5884                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5885                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5886                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5887                     } else {
5888                         /* p0' */
5889                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5890                     }
5891                     if( FFABS( q2 - q0 ) < beta)
5892                     {
5893                         const int q3 = pix[3];
5894                         /* q0', q1', q2' */
5895                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5896                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5897                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5898                     } else {
5899                         /* q0' */
5900                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5901                     }
5902                 }else{
5903                     /* p0', q0' */
5904                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5905                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5906                 }
5907                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5908             }
5909         }
5910     }
5911 }
5912 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5913     int i;
5914     for( i = 0; i < 8; i++, pix += stride) {
5915         int index_a;
5916         int alpha;
5917         int beta;
5918
5919         int qp_index;
5920         int bS_index = i;
5921
5922         if( bS[bS_index] == 0 ) {
5923             continue;
5924         }
5925
5926         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
5927         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5928         alpha = (alpha_table+52)[index_a];
5929         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5930
5931         if( bS[bS_index] < 4 ) {
5932             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
5933             const int p0 = pix[-1];
5934             const int p1 = pix[-2];
5935             const int q0 = pix[0];
5936             const int q1 = pix[1];
5937
5938             if( FFABS( p0 - q0 ) < alpha &&
5939                 FFABS( p1 - p0 ) < beta &&
5940                 FFABS( q1 - q0 ) < beta ) {
5941                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5942
5943                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5944                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5945                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5946             }
5947         }else{
5948             const int p0 = pix[-1];
5949             const int p1 = pix[-2];
5950             const int q0 = pix[0];
5951             const int q1 = pix[1];
5952
5953             if( FFABS( p0 - q0 ) < alpha &&
5954                 FFABS( p1 - p0 ) < beta &&
5955                 FFABS( q1 - q0 ) < beta ) {
5956
5957                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
5958                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
5959                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5960             }
5961         }
5962     }
5963 }
5964
5965 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5966     int i, d;
5967     const int index_a = qp + h->slice_alpha_c0_offset;
5968     const int alpha = (alpha_table+52)[index_a];
5969     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5970     const int pix_next  = stride;
5971
5972     if( bS[0] < 4 ) {
5973         int8_t tc[4];
5974         for(i=0; i<4; i++)
5975             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5976         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
5977     } else {
5978         /* 16px edge length, see filter_mb_edgev */
5979             for( d = 0; d < 16; d++ ) {
5980                 const int p0 = pix[-1*pix_next];
5981                 const int p1 = pix[-2*pix_next];
5982                 const int p2 = pix[-3*pix_next];
5983                 const int q0 = pix[0];
5984                 const int q1 = pix[1*pix_next];
5985                 const int q2 = pix[2*pix_next];
5986
5987                 if( FFABS( p0 - q0 ) < alpha &&
5988                     FFABS( p1 - p0 ) < beta &&
5989                     FFABS( q1 - q0 ) < beta ) {
5990
5991                     const int p3 = pix[-4*pix_next];
5992                     const int q3 = pix[ 3*pix_next];
5993
5994                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5995                         if( FFABS( p2 - p0 ) < beta) {
5996                             /* p0', p1', p2' */
5997                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5998                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5999                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6000                         } else {
6001                             /* p0' */
6002                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6003                         }
6004                         if( FFABS( q2 - q0 ) < beta) {
6005                             /* q0', q1', q2' */
6006                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6007                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6008                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6009                         } else {
6010                             /* q0' */
6011                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6012                         }
6013                     }else{
6014                         /* p0', q0' */
6015                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6016                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6017                     }
6018                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6019                 }
6020                 pix++;
6021             }
6022     }
6023 }
6024
6025 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6026     int i;
6027     const int index_a = qp + h->slice_alpha_c0_offset;
6028     const int alpha = (alpha_table+52)[index_a];
6029     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6030
6031     if( bS[0] < 4 ) {
6032         int8_t tc[4];
6033         for(i=0; i<4; i++)
6034             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6035         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6036     } else {
6037         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6038     }
6039 }
6040
6041 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6042     MpegEncContext * const s = &h->s;
6043     int mb_xy, mb_type;
6044     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6045
6046     mb_xy = mb_x + mb_y*s->mb_stride;
6047
6048     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6049        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6050                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6051         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6052         return;
6053     }
6054     assert(!FRAME_MBAFF);
6055
6056     mb_type = s->current_picture.mb_type[mb_xy];
6057     qp = s->current_picture.qscale_table[mb_xy];
6058     qp0 = s->current_picture.qscale_table[mb_xy-1];
6059     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6060     qpc = get_chroma_qp( h, 0, qp );
6061     qpc0 = get_chroma_qp( h, 0, qp0 );
6062     qpc1 = get_chroma_qp( h, 0, qp1 );
6063     qp0 = (qp + qp0 + 1) >> 1;
6064     qp1 = (qp + qp1 + 1) >> 1;
6065     qpc0 = (qpc + qpc0 + 1) >> 1;
6066     qpc1 = (qpc + qpc1 + 1) >> 1;
6067     qp_thresh = 15 - h->slice_alpha_c0_offset;
6068     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6069        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6070         return;
6071
6072     if( IS_INTRA(mb_type) ) {
6073         int16_t bS4[4] = {4,4,4,4};
6074         int16_t bS3[4] = {3,3,3,3};
6075         if( IS_8x8DCT(mb_type) ) {
6076             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6077             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6078             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6079             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6080         } else {
6081             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6082             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6083             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6084             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6085             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6086             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6087             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6088             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6089         }
6090         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6091         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6092         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6093         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6094         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6095         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6096         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6097         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6098         return;
6099     } else {
6100         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6101         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6102         int edges;
6103         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6104             edges = 4;
6105             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6106         } else {
6107             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6108                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6109             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6110                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6111                              ? 3 : 0;
6112             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6113             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6114             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6115                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6116         }
6117         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6118             bSv[0][0] = 0x0004000400040004ULL;
6119         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6120             bSv[1][0] = 0x0004000400040004ULL;
6121
6122 #define FILTER(hv,dir,edge)\
6123         if(bSv[dir][edge]) {\
6124             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6125             if(!(edge&1)) {\
6126                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6127                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6128             }\
6129         }
6130         if( edges == 1 ) {
6131             FILTER(v,0,0);
6132             FILTER(h,1,0);
6133         } else if( IS_8x8DCT(mb_type) ) {
6134             FILTER(v,0,0);
6135             FILTER(v,0,2);
6136             FILTER(h,1,0);
6137             FILTER(h,1,2);
6138         } else {
6139             FILTER(v,0,0);
6140             FILTER(v,0,1);
6141             FILTER(v,0,2);
6142             FILTER(v,0,3);
6143             FILTER(h,1,0);
6144             FILTER(h,1,1);
6145             FILTER(h,1,2);
6146             FILTER(h,1,3);
6147         }
6148 #undef FILTER
6149     }
6150 }
6151
6152 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6153     MpegEncContext * const s = &h->s;
6154     const int mb_xy= mb_x + mb_y*s->mb_stride;
6155     const int mb_type = s->current_picture.mb_type[mb_xy];
6156     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6157     int first_vertical_edge_done = 0;
6158     int dir;
6159     /* FIXME: A given frame may occupy more than one position in
6160      * the reference list. So ref2frm should be populated with
6161      * frame numbers, not indices. */
6162     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6163                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6164
6165     //for sufficiently low qp, filtering wouldn't do anything
6166     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6167     if(!FRAME_MBAFF){
6168         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6169         int qp = s->current_picture.qscale_table[mb_xy];
6170         if(qp <= qp_thresh
6171            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6172            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6173             return;
6174         }
6175     }
6176
6177     if (FRAME_MBAFF
6178             // left mb is in picture
6179             && h->slice_table[mb_xy-1] != 255
6180             // and current and left pair do not have the same interlaced type
6181             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6182             // and left mb is in the same slice if deblocking_filter == 2
6183             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6184         /* First vertical edge is different in MBAFF frames
6185          * There are 8 different bS to compute and 2 different Qp
6186          */
6187         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6188         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6189         int16_t bS[8];
6190         int qp[2];
6191         int bqp[2];
6192         int rqp[2];
6193         int mb_qp, mbn0_qp, mbn1_qp;
6194         int i;
6195         first_vertical_edge_done = 1;
6196
6197         if( IS_INTRA(mb_type) )
6198             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6199         else {
6200             for( i = 0; i < 8; i++ ) {
6201                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6202
6203                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6204                     bS[i] = 4;
6205                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6206                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6207                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6208                     bS[i] = 2;
6209                 else
6210                     bS[i] = 1;
6211             }
6212         }
6213
6214         mb_qp = s->current_picture.qscale_table[mb_xy];
6215         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6216         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6217         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6218         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6219                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6220         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6221                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6222         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6223         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6224                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6225         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6226                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6227
6228         /* Filter edge */
6229         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6230         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6231         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6232         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6233         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6234     }
6235     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6236     for( dir = 0; dir < 2; dir++ )
6237     {
6238         int edge;
6239         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6240         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6241         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6242
6243         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6244                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6245         // how often to recheck mv-based bS when iterating between edges
6246         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6247                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6248         // how often to recheck mv-based bS when iterating along each edge
6249         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6250
6251         if (first_vertical_edge_done) {
6252             start = 1;
6253             first_vertical_edge_done = 0;
6254         }
6255
6256         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6257             start = 1;
6258
6259         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6260             && !IS_INTERLACED(mb_type)
6261             && IS_INTERLACED(mbm_type)
6262             ) {
6263             // This is a special case in the norm where the filtering must
6264             // be done twice (one each of the field) even if we are in a
6265             // frame macroblock.
6266             //
6267             static const int nnz_idx[4] = {4,5,6,3};
6268             unsigned int tmp_linesize   = 2 *   linesize;
6269             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6270             int mbn_xy = mb_xy - 2 * s->mb_stride;
6271             int qp;
6272             int i, j;
6273             int16_t bS[4];
6274
6275             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6276                 if( IS_INTRA(mb_type) ||
6277                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6278                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6279                 } else {
6280                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6281                     for( i = 0; i < 4; i++ ) {
6282                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6283                             mbn_nnz[nnz_idx[i]] != 0 )
6284                             bS[i] = 2;
6285                         else
6286                             bS[i] = 1;
6287                     }
6288                 }
6289                 // Do not use s->qscale as luma quantizer because it has not the same
6290                 // value in IPCM macroblocks.
6291                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6292                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6293                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6294                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6295                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6296                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6297                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6298                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6299             }
6300
6301             start = 1;
6302         }
6303
6304         /* Calculate bS */
6305         for( edge = start; edge < edges; edge++ ) {
6306             /* mbn_xy: neighbor macroblock */
6307             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6308             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6309             int16_t bS[4];
6310             int qp;
6311
6312             if( (edge&1) && IS_8x8DCT(mb_type) )
6313                 continue;
6314
6315             if( IS_INTRA(mb_type) ||
6316                 IS_INTRA(mbn_type) ) {
6317                 int value;
6318                 if (edge == 0) {
6319                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6320                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6321                     ) {
6322                         value = 4;
6323                     } else {
6324                         value = 3;
6325                     }
6326                 } else {
6327                     value = 3;
6328                 }
6329                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6330             } else {
6331                 int i, l;
6332                 int mv_done;
6333
6334                 if( edge & mask_edge ) {
6335                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6336                     mv_done = 1;
6337                 }
6338                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6339                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6340                     mv_done = 1;
6341                 }
6342                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6343                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6344                     int bn_idx= b_idx - (dir ? 8:1);
6345                     int v = 0;
6346                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6347                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6348                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6349                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6350                     }
6351                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6352                     mv_done = 1;
6353                 }
6354                 else
6355                     mv_done = 0;
6356
6357                 for( i = 0; i < 4; i++ ) {
6358                     int x = dir == 0 ? edge : i;
6359                     int y = dir == 0 ? i    : edge;
6360                     int b_idx= 8 + 4 + x + 8*y;
6361                     int bn_idx= b_idx - (dir ? 8:1);
6362
6363                     if( h->non_zero_count_cache[b_idx] != 0 ||
6364                         h->non_zero_count_cache[bn_idx] != 0 ) {
6365                         bS[i] = 2;
6366                     }
6367                     else if(!mv_done)
6368                     {
6369                         bS[i] = 0;
6370                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6371                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6372                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6373                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6374                                 bS[i] = 1;
6375                                 break;
6376                             }
6377                         }
6378                     }
6379                 }
6380
6381                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6382                     continue;
6383             }
6384
6385             /* Filter edge */
6386             // Do not use s->qscale as luma quantizer because it has not the same
6387             // value in IPCM macroblocks.
6388             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6389             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6390             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6391             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6392             if( dir == 0 ) {
6393                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6394                 if( (edge&1) == 0 ) {
6395                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6396                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6397                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6398                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6399                 }
6400             } else {
6401                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6402                 if( (edge&1) == 0 ) {
6403                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6404                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6405                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6406                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6407                 }
6408             }
6409         }
6410     }
6411 }
6412
6413 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6414     MpegEncContext * const s = &h->s;
6415     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6416
6417     s->mb_skip_run= -1;
6418
6419     if( h->pps.cabac ) {
6420         int i;
6421
6422         /* realign */
6423         align_get_bits( &s->gb );
6424
6425         /* init cabac */
6426         ff_init_cabac_states( &h->cabac);
6427         ff_init_cabac_decoder( &h->cabac,
6428                                s->gb.buffer + get_bits_count(&s->gb)/8,
6429                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6430         /* calculate pre-state */
6431         for( i= 0; i < 460; i++ ) {
6432             int pre;
6433             if( h->slice_type == I_TYPE )
6434                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6435             else
6436                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6437
6438             if( pre <= 63 )
6439                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6440             else
6441                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6442         }
6443
6444         for(;;){
6445 //START_TIMER
6446             int ret = decode_mb_cabac(h);
6447             int eos;
6448 //STOP_TIMER("decode_mb_cabac")
6449
6450             if(ret>=0) hl_decode_mb(h);
6451
6452             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6453                 s->mb_y++;
6454
6455                 if(ret>=0) ret = decode_mb_cabac(h);
6456
6457                 if(ret>=0) hl_decode_mb(h);
6458                 s->mb_y--;
6459             }
6460             eos = get_cabac_terminate( &h->cabac );
6461
6462             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6463                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6464                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6465                 return -1;
6466             }
6467
6468             if( ++s->mb_x >= s->mb_width ) {
6469                 s->mb_x = 0;
6470                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6471                 ++s->mb_y;
6472                 if(FRAME_MBAFF) {
6473                     ++s->mb_y;
6474                 }
6475             }
6476
6477             if( eos || s->mb_y >= s->mb_height ) {
6478                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6479                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6480                 return 0;
6481             }
6482         }
6483
6484     } else {
6485         for(;;){
6486             int ret = decode_mb_cavlc(h);
6487
6488             if(ret>=0) hl_decode_mb(h);
6489
6490             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6491                 s->mb_y++;
6492                 ret = decode_mb_cavlc(h);
6493
6494                 if(ret>=0) hl_decode_mb(h);
6495                 s->mb_y--;
6496             }
6497
6498             if(ret<0){
6499                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6500                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6501
6502                 return -1;
6503             }
6504
6505             if(++s->mb_x >= s->mb_width){
6506                 s->mb_x=0;
6507                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6508                 ++s->mb_y;
6509                 if(FRAME_MBAFF) {
6510                     ++s->mb_y;
6511                 }
6512                 if(s->mb_y >= s->mb_height){
6513                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6514
6515                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6516                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6517
6518                         return 0;
6519                     }else{
6520                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6521
6522                         return -1;
6523                     }
6524                 }
6525             }
6526
6527             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6528                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6529                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6530                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6531
6532                     return 0;
6533                 }else{
6534                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6535
6536                     return -1;
6537                 }
6538             }
6539         }
6540     }
6541
6542 #if 0
6543     for(;s->mb_y < s->mb_height; s->mb_y++){
6544         for(;s->mb_x < s->mb_width; s->mb_x++){
6545             int ret= decode_mb(h);
6546
6547             hl_decode_mb(h);
6548
6549             if(ret<0){
6550                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6551                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6552
6553                 return -1;
6554             }
6555
6556             if(++s->mb_x >= s->mb_width){
6557                 s->mb_x=0;
6558                 if(++s->mb_y >= s->mb_height){
6559                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6560                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6561
6562                         return 0;
6563                     }else{
6564                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6565
6566                         return -1;
6567                     }
6568                 }
6569             }
6570
6571             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6572                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6573                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6574
6575                     return 0;
6576                 }else{
6577                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6578
6579                     return -1;
6580                 }
6581             }
6582         }
6583         s->mb_x=0;
6584         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6585     }
6586 #endif
6587     return -1; //not reached
6588 }
6589
6590 static int decode_unregistered_user_data(H264Context *h, int size){
6591     MpegEncContext * const s = &h->s;
6592     uint8_t user_data[16+256];
6593     int e, build, i;
6594
6595     if(size<16)
6596         return -1;
6597
6598     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6599         user_data[i]= get_bits(&s->gb, 8);
6600     }
6601
6602     user_data[i]= 0;
6603     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6604     if(e==1 && build>=0)
6605         h->x264_build= build;
6606
6607     if(s->avctx->debug & FF_DEBUG_BUGS)
6608         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6609
6610     for(; i<size; i++)
6611         skip_bits(&s->gb, 8);
6612
6613     return 0;
6614 }
6615
6616 static int decode_sei(H264Context *h){
6617     MpegEncContext * const s = &h->s;
6618
6619     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6620         int size, type;
6621
6622         type=0;
6623         do{
6624             type+= show_bits(&s->gb, 8);
6625         }while(get_bits(&s->gb, 8) == 255);
6626
6627         size=0;
6628         do{
6629             size+= show_bits(&s->gb, 8);
6630         }while(get_bits(&s->gb, 8) == 255);
6631
6632         switch(type){
6633         case 5:
6634             if(decode_unregistered_user_data(h, size) < 0)
6635                 return -1;
6636             break;
6637         default:
6638             skip_bits(&s->gb, 8*size);
6639         }
6640
6641         //FIXME check bits here
6642         align_get_bits(&s->gb);
6643     }
6644
6645     return 0;
6646 }
6647
6648 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6649     MpegEncContext * const s = &h->s;
6650     int cpb_count, i;
6651     cpb_count = get_ue_golomb(&s->gb) + 1;
6652     get_bits(&s->gb, 4); /* bit_rate_scale */
6653     get_bits(&s->gb, 4); /* cpb_size_scale */
6654     for(i=0; i<cpb_count; i++){
6655         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6656         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6657         get_bits1(&s->gb);     /* cbr_flag */
6658     }
6659     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6660     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6661     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6662     get_bits(&s->gb, 5); /* time_offset_length */
6663 }
6664
6665 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6666     MpegEncContext * const s = &h->s;
6667     int aspect_ratio_info_present_flag;
6668     unsigned int aspect_ratio_idc;
6669     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6670
6671     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6672
6673     if( aspect_ratio_info_present_flag ) {
6674         aspect_ratio_idc= get_bits(&s->gb, 8);
6675         if( aspect_ratio_idc == EXTENDED_SAR ) {
6676             sps->sar.num= get_bits(&s->gb, 16);
6677             sps->sar.den= get_bits(&s->gb, 16);
6678         }else if(aspect_ratio_idc < 14){
6679             sps->sar=  pixel_aspect[aspect_ratio_idc];
6680         }else{
6681             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6682             return -1;
6683         }
6684     }else{
6685         sps->sar.num=
6686         sps->sar.den= 0;
6687     }
6688 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6689
6690     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6691         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6692     }
6693
6694     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6695         get_bits(&s->gb, 3);    /* video_format */
6696         get_bits1(&s->gb);      /* video_full_range_flag */
6697         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6698             get_bits(&s->gb, 8); /* colour_primaries */
6699             get_bits(&s->gb, 8); /* transfer_characteristics */
6700             get_bits(&s->gb, 8); /* matrix_coefficients */
6701         }
6702     }
6703
6704     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6705         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6706         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6707     }
6708
6709     sps->timing_info_present_flag = get_bits1(&s->gb);
6710     if(sps->timing_info_present_flag){
6711         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6712         sps->time_scale = get_bits_long(&s->gb, 32);
6713         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6714     }
6715
6716     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6717     if(nal_hrd_parameters_present_flag)
6718         decode_hrd_parameters(h, sps);
6719     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6720     if(vcl_hrd_parameters_present_flag)
6721         decode_hrd_parameters(h, sps);
6722     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6723         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6724     get_bits1(&s->gb);         /* pic_struct_present_flag */
6725
6726     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6727     if(sps->bitstream_restriction_flag){
6728         unsigned int num_reorder_frames;
6729         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6730         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6731         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6732         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6733         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6734         num_reorder_frames= get_ue_golomb(&s->gb);
6735         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6736
6737         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6738             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
6739             return -1;
6740         }
6741
6742         sps->num_reorder_frames= num_reorder_frames;
6743     }
6744
6745     return 0;
6746 }
6747
6748 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6749                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6750     MpegEncContext * const s = &h->s;
6751     int i, last = 8, next = 8;
6752     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6753     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6754         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6755     else
6756     for(i=0;i<size;i++){
6757         if(next)
6758             next = (last + get_se_golomb(&s->gb)) & 0xff;
6759         if(!i && !next){ /* matrix not written, we use the preset one */
6760             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6761             break;
6762         }
6763         last = factors[scan[i]] = next ? next : last;
6764     }
6765 }
6766
6767 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6768                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6769     MpegEncContext * const s = &h->s;
6770     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6771     const uint8_t *fallback[4] = {
6772         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6773         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6774         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6775         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6776     };
6777     if(get_bits1(&s->gb)){
6778         sps->scaling_matrix_present |= is_sps;
6779         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6780         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6781         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6782         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6783         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6784         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6785         if(is_sps || pps->transform_8x8_mode){
6786             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6787             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6788         }
6789     } else if(fallback_sps) {
6790         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
6791         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
6792     }
6793 }
6794
6795 /**
6796  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
6797  */
6798 static void *
6799 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
6800                     const size_t size, const char *name)
6801 {
6802     if(id>=max) {
6803         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
6804         return NULL;
6805     }
6806
6807     if(!vec[id]) {
6808         vec[id] = av_mallocz(size);
6809         if(vec[id] == NULL)
6810             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
6811     }
6812     return vec[id];
6813 }
6814
6815 static inline int decode_seq_parameter_set(H264Context *h){
6816     MpegEncContext * const s = &h->s;
6817     int profile_idc, level_idc;
6818     unsigned int sps_id, tmp, mb_width, mb_height;
6819     int i;
6820     SPS *sps;
6821
6822     profile_idc= get_bits(&s->gb, 8);
6823     get_bits1(&s->gb);   //constraint_set0_flag
6824     get_bits1(&s->gb);   //constraint_set1_flag
6825     get_bits1(&s->gb);   //constraint_set2_flag
6826     get_bits1(&s->gb);   //constraint_set3_flag
6827     get_bits(&s->gb, 4); // reserved
6828     level_idc= get_bits(&s->gb, 8);
6829     sps_id= get_ue_golomb(&s->gb);
6830
6831     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
6832     if(sps == NULL)
6833         return -1;
6834
6835     sps->profile_idc= profile_idc;
6836     sps->level_idc= level_idc;
6837
6838     if(sps->profile_idc >= 100){ //high profile
6839         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
6840             get_bits1(&s->gb);  //residual_color_transform_flag
6841         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
6842         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
6843         sps->transform_bypass = get_bits1(&s->gb);
6844         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
6845     }else
6846         sps->scaling_matrix_present = 0;
6847
6848     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6849     sps->poc_type= get_ue_golomb(&s->gb);
6850
6851     if(sps->poc_type == 0){ //FIXME #define
6852         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6853     } else if(sps->poc_type == 1){//FIXME #define
6854         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6855         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6856         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6857         tmp= get_ue_golomb(&s->gb);
6858
6859         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
6860             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
6861             return -1;
6862         }
6863         sps->poc_cycle_length= tmp;
6864
6865         for(i=0; i<sps->poc_cycle_length; i++)
6866             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
6867     }else if(sps->poc_type != 2){
6868         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
6869         return -1;
6870     }
6871
6872     tmp= get_ue_golomb(&s->gb);
6873     if(tmp > MAX_PICTURE_COUNT-2){
6874         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
6875     }
6876     sps->ref_frame_count= tmp;
6877     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
6878     mb_width= get_ue_golomb(&s->gb) + 1;
6879     mb_height= get_ue_golomb(&s->gb) + 1;
6880     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
6881        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
6882         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
6883         return -1;
6884     }
6885     sps->mb_width = mb_width;
6886     sps->mb_height= mb_height;
6887
6888     sps->frame_mbs_only_flag= get_bits1(&s->gb);
6889     if(!sps->frame_mbs_only_flag)
6890         sps->mb_aff= get_bits1(&s->gb);
6891     else
6892         sps->mb_aff= 0;
6893
6894     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
6895
6896 #ifndef ALLOW_INTERLACE
6897     if(sps->mb_aff)
6898         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
6899 #endif
6900     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
6901         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
6902
6903     sps->crop= get_bits1(&s->gb);
6904     if(sps->crop){
6905         sps->crop_left  = get_ue_golomb(&s->gb);
6906         sps->crop_right = get_ue_golomb(&s->gb);
6907         sps->crop_top   = get_ue_golomb(&s->gb);
6908         sps->crop_bottom= get_ue_golomb(&s->gb);
6909         if(sps->crop_left || sps->crop_top){
6910             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
6911         }
6912     }else{
6913         sps->crop_left  =
6914         sps->crop_right =
6915         sps->crop_top   =
6916         sps->crop_bottom= 0;
6917     }
6918
6919     sps->vui_parameters_present_flag= get_bits1(&s->gb);
6920     if( sps->vui_parameters_present_flag )
6921         decode_vui_parameters(h, sps);
6922
6923     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
6924         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
6925                sps_id, sps->profile_idc, sps->level_idc,
6926                sps->poc_type,
6927                sps->ref_frame_count,
6928                sps->mb_width, sps->mb_height,
6929                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
6930                sps->direct_8x8_inference_flag ? "8B8" : "",
6931                sps->crop_left, sps->crop_right,
6932                sps->crop_top, sps->crop_bottom,
6933                sps->vui_parameters_present_flag ? "VUI" : ""
6934                );
6935     }
6936     return 0;
6937 }
6938
6939 static void
6940 build_qp_table(PPS *pps, int t, int index)
6941 {
6942     int i;
6943     for(i = 0; i < 255; i++)
6944         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
6945 }
6946
6947 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
6948     MpegEncContext * const s = &h->s;
6949     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
6950     PPS *pps;
6951
6952     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
6953     if(pps == NULL)
6954         return -1;
6955
6956     tmp= get_ue_golomb(&s->gb);
6957     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
6958         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
6959         return -1;
6960     }
6961     pps->sps_id= tmp;
6962
6963     pps->cabac= get_bits1(&s->gb);
6964     pps->pic_order_present= get_bits1(&s->gb);
6965     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
6966     if(pps->slice_group_count > 1 ){
6967         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
6968         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
6969         switch(pps->mb_slice_group_map_type){
6970         case 0:
6971 #if 0
6972 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
6973 |    run_length[ i ]                                |1  |ue(v)   |
6974 #endif
6975             break;
6976         case 2:
6977 #if 0
6978 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
6979 |{                                                  |   |        |
6980 |    top_left_mb[ i ]                               |1  |ue(v)   |
6981 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
6982 |   }                                               |   |        |
6983 #endif
6984             break;
6985         case 3:
6986         case 4:
6987         case 5:
6988 #if 0
6989 |   slice_group_change_direction_flag               |1  |u(1)    |
6990 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
6991 #endif
6992             break;
6993         case 6:
6994 #if 0
6995 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
6996 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
6997 |)                                                  |   |        |
6998 |    slice_group_id[ i ]                            |1  |u(v)    |
6999 #endif
7000             break;
7001         }
7002     }
7003     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7004     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7005     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7006         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7007         pps->ref_count[0]= pps->ref_count[1]= 1;
7008         return -1;
7009     }
7010
7011     pps->weighted_pred= get_bits1(&s->gb);
7012     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7013     pps->init_qp= get_se_golomb(&s->gb) + 26;
7014     pps->init_qs= get_se_golomb(&s->gb) + 26;
7015     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7016     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7017     pps->constrained_intra_pred= get_bits1(&s->gb);
7018     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7019
7020     pps->transform_8x8_mode= 0;
7021     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7022     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7023     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7024
7025     if(get_bits_count(&s->gb) < bit_length){
7026         pps->transform_8x8_mode= get_bits1(&s->gb);
7027         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7028         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7029     } else {
7030         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7031     }
7032
7033     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7034     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7035         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7036         h->pps.chroma_qp_diff= 1;
7037     } else
7038         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7039
7040     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7041         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7042                pps_id, pps->sps_id,
7043                pps->cabac ? "CABAC" : "CAVLC",
7044                pps->slice_group_count,
7045                pps->ref_count[0], pps->ref_count[1],
7046                pps->weighted_pred ? "weighted" : "",
7047                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7048                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7049                pps->constrained_intra_pred ? "CONSTR" : "",
7050                pps->redundant_pic_cnt_present ? "REDU" : "",
7051                pps->transform_8x8_mode ? "8x8DCT" : ""
7052                );
7053     }
7054
7055     return 0;
7056 }
7057
7058 /**
7059  * Call decode_slice() for each context.
7060  *
7061  * @param h h264 master context
7062  * @param context_count number of contexts to execute
7063  */
7064 static void execute_decode_slices(H264Context *h, int context_count){
7065     MpegEncContext * const s = &h->s;
7066     AVCodecContext * const avctx= s->avctx;
7067     H264Context *hx;
7068     int i;
7069
7070     if(context_count == 1) {
7071         decode_slice(avctx, h);
7072     } else {
7073         for(i = 1; i < context_count; i++) {
7074             hx = h->thread_context[i];
7075             hx->s.error_resilience = avctx->error_resilience;
7076             hx->s.error_count = 0;
7077         }
7078
7079         avctx->execute(avctx, (void *)decode_slice,
7080                        (void **)h->thread_context, NULL, context_count);
7081
7082         /* pull back stuff from slices to master context */
7083         hx = h->thread_context[context_count - 1];
7084         s->mb_x = hx->s.mb_x;
7085         s->mb_y = hx->s.mb_y;
7086         for(i = 1; i < context_count; i++)
7087             h->s.error_count += h->thread_context[i]->s.error_count;
7088     }
7089 }
7090
7091
7092 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7093     MpegEncContext * const s = &h->s;
7094     AVCodecContext * const avctx= s->avctx;
7095     int buf_index=0;
7096     H264Context *hx; ///< thread context
7097     int context_count = 0;
7098
7099     h->max_contexts = avctx->thread_count;
7100 #if 0
7101     int i;
7102     for(i=0; i<50; i++){
7103         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7104     }
7105 #endif
7106     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7107         h->current_slice = 0;
7108         s->current_picture_ptr= NULL;
7109     }
7110
7111     for(;;){
7112         int consumed;
7113         int dst_length;
7114         int bit_length;
7115         uint8_t *ptr;
7116         int i, nalsize = 0;
7117         int err;
7118
7119         if(h->is_avc) {
7120             if(buf_index >= buf_size) break;
7121             nalsize = 0;
7122             for(i = 0; i < h->nal_length_size; i++)
7123                 nalsize = (nalsize << 8) | buf[buf_index++];
7124             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7125                 if(nalsize == 1){
7126                     buf_index++;
7127                     continue;
7128                 }else{
7129                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7130                     break;
7131                 }
7132             }
7133         } else {
7134             // start code prefix search
7135             for(; buf_index + 3 < buf_size; buf_index++){
7136                 // This should always succeed in the first iteration.
7137                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7138                     break;
7139             }
7140
7141             if(buf_index+3 >= buf_size) break;
7142
7143             buf_index+=3;
7144         }
7145
7146         hx = h->thread_context[context_count];
7147
7148         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7149         if (ptr==NULL || dst_length < 0){
7150             return -1;
7151         }
7152         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7153             dst_length--;
7154         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7155
7156         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7157             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7158         }
7159
7160         if (h->is_avc && (nalsize != consumed))
7161             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7162
7163         buf_index += consumed;
7164
7165         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7166            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7167             continue;
7168
7169       again:
7170         err = 0;
7171         switch(hx->nal_unit_type){
7172         case NAL_IDR_SLICE:
7173             if (h->nal_unit_type != NAL_IDR_SLICE) {
7174                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7175                 return -1;
7176             }
7177             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7178         case NAL_SLICE:
7179             init_get_bits(&hx->s.gb, ptr, bit_length);
7180             hx->intra_gb_ptr=
7181             hx->inter_gb_ptr= &hx->s.gb;
7182             hx->s.data_partitioning = 0;
7183
7184             if((err = decode_slice_header(hx, h)))
7185                break;
7186
7187             s->current_picture_ptr->key_frame= (hx->nal_unit_type == NAL_IDR_SLICE);
7188             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7189                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7190                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7191                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7192                && avctx->skip_frame < AVDISCARD_ALL)
7193                 context_count++;
7194             break;
7195         case NAL_DPA:
7196             init_get_bits(&hx->s.gb, ptr, bit_length);
7197             hx->intra_gb_ptr=
7198             hx->inter_gb_ptr= NULL;
7199             hx->s.data_partitioning = 1;
7200
7201             err = decode_slice_header(hx, h);
7202             break;
7203         case NAL_DPB:
7204             init_get_bits(&hx->intra_gb, ptr, bit_length);
7205             hx->intra_gb_ptr= &hx->intra_gb;
7206             break;
7207         case NAL_DPC:
7208             init_get_bits(&hx->inter_gb, ptr, bit_length);
7209             hx->inter_gb_ptr= &hx->inter_gb;
7210
7211             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7212                && s->context_initialized
7213                && s->hurry_up < 5
7214                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7215                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7216                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7217                && avctx->skip_frame < AVDISCARD_ALL)
7218                 context_count++;
7219             break;
7220         case NAL_SEI:
7221             init_get_bits(&s->gb, ptr, bit_length);
7222             decode_sei(h);
7223             break;
7224         case NAL_SPS:
7225             init_get_bits(&s->gb, ptr, bit_length);
7226             decode_seq_parameter_set(h);
7227
7228             if(s->flags& CODEC_FLAG_LOW_DELAY)
7229                 s->low_delay=1;
7230
7231             if(avctx->has_b_frames < 2)
7232                 avctx->has_b_frames= !s->low_delay;
7233             break;
7234         case NAL_PPS:
7235             init_get_bits(&s->gb, ptr, bit_length);
7236
7237             decode_picture_parameter_set(h, bit_length);
7238
7239             break;
7240         case NAL_AUD:
7241         case NAL_END_SEQUENCE:
7242         case NAL_END_STREAM:
7243         case NAL_FILLER_DATA:
7244         case NAL_SPS_EXT:
7245         case NAL_AUXILIARY_SLICE:
7246             break;
7247         default:
7248             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7249         }
7250
7251         if(context_count == h->max_contexts) {
7252             execute_decode_slices(h, context_count);
7253             context_count = 0;
7254         }
7255
7256         if (err < 0)
7257             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7258         else if(err == 1) {
7259             /* Slice could not be decoded in parallel mode, copy down
7260              * NAL unit stuff to context 0 and restart. Note that
7261              * rbsp_buffer is not transfered, but since we no longer
7262              * run in parallel mode this should not be an issue. */
7263             h->nal_unit_type = hx->nal_unit_type;
7264             h->nal_ref_idc   = hx->nal_ref_idc;
7265             hx = h;
7266             goto again;
7267         }
7268     }
7269     if(context_count)
7270         execute_decode_slices(h, context_count);
7271     return buf_index;
7272 }
7273
7274 /**
7275  * returns the number of bytes consumed for building the current frame
7276  */
7277 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7278     if(s->flags&CODEC_FLAG_TRUNCATED){
7279         pos -= s->parse_context.last_index;
7280         if(pos<0) pos=0; // FIXME remove (unneeded?)
7281
7282         return pos;
7283     }else{
7284         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7285         if(pos+10>buf_size) pos=buf_size; // oops ;)
7286
7287         return pos;
7288     }
7289 }
7290
7291 static int decode_frame(AVCodecContext *avctx,
7292                              void *data, int *data_size,
7293                              uint8_t *buf, int buf_size)
7294 {
7295     H264Context *h = avctx->priv_data;
7296     MpegEncContext *s = &h->s;
7297     AVFrame *pict = data;
7298     int buf_index;
7299
7300     s->flags= avctx->flags;
7301     s->flags2= avctx->flags2;
7302
7303    /* no supplementary picture */
7304     if (buf_size == 0) {
7305         Picture *out;
7306         int i, out_idx;
7307
7308 //FIXME factorize this with the output code below
7309         out = h->delayed_pic[0];
7310         out_idx = 0;
7311         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7312             if(h->delayed_pic[i]->poc < out->poc){
7313                 out = h->delayed_pic[i];
7314                 out_idx = i;
7315             }
7316
7317         for(i=out_idx; h->delayed_pic[i]; i++)
7318             h->delayed_pic[i] = h->delayed_pic[i+1];
7319
7320         if(out){
7321             *data_size = sizeof(AVFrame);
7322             *pict= *(AVFrame*)out;
7323         }
7324
7325         return 0;
7326     }
7327
7328     if(s->flags&CODEC_FLAG_TRUNCATED){
7329         int next= ff_h264_find_frame_end(h, buf, buf_size);
7330
7331         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7332             return buf_size;
7333 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7334     }
7335
7336     if(h->is_avc && !h->got_avcC) {
7337         int i, cnt, nalsize;
7338         unsigned char *p = avctx->extradata;
7339         if(avctx->extradata_size < 7) {
7340             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7341             return -1;
7342         }
7343         if(*p != 1) {
7344             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7345             return -1;
7346         }
7347         /* sps and pps in the avcC always have length coded with 2 bytes,
7348            so put a fake nal_length_size = 2 while parsing them */
7349         h->nal_length_size = 2;
7350         // Decode sps from avcC
7351         cnt = *(p+5) & 0x1f; // Number of sps
7352         p += 6;
7353         for (i = 0; i < cnt; i++) {
7354             nalsize = AV_RB16(p) + 2;
7355             if(decode_nal_units(h, p, nalsize) < 0) {
7356                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7357                 return -1;
7358             }
7359             p += nalsize;
7360         }
7361         // Decode pps from avcC
7362         cnt = *(p++); // Number of pps
7363         for (i = 0; i < cnt; i++) {
7364             nalsize = AV_RB16(p) + 2;
7365             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7366                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7367                 return -1;
7368             }
7369             p += nalsize;
7370         }
7371         // Now store right nal length size, that will be use to parse all other nals
7372         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7373         // Do not reparse avcC
7374         h->got_avcC = 1;
7375     }
7376
7377     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7378         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7379             return -1;
7380     }
7381
7382     buf_index=decode_nal_units(h, buf, buf_size);
7383     if(buf_index < 0)
7384         return -1;
7385
7386     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7387         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7388         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7389         return -1;
7390     }
7391
7392     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7393         Picture *out = s->current_picture_ptr;
7394         Picture *cur = s->current_picture_ptr;
7395         Picture *prev = h->delayed_output_pic;
7396         int i, pics, cross_idr, out_of_order, out_idx;
7397
7398         s->mb_y= 0;
7399
7400         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7401         s->current_picture_ptr->pict_type= s->pict_type;
7402
7403         h->prev_frame_num_offset= h->frame_num_offset;
7404         h->prev_frame_num= h->frame_num;
7405         if(s->current_picture_ptr->reference){
7406             h->prev_poc_msb= h->poc_msb;
7407             h->prev_poc_lsb= h->poc_lsb;
7408         }
7409         if(s->current_picture_ptr->reference)
7410             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7411
7412         ff_er_frame_end(s);
7413
7414         MPV_frame_end(s);
7415
7416     //FIXME do something with unavailable reference frames
7417
7418 #if 0 //decode order
7419         *data_size = sizeof(AVFrame);
7420 #else
7421         /* Sort B-frames into display order */
7422
7423         if(h->sps.bitstream_restriction_flag
7424            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7425             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7426             s->low_delay = 0;
7427         }
7428
7429         pics = 0;
7430         while(h->delayed_pic[pics]) pics++;
7431
7432         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7433
7434         h->delayed_pic[pics++] = cur;
7435         if(cur->reference == 0)
7436             cur->reference = 1;
7437
7438         cross_idr = 0;
7439         for(i=0; h->delayed_pic[i]; i++)
7440             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7441                 cross_idr = 1;
7442
7443         out = h->delayed_pic[0];
7444         out_idx = 0;
7445         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7446             if(h->delayed_pic[i]->poc < out->poc){
7447                 out = h->delayed_pic[i];
7448                 out_idx = i;
7449             }
7450
7451         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7452         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7453             { }
7454         else if(prev && pics <= s->avctx->has_b_frames)
7455             out = prev;
7456         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7457            || (s->low_delay &&
7458             ((!cross_idr && prev && out->poc > prev->poc + 2)
7459              || cur->pict_type == B_TYPE)))
7460         {
7461             s->low_delay = 0;
7462             s->avctx->has_b_frames++;
7463             out = prev;
7464         }
7465         else if(out_of_order)
7466             out = prev;
7467
7468         if(out_of_order || pics > s->avctx->has_b_frames){
7469             for(i=out_idx; h->delayed_pic[i]; i++)
7470                 h->delayed_pic[i] = h->delayed_pic[i+1];
7471         }
7472
7473         if(prev == out)
7474             *data_size = 0;
7475         else
7476             *data_size = sizeof(AVFrame);
7477         if(prev && prev != out && prev->reference == 1)
7478             prev->reference = 0;
7479         h->delayed_output_pic = out;
7480 #endif
7481
7482         if(out)
7483             *pict= *(AVFrame*)out;
7484         else
7485             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7486     }
7487
7488     assert(pict->data[0] || !*data_size);
7489     ff_print_debug_info(s, pict);
7490 //printf("out %d\n", (int)pict->data[0]);
7491 #if 0 //?
7492
7493     /* Return the Picture timestamp as the frame number */
7494     /* we substract 1 because it is added on utils.c    */
7495     avctx->frame_number = s->picture_number - 1;
7496 #endif
7497     return get_consumed_bytes(s, buf_index, buf_size);
7498 }
7499 #if 0
7500 static inline void fill_mb_avail(H264Context *h){
7501     MpegEncContext * const s = &h->s;
7502     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7503
7504     if(s->mb_y){
7505         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7506         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7507         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7508     }else{
7509         h->mb_avail[0]=
7510         h->mb_avail[1]=
7511         h->mb_avail[2]= 0;
7512     }
7513     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7514     h->mb_avail[4]= 1; //FIXME move out
7515     h->mb_avail[5]= 0; //FIXME move out
7516 }
7517 #endif
7518
7519 #if 0 //selftest
7520 #undef random
7521 #define COUNT 8000
7522 #define SIZE (COUNT*40)
7523 int main(){
7524     int i;
7525     uint8_t temp[SIZE];
7526     PutBitContext pb;
7527     GetBitContext gb;
7528 //    int int_temp[10000];
7529     DSPContext dsp;
7530     AVCodecContext avctx;
7531
7532     dsputil_init(&dsp, &avctx);
7533
7534     init_put_bits(&pb, temp, SIZE);
7535     printf("testing unsigned exp golomb\n");
7536     for(i=0; i<COUNT; i++){
7537         START_TIMER
7538         set_ue_golomb(&pb, i);
7539         STOP_TIMER("set_ue_golomb");
7540     }
7541     flush_put_bits(&pb);
7542
7543     init_get_bits(&gb, temp, 8*SIZE);
7544     for(i=0; i<COUNT; i++){
7545         int j, s;
7546
7547         s= show_bits(&gb, 24);
7548
7549         START_TIMER
7550         j= get_ue_golomb(&gb);
7551         if(j != i){
7552             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7553 //            return -1;
7554         }
7555         STOP_TIMER("get_ue_golomb");
7556     }
7557
7558
7559     init_put_bits(&pb, temp, SIZE);
7560     printf("testing signed exp golomb\n");
7561     for(i=0; i<COUNT; i++){
7562         START_TIMER
7563         set_se_golomb(&pb, i - COUNT/2);
7564         STOP_TIMER("set_se_golomb");
7565     }
7566     flush_put_bits(&pb);
7567
7568     init_get_bits(&gb, temp, 8*SIZE);
7569     for(i=0; i<COUNT; i++){
7570         int j, s;
7571
7572         s= show_bits(&gb, 24);
7573
7574         START_TIMER
7575         j= get_se_golomb(&gb);
7576         if(j != i - COUNT/2){
7577             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7578 //            return -1;
7579         }
7580         STOP_TIMER("get_se_golomb");
7581     }
7582
7583     printf("testing 4x4 (I)DCT\n");
7584
7585     DCTELEM block[16];
7586     uint8_t src[16], ref[16];
7587     uint64_t error= 0, max_error=0;
7588
7589     for(i=0; i<COUNT; i++){
7590         int j;
7591 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7592         for(j=0; j<16; j++){
7593             ref[j]= random()%255;
7594             src[j]= random()%255;
7595         }
7596
7597         h264_diff_dct_c(block, src, ref, 4);
7598
7599         //normalize
7600         for(j=0; j<16; j++){
7601 //            printf("%d ", block[j]);
7602             block[j]= block[j]*4;
7603             if(j&1) block[j]= (block[j]*4 + 2)/5;
7604             if(j&4) block[j]= (block[j]*4 + 2)/5;
7605         }
7606 //        printf("\n");
7607
7608         s->dsp.h264_idct_add(ref, block, 4);
7609 /*        for(j=0; j<16; j++){
7610             printf("%d ", ref[j]);
7611         }
7612         printf("\n");*/
7613
7614         for(j=0; j<16; j++){
7615             int diff= FFABS(src[j] - ref[j]);
7616
7617             error+= diff*diff;
7618             max_error= FFMAX(max_error, diff);
7619         }
7620     }
7621     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7622 #if 0
7623     printf("testing quantizer\n");
7624     for(qp=0; qp<52; qp++){
7625         for(i=0; i<16; i++)
7626             src1_block[i]= src2_block[i]= random()%255;
7627
7628     }
7629 #endif
7630     printf("Testing NAL layer\n");
7631
7632     uint8_t bitstream[COUNT];
7633     uint8_t nal[COUNT*2];
7634     H264Context h;
7635     memset(&h, 0, sizeof(H264Context));
7636
7637     for(i=0; i<COUNT; i++){
7638         int zeros= i;
7639         int nal_length;
7640         int consumed;
7641         int out_length;
7642         uint8_t *out;
7643         int j;
7644
7645         for(j=0; j<COUNT; j++){
7646             bitstream[j]= (random() % 255) + 1;
7647         }
7648
7649         for(j=0; j<zeros; j++){
7650             int pos= random() % COUNT;
7651             while(bitstream[pos] == 0){
7652                 pos++;
7653                 pos %= COUNT;
7654             }
7655             bitstream[pos]=0;
7656         }
7657
7658         START_TIMER
7659
7660         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7661         if(nal_length<0){
7662             printf("encoding failed\n");
7663             return -1;
7664         }
7665
7666         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7667
7668         STOP_TIMER("NAL")
7669
7670         if(out_length != COUNT){
7671             printf("incorrect length %d %d\n", out_length, COUNT);
7672             return -1;
7673         }
7674
7675         if(consumed != nal_length){
7676             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7677             return -1;
7678         }
7679
7680         if(memcmp(bitstream, out, COUNT)){
7681             printf("mismatch\n");
7682             return -1;
7683         }
7684     }
7685
7686     printf("Testing RBSP\n");
7687
7688
7689     return 0;
7690 }
7691 #endif
7692
7693
7694 static int decode_end(AVCodecContext *avctx)
7695 {
7696     H264Context *h = avctx->priv_data;
7697     MpegEncContext *s = &h->s;
7698
7699     av_freep(&h->rbsp_buffer[0]);
7700     av_freep(&h->rbsp_buffer[1]);
7701     free_tables(h); //FIXME cleanup init stuff perhaps
7702     MPV_common_end(s);
7703
7704 //    memset(h, 0, sizeof(H264Context));
7705
7706     return 0;
7707 }
7708
7709
7710 AVCodec h264_decoder = {
7711     "h264",
7712     CODEC_TYPE_VIDEO,
7713     CODEC_ID_H264,
7714     sizeof(H264Context),
7715     decode_init,
7716     NULL,
7717     decode_end,
7718     decode_frame,
7719     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
7720     .flush= flush_dpb,
7721 };
7722
7723 #include "svq3.c"