git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 static VLC coeff_token_vlc[4];
  42 static VLC chroma_dc_coeff_token_vlc;
  43
  44 static VLC total_zeros_vlc[15];
  45 static VLC chroma_dc_total_zeros_vlc[3];
  46
  47 static VLC run_vlc[6];
  48 static VLC run7_vlc;
  49
  50 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  51 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  52 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  53 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  54
  55 static av_always_inline uint32_t pack16to32(int a, int b){
  56 #ifdef WORDS_BIGENDIAN
  57    return (b&0xFFFF) + (a<<16);
  58 #else
  59    return (a&0xFFFF) + (b<<16);
  60 #endif
  61 }
  62
  63 const uint8_t ff_rem6[52]={
  64 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  65 };
  66
  67 const uint8_t ff_div6[52]={
  68 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  69 };
  70
  71
  72 /**
  73  * fill a rectangle.
  74  * @param h height of the rectangle, should be a constant
  75  * @param w width of the rectangle, should be a constant
  76  * @param size the size of val (1 or 4), should be a constant
  77  */
  78 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  79     uint8_t *p= (uint8_t*)vp;
  80     assert(size==1 || size==4);
  81     assert(w<=4);
  82
  83     w      *= size;
  84     stride *= size;
  85
  86     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  87     assert((stride&(w-1))==0);
  88     if(w==2){
  89         const uint16_t v= size==4 ? val : val*0x0101;
  90         *(uint16_t*)(p + 0*stride)= v;
  91         if(h==1) return;
  92         *(uint16_t*)(p + 1*stride)= v;
  93         if(h==2) return;
  94         *(uint16_t*)(p + 2*stride)=
  95         *(uint16_t*)(p + 3*stride)= v;
  96     }else if(w==4){
  97         const uint32_t v= size==4 ? val : val*0x01010101;
  98         *(uint32_t*)(p + 0*stride)= v;
  99         if(h==1) return;
 100         *(uint32_t*)(p + 1*stride)= v;
 101         if(h==2) return;
 102         *(uint32_t*)(p + 2*stride)=
 103         *(uint32_t*)(p + 3*stride)= v;
 104     }else if(w==8){
 105     //gcc can't optimize 64bit math on x86_32
 106 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 107         const uint64_t v= val*0x0100000001ULL;
 108         *(uint64_t*)(p + 0*stride)= v;
 109         if(h==1) return;
 110         *(uint64_t*)(p + 1*stride)= v;
 111         if(h==2) return;
 112         *(uint64_t*)(p + 2*stride)=
 113         *(uint64_t*)(p + 3*stride)= v;
 114     }else if(w==16){
 115         const uint64_t v= val*0x0100000001ULL;
 116         *(uint64_t*)(p + 0+0*stride)=
 117         *(uint64_t*)(p + 8+0*stride)=
 118         *(uint64_t*)(p + 0+1*stride)=
 119         *(uint64_t*)(p + 8+1*stride)= v;
 120         if(h==2) return;
 121         *(uint64_t*)(p + 0+2*stride)=
 122         *(uint64_t*)(p + 8+2*stride)=
 123         *(uint64_t*)(p + 0+3*stride)=
 124         *(uint64_t*)(p + 8+3*stride)= v;
 125 #else
 126         *(uint32_t*)(p + 0+0*stride)=
 127         *(uint32_t*)(p + 4+0*stride)= val;
 128         if(h==1) return;
 129         *(uint32_t*)(p + 0+1*stride)=
 130         *(uint32_t*)(p + 4+1*stride)= val;
 131         if(h==2) return;
 132         *(uint32_t*)(p + 0+2*stride)=
 133         *(uint32_t*)(p + 4+2*stride)=
 134         *(uint32_t*)(p + 0+3*stride)=
 135         *(uint32_t*)(p + 4+3*stride)= val;
 136     }else if(w==16){
 137         *(uint32_t*)(p + 0+0*stride)=
 138         *(uint32_t*)(p + 4+0*stride)=
 139         *(uint32_t*)(p + 8+0*stride)=
 140         *(uint32_t*)(p +12+0*stride)=
 141         *(uint32_t*)(p + 0+1*stride)=
 142         *(uint32_t*)(p + 4+1*stride)=
 143         *(uint32_t*)(p + 8+1*stride)=
 144         *(uint32_t*)(p +12+1*stride)= val;
 145         if(h==2) return;
 146         *(uint32_t*)(p + 0+2*stride)=
 147         *(uint32_t*)(p + 4+2*stride)=
 148         *(uint32_t*)(p + 8+2*stride)=
 149         *(uint32_t*)(p +12+2*stride)=
 150         *(uint32_t*)(p + 0+3*stride)=
 151         *(uint32_t*)(p + 4+3*stride)=
 152         *(uint32_t*)(p + 8+3*stride)=
 153         *(uint32_t*)(p +12+3*stride)= val;
 154 #endif
 155     }else
 156         assert(0);
 157     assert(h==4);
 158 }
 159
 160 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 161     MpegEncContext * const s = &h->s;
 162     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 163     int topleft_xy, top_xy, topright_xy, left_xy[2];
 164     int topleft_type, top_type, topright_type, left_type[2];
 165     int left_block[8];
 166     int i;
 167
 168     //FIXME deblocking could skip the intra and nnz parts.
 169     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 170         return;
 171
 172     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 173
 174     top_xy     = mb_xy  - s->mb_stride;
 175     topleft_xy = top_xy - 1;
 176     topright_xy= top_xy + 1;
 177     left_xy[1] = left_xy[0] = mb_xy-1;
 178     left_block[0]= 0;
 179     left_block[1]= 1;
 180     left_block[2]= 2;
 181     left_block[3]= 3;
 182     left_block[4]= 7;
 183     left_block[5]= 10;
 184     left_block[6]= 8;
 185     left_block[7]= 11;
 186     if(FRAME_MBAFF){
 187         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 188         const int top_pair_xy      = pair_xy     - s->mb_stride;
 189         const int topleft_pair_xy  = top_pair_xy - 1;
 190         const int topright_pair_xy = top_pair_xy + 1;
 191         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 192         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 193         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 194         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 195         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 196         const int bottom = (s->mb_y & 1);
 197         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 198         if (bottom
 199                 ? !curr_mb_frame_flag // bottom macroblock
 200                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 201                 ) {
 202             top_xy -= s->mb_stride;
 203         }
 204         if (bottom
 205                 ? !curr_mb_frame_flag // bottom macroblock
 206                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 207                 ) {
 208             topleft_xy -= s->mb_stride;
 209         }
 210         if (bottom
 211                 ? !curr_mb_frame_flag // bottom macroblock
 212                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 213                 ) {
 214             topright_xy -= s->mb_stride;
 215         }
 216         if (left_mb_frame_flag != curr_mb_frame_flag) {
 217             left_xy[1] = left_xy[0] = pair_xy - 1;
 218             if (curr_mb_frame_flag) {
 219                 if (bottom) {
 220                     left_block[0]= 2;
 221                     left_block[1]= 2;
 222                     left_block[2]= 3;
 223                     left_block[3]= 3;
 224                     left_block[4]= 8;
 225                     left_block[5]= 11;
 226                     left_block[6]= 8;
 227                     left_block[7]= 11;
 228                 } else {
 229                     left_block[0]= 0;
 230                     left_block[1]= 0;
 231                     left_block[2]= 1;
 232                     left_block[3]= 1;
 233                     left_block[4]= 7;
 234                     left_block[5]= 10;
 235                     left_block[6]= 7;
 236                     left_block[7]= 10;
 237                 }
 238             } else {
 239                 left_xy[1] += s->mb_stride;
 240                 //left_block[0]= 0;
 241                 left_block[1]= 2;
 242                 left_block[2]= 0;
 243                 left_block[3]= 2;
 244                 //left_block[4]= 7;
 245                 left_block[5]= 10;
 246                 left_block[6]= 7;
 247                 left_block[7]= 10;
 248             }
 249         }
 250     }
 251
 252     h->top_mb_xy = top_xy;
 253     h->left_mb_xy[0] = left_xy[0];
 254     h->left_mb_xy[1] = left_xy[1];
 255     if(for_deblock){
 256         topleft_type = 0;
 257         topright_type = 0;
 258         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 259         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 260         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 261
 262         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 263             int list;
 264             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 265             for(i=0; i<16; i++)
 266                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 267             for(list=0; list<h->list_count; list++){
 268                 if(USES_LIST(mb_type,list)){
 269                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 270                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 271                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 272                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 273                         dst[0] = src[0];
 274                         dst[1] = src[1];
 275                         dst[2] = src[2];
 276                         dst[3] = src[3];
 277                     }
 278                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 279                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 280                     ref += h->b8_stride;
 281                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 282                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 283                 }else{
 284                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 285                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 286                 }
 287             }
 288         }
 289     }else{
 290         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 291         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 292         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 293         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 294         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 295     }
 296
 297     if(IS_INTRA(mb_type)){
 298         h->topleft_samples_available=
 299         h->top_samples_available=
 300         h->left_samples_available= 0xFFFF;
 301         h->topright_samples_available= 0xEEEA;
 302
 303         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 304             h->topleft_samples_available= 0xB3FF;
 305             h->top_samples_available= 0x33FF;
 306             h->topright_samples_available= 0x26EA;
 307         }
 308         for(i=0; i<2; i++){
 309             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 310                 h->topleft_samples_available&= 0xDF5F;
 311                 h->left_samples_available&= 0x5F5F;
 312             }
 313         }
 314
 315         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 316             h->topleft_samples_available&= 0x7FFF;
 317
 318         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 319             h->topright_samples_available&= 0xFBFF;
 320
 321         if(IS_INTRA4x4(mb_type)){
 322             if(IS_INTRA4x4(top_type)){
 323                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 324                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 325                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 326                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 327             }else{
 328                 int pred;
 329                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 330                     pred= -1;
 331                 else{
 332                     pred= 2;
 333                 }
 334                 h->intra4x4_pred_mode_cache[4+8*0]=
 335                 h->intra4x4_pred_mode_cache[5+8*0]=
 336                 h->intra4x4_pred_mode_cache[6+8*0]=
 337                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 338             }
 339             for(i=0; i<2; i++){
 340                 if(IS_INTRA4x4(left_type[i])){
 341                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 342                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 343                 }else{
 344                     int pred;
 345                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 346                         pred= -1;
 347                     else{
 348                         pred= 2;
 349                     }
 350                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 351                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 352                 }
 353             }
 354         }
 355     }
 356
 357
 358 /*
 359 0 . T T. T T T T
 360 1 L . .L . . . .
 361 2 L . .L . . . .
 362 3 . T TL . . . .
 363 4 L . .L . . . .
 364 5 L . .. . . . .
 365 */
 366 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 367     if(top_type){
 368         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 369         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 370         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 371         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 372
 373         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 374         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 375
 376         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 377         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 378
 379     }else{
 380         h->non_zero_count_cache[4+8*0]=
 381         h->non_zero_count_cache[5+8*0]=
 382         h->non_zero_count_cache[6+8*0]=
 383         h->non_zero_count_cache[7+8*0]=
 384
 385         h->non_zero_count_cache[1+8*0]=
 386         h->non_zero_count_cache[2+8*0]=
 387
 388         h->non_zero_count_cache[1+8*3]=
 389         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 390
 391     }
 392
 393     for (i=0; i<2; i++) {
 394         if(left_type[i]){
 395             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 396             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 397             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 398             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 399         }else{
 400             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 401             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 402             h->non_zero_count_cache[0+8*1 +   8*i]=
 403             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 404         }
 405     }
 406
 407     if( h->pps.cabac ) {
 408         // top_cbp
 409         if(top_type) {
 410             h->top_cbp = h->cbp_table[top_xy];
 411         } else if(IS_INTRA(mb_type)) {
 412             h->top_cbp = 0x1C0;
 413         } else {
 414             h->top_cbp = 0;
 415         }
 416         // left_cbp
 417         if (left_type[0]) {
 418             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 419         } else if(IS_INTRA(mb_type)) {
 420             h->left_cbp = 0x1C0;
 421         } else {
 422             h->left_cbp = 0;
 423         }
 424         if (left_type[0]) {
 425             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 426         }
 427         if (left_type[1]) {
 428             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 429         }
 430     }
 431
 432 #if 1
 433     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 434         int list;
 435         for(list=0; list<h->list_count; list++){
 436             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 437                 /*if(!h->mv_cache_clean[list]){
 438                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 439                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 440                     h->mv_cache_clean[list]= 1;
 441                 }*/
 442                 continue;
 443             }
 444             h->mv_cache_clean[list]= 0;
 445
 446             if(USES_LIST(top_type, list)){
 447                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 448                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 449                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 450                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 451                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 452                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 453                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 454                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 455                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 456                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 457             }else{
 458                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 459                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 460                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 461                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 462                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 463             }
 464
 465             for(i=0; i<2; i++){
 466                 int cache_idx = scan8[0] - 1 + i*2*8;
 467                 if(USES_LIST(left_type[i], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 469                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 470                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 471                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 472                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 473                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 474                 }else{
 475                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 476                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 477                     h->ref_cache[list][cache_idx  ]=
 478                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 479                 }
 480             }
 481
 482             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 483                 continue;
 484
 485             if(USES_LIST(topleft_type, list)){
 486                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 487                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 488                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 489                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 490             }else{
 491                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 492                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 493             }
 494
 495             if(USES_LIST(topright_type, list)){
 496                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 497                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 498                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 499                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 500             }else{
 501                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 502                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 503             }
 504
 505             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 506                 continue;
 507
 508             h->ref_cache[list][scan8[5 ]+1] =
 509             h->ref_cache[list][scan8[7 ]+1] =
 510             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 511             h->ref_cache[list][scan8[4 ]] =
 512             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 513             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 514             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 515             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 516             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 517             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 518
 519             if( h->pps.cabac ) {
 520                 /* XXX beurk, Load mvd */
 521                 if(USES_LIST(top_type, list)){
 522                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 523                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 524                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 525                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 526                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 527                 }else{
 528                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 529                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 530                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 531                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 532                 }
 533                 if(USES_LIST(left_type[0], list)){
 534                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 535                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 536                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 537                 }else{
 538                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 539                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 540                 }
 541                 if(USES_LIST(left_type[1], list)){
 542                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 543                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 544                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 545                 }else{
 546                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 547                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 548                 }
 549                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 550                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 551                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 552                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 553                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 554
 555                 if(h->slice_type == B_TYPE){
 556                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 557
 558                     if(IS_DIRECT(top_type)){
 559                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 560                     }else if(IS_8X8(top_type)){
 561                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 562                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 563                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 564                     }else{
 565                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 566                     }
 567
 568                     if(IS_DIRECT(left_type[0]))
 569                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 570                     else if(IS_8X8(left_type[0]))
 571                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 572                     else
 573                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 574
 575                     if(IS_DIRECT(left_type[1]))
 576                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 577                     else if(IS_8X8(left_type[1]))
 578                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 579                     else
 580                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 581                 }
 582             }
 583
 584             if(FRAME_MBAFF){
 585 #define MAP_MVS\
 586                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 587                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 588                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 589                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 590                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 591                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 592                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 593                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 594                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 595                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 596                 if(MB_FIELD){
 597 #define MAP_F2F(idx, mb_type)\
 598                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 599                         h->ref_cache[list][idx] <<= 1;\
 600                         h->mv_cache[list][idx][1] /= 2;\
 601                         h->mvd_cache[list][idx][1] /= 2;\
 602                     }
 603                     MAP_MVS
 604 #undef MAP_F2F
 605                 }else{
 606 #define MAP_F2F(idx, mb_type)\
 607                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 608                         h->ref_cache[list][idx] >>= 1;\
 609                         h->mv_cache[list][idx][1] <<= 1;\
 610                         h->mvd_cache[list][idx][1] <<= 1;\
 611                     }
 612                     MAP_MVS
 613 #undef MAP_F2F
 614                 }
 615             }
 616         }
 617     }
 618 #endif
 619
 620     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 621 }
 622
 623 static inline void write_back_intra_pred_mode(H264Context *h){
 624     MpegEncContext * const s = &h->s;
 625     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 626
 627     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 628     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 629     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 630     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 631     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 632     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 633     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 634 }
 635
 636 /**
 637  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 638  */
 639 static inline int check_intra4x4_pred_mode(H264Context *h){
 640     MpegEncContext * const s = &h->s;
 641     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 642     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 643     int i;
 644
 645     if(!(h->top_samples_available&0x8000)){
 646         for(i=0; i<4; i++){
 647             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 648             if(status<0){
 649                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 650                 return -1;
 651             } else if(status){
 652                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 653             }
 654         }
 655     }
 656
 657     if(!(h->left_samples_available&0x8000)){
 658         for(i=0; i<4; i++){
 659             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 660             if(status<0){
 661                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 662                 return -1;
 663             } else if(status){
 664                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 665             }
 666         }
 667     }
 668
 669     return 0;
 670 } //FIXME cleanup like next
 671
 672 /**
 673  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 674  */
 675 static inline int check_intra_pred_mode(H264Context *h, int mode){
 676     MpegEncContext * const s = &h->s;
 677     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 678     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 679
 680     if(mode > 6U) {
 681         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 682         return -1;
 683     }
 684
 685     if(!(h->top_samples_available&0x8000)){
 686         mode= top[ mode ];
 687         if(mode<0){
 688             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 689             return -1;
 690         }
 691     }
 692
 693     if(!(h->left_samples_available&0x8000)){
 694         mode= left[ mode ];
 695         if(mode<0){
 696             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 697             return -1;
 698         }
 699     }
 700
 701     return mode;
 702 }
 703
 704 /**
 705  * gets the predicted intra4x4 prediction mode.
 706  */
 707 static inline int pred_intra_mode(H264Context *h, int n){
 708     const int index8= scan8[n];
 709     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 710     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 711     const int min= FFMIN(left, top);
 712
 713     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 714
 715     if(min<0) return DC_PRED;
 716     else      return min;
 717 }
 718
 719 static inline void write_back_non_zero_count(H264Context *h){
 720     MpegEncContext * const s = &h->s;
 721     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 722
 723     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 724     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 725     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 726     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 727     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 728     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 729     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 730
 731     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 732     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 733     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 734
 735     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 736     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 737     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 738
 739     if(FRAME_MBAFF){
 740         // store all luma nnzs, for deblocking
 741         int v = 0, i;
 742         for(i=0; i<16; i++)
 743             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 744         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 745     }
 746 }
 747
 748 /**
 749  * gets the predicted number of non zero coefficients.
 750  * @param n block index
 751  */
 752 static inline int pred_non_zero_count(H264Context *h, int n){
 753     const int index8= scan8[n];
 754     const int left= h->non_zero_count_cache[index8 - 1];
 755     const int top = h->non_zero_count_cache[index8 - 8];
 756     int i= left + top;
 757
 758     if(i<64) i= (i+1)>>1;
 759
 760     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 761
 762     return i&31;
 763 }
 764
 765 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 766     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 767     MpegEncContext *s = &h->s;
 768
 769     /* there is no consistent mapping of mvs to neighboring locations that will
 770      * make mbaff happy, so we can't move all this logic to fill_caches */
 771     if(FRAME_MBAFF){
 772         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 773         const int16_t *mv;
 774         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 775         *C = h->mv_cache[list][scan8[0]-2];
 776
 777         if(!MB_FIELD
 778            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 779             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 780             if(IS_INTERLACED(mb_types[topright_xy])){
 781 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 782                 const int x4 = X4, y4 = Y4;\
 783                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 784                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 785                     return LIST_NOT_USED;\
 786                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 787                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 788                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 789                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 790
 791                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 792             }
 793         }
 794         if(topright_ref == PART_NOT_AVAILABLE
 795            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 796            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 797             if(!MB_FIELD
 798                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 799                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 800             }
 801             if(MB_FIELD
 802                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 803                && i >= scan8[0]+8){
 804                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 805                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 806             }
 807         }
 808 #undef SET_DIAG_MV
 809     }
 810
 811     if(topright_ref != PART_NOT_AVAILABLE){
 812         *C= h->mv_cache[list][ i - 8 + part_width ];
 813         return topright_ref;
 814     }else{
 815         tprintf(s->avctx, "topright MV not available\n");
 816
 817         *C= h->mv_cache[list][ i - 8 - 1 ];
 818         return h->ref_cache[list][ i - 8 - 1 ];
 819     }
 820 }
 821
 822 /**
 823  * gets the predicted MV.
 824  * @param n the block index
 825  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 826  * @param mx the x component of the predicted motion vector
 827  * @param my the y component of the predicted motion vector
 828  */
 829 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 830     const int index8= scan8[n];
 831     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 832     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 833     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 834     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 835     const int16_t * C;
 836     int diagonal_ref, match_count;
 837
 838     assert(part_width==1 || part_width==2 || part_width==4);
 839
 840 /* mv_cache
 841   B . . A T T T T
 842   U . . L . . , .
 843   U . . L . . . .
 844   U . . L . . , .
 845   . . . L . . . .
 846 */
 847
 848     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 849     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 850     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 851     if(match_count > 1){ //most common
 852         *mx= mid_pred(A[0], B[0], C[0]);
 853         *my= mid_pred(A[1], B[1], C[1]);
 854     }else if(match_count==1){
 855         if(left_ref==ref){
 856             *mx= A[0];
 857             *my= A[1];
 858         }else if(top_ref==ref){
 859             *mx= B[0];
 860             *my= B[1];
 861         }else{
 862             *mx= C[0];
 863             *my= C[1];
 864         }
 865     }else{
 866         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 867             *mx= A[0];
 868             *my= A[1];
 869         }else{
 870             *mx= mid_pred(A[0], B[0], C[0]);
 871             *my= mid_pred(A[1], B[1], C[1]);
 872         }
 873     }
 874
 875     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 876 }
 877
 878 /**
 879  * gets the directionally predicted 16x8 MV.
 880  * @param n the block index
 881  * @param mx the x component of the predicted motion vector
 882  * @param my the y component of the predicted motion vector
 883  */
 884 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 885     if(n==0){
 886         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 887         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 888
 889         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 890
 891         if(top_ref == ref){
 892             *mx= B[0];
 893             *my= B[1];
 894             return;
 895         }
 896     }else{
 897         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 898         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 899
 900         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 901
 902         if(left_ref == ref){
 903             *mx= A[0];
 904             *my= A[1];
 905             return;
 906         }
 907     }
 908
 909     //RARE
 910     pred_motion(h, n, 4, list, ref, mx, my);
 911 }
 912
 913 /**
 914  * gets the directionally predicted 8x16 MV.
 915  * @param n the block index
 916  * @param mx the x component of the predicted motion vector
 917  * @param my the y component of the predicted motion vector
 918  */
 919 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 920     if(n==0){
 921         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 922         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 923
 924         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 925
 926         if(left_ref == ref){
 927             *mx= A[0];
 928             *my= A[1];
 929             return;
 930         }
 931     }else{
 932         const int16_t * C;
 933         int diagonal_ref;
 934
 935         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 936
 937         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 938
 939         if(diagonal_ref == ref){
 940             *mx= C[0];
 941             *my= C[1];
 942             return;
 943         }
 944     }
 945
 946     //RARE
 947     pred_motion(h, n, 2, list, ref, mx, my);
 948 }
 949
 950 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 951     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 952     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 953
 954     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 955
 956     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 957        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 958        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 959
 960         *mx = *my = 0;
 961         return;
 962     }
 963
 964     pred_motion(h, 0, 4, 0, 0, mx, my);
 965
 966     return;
 967 }
 968
 969 static inline void direct_dist_scale_factor(H264Context * const h){
 970     const int poc = h->s.current_picture_ptr->poc;
 971     const int poc1 = h->ref_list[1][0].poc;
 972     int i;
 973     for(i=0; i<h->ref_count[0]; i++){
 974         int poc0 = h->ref_list[0][i].poc;
 975         int td = av_clip(poc1 - poc0, -128, 127);
 976         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 977             h->dist_scale_factor[i] = 256;
 978         }else{
 979             int tb = av_clip(poc - poc0, -128, 127);
 980             int tx = (16384 + (FFABS(td) >> 1)) / td;
 981             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 982         }
 983     }
 984     if(FRAME_MBAFF){
 985         for(i=0; i<h->ref_count[0]; i++){
 986             h->dist_scale_factor_field[2*i] =
 987             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 988         }
 989     }
 990 }
 991 static inline void direct_ref_list_init(H264Context * const h){
 992     MpegEncContext * const s = &h->s;
 993     Picture * const ref1 = &h->ref_list[1][0];
 994     Picture * const cur = s->current_picture_ptr;
 995     int list, i, j;
 996     if(cur->pict_type == I_TYPE)
 997         cur->ref_count[0] = 0;
 998     if(cur->pict_type != B_TYPE)
 999         cur->ref_count[1] = 0;
1000     for(list=0; list<2; list++){
1001         cur->ref_count[list] = h->ref_count[list];
1002         for(j=0; j<h->ref_count[list]; j++)
1003             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1004     }
1005     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1006         return;
1007     for(list=0; list<2; list++){
1008         for(i=0; i<ref1->ref_count[list]; i++){
1009             const int poc = ref1->ref_poc[list][i];
1010             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1011             for(j=0; j<h->ref_count[list]; j++)
1012                 if(h->ref_list[list][j].poc == poc){
1013                     h->map_col_to_list0[list][i] = j;
1014                     break;
1015                 }
1016         }
1017     }
1018     if(FRAME_MBAFF){
1019         for(list=0; list<2; list++){
1020             for(i=0; i<ref1->ref_count[list]; i++){
1021                 j = h->map_col_to_list0[list][i];
1022                 h->map_col_to_list0_field[list][2*i] = 2*j;
1023                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1024             }
1025         }
1026     }
1027 }
1028
1029 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1030     MpegEncContext * const s = &h->s;
1031     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1032     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1033     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1034     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1035     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1036     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1037     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1038     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1039     const int is_b8x8 = IS_8X8(*mb_type);
1040     unsigned int sub_mb_type;
1041     int i8, i4;
1042
1043 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1044     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1045         /* FIXME save sub mb types from previous frames (or derive from MVs)
1046          * so we know exactly what block size to use */
1047         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1048         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1049     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1050         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1051         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1052     }else{
1053         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1054         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1055     }
1056     if(!is_b8x8)
1057         *mb_type |= MB_TYPE_DIRECT2;
1058     if(MB_FIELD)
1059         *mb_type |= MB_TYPE_INTERLACED;
1060
1061     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1062
1063     if(h->direct_spatial_mv_pred){
1064         int ref[2];
1065         int mv[2][2];
1066         int list;
1067
1068         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1069
1070         /* ref = min(neighbors) */
1071         for(list=0; list<2; list++){
1072             int refa = h->ref_cache[list][scan8[0] - 1];
1073             int refb = h->ref_cache[list][scan8[0] - 8];
1074             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1075             if(refc == -2)
1076                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1077             ref[list] = refa;
1078             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1079                 ref[list] = refb;
1080             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1081                 ref[list] = refc;
1082             if(ref[list] < 0)
1083                 ref[list] = -1;
1084         }
1085
1086         if(ref[0] < 0 && ref[1] < 0){
1087             ref[0] = ref[1] = 0;
1088             mv[0][0] = mv[0][1] =
1089             mv[1][0] = mv[1][1] = 0;
1090         }else{
1091             for(list=0; list<2; list++){
1092                 if(ref[list] >= 0)
1093                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1094                 else
1095                     mv[list][0] = mv[list][1] = 0;
1096             }
1097         }
1098
1099         if(ref[1] < 0){
1100             *mb_type &= ~MB_TYPE_P0L1;
1101             sub_mb_type &= ~MB_TYPE_P0L1;
1102         }else if(ref[0] < 0){
1103             *mb_type &= ~MB_TYPE_P0L0;
1104             sub_mb_type &= ~MB_TYPE_P0L0;
1105         }
1106
1107         if(IS_16X16(*mb_type)){
1108             int a=0, b=0;
1109
1110             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1111             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1112             if(!IS_INTRA(mb_type_col)
1113                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1114                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1115                        && (h->x264_build>33 || !h->x264_build)))){
1116                 if(ref[0] > 0)
1117                     a= pack16to32(mv[0][0],mv[0][1]);
1118                 if(ref[1] > 0)
1119                     b= pack16to32(mv[1][0],mv[1][1]);
1120             }else{
1121                 a= pack16to32(mv[0][0],mv[0][1]);
1122                 b= pack16to32(mv[1][0],mv[1][1]);
1123             }
1124             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1125             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1126         }else{
1127             for(i8=0; i8<4; i8++){
1128                 const int x8 = i8&1;
1129                 const int y8 = i8>>1;
1130
1131                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1132                     continue;
1133                 h->sub_mb_type[i8] = sub_mb_type;
1134
1135                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1136                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1137                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1138                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1139
1140                 /* col_zero_flag */
1141                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1142                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1143                                                   && (h->x264_build>33 || !h->x264_build)))){
1144                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1145                     if(IS_SUB_8X8(sub_mb_type)){
1146                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1147                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1148                             if(ref[0] == 0)
1149                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1150                             if(ref[1] == 0)
1151                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1152                         }
1153                     }else
1154                     for(i4=0; i4<4; i4++){
1155                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1156                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1157                             if(ref[0] == 0)
1158                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1159                             if(ref[1] == 0)
1160                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1161                         }
1162                     }
1163                 }
1164             }
1165         }
1166     }else{ /* direct temporal mv pred */
1167         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1168         const int *dist_scale_factor = h->dist_scale_factor;
1169
1170         if(FRAME_MBAFF){
1171             if(IS_INTERLACED(*mb_type)){
1172                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1173                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1174                 dist_scale_factor = h->dist_scale_factor_field;
1175             }
1176             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1177                 /* FIXME assumes direct_8x8_inference == 1 */
1178                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1179                 int mb_types_col[2];
1180                 int y_shift;
1181
1182                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1183                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1184                          | (*mb_type & MB_TYPE_INTERLACED);
1185                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1186
1187                 if(IS_INTERLACED(*mb_type)){
1188                     /* frame to field scaling */
1189                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1190                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1191                     if(s->mb_y&1){
1192                         l1ref0 -= 2*h->b8_stride;
1193                         l1ref1 -= 2*h->b8_stride;
1194                         l1mv0 -= 4*h->b_stride;
1195                         l1mv1 -= 4*h->b_stride;
1196                     }
1197                     y_shift = 0;
1198
1199                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1200                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1201                        && !is_b8x8)
1202                         *mb_type |= MB_TYPE_16x8;
1203                     else
1204                         *mb_type |= MB_TYPE_8x8;
1205                 }else{
1206                     /* field to frame scaling */
1207                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1208                      * but in MBAFF, top and bottom POC are equal */
1209                     int dy = (s->mb_y&1) ? 1 : 2;
1210                     mb_types_col[0] =
1211                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1212                     l1ref0 += dy*h->b8_stride;
1213                     l1ref1 += dy*h->b8_stride;
1214                     l1mv0 += 2*dy*h->b_stride;
1215                     l1mv1 += 2*dy*h->b_stride;
1216                     y_shift = 2;
1217
1218                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1219                        && !is_b8x8)
1220                         *mb_type |= MB_TYPE_16x16;
1221                     else
1222                         *mb_type |= MB_TYPE_8x8;
1223                 }
1224
1225                 for(i8=0; i8<4; i8++){
1226                     const int x8 = i8&1;
1227                     const int y8 = i8>>1;
1228                     int ref0, scale;
1229                     const int16_t (*l1mv)[2]= l1mv0;
1230
1231                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1232                         continue;
1233                     h->sub_mb_type[i8] = sub_mb_type;
1234
1235                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1236                     if(IS_INTRA(mb_types_col[y8])){
1237                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1238                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1239                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1240                         continue;
1241                     }
1242
1243                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1244                     if(ref0 >= 0)
1245                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1246                     else{
1247                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1248                         l1mv= l1mv1;
1249                     }
1250                     scale = dist_scale_factor[ref0];
1251                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1252
1253                     {
1254                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1255                         int my_col = (mv_col[1]<<y_shift)/2;
1256                         int mx = (scale * mv_col[0] + 128) >> 8;
1257                         int my = (scale * my_col + 128) >> 8;
1258                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1259                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1260                     }
1261                 }
1262                 return;
1263             }
1264         }
1265
1266         /* one-to-one mv scaling */
1267
1268         if(IS_16X16(*mb_type)){
1269             int ref, mv0, mv1;
1270
1271             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1272             if(IS_INTRA(mb_type_col)){
1273                 ref=mv0=mv1=0;
1274             }else{
1275                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1276                                                 : map_col_to_list0[1][l1ref1[0]];
1277                 const int scale = dist_scale_factor[ref0];
1278                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1279                 int mv_l0[2];
1280                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1281                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1282                 ref= ref0;
1283                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1284                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1285             }
1286             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1287             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1288             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1289         }else{
1290             for(i8=0; i8<4; i8++){
1291                 const int x8 = i8&1;
1292                 const int y8 = i8>>1;
1293                 int ref0, scale;
1294                 const int16_t (*l1mv)[2]= l1mv0;
1295
1296                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1297                     continue;
1298                 h->sub_mb_type[i8] = sub_mb_type;
1299                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1300                 if(IS_INTRA(mb_type_col)){
1301                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1302                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1303                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1304                     continue;
1305                 }
1306
1307                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1308                 if(ref0 >= 0)
1309                     ref0 = map_col_to_list0[0][ref0];
1310                 else{
1311                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1312                     l1mv= l1mv1;
1313                 }
1314                 scale = dist_scale_factor[ref0];
1315
1316                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1317                 if(IS_SUB_8X8(sub_mb_type)){
1318                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1319                     int mx = (scale * mv_col[0] + 128) >> 8;
1320                     int my = (scale * mv_col[1] + 128) >> 8;
1321                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1322                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1323                 }else
1324                 for(i4=0; i4<4; i4++){
1325                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1326                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1327                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1328                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1329                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1330                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1331                 }
1332             }
1333         }
1334     }
1335 }
1336
1337 static inline void write_back_motion(H264Context *h, int mb_type){
1338     MpegEncContext * const s = &h->s;
1339     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1340     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1341     int list;
1342
1343     if(!USES_LIST(mb_type, 0))
1344         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1345
1346     for(list=0; list<h->list_count; list++){
1347         int y;
1348         if(!USES_LIST(mb_type, list))
1349             continue;
1350
1351         for(y=0; y<4; y++){
1352             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1353             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1354         }
1355         if( h->pps.cabac ) {
1356             if(IS_SKIP(mb_type))
1357                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1358             else
1359             for(y=0; y<4; y++){
1360                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1361                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1362             }
1363         }
1364
1365         {
1366             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1367             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1368             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1369             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1370             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1371         }
1372     }
1373
1374     if(h->slice_type == B_TYPE && h->pps.cabac){
1375         if(IS_8X8(mb_type)){
1376             uint8_t *direct_table = &h->direct_table[b8_xy];
1377             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1378             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1379             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1380         }
1381     }
1382 }
1383
1384 /**
1385  * Decodes a network abstraction layer unit.
1386  * @param consumed is the number of bytes used as input
1387  * @param length is the length of the array
1388  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1389  * @returns decoded bytes, might be src+1 if no escapes
1390  */
1391 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1392     int i, si, di;
1393     uint8_t *dst;
1394     int bufidx;
1395
1396 //    src[0]&0x80;                //forbidden bit
1397     h->nal_ref_idc= src[0]>>5;
1398     h->nal_unit_type= src[0]&0x1F;
1399
1400     src++; length--;
1401 #if 0
1402     for(i=0; i<length; i++)
1403         printf("%2X ", src[i]);
1404 #endif
1405     for(i=0; i+1<length; i+=2){
1406         if(src[i]) continue;
1407         if(i>0 && src[i-1]==0) i--;
1408         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1409             if(src[i+2]!=3){
1410                 /* startcode, so we must be past the end */
1411                 length=i;
1412             }
1413             break;
1414         }
1415     }
1416
1417     if(i>=length-1){ //no escaped 0
1418         *dst_length= length;
1419         *consumed= length+1; //+1 for the header
1420         return src;
1421     }
1422
1423     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1424     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1425     dst= h->rbsp_buffer[bufidx];
1426
1427     if (dst == NULL){
1428         return NULL;
1429     }
1430
1431 //printf("decoding esc\n");
1432     si=di=0;
1433     while(si<length){
1434         //remove escapes (very rare 1:2^22)
1435         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1436             if(src[si+2]==3){ //escape
1437                 dst[di++]= 0;
1438                 dst[di++]= 0;
1439                 si+=3;
1440                 continue;
1441             }else //next start code
1442                 break;
1443         }
1444
1445         dst[di++]= src[si++];
1446     }
1447
1448     *dst_length= di;
1449     *consumed= si + 1;//+1 for the header
1450 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1451     return dst;
1452 }
1453
1454 /**
1455  * identifies the exact end of the bitstream
1456  * @return the length of the trailing, or 0 if damaged
1457  */
1458 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1459     int v= *src;
1460     int r;
1461
1462     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1463
1464     for(r=1; r<9; r++){
1465         if(v&1) return r;
1466         v>>=1;
1467     }
1468     return 0;
1469 }
1470
1471 /**
1472  * idct tranforms the 16 dc values and dequantize them.
1473  * @param qp quantization parameter
1474  */
1475 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1476 #define stride 16
1477     int i;
1478     int temp[16]; //FIXME check if this is a good idea
1479     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1480     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1481
1482 //memset(block, 64, 2*256);
1483 //return;
1484     for(i=0; i<4; i++){
1485         const int offset= y_offset[i];
1486         const int z0= block[offset+stride*0] + block[offset+stride*4];
1487         const int z1= block[offset+stride*0] - block[offset+stride*4];
1488         const int z2= block[offset+stride*1] - block[offset+stride*5];
1489         const int z3= block[offset+stride*1] + block[offset+stride*5];
1490
1491         temp[4*i+0]= z0+z3;
1492         temp[4*i+1]= z1+z2;
1493         temp[4*i+2]= z1-z2;
1494         temp[4*i+3]= z0-z3;
1495     }
1496
1497     for(i=0; i<4; i++){
1498         const int offset= x_offset[i];
1499         const int z0= temp[4*0+i] + temp[4*2+i];
1500         const int z1= temp[4*0+i] - temp[4*2+i];
1501         const int z2= temp[4*1+i] - temp[4*3+i];
1502         const int z3= temp[4*1+i] + temp[4*3+i];
1503
1504         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1505         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1506         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1507         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1508     }
1509 }
1510
1511 #if 0
1512 /**
1513  * dct tranforms the 16 dc values.
1514  * @param qp quantization parameter ??? FIXME
1515  */
1516 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1517 //    const int qmul= dequant_coeff[qp][0];
1518     int i;
1519     int temp[16]; //FIXME check if this is a good idea
1520     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1521     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1522
1523     for(i=0; i<4; i++){
1524         const int offset= y_offset[i];
1525         const int z0= block[offset+stride*0] + block[offset+stride*4];
1526         const int z1= block[offset+stride*0] - block[offset+stride*4];
1527         const int z2= block[offset+stride*1] - block[offset+stride*5];
1528         const int z3= block[offset+stride*1] + block[offset+stride*5];
1529
1530         temp[4*i+0]= z0+z3;
1531         temp[4*i+1]= z1+z2;
1532         temp[4*i+2]= z1-z2;
1533         temp[4*i+3]= z0-z3;
1534     }
1535
1536     for(i=0; i<4; i++){
1537         const int offset= x_offset[i];
1538         const int z0= temp[4*0+i] + temp[4*2+i];
1539         const int z1= temp[4*0+i] - temp[4*2+i];
1540         const int z2= temp[4*1+i] - temp[4*3+i];
1541         const int z3= temp[4*1+i] + temp[4*3+i];
1542
1543         block[stride*0 +offset]= (z0 + z3)>>1;
1544         block[stride*2 +offset]= (z1 + z2)>>1;
1545         block[stride*8 +offset]= (z1 - z2)>>1;
1546         block[stride*10+offset]= (z0 - z3)>>1;
1547     }
1548 }
1549 #endif
1550
1551 #undef xStride
1552 #undef stride
1553
1554 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1555     const int stride= 16*2;
1556     const int xStride= 16;
1557     int a,b,c,d,e;
1558
1559     a= block[stride*0 + xStride*0];
1560     b= block[stride*0 + xStride*1];
1561     c= block[stride*1 + xStride*0];
1562     d= block[stride*1 + xStride*1];
1563
1564     e= a-b;
1565     a= a+b;
1566     b= c-d;
1567     c= c+d;
1568
1569     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1570     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1571     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1572     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1573 }
1574
1575 #if 0
1576 static void chroma_dc_dct_c(DCTELEM *block){
1577     const int stride= 16*2;
1578     const int xStride= 16;
1579     int a,b,c,d,e;
1580
1581     a= block[stride*0 + xStride*0];
1582     b= block[stride*0 + xStride*1];
1583     c= block[stride*1 + xStride*0];
1584     d= block[stride*1 + xStride*1];
1585
1586     e= a-b;
1587     a= a+b;
1588     b= c-d;
1589     c= c+d;
1590
1591     block[stride*0 + xStride*0]= (a+c);
1592     block[stride*0 + xStride*1]= (e+b);
1593     block[stride*1 + xStride*0]= (a-c);
1594     block[stride*1 + xStride*1]= (e-b);
1595 }
1596 #endif
1597
1598 /**
1599  * gets the chroma qp.
1600  */
1601 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1602     return h->pps.chroma_qp_table[t][qscale & 0xff];
1603 }
1604
1605 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1606 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1607 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1608     int i;
1609     const int * const quant_table= quant_coeff[qscale];
1610     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1611     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1612     const unsigned int threshold2= (threshold1<<1);
1613     int last_non_zero;
1614
1615     if(separate_dc){
1616         if(qscale<=18){
1617             //avoid overflows
1618             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1619             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1620             const unsigned int dc_threshold2= (dc_threshold1<<1);
1621
1622             int level= block[0]*quant_coeff[qscale+18][0];
1623             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1624                 if(level>0){
1625                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1626                     block[0]= level;
1627                 }else{
1628                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1629                     block[0]= -level;
1630                 }
1631 //                last_non_zero = i;
1632             }else{
1633                 block[0]=0;
1634             }
1635         }else{
1636             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1637             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1638             const unsigned int dc_threshold2= (dc_threshold1<<1);
1639
1640             int level= block[0]*quant_table[0];
1641             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1642                 if(level>0){
1643                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1644                     block[0]= level;
1645                 }else{
1646                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1647                     block[0]= -level;
1648                 }
1649 //                last_non_zero = i;
1650             }else{
1651                 block[0]=0;
1652             }
1653         }
1654         last_non_zero= 0;
1655         i=1;
1656     }else{
1657         last_non_zero= -1;
1658         i=0;
1659     }
1660
1661     for(; i<16; i++){
1662         const int j= scantable[i];
1663         int level= block[j]*quant_table[j];
1664
1665 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1666 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1667         if(((unsigned)(level+threshold1))>threshold2){
1668             if(level>0){
1669                 level= (bias + level)>>QUANT_SHIFT;
1670                 block[j]= level;
1671             }else{
1672                 level= (bias - level)>>QUANT_SHIFT;
1673                 block[j]= -level;
1674             }
1675             last_non_zero = i;
1676         }else{
1677             block[j]=0;
1678         }
1679     }
1680
1681     return last_non_zero;
1682 }
1683
1684 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1685     const uint32_t a= ((uint32_t*)(src-stride))[0];
1686     ((uint32_t*)(src+0*stride))[0]= a;
1687     ((uint32_t*)(src+1*stride))[0]= a;
1688     ((uint32_t*)(src+2*stride))[0]= a;
1689     ((uint32_t*)(src+3*stride))[0]= a;
1690 }
1691
1692 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1693     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1694     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1695     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1696     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1697 }
1698
1699 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1700     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1701                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1702
1703     ((uint32_t*)(src+0*stride))[0]=
1704     ((uint32_t*)(src+1*stride))[0]=
1705     ((uint32_t*)(src+2*stride))[0]=
1706     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1707 }
1708
1709 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1710     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1711
1712     ((uint32_t*)(src+0*stride))[0]=
1713     ((uint32_t*)(src+1*stride))[0]=
1714     ((uint32_t*)(src+2*stride))[0]=
1715     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1716 }
1717
1718 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1719     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1720
1721     ((uint32_t*)(src+0*stride))[0]=
1722     ((uint32_t*)(src+1*stride))[0]=
1723     ((uint32_t*)(src+2*stride))[0]=
1724     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1725 }
1726
1727 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1728     ((uint32_t*)(src+0*stride))[0]=
1729     ((uint32_t*)(src+1*stride))[0]=
1730     ((uint32_t*)(src+2*stride))[0]=
1731     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1732 }
1733
1734
1735 #define LOAD_TOP_RIGHT_EDGE\
1736     const int av_unused t4= topright[0];\
1737     const int av_unused t5= topright[1];\
1738     const int av_unused t6= topright[2];\
1739     const int av_unused t7= topright[3];\
1740
1741 #define LOAD_LEFT_EDGE\
1742     const int av_unused l0= src[-1+0*stride];\
1743     const int av_unused l1= src[-1+1*stride];\
1744     const int av_unused l2= src[-1+2*stride];\
1745     const int av_unused l3= src[-1+3*stride];\
1746
1747 #define LOAD_TOP_EDGE\
1748     const int av_unused t0= src[ 0-1*stride];\
1749     const int av_unused t1= src[ 1-1*stride];\
1750     const int av_unused t2= src[ 2-1*stride];\
1751     const int av_unused t3= src[ 3-1*stride];\
1752
1753 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1754     const int lt= src[-1-1*stride];
1755     LOAD_TOP_EDGE
1756     LOAD_LEFT_EDGE
1757
1758     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1759     src[0+2*stride]=
1760     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1761     src[0+1*stride]=
1762     src[1+2*stride]=
1763     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1764     src[0+0*stride]=
1765     src[1+1*stride]=
1766     src[2+2*stride]=
1767     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1768     src[1+0*stride]=
1769     src[2+1*stride]=
1770     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1771     src[2+0*stride]=
1772     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1773     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1774 }
1775
1776 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1777     LOAD_TOP_EDGE
1778     LOAD_TOP_RIGHT_EDGE
1779 //    LOAD_LEFT_EDGE
1780
1781     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1782     src[1+0*stride]=
1783     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1784     src[2+0*stride]=
1785     src[1+1*stride]=
1786     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1787     src[3+0*stride]=
1788     src[2+1*stride]=
1789     src[1+2*stride]=
1790     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1791     src[3+1*stride]=
1792     src[2+2*stride]=
1793     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1794     src[3+2*stride]=
1795     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1796     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1797 }
1798
1799 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1800     const int lt= src[-1-1*stride];
1801     LOAD_TOP_EDGE
1802     LOAD_LEFT_EDGE
1803
1804     src[0+0*stride]=
1805     src[1+2*stride]=(lt + t0 + 1)>>1;
1806     src[1+0*stride]=
1807     src[2+2*stride]=(t0 + t1 + 1)>>1;
1808     src[2+0*stride]=
1809     src[3+2*stride]=(t1 + t2 + 1)>>1;
1810     src[3+0*stride]=(t2 + t3 + 1)>>1;
1811     src[0+1*stride]=
1812     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1813     src[1+1*stride]=
1814     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
1815     src[2+1*stride]=
1816     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1817     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1818     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1819     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1820 }
1821
1822 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
1823     LOAD_TOP_EDGE
1824     LOAD_TOP_RIGHT_EDGE
1825
1826     src[0+0*stride]=(t0 + t1 + 1)>>1;
1827     src[1+0*stride]=
1828     src[0+2*stride]=(t1 + t2 + 1)>>1;
1829     src[2+0*stride]=
1830     src[1+2*stride]=(t2 + t3 + 1)>>1;
1831     src[3+0*stride]=
1832     src[2+2*stride]=(t3 + t4+ 1)>>1;
1833     src[3+2*stride]=(t4 + t5+ 1)>>1;
1834     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1835     src[1+1*stride]=
1836     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1837     src[2+1*stride]=
1838     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
1839     src[3+1*stride]=
1840     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
1841     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
1842 }
1843
1844 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
1845     LOAD_LEFT_EDGE
1846
1847     src[0+0*stride]=(l0 + l1 + 1)>>1;
1848     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1849     src[2+0*stride]=
1850     src[0+1*stride]=(l1 + l2 + 1)>>1;
1851     src[3+0*stride]=
1852     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1853     src[2+1*stride]=
1854     src[0+2*stride]=(l2 + l3 + 1)>>1;
1855     src[3+1*stride]=
1856     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
1857     src[3+2*stride]=
1858     src[1+3*stride]=
1859     src[0+3*stride]=
1860     src[2+2*stride]=
1861     src[2+3*stride]=
1862     src[3+3*stride]=l3;
1863 }
1864
1865 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
1866     const int lt= src[-1-1*stride];
1867     LOAD_TOP_EDGE
1868     LOAD_LEFT_EDGE
1869
1870     src[0+0*stride]=
1871     src[2+1*stride]=(lt + l0 + 1)>>1;
1872     src[1+0*stride]=
1873     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
1874     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
1875     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1876     src[0+1*stride]=
1877     src[2+2*stride]=(l0 + l1 + 1)>>1;
1878     src[1+1*stride]=
1879     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1880     src[0+2*stride]=
1881     src[2+3*stride]=(l1 + l2+ 1)>>1;
1882     src[1+2*stride]=
1883     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1884     src[0+3*stride]=(l2 + l3 + 1)>>1;
1885     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1886 }
1887
1888 void ff_pred16x16_vertical_c(uint8_t *src, int stride){
1889     int i;
1890     const uint32_t a= ((uint32_t*)(src-stride))[0];
1891     const uint32_t b= ((uint32_t*)(src-stride))[1];
1892     const uint32_t c= ((uint32_t*)(src-stride))[2];
1893     const uint32_t d= ((uint32_t*)(src-stride))[3];
1894
1895     for(i=0; i<16; i++){
1896         ((uint32_t*)(src+i*stride))[0]= a;
1897         ((uint32_t*)(src+i*stride))[1]= b;
1898         ((uint32_t*)(src+i*stride))[2]= c;
1899         ((uint32_t*)(src+i*stride))[3]= d;
1900     }
1901 }
1902
1903 void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
1904     int i;
1905
1906     for(i=0; i<16; i++){
1907         ((uint32_t*)(src+i*stride))[0]=
1908         ((uint32_t*)(src+i*stride))[1]=
1909         ((uint32_t*)(src+i*stride))[2]=
1910         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
1911     }
1912 }
1913
1914 void ff_pred16x16_dc_c(uint8_t *src, int stride){
1915     int i, dc=0;
1916
1917     for(i=0;i<16; i++){
1918         dc+= src[-1+i*stride];
1919     }
1920
1921     for(i=0;i<16; i++){
1922         dc+= src[i-stride];
1923     }
1924
1925     dc= 0x01010101*((dc + 16)>>5);
1926
1927     for(i=0; i<16; i++){
1928         ((uint32_t*)(src+i*stride))[0]=
1929         ((uint32_t*)(src+i*stride))[1]=
1930         ((uint32_t*)(src+i*stride))[2]=
1931         ((uint32_t*)(src+i*stride))[3]= dc;
1932     }
1933 }
1934
1935 void ff_pred16x16_left_dc_c(uint8_t *src, int stride){
1936     int i, dc=0;
1937
1938     for(i=0;i<16; i++){
1939         dc+= src[-1+i*stride];
1940     }
1941
1942     dc= 0x01010101*((dc + 8)>>4);
1943
1944     for(i=0; i<16; i++){
1945         ((uint32_t*)(src+i*stride))[0]=
1946         ((uint32_t*)(src+i*stride))[1]=
1947         ((uint32_t*)(src+i*stride))[2]=
1948         ((uint32_t*)(src+i*stride))[3]= dc;
1949     }
1950 }
1951
1952 void ff_pred16x16_top_dc_c(uint8_t *src, int stride){
1953     int i, dc=0;
1954
1955     for(i=0;i<16; i++){
1956         dc+= src[i-stride];
1957     }
1958     dc= 0x01010101*((dc + 8)>>4);
1959
1960     for(i=0; i<16; i++){
1961         ((uint32_t*)(src+i*stride))[0]=
1962         ((uint32_t*)(src+i*stride))[1]=
1963         ((uint32_t*)(src+i*stride))[2]=
1964         ((uint32_t*)(src+i*stride))[3]= dc;
1965     }
1966 }
1967
1968 void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
1969     int i;
1970
1971     for(i=0; i<16; i++){
1972         ((uint32_t*)(src+i*stride))[0]=
1973         ((uint32_t*)(src+i*stride))[1]=
1974         ((uint32_t*)(src+i*stride))[2]=
1975         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
1976     }
1977 }
1978
1979 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
1980   int i, j, k;
1981   int a;
1982   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1983   const uint8_t * const src0 = src+7-stride;
1984   const uint8_t *src1 = src+8*stride-1;
1985   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
1986   int H = src0[1] - src0[-1];
1987   int V = src1[0] - src2[ 0];
1988   for(k=2; k<=8; ++k) {
1989     src1 += stride; src2 -= stride;
1990     H += k*(src0[k] - src0[-k]);
1991     V += k*(src1[0] - src2[ 0]);
1992   }
1993   if(svq3){
1994     H = ( 5*(H/4) ) / 16;
1995     V = ( 5*(V/4) ) / 16;
1996
1997     /* required for 100% accuracy */
1998     i = H; H = V; V = i;
1999   }else{
2000     H = ( 5*H+32 ) >> 6;
2001     V = ( 5*V+32 ) >> 6;
2002   }
2003
2004   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2005   for(j=16; j>0; --j) {
2006     int b = a;
2007     a += V;
2008     for(i=-16; i<0; i+=4) {
2009       src[16+i] = cm[ (b    ) >> 5 ];
2010       src[17+i] = cm[ (b+  H) >> 5 ];
2011       src[18+i] = cm[ (b+2*H) >> 5 ];
2012       src[19+i] = cm[ (b+3*H) >> 5 ];
2013       b += 4*H;
2014     }
2015     src += stride;
2016   }
2017 }
2018
2019 void ff_pred16x16_plane_c(uint8_t *src, int stride){
2020     pred16x16_plane_compat_c(src, stride, 0);
2021 }
2022
2023 void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2024     int i;
2025     const uint32_t a= ((uint32_t*)(src-stride))[0];
2026     const uint32_t b= ((uint32_t*)(src-stride))[1];
2027
2028     for(i=0; i<8; i++){
2029         ((uint32_t*)(src+i*stride))[0]= a;
2030         ((uint32_t*)(src+i*stride))[1]= b;
2031     }
2032 }
2033
2034 void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2035     int i;
2036
2037     for(i=0; i<8; i++){
2038         ((uint32_t*)(src+i*stride))[0]=
2039         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2040     }
2041 }
2042
2043 void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2044     int i;
2045
2046     for(i=0; i<8; i++){
2047         ((uint32_t*)(src+i*stride))[0]=
2048         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2049     }
2050 }
2051
2052 void ff_pred8x8_left_dc_c(uint8_t *src, int stride){
2053     int i;
2054     int dc0, dc2;
2055
2056     dc0=dc2=0;
2057     for(i=0;i<4; i++){
2058         dc0+= src[-1+i*stride];
2059         dc2+= src[-1+(i+4)*stride];
2060     }
2061     dc0= 0x01010101*((dc0 + 2)>>2);
2062     dc2= 0x01010101*((dc2 + 2)>>2);
2063
2064     for(i=0; i<4; i++){
2065         ((uint32_t*)(src+i*stride))[0]=
2066         ((uint32_t*)(src+i*stride))[1]= dc0;
2067     }
2068     for(i=4; i<8; i++){
2069         ((uint32_t*)(src+i*stride))[0]=
2070         ((uint32_t*)(src+i*stride))[1]= dc2;
2071     }
2072 }
2073
2074 void ff_pred8x8_top_dc_c(uint8_t *src, int stride){
2075     int i;
2076     int dc0, dc1;
2077
2078     dc0=dc1=0;
2079     for(i=0;i<4; i++){
2080         dc0+= src[i-stride];
2081         dc1+= src[4+i-stride];
2082     }
2083     dc0= 0x01010101*((dc0 + 2)>>2);
2084     dc1= 0x01010101*((dc1 + 2)>>2);
2085
2086     for(i=0; i<4; i++){
2087         ((uint32_t*)(src+i*stride))[0]= dc0;
2088         ((uint32_t*)(src+i*stride))[1]= dc1;
2089     }
2090     for(i=4; i<8; i++){
2091         ((uint32_t*)(src+i*stride))[0]= dc0;
2092         ((uint32_t*)(src+i*stride))[1]= dc1;
2093     }
2094 }
2095
2096
2097 void ff_pred8x8_dc_c(uint8_t *src, int stride){
2098     int i;
2099     int dc0, dc1, dc2, dc3;
2100
2101     dc0=dc1=dc2=0;
2102     for(i=0;i<4; i++){
2103         dc0+= src[-1+i*stride] + src[i-stride];
2104         dc1+= src[4+i-stride];
2105         dc2+= src[-1+(i+4)*stride];
2106     }
2107     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2108     dc0= 0x01010101*((dc0 + 4)>>3);
2109     dc1= 0x01010101*((dc1 + 2)>>2);
2110     dc2= 0x01010101*((dc2 + 2)>>2);
2111
2112     for(i=0; i<4; i++){
2113         ((uint32_t*)(src+i*stride))[0]= dc0;
2114         ((uint32_t*)(src+i*stride))[1]= dc1;
2115     }
2116     for(i=4; i<8; i++){
2117         ((uint32_t*)(src+i*stride))[0]= dc2;
2118         ((uint32_t*)(src+i*stride))[1]= dc3;
2119     }
2120 }
2121
2122 void ff_pred8x8_plane_c(uint8_t *src, int stride){
2123   int j, k;
2124   int a;
2125   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2126   const uint8_t * const src0 = src+3-stride;
2127   const uint8_t *src1 = src+4*stride-1;
2128   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2129   int H = src0[1] - src0[-1];
2130   int V = src1[0] - src2[ 0];
2131   for(k=2; k<=4; ++k) {
2132     src1 += stride; src2 -= stride;
2133     H += k*(src0[k] - src0[-k]);
2134     V += k*(src1[0] - src2[ 0]);
2135   }
2136   H = ( 17*H+16 ) >> 5;
2137   V = ( 17*V+16 ) >> 5;
2138
2139   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2140   for(j=8; j>0; --j) {
2141     int b = a;
2142     a += V;
2143     src[0] = cm[ (b    ) >> 5 ];
2144     src[1] = cm[ (b+  H) >> 5 ];
2145     src[2] = cm[ (b+2*H) >> 5 ];
2146     src[3] = cm[ (b+3*H) >> 5 ];
2147     src[4] = cm[ (b+4*H) >> 5 ];
2148     src[5] = cm[ (b+5*H) >> 5 ];
2149     src[6] = cm[ (b+6*H) >> 5 ];
2150     src[7] = cm[ (b+7*H) >> 5 ];
2151     src += stride;
2152   }
2153 }
2154
2155 #define SRC(x,y) src[(x)+(y)*stride]
2156 #define PL(y) \
2157     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2158 #define PREDICT_8x8_LOAD_LEFT \
2159     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2160                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2161     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2162     const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2163
2164 #define PT(x) \
2165     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2166 #define PREDICT_8x8_LOAD_TOP \
2167     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2168                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2169     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2170     const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2171                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2172
2173 #define PTR(x) \
2174     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2175 #define PREDICT_8x8_LOAD_TOPRIGHT \
2176     int t8, t9, t10, t11, t12, t13, t14, t15; \
2177     if(has_topright) { \
2178         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2179         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2180     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2181
2182 #define PREDICT_8x8_LOAD_TOPLEFT \
2183     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2184
2185 #define PREDICT_8x8_DC(v) \
2186     int y; \
2187     for( y = 0; y < 8; y++ ) { \
2188         ((uint32_t*)src)[0] = \
2189         ((uint32_t*)src)[1] = v; \
2190         src += stride; \
2191     }
2192
2193 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2194 {
2195     PREDICT_8x8_DC(0x80808080);
2196 }
2197 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2198 {
2199     PREDICT_8x8_LOAD_LEFT;
2200     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2201     PREDICT_8x8_DC(dc);
2202 }
2203 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2204 {
2205     PREDICT_8x8_LOAD_TOP;
2206     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2207     PREDICT_8x8_DC(dc);
2208 }
2209 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2210 {
2211     PREDICT_8x8_LOAD_LEFT;
2212     PREDICT_8x8_LOAD_TOP;
2213     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2214                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2215     PREDICT_8x8_DC(dc);
2216 }
2217 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2218 {
2219     PREDICT_8x8_LOAD_LEFT;
2220 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2221                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2222     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2223 #undef ROW
2224 }
2225 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2226 {
2227     int y;
2228     PREDICT_8x8_LOAD_TOP;
2229     src[0] = t0;
2230     src[1] = t1;
2231     src[2] = t2;
2232     src[3] = t3;
2233     src[4] = t4;
2234     src[5] = t5;
2235     src[6] = t6;
2236     src[7] = t7;
2237     for( y = 1; y < 8; y++ )
2238         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2239 }
2240 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2241 {
2242     PREDICT_8x8_LOAD_TOP;
2243     PREDICT_8x8_LOAD_TOPRIGHT;
2244     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2245     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2246     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2247     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2248     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2249     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2250     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2251     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2252     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2253     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2254     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2255     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2256     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2257     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2258     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2259 }
2260 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2261 {
2262     PREDICT_8x8_LOAD_TOP;
2263     PREDICT_8x8_LOAD_LEFT;
2264     PREDICT_8x8_LOAD_TOPLEFT;
2265     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2266     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2267     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2268     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2269     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2270     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2271     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2272     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2273     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2274     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2275     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2276     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2277     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2278     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2279     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2280
2281 }
2282 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2283 {
2284     PREDICT_8x8_LOAD_TOP;
2285     PREDICT_8x8_LOAD_LEFT;
2286     PREDICT_8x8_LOAD_TOPLEFT;
2287     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2288     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2289     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2290     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2291     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2292     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2293     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2294     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2295     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2296     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2297     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2298     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2299     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2300     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2301     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2302     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2303     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2304     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2305     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2306     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2307     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2308     SRC(7,0)= (t6 + t7 + 1) >> 1;
2309 }
2310 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2311 {
2312     PREDICT_8x8_LOAD_TOP;
2313     PREDICT_8x8_LOAD_LEFT;
2314     PREDICT_8x8_LOAD_TOPLEFT;
2315     SRC(0,7)= (l6 + l7 + 1) >> 1;
2316     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2317     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2318     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2319     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2320     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2321     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2322     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2323     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2324     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2325     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2326     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2327     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2328     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2329     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2330     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2331     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2332     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2333     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2334     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2335     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2336     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2337 }
2338 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2339 {
2340     PREDICT_8x8_LOAD_TOP;
2341     PREDICT_8x8_LOAD_TOPRIGHT;
2342     SRC(0,0)= (t0 + t1 + 1) >> 1;
2343     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2344     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2345     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2346     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2347     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2348     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2349     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2350     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2351     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2352     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2353     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2354     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2355     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2356     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2357     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2358     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2359     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2360     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2361     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2362     SRC(7,6)= (t10 + t11 + 1) >> 1;
2363     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2364 }
2365 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2366 {
2367     PREDICT_8x8_LOAD_LEFT;
2368     SRC(0,0)= (l0 + l1 + 1) >> 1;
2369     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2370     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2371     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2372     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2373     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2374     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2375     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2376     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2377     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2378     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2379     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2380     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2381     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2382     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2383     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2384     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2385     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2386 }
2387 #undef PREDICT_8x8_LOAD_LEFT
2388 #undef PREDICT_8x8_LOAD_TOP
2389 #undef PREDICT_8x8_LOAD_TOPLEFT
2390 #undef PREDICT_8x8_LOAD_TOPRIGHT
2391 #undef PREDICT_8x8_DC
2392 #undef PTR
2393 #undef PT
2394 #undef PL
2395 #undef SRC
2396
2397 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2398                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2399                            int src_x_offset, int src_y_offset,
2400                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2401     MpegEncContext * const s = &h->s;
2402     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2403     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2404     const int luma_xy= (mx&3) + ((my&3)<<2);
2405     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2406     uint8_t * src_cb, * src_cr;
2407     int extra_width= h->emu_edge_width;
2408     int extra_height= h->emu_edge_height;
2409     int emu=0;
2410     const int full_mx= mx>>2;
2411     const int full_my= my>>2;
2412     const int pic_width  = 16*s->mb_width;
2413     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2414
2415     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
2416         return;
2417
2418     if(mx&7) extra_width -= 3;
2419     if(my&7) extra_height -= 3;
2420
2421     if(   full_mx < 0-extra_width
2422        || full_my < 0-extra_height
2423        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2424        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2425         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2426             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2427         emu=1;
2428     }
2429
2430     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2431     if(!square){
2432         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2433     }
2434
2435     if(s->flags&CODEC_FLAG_GRAY) return;
2436
2437     if(MB_MBAFF){
2438         // chroma offset when predicting from a field of opposite parity
2439         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2440         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2441     }
2442     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2443     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2444
2445     if(emu){
2446         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2447             src_cb= s->edge_emu_buffer;
2448     }
2449     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2450
2451     if(emu){
2452         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2453             src_cr= s->edge_emu_buffer;
2454     }
2455     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2456 }
2457
2458 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2459                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2460                            int x_offset, int y_offset,
2461                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2462                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2463                            int list0, int list1){
2464     MpegEncContext * const s = &h->s;
2465     qpel_mc_func *qpix_op=  qpix_put;
2466     h264_chroma_mc_func chroma_op= chroma_put;
2467
2468     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2469     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2470     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2471     x_offset += 8*s->mb_x;
2472     y_offset += 8*(s->mb_y >> MB_MBAFF);
2473
2474     if(list0){
2475         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2476         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2477                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2478                            qpix_op, chroma_op);
2479
2480         qpix_op=  qpix_avg;
2481         chroma_op= chroma_avg;
2482     }
2483
2484     if(list1){
2485         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2486         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2487                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2488                            qpix_op, chroma_op);
2489     }
2490 }
2491
2492 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2493                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2494                            int x_offset, int y_offset,
2495                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2496                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2497                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2498                            int list0, int list1){
2499     MpegEncContext * const s = &h->s;
2500
2501     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2502     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2503     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2504     x_offset += 8*s->mb_x;
2505     y_offset += 8*(s->mb_y >> MB_MBAFF);
2506
2507     if(list0 && list1){
2508         /* don't optimize for luma-only case, since B-frames usually
2509          * use implicit weights => chroma too. */
2510         uint8_t *tmp_cb = s->obmc_scratchpad;
2511         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2512         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2513         int refn0 = h->ref_cache[0][ scan8[n] ];
2514         int refn1 = h->ref_cache[1][ scan8[n] ];
2515
2516         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2517                     dest_y, dest_cb, dest_cr,
2518                     x_offset, y_offset, qpix_put, chroma_put);
2519         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2520                     tmp_y, tmp_cb, tmp_cr,
2521                     x_offset, y_offset, qpix_put, chroma_put);
2522
2523         if(h->use_weight == 2){
2524             int weight0 = h->implicit_weight[refn0][refn1];
2525             int weight1 = 64 - weight0;
2526             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2527             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2528             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2529         }else{
2530             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2531                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2532                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2533             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2534                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2535                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2536             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2537                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2538                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2539         }
2540     }else{
2541         int list = list1 ? 1 : 0;
2542         int refn = h->ref_cache[list][ scan8[n] ];
2543         Picture *ref= &h->ref_list[list][refn];
2544         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2545                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2546                     qpix_put, chroma_put);
2547
2548         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2549                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2550         if(h->use_weight_chroma){
2551             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2552                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2553             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2554                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2555         }
2556     }
2557 }
2558
2559 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2560                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2561                            int x_offset, int y_offset,
2562                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2563                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2564                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2565                            int list0, int list1){
2566     if((h->use_weight==2 && list0 && list1
2567         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2568        || h->use_weight==1)
2569         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2570                          x_offset, y_offset, qpix_put, chroma_put,
2571                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2572     else
2573         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2574                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2575 }
2576
2577 static inline void prefetch_motion(H264Context *h, int list){
2578     /* fetch pixels for estimated mv 4 macroblocks ahead
2579      * optimized for 64byte cache lines */
2580     MpegEncContext * const s = &h->s;
2581     const int refn = h->ref_cache[list][scan8[0]];
2582     if(refn >= 0){
2583         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2584         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2585         uint8_t **src= h->ref_list[list][refn].data;
2586         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2587         s->dsp.prefetch(src[0]+off, s->linesize, 4);
2588         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2589         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2590     }
2591 }
2592
2593 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2594                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2595                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2596                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2597     MpegEncContext * const s = &h->s;
2598     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2599     const int mb_type= s->current_picture.mb_type[mb_xy];
2600
2601     assert(IS_INTER(mb_type));
2602
2603     prefetch_motion(h, 0);
2604
2605     if(IS_16X16(mb_type)){
2606         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2607                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2608                 &weight_op[0], &weight_avg[0],
2609                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2610     }else if(IS_16X8(mb_type)){
2611         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2612                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2613                 &weight_op[1], &weight_avg[1],
2614                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2615         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2616                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2617                 &weight_op[1], &weight_avg[1],
2618                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2619     }else if(IS_8X16(mb_type)){
2620         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2621                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2622                 &weight_op[2], &weight_avg[2],
2623                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2624         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2625                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2626                 &weight_op[2], &weight_avg[2],
2627                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2628     }else{
2629         int i;
2630
2631         assert(IS_8X8(mb_type));
2632
2633         for(i=0; i<4; i++){
2634             const int sub_mb_type= h->sub_mb_type[i];
2635             const int n= 4*i;
2636             int x_offset= (i&1)<<2;
2637             int y_offset= (i&2)<<1;
2638
2639             if(IS_SUB_8X8(sub_mb_type)){
2640                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2641                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2642                     &weight_op[3], &weight_avg[3],
2643                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2644             }else if(IS_SUB_8X4(sub_mb_type)){
2645                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2646                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2647                     &weight_op[4], &weight_avg[4],
2648                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2649                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2650                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2651                     &weight_op[4], &weight_avg[4],
2652                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2653             }else if(IS_SUB_4X8(sub_mb_type)){
2654                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2655                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2656                     &weight_op[5], &weight_avg[5],
2657                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2658                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2659                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2660                     &weight_op[5], &weight_avg[5],
2661                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2662             }else{
2663                 int j;
2664                 assert(IS_SUB_4X4(sub_mb_type));
2665                 for(j=0; j<4; j++){
2666                     int sub_x_offset= x_offset + 2*(j&1);
2667                     int sub_y_offset= y_offset +   (j&2);
2668                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2669                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2670                         &weight_op[6], &weight_avg[6],
2671                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2672                 }
2673             }
2674         }
2675     }
2676
2677     prefetch_motion(h, 1);
2678 }
2679
2680 static void decode_init_vlc(void){
2681     static int done = 0;
2682
2683     if (!done) {
2684         int i;
2685         done = 1;
2686
2687         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2688                  &chroma_dc_coeff_token_len [0], 1, 1,
2689                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2690
2691         for(i=0; i<4; i++){
2692             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2693                      &coeff_token_len [i][0], 1, 1,
2694                      &coeff_token_bits[i][0], 1, 1, 1);
2695         }
2696
2697         for(i=0; i<3; i++){
2698             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2699                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2700                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2701         }
2702         for(i=0; i<15; i++){
2703             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2704                      &total_zeros_len [i][0], 1, 1,
2705                      &total_zeros_bits[i][0], 1, 1, 1);
2706         }
2707
2708         for(i=0; i<6; i++){
2709             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2710                      &run_len [i][0], 1, 1,
2711                      &run_bits[i][0], 1, 1, 1);
2712         }
2713         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2714                  &run_len [6][0], 1, 1,
2715                  &run_bits[6][0], 1, 1, 1);
2716     }
2717 }
2718
2719 /**
2720  * Sets the intra prediction function pointers.
2721  */
2722 static void init_pred_ptrs(H264Context *h){
2723 //    MpegEncContext * const s = &h->s;
2724
2725     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2726     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2727     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2728     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2729     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2730     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2731     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2732     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2733     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2734     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2735     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2736     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2737
2738     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
2739     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
2740     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
2741     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
2742     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
2743     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
2744     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
2745     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
2746     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
2747     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
2748     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
2749     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
2750
2751     h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
2752     h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
2753     h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
2754     h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
2755     h->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c;
2756     h->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c;
2757     h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
2758
2759     h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
2760     h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
2761     h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
2762     h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
2763     h->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c;
2764     h->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c;
2765     h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
2766 }
2767
2768 static void free_tables(H264Context *h){
2769     int i;
2770     av_freep(&h->intra4x4_pred_mode);
2771     av_freep(&h->chroma_pred_mode_table);
2772     av_freep(&h->cbp_table);
2773     av_freep(&h->mvd_table[0]);
2774     av_freep(&h->mvd_table[1]);
2775     av_freep(&h->direct_table);
2776     av_freep(&h->non_zero_count);
2777     av_freep(&h->slice_table_base);
2778     av_freep(&h->top_borders[1]);
2779     av_freep(&h->top_borders[0]);
2780     h->slice_table= NULL;
2781
2782     av_freep(&h->mb2b_xy);
2783     av_freep(&h->mb2b8_xy);
2784
2785     av_freep(&h->s.obmc_scratchpad);
2786
2787     for(i = 0; i < MAX_SPS_COUNT; i++)
2788         av_freep(h->sps_buffers + i);
2789
2790     for(i = 0; i < MAX_PPS_COUNT; i++)
2791         av_freep(h->pps_buffers + i);
2792 }
2793
2794 static void init_dequant8_coeff_table(H264Context *h){
2795     int i,q,x;
2796     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2797     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2798     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2799
2800     for(i=0; i<2; i++ ){
2801         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2802             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2803             break;
2804         }
2805
2806         for(q=0; q<52; q++){
2807             int shift = ff_div6[q];
2808             int idx = ff_rem6[q];
2809             for(x=0; x<64; x++)
2810                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2811                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2812                     h->pps.scaling_matrix8[i][x]) << shift;
2813         }
2814     }
2815 }
2816
2817 static void init_dequant4_coeff_table(H264Context *h){
2818     int i,j,q,x;
2819     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2820     for(i=0; i<6; i++ ){
2821         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2822         for(j=0; j<i; j++){
2823             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2824                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2825                 break;
2826             }
2827         }
2828         if(j<i)
2829             continue;
2830
2831         for(q=0; q<52; q++){
2832             int shift = ff_div6[q] + 2;
2833             int idx = ff_rem6[q];
2834             for(x=0; x<16; x++)
2835                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2836                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2837                     h->pps.scaling_matrix4[i][x]) << shift;
2838         }
2839     }
2840 }
2841
2842 static void init_dequant_tables(H264Context *h){
2843     int i,x;
2844     init_dequant4_coeff_table(h);
2845     if(h->pps.transform_8x8_mode)
2846         init_dequant8_coeff_table(h);
2847     if(h->sps.transform_bypass){
2848         for(i=0; i<6; i++)
2849             for(x=0; x<16; x++)
2850                 h->dequant4_coeff[i][0][x] = 1<<6;
2851         if(h->pps.transform_8x8_mode)
2852             for(i=0; i<2; i++)
2853                 for(x=0; x<64; x++)
2854                     h->dequant8_coeff[i][0][x] = 1<<6;
2855     }
2856 }
2857
2858
2859 /**
2860  * allocates tables.
2861  * needs width/height
2862  */
2863 static int alloc_tables(H264Context *h){
2864     MpegEncContext * const s = &h->s;
2865     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2866     int x,y;
2867
2868     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2869
2870     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2871     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2872     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2873     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2874     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2875
2876     if( h->pps.cabac ) {
2877         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2878         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2879         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2880         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2881     }
2882
2883     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2884     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2885
2886     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2887     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2888     for(y=0; y<s->mb_height; y++){
2889         for(x=0; x<s->mb_width; x++){
2890             const int mb_xy= x + y*s->mb_stride;
2891             const int b_xy = 4*x + 4*y*h->b_stride;
2892             const int b8_xy= 2*x + 2*y*h->b8_stride;
2893
2894             h->mb2b_xy [mb_xy]= b_xy;
2895             h->mb2b8_xy[mb_xy]= b8_xy;
2896         }
2897     }
2898
2899     s->obmc_scratchpad = NULL;
2900
2901     if(!h->dequant4_coeff[0])
2902         init_dequant_tables(h);
2903
2904     return 0;
2905 fail:
2906     free_tables(h);
2907     return -1;
2908 }
2909
2910 static void common_init(H264Context *h){
2911     MpegEncContext * const s = &h->s;
2912
2913     s->width = s->avctx->width;
2914     s->height = s->avctx->height;
2915     s->codec_id= s->avctx->codec->id;
2916
2917     init_pred_ptrs(h);
2918
2919     h->dequant_coeff_pps= -1;
2920     s->unrestricted_mv=1;
2921     s->decode=1; //FIXME
2922
2923     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2924     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2925 }
2926
2927 static int decode_init(AVCodecContext *avctx){
2928     H264Context *h= avctx->priv_data;
2929     MpegEncContext * const s = &h->s;
2930
2931     MPV_decode_defaults(s);
2932
2933     s->avctx = avctx;
2934     common_init(h);
2935
2936     s->out_format = FMT_H264;
2937     s->workaround_bugs= avctx->workaround_bugs;
2938
2939     // set defaults
2940 //    s->decode_mb= ff_h263_decode_mb;
2941     s->quarter_sample = 1;
2942     s->low_delay= 1;
2943     avctx->pix_fmt= PIX_FMT_YUV420P;
2944
2945     decode_init_vlc();
2946
2947     if(avctx->extradata_size > 0 && avctx->extradata &&
2948        *(char *)avctx->extradata == 1){
2949         h->is_avc = 1;
2950         h->got_avcC = 0;
2951     } else {
2952         h->is_avc = 0;
2953     }
2954
2955     return 0;
2956 }
2957
2958 static int frame_start(H264Context *h){
2959     MpegEncContext * const s = &h->s;
2960     int i;
2961
2962     if(MPV_frame_start(s, s->avctx) < 0)
2963         return -1;
2964     ff_er_frame_start(s);
2965
2966     assert(s->linesize && s->uvlinesize);
2967
2968     for(i=0; i<16; i++){
2969         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2970         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2971     }
2972     for(i=0; i<4; i++){
2973         h->block_offset[16+i]=
2974         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2975         h->block_offset[24+16+i]=
2976         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2977     }
2978
2979     /* can't be in alloc_tables because linesize isn't known there.
2980      * FIXME: redo bipred weight to not require extra buffer? */
2981     if(!s->obmc_scratchpad)
2982         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2983
2984     /* some macroblocks will be accessed before they're available */
2985     if(FRAME_MBAFF)
2986         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2987
2988 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2989     return 0;
2990 }
2991
2992 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2993     MpegEncContext * const s = &h->s;
2994     int i;
2995
2996     src_y  -=   linesize;
2997     src_cb -= uvlinesize;
2998     src_cr -= uvlinesize;
2999
3000     // There are two lines saved, the line above the the top macroblock of a pair,
3001     // and the line above the bottom macroblock
3002     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3003     for(i=1; i<17; i++){
3004         h->left_border[i]= src_y[15+i*  linesize];
3005     }
3006
3007     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3008     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3009
3010     if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3011         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3012         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3013         for(i=1; i<9; i++){
3014             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3015             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3016         }
3017         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3018         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3019     }
3020 }
3021
3022 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
3023     MpegEncContext * const s = &h->s;
3024     int temp8, i;
3025     uint64_t temp64;
3026     int deblock_left;
3027     int deblock_top;
3028     int mb_xy;
3029
3030     if(h->deblocking_filter == 2) {
3031         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
3032         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
3033         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
3034     } else {
3035         deblock_left = (s->mb_x > 0);
3036         deblock_top =  (s->mb_y > 0);
3037     }
3038
3039     src_y  -=   linesize + 1;
3040     src_cb -= uvlinesize + 1;
3041     src_cr -= uvlinesize + 1;
3042
3043 #define XCHG(a,b,t,xchg)\
3044 t= a;\
3045 if(xchg)\
3046     a= b;\
3047 b= t;
3048
3049     if(deblock_left){
3050         for(i = !deblock_top; i<17; i++){
3051             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3052         }
3053     }
3054
3055     if(deblock_top){
3056         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3057         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3058         if(s->mb_x+1 < s->mb_width){
3059             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3060         }
3061     }
3062
3063     if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3064         if(deblock_left){
3065             for(i = !deblock_top; i<9; i++){
3066                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3067                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3068             }
3069         }
3070         if(deblock_top){
3071             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3072             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3073         }
3074     }
3075 }
3076
3077 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3078     MpegEncContext * const s = &h->s;
3079     int i;
3080
3081     src_y  -= 2 *   linesize;
3082     src_cb -= 2 * uvlinesize;
3083     src_cr -= 2 * uvlinesize;
3084
3085     // There are two lines saved, the line above the the top macroblock of a pair,
3086     // and the line above the bottom macroblock
3087     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3088     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3089     for(i=2; i<34; i++){
3090         h->left_border[i]= src_y[15+i*  linesize];
3091     }
3092
3093     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3094     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3095     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3096     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3097
3098     if(!(s->flags&CODEC_FLAG_GRAY)){
3099         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3100         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3101         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3102         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3103         for(i=2; i<18; i++){
3104             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3105             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3106         }
3107         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3108         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3109         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3110         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3111     }
3112 }
3113
3114 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3115     MpegEncContext * const s = &h->s;
3116     int temp8, i;
3117     uint64_t temp64;
3118     int deblock_left = (s->mb_x > 0);
3119     int deblock_top  = (s->mb_y > 1);
3120
3121     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3122
3123     src_y  -= 2 *   linesize + 1;
3124     src_cb -= 2 * uvlinesize + 1;
3125     src_cr -= 2 * uvlinesize + 1;
3126
3127 #define XCHG(a,b,t,xchg)\
3128 t= a;\
3129 if(xchg)\
3130     a= b;\
3131 b= t;
3132
3133     if(deblock_left){
3134         for(i = (!deblock_top)<<1; i<34; i++){
3135             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3136         }
3137     }
3138
3139     if(deblock_top){
3140         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3141         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3142         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3143         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3144         if(s->mb_x+1 < s->mb_width){
3145             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3146             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3147         }
3148     }
3149
3150     if(!(s->flags&CODEC_FLAG_GRAY)){
3151         if(deblock_left){
3152             for(i = (!deblock_top) << 1; i<18; i++){
3153                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3154                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3155             }
3156         }
3157         if(deblock_top){
3158             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3159             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3160             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3161             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3162         }
3163     }
3164 }
3165
3166 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
3167     MpegEncContext * const s = &h->s;
3168     const int mb_x= s->mb_x;
3169     const int mb_y= s->mb_y;
3170     const int mb_xy= mb_x + mb_y*s->mb_stride;
3171     const int mb_type= s->current_picture.mb_type[mb_xy];
3172     uint8_t  *dest_y, *dest_cb, *dest_cr;
3173     int linesize, uvlinesize /*dct_offset*/;
3174     int i;
3175     int *block_offset = &h->block_offset[0];
3176     const unsigned int bottom = mb_y & 1;
3177     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
3178     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3179     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3180
3181     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3182     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3183     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3184
3185     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3186     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3187
3188     if (!simple && MB_FIELD) {
3189         linesize   = h->mb_linesize   = s->linesize * 2;
3190         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3191         block_offset = &h->block_offset[24];
3192         if(mb_y&1){ //FIXME move out of this func?
3193             dest_y -= s->linesize*15;
3194             dest_cb-= s->uvlinesize*7;
3195             dest_cr-= s->uvlinesize*7;
3196         }
3197         if(FRAME_MBAFF) {
3198             int list;
3199             for(list=0; list<h->list_count; list++){
3200                 if(!USES_LIST(mb_type, list))
3201                     continue;
3202                 if(IS_16X16(mb_type)){
3203                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3204                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3205                 }else{
3206                     for(i=0; i<16; i+=4){
3207                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3208                         int ref = h->ref_cache[list][scan8[i]];
3209                         if(ref >= 0)
3210                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3211                     }
3212                 }
3213             }
3214         }
3215     } else {
3216         linesize   = h->mb_linesize   = s->linesize;
3217         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3218 //        dct_offset = s->linesize * 16;
3219     }
3220
3221     if(transform_bypass){
3222         idct_dc_add =
3223         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3224     }else if(IS_8x8DCT(mb_type)){
3225         idct_dc_add = s->dsp.h264_idct8_dc_add;
3226         idct_add = s->dsp.h264_idct8_add;
3227     }else{
3228         idct_dc_add = s->dsp.h264_idct_dc_add;
3229         idct_add = s->dsp.h264_idct_add;
3230     }
3231
3232     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3233        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3234         int mbt_y = mb_y&~1;
3235         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3236         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3237         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3238         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3239     }
3240
3241     if (!simple && IS_INTRA_PCM(mb_type)) {
3242         unsigned int x, y;
3243
3244         // The pixels are stored in h->mb array in the same order as levels,
3245         // copy them in output in the correct order.
3246         for(i=0; i<16; i++) {
3247             for (y=0; y<4; y++) {
3248                 for (x=0; x<4; x++) {
3249                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3250                 }
3251             }
3252         }
3253         for(i=16; i<16+4; i++) {
3254             for (y=0; y<4; y++) {
3255                 for (x=0; x<4; x++) {
3256                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3257                 }
3258             }
3259         }
3260         for(i=20; i<20+4; i++) {
3261             for (y=0; y<4; y++) {
3262                 for (x=0; x<4; x++) {
3263                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3264                 }
3265             }
3266         }
3267     } else {
3268         if(IS_INTRA(mb_type)){
3269             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3270                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
3271
3272             if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3273                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3274                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3275             }
3276
3277             if(IS_INTRA4x4(mb_type)){
3278                 if(simple || !s->encoding){
3279                     if(IS_8x8DCT(mb_type)){
3280                         for(i=0; i<16; i+=4){
3281                             uint8_t * const ptr= dest_y + block_offset[i];
3282                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3283                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3284                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3285                                                    (h->topright_samples_available<<i)&0x4000, linesize);
3286                             if(nnz){
3287                                 if(nnz == 1 && h->mb[i*16])
3288                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3289                                 else
3290                                     idct_add(ptr, h->mb + i*16, linesize);
3291                             }
3292                         }
3293                     }else
3294                     for(i=0; i<16; i++){
3295                         uint8_t * const ptr= dest_y + block_offset[i];
3296                         uint8_t *topright;
3297                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3298                         int nnz, tr;
3299
3300                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3301                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3302                             assert(mb_y || linesize <= block_offset[i]);
3303                             if(!topright_avail){
3304                                 tr= ptr[3 - linesize]*0x01010101;
3305                                 topright= (uint8_t*) &tr;
3306                             }else
3307                                 topright= ptr + 4 - linesize;
3308                         }else
3309                             topright= NULL;
3310
3311                         h->pred4x4[ dir ](ptr, topright, linesize);
3312                         nnz = h->non_zero_count_cache[ scan8[i] ];
3313                         if(nnz){
3314                             if(is_h264){
3315                                 if(nnz == 1 && h->mb[i*16])
3316                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3317                                 else
3318                                     idct_add(ptr, h->mb + i*16, linesize);
3319                             }else
3320                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3321                         }
3322                     }
3323                 }
3324             }else{
3325                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3326                 if(is_h264){
3327                     if(!transform_bypass)
3328                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3329                 }else
3330                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3331             }
3332             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3333                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
3334         }else if(is_h264){
3335             hl_motion(h, dest_y, dest_cb, dest_cr,
3336                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3337                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3338                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3339         }
3340
3341
3342         if(!IS_INTRA4x4(mb_type)){
3343             if(is_h264){
3344                 if(IS_INTRA16x16(mb_type)){
3345                     for(i=0; i<16; i++){
3346                         if(h->non_zero_count_cache[ scan8[i] ])
3347                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3348                         else if(h->mb[i*16])
3349                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3350                     }
3351                 }else{
3352                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3353                     for(i=0; i<16; i+=di){
3354                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3355                         if(nnz){
3356                             if(nnz==1 && h->mb[i*16])
3357                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3358                             else
3359                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3360                         }
3361                     }
3362                 }
3363             }else{
3364                 for(i=0; i<16; i++){
3365                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3366                         uint8_t * const ptr= dest_y + block_offset[i];
3367                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3368                     }
3369                 }
3370             }
3371         }
3372
3373         if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3374             uint8_t *dest[2] = {dest_cb, dest_cr};
3375             if(transform_bypass){
3376                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3377             }else{
3378                 idct_add = s->dsp.h264_idct_add;
3379                 idct_dc_add = s->dsp.h264_idct_dc_add;
3380                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
3381                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
3382             }
3383             if(is_h264){
3384                 for(i=16; i<16+8; i++){
3385                     if(h->non_zero_count_cache[ scan8[i] ])
3386                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3387                     else if(h->mb[i*16])
3388                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3389                 }
3390             }else{
3391                 for(i=16; i<16+8; i++){
3392                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3393                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3394                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3395                     }
3396                 }
3397             }
3398         }
3399     }
3400     if(h->deblocking_filter) {
3401         if (!simple && FRAME_MBAFF) {
3402             //FIXME try deblocking one mb at a time?
3403             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3404             const int mb_y = s->mb_y - 1;
3405             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3406             const int mb_xy= mb_x + mb_y*s->mb_stride;
3407             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3408             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3409             if (!bottom) return;
3410             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3411             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3412             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3413
3414             if(IS_INTRA(mb_type_top | mb_type_bottom))
3415                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3416
3417             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3418             // deblock a pair
3419             // top
3420             s->mb_y--;
3421             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3422             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3423             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
3424             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
3425             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3426             // bottom
3427             s->mb_y++;
3428             tprintf(h->s.avctx, "call mbaff filter_mb\n");
3429             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3430             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3431             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3432             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3433         } else {
3434             tprintf(h->s.avctx, "call filter_mb\n");
3435             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
3436             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3437             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3438         }
3439     }
3440 }
3441
3442 /**
3443  * Process a macroblock; this case avoids checks for expensive uncommon cases.
3444  */
3445 static void hl_decode_mb_simple(H264Context *h){
3446     hl_decode_mb_internal(h, 1);
3447 }
3448
3449 /**
3450  * Process a macroblock; this handles edge cases, such as interlacing.
3451  */
3452 static void av_noinline hl_decode_mb_complex(H264Context *h){
3453     hl_decode_mb_internal(h, 0);
3454 }
3455
3456 static void hl_decode_mb(H264Context *h){
3457     MpegEncContext * const s = &h->s;
3458     const int mb_x= s->mb_x;
3459     const int mb_y= s->mb_y;
3460     const int mb_xy= mb_x + mb_y*s->mb_stride;
3461     const int mb_type= s->current_picture.mb_type[mb_xy];
3462     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (s->flags&CODEC_FLAG_GRAY) || s->encoding;
3463
3464     if(!s->decode)
3465         return;
3466
3467     if (is_complex)
3468         hl_decode_mb_complex(h);
3469     else hl_decode_mb_simple(h);
3470 }
3471
3472 /**
3473  * fills the default_ref_list.
3474  */
3475 static int fill_default_ref_list(H264Context *h){
3476     MpegEncContext * const s = &h->s;
3477     int i;
3478     int smallest_poc_greater_than_current = -1;
3479     Picture sorted_short_ref[32];
3480
3481     if(h->slice_type==B_TYPE){
3482         int out_i;
3483         int limit= INT_MIN;
3484
3485         /* sort frame according to poc in B slice */
3486         for(out_i=0; out_i<h->short_ref_count; out_i++){
3487             int best_i=INT_MIN;
3488             int best_poc=INT_MAX;
3489
3490             for(i=0; i<h->short_ref_count; i++){
3491                 const int poc= h->short_ref[i]->poc;
3492                 if(poc > limit && poc < best_poc){
3493                     best_poc= poc;
3494                     best_i= i;
3495                 }
3496             }
3497
3498             assert(best_i != INT_MIN);
3499
3500             limit= best_poc;
3501             sorted_short_ref[out_i]= *h->short_ref[best_i];
3502             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3503             if (-1 == smallest_poc_greater_than_current) {
3504                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3505                     smallest_poc_greater_than_current = out_i;
3506                 }
3507             }
3508         }
3509     }
3510
3511     if(s->picture_structure == PICT_FRAME){
3512         if(h->slice_type==B_TYPE){
3513             int list;
3514             tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3515
3516             // find the largest poc
3517             for(list=0; list<2; list++){
3518                 int index = 0;
3519                 int j= -99;
3520                 int step= list ? -1 : 1;
3521
3522                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3523                     while(j<0 || j>= h->short_ref_count){
3524                         if(j != -99 && step == (list ? -1 : 1))
3525                             return -1;
3526                         step = -step;
3527                         j= smallest_poc_greater_than_current + (step>>1);
3528                     }
3529                     if(sorted_short_ref[j].reference != 3) continue;
3530                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3531                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3532                 }
3533
3534                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3535                     if(h->long_ref[i] == NULL) continue;
3536                     if(h->long_ref[i]->reference != 3) continue;
3537
3538                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3539                     h->default_ref_list[ list ][index++].pic_id= i;;
3540                 }
3541
3542                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3543                     // swap the two first elements of L1 when
3544                     // L0 and L1 are identical
3545                     Picture temp= h->default_ref_list[1][0];
3546                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3547                     h->default_ref_list[1][1] = temp;
3548                 }
3549
3550                 if(index < h->ref_count[ list ])
3551                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3552             }
3553         }else{
3554             int index=0;
3555             for(i=0; i<h->short_ref_count; i++){
3556                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3557                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3558                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3559             }
3560             for(i = 0; i < 16; i++){
3561                 if(h->long_ref[i] == NULL) continue;
3562                 if(h->long_ref[i]->reference != 3) continue;
3563                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3564                 h->default_ref_list[0][index++].pic_id= i;;
3565             }
3566             if(index < h->ref_count[0])
3567                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3568         }
3569     }else{ //FIELD
3570         if(h->slice_type==B_TYPE){
3571         }else{
3572             //FIXME second field balh
3573         }
3574     }
3575 #ifdef TRACE
3576     for (i=0; i<h->ref_count[0]; i++) {
3577         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3578     }
3579     if(h->slice_type==B_TYPE){
3580         for (i=0; i<h->ref_count[1]; i++) {
3581             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3582         }
3583     }
3584 #endif
3585     return 0;
3586 }
3587
3588 static void print_short_term(H264Context *h);
3589 static void print_long_term(H264Context *h);
3590
3591 static int decode_ref_pic_list_reordering(H264Context *h){
3592     MpegEncContext * const s = &h->s;
3593     int list, index;
3594
3595     print_short_term(h);
3596     print_long_term(h);
3597     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3598
3599     for(list=0; list<h->list_count; list++){
3600         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3601
3602         if(get_bits1(&s->gb)){
3603             int pred= h->curr_pic_num;
3604
3605             for(index=0; ; index++){
3606                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3607                 unsigned int pic_id;
3608                 int i;
3609                 Picture *ref = NULL;
3610
3611                 if(reordering_of_pic_nums_idc==3)
3612                     break;
3613
3614                 if(index >= h->ref_count[list]){
3615                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3616                     return -1;
3617                 }
3618
3619                 if(reordering_of_pic_nums_idc<3){
3620                     if(reordering_of_pic_nums_idc<2){
3621                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3622
3623                         if(abs_diff_pic_num >= h->max_pic_num){
3624                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3625                             return -1;
3626                         }
3627
3628                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3629                         else                                pred+= abs_diff_pic_num;
3630                         pred &= h->max_pic_num - 1;
3631
3632                         for(i= h->short_ref_count-1; i>=0; i--){
3633                             ref = h->short_ref[i];
3634                             assert(ref->reference == 3);
3635                             assert(!ref->long_ref);
3636                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3637                                 break;
3638                         }
3639                         if(i>=0)
3640                             ref->pic_id= ref->frame_num;
3641                     }else{
3642                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3643                         if(pic_id>31){
3644                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3645                             return -1;
3646                         }
3647                         ref = h->long_ref[pic_id];
3648                         if(ref){
3649                             ref->pic_id= pic_id;
3650                             assert(ref->reference == 3);
3651                             assert(ref->long_ref);
3652                             i=0;
3653                         }else{
3654                             i=-1;
3655                         }
3656                     }
3657
3658                     if (i < 0) {
3659                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3660                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3661                     } else {
3662                         for(i=index; i+1<h->ref_count[list]; i++){
3663                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3664                                 break;
3665                         }
3666                         for(; i > index; i--){
3667                             h->ref_list[list][i]= h->ref_list[list][i-1];
3668                         }
3669                         h->ref_list[list][index]= *ref;
3670                     }
3671                 }else{
3672                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3673                     return -1;
3674                 }
3675             }
3676         }
3677     }
3678     for(list=0; list<h->list_count; list++){
3679         for(index= 0; index < h->ref_count[list]; index++){
3680             if(!h->ref_list[list][index].data[0])
3681                 h->ref_list[list][index]= s->current_picture;
3682         }
3683     }
3684
3685     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3686         direct_dist_scale_factor(h);
3687     direct_ref_list_init(h);
3688     return 0;
3689 }
3690
3691 static void fill_mbaff_ref_list(H264Context *h){
3692     int list, i, j;
3693     for(list=0; list<2; list++){ //FIXME try list_count
3694         for(i=0; i<h->ref_count[list]; i++){
3695             Picture *frame = &h->ref_list[list][i];
3696             Picture *field = &h->ref_list[list][16+2*i];
3697             field[0] = *frame;
3698             for(j=0; j<3; j++)
3699                 field[0].linesize[j] <<= 1;
3700             field[1] = field[0];
3701             for(j=0; j<3; j++)
3702                 field[1].data[j] += frame->linesize[j];
3703
3704             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3705             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3706             for(j=0; j<2; j++){
3707                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3708                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3709             }
3710         }
3711     }
3712     for(j=0; j<h->ref_count[1]; j++){
3713         for(i=0; i<h->ref_count[0]; i++)
3714             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3715         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3716         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3717     }
3718 }
3719
3720 static int pred_weight_table(H264Context *h){
3721     MpegEncContext * const s = &h->s;
3722     int list, i;
3723     int luma_def, chroma_def;
3724
3725     h->use_weight= 0;
3726     h->use_weight_chroma= 0;
3727     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3728     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3729     luma_def = 1<<h->luma_log2_weight_denom;
3730     chroma_def = 1<<h->chroma_log2_weight_denom;
3731
3732     for(list=0; list<2; list++){
3733         for(i=0; i<h->ref_count[list]; i++){
3734             int luma_weight_flag, chroma_weight_flag;
3735
3736             luma_weight_flag= get_bits1(&s->gb);
3737             if(luma_weight_flag){
3738                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3739                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3740                 if(   h->luma_weight[list][i] != luma_def
3741                    || h->luma_offset[list][i] != 0)
3742                     h->use_weight= 1;
3743             }else{
3744                 h->luma_weight[list][i]= luma_def;
3745                 h->luma_offset[list][i]= 0;
3746             }
3747
3748             chroma_weight_flag= get_bits1(&s->gb);
3749             if(chroma_weight_flag){
3750                 int j;
3751                 for(j=0; j<2; j++){
3752                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3753                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3754                     if(   h->chroma_weight[list][i][j] != chroma_def
3755                        || h->chroma_offset[list][i][j] != 0)
3756                         h->use_weight_chroma= 1;
3757                 }
3758             }else{
3759                 int j;
3760                 for(j=0; j<2; j++){
3761                     h->chroma_weight[list][i][j]= chroma_def;
3762                     h->chroma_offset[list][i][j]= 0;
3763                 }
3764             }
3765         }
3766         if(h->slice_type != B_TYPE) break;
3767     }
3768     h->use_weight= h->use_weight || h->use_weight_chroma;
3769     return 0;
3770 }
3771
3772 static void implicit_weight_table(H264Context *h){
3773     MpegEncContext * const s = &h->s;
3774     int ref0, ref1;
3775     int cur_poc = s->current_picture_ptr->poc;
3776
3777     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3778        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3779         h->use_weight= 0;
3780         h->use_weight_chroma= 0;
3781         return;
3782     }
3783
3784     h->use_weight= 2;
3785     h->use_weight_chroma= 2;
3786     h->luma_log2_weight_denom= 5;
3787     h->chroma_log2_weight_denom= 5;
3788
3789     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3790         int poc0 = h->ref_list[0][ref0].poc;
3791         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3792             int poc1 = h->ref_list[1][ref1].poc;
3793             int td = av_clip(poc1 - poc0, -128, 127);
3794             if(td){
3795                 int tb = av_clip(cur_poc - poc0, -128, 127);
3796                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3797                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3798                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3799                     h->implicit_weight[ref0][ref1] = 32;
3800                 else
3801                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3802             }else
3803                 h->implicit_weight[ref0][ref1] = 32;
3804         }
3805     }
3806 }
3807
3808 static inline void unreference_pic(H264Context *h, Picture *pic){
3809     int i;
3810     pic->reference=0;
3811     if(pic == h->delayed_output_pic)
3812         pic->reference=1;
3813     else{
3814         for(i = 0; h->delayed_pic[i]; i++)
3815             if(pic == h->delayed_pic[i]){
3816                 pic->reference=1;
3817                 break;
3818             }
3819     }
3820 }
3821
3822 /**
3823  * instantaneous decoder refresh.
3824  */
3825 static void idr(H264Context *h){
3826     int i;
3827
3828     for(i=0; i<16; i++){
3829         if (h->long_ref[i] != NULL) {
3830             unreference_pic(h, h->long_ref[i]);
3831             h->long_ref[i]= NULL;
3832         }
3833     }
3834     h->long_ref_count=0;
3835
3836     for(i=0; i<h->short_ref_count; i++){
3837         unreference_pic(h, h->short_ref[i]);
3838         h->short_ref[i]= NULL;
3839     }
3840     h->short_ref_count=0;
3841 }
3842
3843 /* forget old pics after a seek */
3844 static void flush_dpb(AVCodecContext *avctx){
3845     H264Context *h= avctx->priv_data;
3846     int i;
3847     for(i=0; i<16; i++) {
3848         if(h->delayed_pic[i])
3849             h->delayed_pic[i]->reference= 0;
3850         h->delayed_pic[i]= NULL;
3851     }
3852     if(h->delayed_output_pic)
3853         h->delayed_output_pic->reference= 0;
3854     h->delayed_output_pic= NULL;
3855     idr(h);
3856     if(h->s.current_picture_ptr)
3857         h->s.current_picture_ptr->reference= 0;
3858 }
3859
3860 /**
3861  *
3862  * @return the removed picture or NULL if an error occurs
3863  */
3864 static Picture * remove_short(H264Context *h, int frame_num){
3865     MpegEncContext * const s = &h->s;
3866     int i;
3867
3868     if(s->avctx->debug&FF_DEBUG_MMCO)
3869         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3870
3871     for(i=0; i<h->short_ref_count; i++){
3872         Picture *pic= h->short_ref[i];
3873         if(s->avctx->debug&FF_DEBUG_MMCO)
3874             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3875         if(pic->frame_num == frame_num){
3876             h->short_ref[i]= NULL;
3877             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3878             h->short_ref_count--;
3879             return pic;
3880         }
3881     }
3882     return NULL;
3883 }
3884
3885 /**
3886  *
3887  * @return the removed picture or NULL if an error occurs
3888  */
3889 static Picture * remove_long(H264Context *h, int i){
3890     Picture *pic;
3891
3892     pic= h->long_ref[i];
3893     h->long_ref[i]= NULL;
3894     if(pic) h->long_ref_count--;
3895
3896     return pic;
3897 }
3898
3899 /**
3900  * print short term list
3901  */
3902 static void print_short_term(H264Context *h) {
3903     uint32_t i;
3904     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3905         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3906         for(i=0; i<h->short_ref_count; i++){
3907             Picture *pic= h->short_ref[i];
3908             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3909         }
3910     }
3911 }
3912
3913 /**
3914  * print long term list
3915  */
3916 static void print_long_term(H264Context *h) {
3917     uint32_t i;
3918     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3919         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3920         for(i = 0; i < 16; i++){
3921             Picture *pic= h->long_ref[i];
3922             if (pic) {
3923                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3924             }
3925         }
3926     }
3927 }
3928
3929 /**
3930  * Executes the reference picture marking (memory management control operations).
3931  */
3932 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3933     MpegEncContext * const s = &h->s;
3934     int i, j;
3935     int current_is_long=0;
3936     Picture *pic;
3937
3938     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3939         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3940
3941     for(i=0; i<mmco_count; i++){
3942         if(s->avctx->debug&FF_DEBUG_MMCO)
3943             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
3944
3945         switch(mmco[i].opcode){
3946         case MMCO_SHORT2UNUSED:
3947             pic= remove_short(h, mmco[i].short_frame_num);
3948             if(pic)
3949                 unreference_pic(h, pic);
3950             else if(s->avctx->debug&FF_DEBUG_MMCO)
3951                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3952             break;
3953         case MMCO_SHORT2LONG:
3954             pic= remove_long(h, mmco[i].long_index);
3955             if(pic) unreference_pic(h, pic);
3956
3957             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
3958             if (h->long_ref[ mmco[i].long_index ]){
3959                 h->long_ref[ mmco[i].long_index ]->long_ref=1;
3960                 h->long_ref_count++;
3961             }
3962             break;
3963         case MMCO_LONG2UNUSED:
3964             pic= remove_long(h, mmco[i].long_index);
3965             if(pic)
3966                 unreference_pic(h, pic);
3967             else if(s->avctx->debug&FF_DEBUG_MMCO)
3968                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3969             break;
3970         case MMCO_LONG:
3971             pic= remove_long(h, mmco[i].long_index);
3972             if(pic) unreference_pic(h, pic);
3973
3974             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
3975             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3976             h->long_ref_count++;
3977
3978             current_is_long=1;
3979             break;
3980         case MMCO_SET_MAX_LONG:
3981             assert(mmco[i].long_index <= 16);
3982             // just remove the long term which index is greater than new max
3983             for(j = mmco[i].long_index; j<16; j++){
3984                 pic = remove_long(h, j);
3985                 if (pic) unreference_pic(h, pic);
3986             }
3987             break;
3988         case MMCO_RESET:
3989             while(h->short_ref_count){
3990                 pic= remove_short(h, h->short_ref[0]->frame_num);
3991                 if(pic) unreference_pic(h, pic);
3992             }
3993             for(j = 0; j < 16; j++) {
3994                 pic= remove_long(h, j);
3995                 if(pic) unreference_pic(h, pic);
3996             }
3997             break;
3998         default: assert(0);
3999         }
4000     }
4001
4002     if(!current_is_long){
4003         pic= remove_short(h, s->current_picture_ptr->frame_num);
4004         if(pic){
4005             unreference_pic(h, pic);
4006             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4007         }
4008
4009         if(h->short_ref_count)
4010             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4011
4012         h->short_ref[0]= s->current_picture_ptr;
4013         h->short_ref[0]->long_ref=0;
4014         h->short_ref_count++;
4015     }
4016
4017     print_short_term(h);
4018     print_long_term(h);
4019     return 0;
4020 }
4021
4022 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
4023     MpegEncContext * const s = &h->s;
4024     int i;
4025
4026     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4027         s->broken_link= get_bits1(gb) -1;
4028         h->mmco[0].long_index= get_bits1(gb) - 1; // current_long_term_idx
4029         if(h->mmco[0].long_index == -1)
4030             h->mmco_index= 0;
4031         else{
4032             h->mmco[0].opcode= MMCO_LONG;
4033             h->mmco_index= 1;
4034         }
4035     }else{
4036         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
4037             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4038                 MMCOOpcode opcode= get_ue_golomb(gb);
4039
4040                 h->mmco[i].opcode= opcode;
4041                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4042                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4043 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4044                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4045                         return -1;
4046                     }*/
4047                 }
4048                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4049                     unsigned int long_index= get_ue_golomb(gb);
4050                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
4051                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4052                         return -1;
4053                     }
4054                     h->mmco[i].long_index= long_index;
4055                 }
4056
4057                 if(opcode > (unsigned)MMCO_LONG){
4058                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4059                     return -1;
4060                 }
4061                 if(opcode == MMCO_END)
4062                     break;
4063             }
4064             h->mmco_index= i;
4065         }else{
4066             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4067
4068             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4069                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4070                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4071                 h->mmco_index= 1;
4072             }else
4073                 h->mmco_index= 0;
4074         }
4075     }
4076
4077     return 0;
4078 }
4079
4080 static int init_poc(H264Context *h){
4081     MpegEncContext * const s = &h->s;
4082     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4083     int field_poc[2];
4084
4085     if(h->nal_unit_type == NAL_IDR_SLICE){
4086         h->frame_num_offset= 0;
4087     }else{
4088         if(h->frame_num < h->prev_frame_num)
4089             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4090         else
4091             h->frame_num_offset= h->prev_frame_num_offset;
4092     }
4093
4094     if(h->sps.poc_type==0){
4095         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4096
4097         if(h->nal_unit_type == NAL_IDR_SLICE){
4098              h->prev_poc_msb=
4099              h->prev_poc_lsb= 0;
4100         }
4101
4102         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4103             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4104         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4105             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4106         else
4107             h->poc_msb = h->prev_poc_msb;
4108 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4109         field_poc[0] =
4110         field_poc[1] = h->poc_msb + h->poc_lsb;
4111         if(s->picture_structure == PICT_FRAME)
4112             field_poc[1] += h->delta_poc_bottom;
4113     }else if(h->sps.poc_type==1){
4114         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4115         int i;
4116
4117         if(h->sps.poc_cycle_length != 0)
4118             abs_frame_num = h->frame_num_offset + h->frame_num;
4119         else
4120             abs_frame_num = 0;
4121
4122         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4123             abs_frame_num--;
4124
4125         expected_delta_per_poc_cycle = 0;
4126         for(i=0; i < h->sps.poc_cycle_length; i++)
4127             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4128
4129         if(abs_frame_num > 0){
4130             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4131             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4132
4133             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4134             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4135                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4136         } else
4137             expectedpoc = 0;
4138
4139         if(h->nal_ref_idc == 0)
4140             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4141
4142         field_poc[0] = expectedpoc + h->delta_poc[0];
4143         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4144
4145         if(s->picture_structure == PICT_FRAME)
4146             field_poc[1] += h->delta_poc[1];
4147     }else{
4148         int poc;
4149         if(h->nal_unit_type == NAL_IDR_SLICE){
4150             poc= 0;
4151         }else{
4152             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4153             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4154         }
4155         field_poc[0]= poc;
4156         field_poc[1]= poc;
4157     }
4158
4159     if(s->picture_structure != PICT_BOTTOM_FIELD)
4160         s->current_picture_ptr->field_poc[0]= field_poc[0];
4161     if(s->picture_structure != PICT_TOP_FIELD)
4162         s->current_picture_ptr->field_poc[1]= field_poc[1];
4163     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4164         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4165
4166     return 0;
4167 }
4168
4169
4170 /**
4171  * initialize scan tables
4172  */
4173 static void init_scan_tables(H264Context *h){
4174     MpegEncContext * const s = &h->s;
4175     int i;
4176     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4177         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4178         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4179     }else{
4180         for(i=0; i<16; i++){
4181 #define T(x) (x>>2) | ((x<<2) & 0xF)
4182             h->zigzag_scan[i] = T(zigzag_scan[i]);
4183             h-> field_scan[i] = T( field_scan[i]);
4184 #undef T
4185         }
4186     }
4187     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4188         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4189         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4190         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4191         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4192     }else{
4193         for(i=0; i<64; i++){
4194 #define T(x) (x>>3) | ((x&7)<<3)
4195             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4196             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4197             h->field_scan8x8[i]        = T(field_scan8x8[i]);
4198             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4199 #undef T
4200         }
4201     }
4202     if(h->sps.transform_bypass){ //FIXME same ugly
4203         h->zigzag_scan_q0          = zigzag_scan;
4204         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4205         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4206         h->field_scan_q0           = field_scan;
4207         h->field_scan8x8_q0        = field_scan8x8;
4208         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4209     }else{
4210         h->zigzag_scan_q0          = h->zigzag_scan;
4211         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4212         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4213         h->field_scan_q0           = h->field_scan;
4214         h->field_scan8x8_q0        = h->field_scan8x8;
4215         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4216     }
4217 }
4218 /**
4219  * decodes a slice header.
4220  * this will allso call MPV_common_init() and frame_start() as needed
4221  */
4222 static int decode_slice_header(H264Context *h){
4223     MpegEncContext * const s = &h->s;
4224     unsigned int first_mb_in_slice;
4225     unsigned int pps_id;
4226     int num_ref_idx_active_override_flag;
4227     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4228     unsigned int slice_type, tmp;
4229     int default_ref_list_done = 0;
4230
4231     s->current_picture.reference= h->nal_ref_idc != 0;
4232     s->dropable= h->nal_ref_idc == 0;
4233
4234     first_mb_in_slice= get_ue_golomb(&s->gb);
4235
4236     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
4237         h->slice_num = 0;
4238         s->current_picture_ptr= NULL;
4239     }
4240
4241     slice_type= get_ue_golomb(&s->gb);
4242     if(slice_type > 9){
4243         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4244         return -1;
4245     }
4246     if(slice_type > 4){
4247         slice_type -= 5;
4248         h->slice_type_fixed=1;
4249     }else
4250         h->slice_type_fixed=0;
4251
4252     slice_type= slice_type_map[ slice_type ];
4253     if (slice_type == I_TYPE
4254         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4255         default_ref_list_done = 1;
4256     }
4257     h->slice_type= slice_type;
4258
4259     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4260
4261     pps_id= get_ue_golomb(&s->gb);
4262     if(pps_id>=MAX_PPS_COUNT){
4263         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4264         return -1;
4265     }
4266     if(!h->pps_buffers[pps_id]) {
4267         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4268         return -1;
4269     }
4270     h->pps= *h->pps_buffers[pps_id];
4271
4272     if(!h->sps_buffers[h->pps.sps_id]) {
4273         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4274         return -1;
4275     }
4276     h->sps = *h->sps_buffers[h->pps.sps_id];
4277
4278     if(h->dequant_coeff_pps != pps_id){
4279         h->dequant_coeff_pps = pps_id;
4280         init_dequant_tables(h);
4281     }
4282
4283     s->mb_width= h->sps.mb_width;
4284     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4285
4286     h->b_stride=  s->mb_width*4;
4287     h->b8_stride= s->mb_width*2;
4288
4289     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4290     if(h->sps.frame_mbs_only_flag)
4291         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4292     else
4293         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4294
4295     if (s->context_initialized
4296         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4297         free_tables(h);
4298         MPV_common_end(s);
4299     }
4300     if (!s->context_initialized) {
4301         if (MPV_common_init(s) < 0)
4302             return -1;
4303
4304         init_scan_tables(h);
4305         alloc_tables(h);
4306
4307         s->avctx->width = s->width;
4308         s->avctx->height = s->height;
4309         s->avctx->sample_aspect_ratio= h->sps.sar;
4310         if(!s->avctx->sample_aspect_ratio.den)
4311             s->avctx->sample_aspect_ratio.den = 1;
4312
4313         if(h->sps.timing_info_present_flag){
4314             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4315             if(h->x264_build > 0 && h->x264_build < 44)
4316                 s->avctx->time_base.den *= 2;
4317             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4318                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4319         }
4320     }
4321
4322     if(h->slice_num == 0){
4323         if(frame_start(h) < 0)
4324             return -1;
4325     }
4326
4327     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4328     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4329
4330     h->mb_mbaff = 0;
4331     h->mb_aff_frame = 0;
4332     if(h->sps.frame_mbs_only_flag){
4333         s->picture_structure= PICT_FRAME;
4334     }else{
4335         if(get_bits1(&s->gb)) { //field_pic_flag
4336             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4337             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4338         } else {
4339             s->picture_structure= PICT_FRAME;
4340             h->mb_aff_frame = h->sps.mb_aff;
4341         }
4342     }
4343     assert(s->mb_num == s->mb_width * s->mb_height);
4344     if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
4345        first_mb_in_slice                    >= s->mb_num){
4346         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4347         return -1;
4348     }
4349     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4350     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4351     assert(s->mb_y < s->mb_height);
4352
4353     if(s->picture_structure==PICT_FRAME){
4354         h->curr_pic_num=   h->frame_num;
4355         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4356     }else{
4357         h->curr_pic_num= 2*h->frame_num;
4358         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4359     }
4360
4361     if(h->nal_unit_type == NAL_IDR_SLICE){
4362         get_ue_golomb(&s->gb); /* idr_pic_id */
4363     }
4364
4365     if(h->sps.poc_type==0){
4366         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4367
4368         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4369             h->delta_poc_bottom= get_se_golomb(&s->gb);
4370         }
4371     }
4372
4373     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4374         h->delta_poc[0]= get_se_golomb(&s->gb);
4375
4376         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4377             h->delta_poc[1]= get_se_golomb(&s->gb);
4378     }
4379
4380     init_poc(h);
4381
4382     if(h->pps.redundant_pic_cnt_present){
4383         h->redundant_pic_count= get_ue_golomb(&s->gb);
4384     }
4385
4386     //set defaults, might be overriden a few line later
4387     h->ref_count[0]= h->pps.ref_count[0];
4388     h->ref_count[1]= h->pps.ref_count[1];
4389
4390     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4391         if(h->slice_type == B_TYPE){
4392             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4393             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4394                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4395         }
4396         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4397
4398         if(num_ref_idx_active_override_flag){
4399             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4400             if(h->slice_type==B_TYPE)
4401                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4402
4403             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4404                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4405                 h->ref_count[0]= h->ref_count[1]= 1;
4406                 return -1;
4407             }
4408         }
4409         if(h->slice_type == B_TYPE)
4410             h->list_count= 2;
4411         else
4412             h->list_count= 1;
4413     }else
4414         h->list_count= 0;
4415
4416     if(!default_ref_list_done){
4417         fill_default_ref_list(h);
4418     }
4419
4420     if(decode_ref_pic_list_reordering(h) < 0)
4421         return -1;
4422
4423     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4424        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4425         pred_weight_table(h);
4426     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4427         implicit_weight_table(h);
4428     else
4429         h->use_weight = 0;
4430
4431     if(s->current_picture.reference)
4432         decode_ref_pic_marking(h, &s->gb);
4433
4434     if(FRAME_MBAFF)
4435         fill_mbaff_ref_list(h);
4436
4437     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4438         tmp = get_ue_golomb(&s->gb);
4439         if(tmp > 2){
4440             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4441             return -1;
4442         }
4443         h->cabac_init_idc= tmp;
4444     }
4445
4446     h->last_qscale_diff = 0;
4447     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4448     if(tmp>51){
4449         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4450         return -1;
4451     }
4452     s->qscale= tmp;
4453     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4454     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4455     //FIXME qscale / qp ... stuff
4456     if(h->slice_type == SP_TYPE){
4457         get_bits1(&s->gb); /* sp_for_switch_flag */
4458     }
4459     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4460         get_se_golomb(&s->gb); /* slice_qs_delta */
4461     }
4462
4463     h->deblocking_filter = 1;
4464     h->slice_alpha_c0_offset = 0;
4465     h->slice_beta_offset = 0;
4466     if( h->pps.deblocking_filter_parameters_present ) {
4467         tmp= get_ue_golomb(&s->gb);
4468         if(tmp > 2){
4469             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4470             return -1;
4471         }
4472         h->deblocking_filter= tmp;
4473         if(h->deblocking_filter < 2)
4474             h->deblocking_filter^= 1; // 1<->0
4475
4476         if( h->deblocking_filter ) {
4477             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4478             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4479         }
4480     }
4481     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4482        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4483        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4484        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4485         h->deblocking_filter= 0;
4486
4487 #if 0 //FMO
4488     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4489         slice_group_change_cycle= get_bits(&s->gb, ?);
4490 #endif
4491
4492     h->slice_num++;
4493
4494     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4495     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4496
4497     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4498         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4499                h->slice_num,
4500                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4501                first_mb_in_slice,
4502                av_get_pict_type_char(h->slice_type),
4503                pps_id, h->frame_num,
4504                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4505                h->ref_count[0], h->ref_count[1],
4506                s->qscale,
4507                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4508                h->use_weight,
4509                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4510                );
4511     }
4512
4513     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4514         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4515         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4516     }else{
4517         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4518         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4519     }
4520
4521     return 0;
4522 }
4523
4524 /**
4525  *
4526  */
4527 static inline int get_level_prefix(GetBitContext *gb){
4528     unsigned int buf;
4529     int log;
4530
4531     OPEN_READER(re, gb);
4532     UPDATE_CACHE(re, gb);
4533     buf=GET_CACHE(re, gb);
4534
4535     log= 32 - av_log2(buf);
4536 #ifdef TRACE
4537     print_bin(buf>>(32-log), log);
4538     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4539 #endif
4540
4541     LAST_SKIP_BITS(re, gb, log);
4542     CLOSE_READER(re, gb);
4543
4544     return log-1;
4545 }
4546
4547 static inline int get_dct8x8_allowed(H264Context *h){
4548     int i;
4549     for(i=0; i<4; i++){
4550         if(!IS_SUB_8X8(h->sub_mb_type[i])
4551            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4552             return 0;
4553     }
4554     return 1;
4555 }
4556
4557 /**
4558  * decodes a residual block.
4559  * @param n block index
4560  * @param scantable scantable
4561  * @param max_coeff number of coefficients in the block
4562  * @return <0 if an error occured
4563  */
4564 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4565     MpegEncContext * const s = &h->s;
4566     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4567     int level[16];
4568     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4569
4570     //FIXME put trailing_onex into the context
4571
4572     if(n == CHROMA_DC_BLOCK_INDEX){
4573         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4574         total_coeff= coeff_token>>2;
4575     }else{
4576         if(n == LUMA_DC_BLOCK_INDEX){
4577             total_coeff= pred_non_zero_count(h, 0);
4578             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4579             total_coeff= coeff_token>>2;
4580         }else{
4581             total_coeff= pred_non_zero_count(h, n);
4582             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4583             total_coeff= coeff_token>>2;
4584             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4585         }
4586     }
4587
4588     //FIXME set last_non_zero?
4589
4590     if(total_coeff==0)
4591         return 0;
4592     if(total_coeff > (unsigned)max_coeff) {
4593         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4594         return -1;
4595     }
4596
4597     trailing_ones= coeff_token&3;
4598     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4599     assert(total_coeff<=16);
4600
4601     for(i=0; i<trailing_ones; i++){
4602         level[i]= 1 - 2*get_bits1(gb);
4603     }
4604
4605     if(i<total_coeff) {
4606         int level_code, mask;
4607         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4608         int prefix= get_level_prefix(gb);
4609
4610         //first coefficient has suffix_length equal to 0 or 1
4611         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4612             if(suffix_length)
4613                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4614             else
4615                 level_code= (prefix<<suffix_length); //part
4616         }else if(prefix==14){
4617             if(suffix_length)
4618                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4619             else
4620                 level_code= prefix + get_bits(gb, 4); //part
4621         }else if(prefix==15){
4622             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4623             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4624         }else{
4625             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4626             return -1;
4627         }
4628
4629         if(trailing_ones < 3) level_code += 2;
4630
4631         suffix_length = 1;
4632         if(level_code > 5)
4633             suffix_length++;
4634         mask= -(level_code&1);
4635         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4636         i++;
4637
4638         //remaining coefficients have suffix_length > 0
4639         for(;i<total_coeff;i++) {
4640             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4641             prefix = get_level_prefix(gb);
4642             if(prefix<15){
4643                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4644             }else if(prefix==15){
4645                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4646             }else{
4647                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4648                 return -1;
4649             }
4650             mask= -(level_code&1);
4651             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4652             if(level_code > suffix_limit[suffix_length])
4653                 suffix_length++;
4654         }
4655     }
4656
4657     if(total_coeff == max_coeff)
4658         zeros_left=0;
4659     else{
4660         if(n == CHROMA_DC_BLOCK_INDEX)
4661             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4662         else
4663             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4664     }
4665
4666     coeff_num = zeros_left + total_coeff - 1;
4667     j = scantable[coeff_num];
4668     if(n > 24){
4669         block[j] = level[0];
4670         for(i=1;i<total_coeff;i++) {
4671             if(zeros_left <= 0)
4672                 run_before = 0;
4673             else if(zeros_left < 7){
4674                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4675             }else{
4676                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4677             }
4678             zeros_left -= run_before;
4679             coeff_num -= 1 + run_before;
4680             j= scantable[ coeff_num ];
4681
4682             block[j]= level[i];
4683         }
4684     }else{
4685         block[j] = (level[0] * qmul[j] + 32)>>6;
4686         for(i=1;i<total_coeff;i++) {
4687             if(zeros_left <= 0)
4688                 run_before = 0;
4689             else if(zeros_left < 7){
4690                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4691             }else{
4692                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4693             }
4694             zeros_left -= run_before;
4695             coeff_num -= 1 + run_before;
4696             j= scantable[ coeff_num ];
4697
4698             block[j]= (level[i] * qmul[j] + 32)>>6;
4699         }
4700     }
4701
4702     if(zeros_left<0){
4703         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4704         return -1;
4705     }
4706
4707     return 0;
4708 }
4709
4710 static void predict_field_decoding_flag(H264Context *h){
4711     MpegEncContext * const s = &h->s;
4712     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4713     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4714                 ? s->current_picture.mb_type[mb_xy-1]
4715                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4716                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4717                 : 0;
4718     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4719 }
4720
4721 /**
4722  * decodes a P_SKIP or B_SKIP macroblock
4723  */
4724 static void decode_mb_skip(H264Context *h){
4725     MpegEncContext * const s = &h->s;
4726     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4727     int mb_type=0;
4728
4729     memset(h->non_zero_count[mb_xy], 0, 16);
4730     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4731
4732     if(MB_FIELD)
4733         mb_type|= MB_TYPE_INTERLACED;
4734
4735     if( h->slice_type == B_TYPE )
4736     {
4737         // just for fill_caches. pred_direct_motion will set the real mb_type
4738         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4739
4740         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4741         pred_direct_motion(h, &mb_type);
4742         mb_type|= MB_TYPE_SKIP;
4743     }
4744     else
4745     {
4746         int mx, my;
4747         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4748
4749         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4750         pred_pskip_motion(h, &mx, &my);
4751         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4752         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4753     }
4754
4755     write_back_motion(h, mb_type);
4756     s->current_picture.mb_type[mb_xy]= mb_type;
4757     s->current_picture.qscale_table[mb_xy]= s->qscale;
4758     h->slice_table[ mb_xy ]= h->slice_num;
4759     h->prev_mb_skipped= 1;
4760 }
4761
4762 /**
4763  * decodes a macroblock
4764  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4765  */
4766 static int decode_mb_cavlc(H264Context *h){
4767     MpegEncContext * const s = &h->s;
4768     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4769     int partition_count;
4770     unsigned int mb_type, cbp;
4771     int dct8x8_allowed= h->pps.transform_8x8_mode;
4772
4773     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4774
4775     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4776     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4777                 down the code */
4778     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4779         if(s->mb_skip_run==-1)
4780             s->mb_skip_run= get_ue_golomb(&s->gb);
4781
4782         if (s->mb_skip_run--) {
4783             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4784                 if(s->mb_skip_run==0)
4785                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4786                 else
4787                     predict_field_decoding_flag(h);
4788             }
4789             decode_mb_skip(h);
4790             return 0;
4791         }
4792     }
4793     if(FRAME_MBAFF){
4794         if( (s->mb_y&1) == 0 )
4795             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4796     }else
4797         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4798
4799     h->prev_mb_skipped= 0;
4800
4801     mb_type= get_ue_golomb(&s->gb);
4802     if(h->slice_type == B_TYPE){
4803         if(mb_type < 23){
4804             partition_count= b_mb_type_info[mb_type].partition_count;
4805             mb_type=         b_mb_type_info[mb_type].type;
4806         }else{
4807             mb_type -= 23;
4808             goto decode_intra_mb;
4809         }
4810     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4811         if(mb_type < 5){
4812             partition_count= p_mb_type_info[mb_type].partition_count;
4813             mb_type=         p_mb_type_info[mb_type].type;
4814         }else{
4815             mb_type -= 5;
4816             goto decode_intra_mb;
4817         }
4818     }else{
4819        assert(h->slice_type == I_TYPE);
4820 decode_intra_mb:
4821         if(mb_type > 25){
4822             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4823             return -1;
4824         }
4825         partition_count=0;
4826         cbp= i_mb_type_info[mb_type].cbp;
4827         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4828         mb_type= i_mb_type_info[mb_type].type;
4829     }
4830
4831     if(MB_FIELD)
4832         mb_type |= MB_TYPE_INTERLACED;
4833
4834     h->slice_table[ mb_xy ]= h->slice_num;
4835
4836     if(IS_INTRA_PCM(mb_type)){
4837         unsigned int x, y;
4838
4839         // We assume these blocks are very rare so we do not optimize it.
4840         align_get_bits(&s->gb);
4841
4842         // The pixels are stored in the same order as levels in h->mb array.
4843         for(y=0; y<16; y++){
4844             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4845             for(x=0; x<16; x++){
4846                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4847                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4848             }
4849         }
4850         for(y=0; y<8; y++){
4851             const int index= 256 + 4*(y&3) + 32*(y>>2);
4852             for(x=0; x<8; x++){
4853                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4854                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4855             }
4856         }
4857         for(y=0; y<8; y++){
4858             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4859             for(x=0; x<8; x++){
4860                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4861                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4862             }
4863         }
4864
4865         // In deblocking, the quantizer is 0
4866         s->current_picture.qscale_table[mb_xy]= 0;
4867         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4868         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4869         // All coeffs are present
4870         memset(h->non_zero_count[mb_xy], 16, 16);
4871
4872         s->current_picture.mb_type[mb_xy]= mb_type;
4873         return 0;
4874     }
4875
4876     if(MB_MBAFF){
4877         h->ref_count[0] <<= 1;
4878         h->ref_count[1] <<= 1;
4879     }
4880
4881     fill_caches(h, mb_type, 0);
4882
4883     //mb_pred
4884     if(IS_INTRA(mb_type)){
4885             int pred_mode;
4886 //            init_top_left_availability(h);
4887             if(IS_INTRA4x4(mb_type)){
4888                 int i;
4889                 int di = 1;
4890                 if(dct8x8_allowed && get_bits1(&s->gb)){
4891                     mb_type |= MB_TYPE_8x8DCT;
4892                     di = 4;
4893                 }
4894
4895 //                fill_intra4x4_pred_table(h);
4896                 for(i=0; i<16; i+=di){
4897                     int mode= pred_intra_mode(h, i);
4898
4899                     if(!get_bits1(&s->gb)){
4900                         const int rem_mode= get_bits(&s->gb, 3);
4901                         mode = rem_mode + (rem_mode >= mode);
4902                     }
4903
4904                     if(di==4)
4905                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4906                     else
4907                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4908                 }
4909                 write_back_intra_pred_mode(h);
4910                 if( check_intra4x4_pred_mode(h) < 0)
4911                     return -1;
4912             }else{
4913                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4914                 if(h->intra16x16_pred_mode < 0)
4915                     return -1;
4916             }
4917
4918             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4919             if(pred_mode < 0)
4920                 return -1;
4921             h->chroma_pred_mode= pred_mode;
4922     }else if(partition_count==4){
4923         int i, j, sub_partition_count[4], list, ref[2][4];
4924
4925         if(h->slice_type == B_TYPE){
4926             for(i=0; i<4; i++){
4927                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4928                 if(h->sub_mb_type[i] >=13){
4929                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4930                     return -1;
4931                 }
4932                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4933                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4934             }
4935             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4936                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4937                 pred_direct_motion(h, &mb_type);
4938                 h->ref_cache[0][scan8[4]] =
4939                 h->ref_cache[1][scan8[4]] =
4940                 h->ref_cache[0][scan8[12]] =
4941                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4942             }
4943         }else{
4944             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4945             for(i=0; i<4; i++){
4946                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4947                 if(h->sub_mb_type[i] >=4){
4948                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4949                     return -1;
4950                 }
4951                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4952                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4953             }
4954         }
4955
4956         for(list=0; list<h->list_count; list++){
4957             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4958             for(i=0; i<4; i++){
4959                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4960                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4961                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4962                     if(tmp>=ref_count){
4963                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4964                         return -1;
4965                     }
4966                     ref[list][i]= tmp;
4967                 }else{
4968                  //FIXME
4969                     ref[list][i] = -1;
4970                 }
4971             }
4972         }
4973
4974         if(dct8x8_allowed)
4975             dct8x8_allowed = get_dct8x8_allowed(h);
4976
4977         for(list=0; list<h->list_count; list++){
4978             for(i=0; i<4; i++){
4979                 if(IS_DIRECT(h->sub_mb_type[i])) {
4980                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4981                     continue;
4982                 }
4983                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4984                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4985
4986                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4987                     const int sub_mb_type= h->sub_mb_type[i];
4988                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4989                     for(j=0; j<sub_partition_count[i]; j++){
4990                         int mx, my;
4991                         const int index= 4*i + block_width*j;
4992                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4993                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4994                         mx += get_se_golomb(&s->gb);
4995                         my += get_se_golomb(&s->gb);
4996                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4997
4998                         if(IS_SUB_8X8(sub_mb_type)){
4999                             mv_cache[ 1 ][0]=
5000                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5001                             mv_cache[ 1 ][1]=
5002                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5003                         }else if(IS_SUB_8X4(sub_mb_type)){
5004                             mv_cache[ 1 ][0]= mx;
5005                             mv_cache[ 1 ][1]= my;
5006                         }else if(IS_SUB_4X8(sub_mb_type)){
5007                             mv_cache[ 8 ][0]= mx;
5008                             mv_cache[ 8 ][1]= my;
5009                         }
5010                         mv_cache[ 0 ][0]= mx;
5011                         mv_cache[ 0 ][1]= my;
5012                     }
5013                 }else{
5014                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5015                     p[0] = p[1]=
5016                     p[8] = p[9]= 0;
5017                 }
5018             }
5019         }
5020     }else if(IS_DIRECT(mb_type)){
5021         pred_direct_motion(h, &mb_type);
5022         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5023     }else{
5024         int list, mx, my, i;
5025          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5026         if(IS_16X16(mb_type)){
5027             for(list=0; list<h->list_count; list++){
5028                     unsigned int val;
5029                     if(IS_DIR(mb_type, 0, list)){
5030                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
5031                         if(val >= h->ref_count[list]){
5032                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5033                             return -1;
5034                         }
5035                     }else
5036                         val= LIST_NOT_USED&0xFF;
5037                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5038             }
5039             for(list=0; list<h->list_count; list++){
5040                 unsigned int val;
5041                 if(IS_DIR(mb_type, 0, list)){
5042                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5043                     mx += get_se_golomb(&s->gb);
5044                     my += get_se_golomb(&s->gb);
5045                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5046
5047                     val= pack16to32(mx,my);
5048                 }else
5049                     val=0;
5050                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
5051             }
5052         }
5053         else if(IS_16X8(mb_type)){
5054             for(list=0; list<h->list_count; list++){
5055                     for(i=0; i<2; i++){
5056                         unsigned int val;
5057                         if(IS_DIR(mb_type, i, list)){
5058                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5059                             if(val >= h->ref_count[list]){
5060                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5061                                 return -1;
5062                             }
5063                         }else
5064                             val= LIST_NOT_USED&0xFF;
5065                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5066                     }
5067             }
5068             for(list=0; list<h->list_count; list++){
5069                 for(i=0; i<2; i++){
5070                     unsigned int val;
5071                     if(IS_DIR(mb_type, i, list)){
5072                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5073                         mx += get_se_golomb(&s->gb);
5074                         my += get_se_golomb(&s->gb);
5075                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5076
5077                         val= pack16to32(mx,my);
5078                     }else
5079                         val=0;
5080                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
5081                 }
5082             }
5083         }else{
5084             assert(IS_8X16(mb_type));
5085             for(list=0; list<h->list_count; list++){
5086                     for(i=0; i<2; i++){
5087                         unsigned int val;
5088                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5089                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5090                             if(val >= h->ref_count[list]){
5091                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5092                                 return -1;
5093                             }
5094                         }else
5095                             val= LIST_NOT_USED&0xFF;
5096                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5097                     }
5098             }
5099             for(list=0; list<h->list_count; list++){
5100                 for(i=0; i<2; i++){
5101                     unsigned int val;
5102                     if(IS_DIR(mb_type, i, list)){
5103                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5104                         mx += get_se_golomb(&s->gb);
5105                         my += get_se_golomb(&s->gb);
5106                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5107
5108                         val= pack16to32(mx,my);
5109                     }else
5110                         val=0;
5111                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
5112                 }
5113             }
5114         }
5115     }
5116
5117     if(IS_INTER(mb_type))
5118         write_back_motion(h, mb_type);
5119
5120     if(!IS_INTRA16x16(mb_type)){
5121         cbp= get_ue_golomb(&s->gb);
5122         if(cbp > 47){
5123             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
5124             return -1;
5125         }
5126
5127         if(IS_INTRA4x4(mb_type))
5128             cbp= golomb_to_intra4x4_cbp[cbp];
5129         else
5130             cbp= golomb_to_inter_cbp[cbp];
5131     }
5132     h->cbp = cbp;
5133
5134     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5135         if(get_bits1(&s->gb))
5136             mb_type |= MB_TYPE_8x8DCT;
5137     }
5138     s->current_picture.mb_type[mb_xy]= mb_type;
5139
5140     if(cbp || IS_INTRA16x16(mb_type)){
5141         int i8x8, i4x4, chroma_idx;
5142         int dquant;
5143         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5144         const uint8_t *scan, *scan8x8, *dc_scan;
5145
5146 //        fill_non_zero_count_cache(h);
5147
5148         if(IS_INTERLACED(mb_type)){
5149             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5150             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5151             dc_scan= luma_dc_field_scan;
5152         }else{
5153             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5154             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5155             dc_scan= luma_dc_zigzag_scan;
5156         }
5157
5158         dquant= get_se_golomb(&s->gb);
5159
5160         if( dquant > 25 || dquant < -26 ){
5161             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5162             return -1;
5163         }
5164
5165         s->qscale += dquant;
5166         if(((unsigned)s->qscale) > 51){
5167             if(s->qscale<0) s->qscale+= 52;
5168             else            s->qscale-= 52;
5169         }
5170
5171         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
5172         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
5173         if(IS_INTRA16x16(mb_type)){
5174             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5175                 return -1; //FIXME continue if partitioned and other return -1 too
5176             }
5177
5178             assert((cbp&15) == 0 || (cbp&15) == 15);
5179
5180             if(cbp&15){
5181                 for(i8x8=0; i8x8<4; i8x8++){
5182                     for(i4x4=0; i4x4<4; i4x4++){
5183                         const int index= i4x4 + 4*i8x8;
5184                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5185                             return -1;
5186                         }
5187                     }
5188                 }
5189             }else{
5190                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5191             }
5192         }else{
5193             for(i8x8=0; i8x8<4; i8x8++){
5194                 if(cbp & (1<<i8x8)){
5195                     if(IS_8x8DCT(mb_type)){
5196                         DCTELEM *buf = &h->mb[64*i8x8];
5197                         uint8_t *nnz;
5198                         for(i4x4=0; i4x4<4; i4x4++){
5199                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5200                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5201                                 return -1;
5202                         }
5203                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5204                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5205                     }else{
5206                         for(i4x4=0; i4x4<4; i4x4++){
5207                             const int index= i4x4 + 4*i8x8;
5208
5209                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5210                                 return -1;
5211                             }
5212                         }
5213                     }
5214                 }else{
5215                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5216                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5217                 }
5218             }
5219         }
5220
5221         if(cbp&0x30){
5222             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5223                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5224                     return -1;
5225                 }
5226         }
5227
5228         if(cbp&0x20){
5229             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5230                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
5231                 for(i4x4=0; i4x4<4; i4x4++){
5232                     const int index= 16 + 4*chroma_idx + i4x4;
5233                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
5234                         return -1;
5235                     }
5236                 }
5237             }
5238         }else{
5239             uint8_t * const nnz= &h->non_zero_count_cache[0];
5240             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5241             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5242         }
5243     }else{
5244         uint8_t * const nnz= &h->non_zero_count_cache[0];
5245         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5246         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5247         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5248     }
5249     s->current_picture.qscale_table[mb_xy]= s->qscale;
5250     write_back_non_zero_count(h);
5251
5252     if(MB_MBAFF){
5253         h->ref_count[0] >>= 1;
5254         h->ref_count[1] >>= 1;
5255     }
5256
5257     return 0;
5258 }
5259
5260 static int decode_cabac_field_decoding_flag(H264Context *h) {
5261     MpegEncContext * const s = &h->s;
5262     const int mb_x = s->mb_x;
5263     const int mb_y = s->mb_y & ~1;
5264     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5265     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5266
5267     unsigned int ctx = 0;
5268
5269     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5270         ctx += 1;
5271     }
5272     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5273         ctx += 1;
5274     }
5275
5276     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5277 }
5278
5279 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5280     uint8_t *state= &h->cabac_state[ctx_base];
5281     int mb_type;
5282
5283     if(intra_slice){
5284         MpegEncContext * const s = &h->s;
5285         const int mba_xy = h->left_mb_xy[0];
5286         const int mbb_xy = h->top_mb_xy;
5287         int ctx=0;
5288         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5289             ctx++;
5290         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5291             ctx++;
5292         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5293             return 0;   /* I4x4 */
5294         state += 2;
5295     }else{
5296         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5297             return 0;   /* I4x4 */
5298     }
5299
5300     if( get_cabac_terminate( &h->cabac ) )
5301         return 25;  /* PCM */
5302
5303     mb_type = 1; /* I16x16 */
5304     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5305     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5306         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5307     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5308     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5309     return mb_type;
5310 }
5311
5312 static int decode_cabac_mb_type( H264Context *h ) {
5313     MpegEncContext * const s = &h->s;
5314
5315     if( h->slice_type == I_TYPE ) {
5316         return decode_cabac_intra_mb_type(h, 3, 1);
5317     } else if( h->slice_type == P_TYPE ) {
5318         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5319             /* P-type */
5320             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5321                 /* P_L0_D16x16, P_8x8 */
5322                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5323             } else {
5324                 /* P_L0_D8x16, P_L0_D16x8 */
5325                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5326             }
5327         } else {
5328             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5329         }
5330     } else if( h->slice_type == B_TYPE ) {
5331         const int mba_xy = h->left_mb_xy[0];
5332         const int mbb_xy = h->top_mb_xy;
5333         int ctx = 0;
5334         int bits;
5335
5336         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5337             ctx++;
5338         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5339             ctx++;
5340
5341         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5342             return 0; /* B_Direct_16x16 */
5343
5344         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5345             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5346         }
5347
5348         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5349         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5350         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5351         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5352         if( bits < 8 )
5353             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5354         else if( bits == 13 ) {
5355             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5356         } else if( bits == 14 )
5357             return 11; /* B_L1_L0_8x16 */
5358         else if( bits == 15 )
5359             return 22; /* B_8x8 */
5360
5361         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5362         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5363     } else {
5364         /* TODO SI/SP frames? */
5365         return -1;
5366     }
5367 }
5368
5369 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5370     MpegEncContext * const s = &h->s;
5371     int mba_xy, mbb_xy;
5372     int ctx = 0;
5373
5374     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5375         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5376         mba_xy = mb_xy - 1;
5377         if( (mb_y&1)
5378             && h->slice_table[mba_xy] == h->slice_num
5379             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5380             mba_xy += s->mb_stride;
5381         if( MB_FIELD ){
5382             mbb_xy = mb_xy - s->mb_stride;
5383             if( !(mb_y&1)
5384                 && h->slice_table[mbb_xy] == h->slice_num
5385                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5386                 mbb_xy -= s->mb_stride;
5387         }else
5388             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5389     }else{
5390         int mb_xy = mb_x + mb_y*s->mb_stride;
5391         mba_xy = mb_xy - 1;
5392         mbb_xy = mb_xy - s->mb_stride;
5393     }
5394
5395     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5396         ctx++;
5397     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5398         ctx++;
5399
5400     if( h->slice_type == B_TYPE )
5401         ctx += 13;
5402     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5403 }
5404
5405 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5406     int mode = 0;
5407
5408     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5409         return pred_mode;
5410
5411     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5412     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5413     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5414
5415     if( mode >= pred_mode )
5416         return mode + 1;
5417     else
5418         return mode;
5419 }
5420
5421 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5422     const int mba_xy = h->left_mb_xy[0];
5423     const int mbb_xy = h->top_mb_xy;
5424
5425     int ctx = 0;
5426
5427     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5428     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5429         ctx++;
5430
5431     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5432         ctx++;
5433
5434     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5435         return 0;
5436
5437     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5438         return 1;
5439     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5440         return 2;
5441     else
5442         return 3;
5443 }
5444
5445 static const uint8_t block_idx_x[16] = {
5446     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5447 };
5448 static const uint8_t block_idx_y[16] = {
5449     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5450 };
5451 static const uint8_t block_idx_xy[4][4] = {
5452     { 0, 2, 8,  10},
5453     { 1, 3, 9,  11},
5454     { 4, 6, 12, 14},
5455     { 5, 7, 13, 15}
5456 };
5457
5458 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5459     int cbp = 0;
5460     int cbp_b = -1;
5461     int i8x8;
5462
5463     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5464         cbp_b = h->top_cbp;
5465         tprintf(h->s.avctx, "cbp_b = top_cbp = %x\n", cbp_b);
5466     }
5467
5468     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5469         int cbp_a = -1;
5470         int x, y;
5471         int ctx = 0;
5472
5473         x = block_idx_x[4*i8x8];
5474         y = block_idx_y[4*i8x8];
5475
5476         if( x > 0 )
5477             cbp_a = cbp;
5478         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5479             cbp_a = h->left_cbp;
5480             tprintf(h->s.avctx, "cbp_a = left_cbp = %x\n", cbp_a);
5481         }
5482
5483         if( y > 0 )
5484             cbp_b = cbp;
5485
5486         /* No need to test for skip as we put 0 for skip block */
5487         /* No need to test for IPCM as we put 1 for IPCM block */
5488         if( cbp_a >= 0 ) {
5489             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5490             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5491                 ctx++;
5492         }
5493
5494         if( cbp_b >= 0 ) {
5495             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5496             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5497                 ctx += 2;
5498         }
5499
5500         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5501             cbp |= 1 << i8x8;
5502         }
5503     }
5504     return cbp;
5505 }
5506 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5507     int ctx;
5508     int cbp_a, cbp_b;
5509
5510     cbp_a = (h->left_cbp>>4)&0x03;
5511     cbp_b = (h-> top_cbp>>4)&0x03;
5512
5513     ctx = 0;
5514     if( cbp_a > 0 ) ctx++;
5515     if( cbp_b > 0 ) ctx += 2;
5516     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5517         return 0;
5518
5519     ctx = 4;
5520     if( cbp_a == 2 ) ctx++;
5521     if( cbp_b == 2 ) ctx += 2;
5522     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5523 }
5524 static int decode_cabac_mb_dqp( H264Context *h) {
5525     MpegEncContext * const s = &h->s;
5526     int mbn_xy;
5527     int   ctx = 0;
5528     int   val = 0;
5529
5530     if( s->mb_x > 0 )
5531         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5532     else
5533         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5534
5535     if( h->last_qscale_diff != 0 )
5536         ctx++;
5537
5538     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5539         if( ctx < 2 )
5540             ctx = 2;
5541         else
5542             ctx = 3;
5543         val++;
5544         if(val > 102) //prevent infinite loop
5545             return INT_MIN;
5546     }
5547
5548     if( val&0x01 )
5549         return (val + 1)/2;
5550     else
5551         return -(val + 1)/2;
5552 }
5553 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5554     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5555         return 0;   /* 8x8 */
5556     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5557         return 1;   /* 8x4 */
5558     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5559         return 2;   /* 4x8 */
5560     return 3;       /* 4x4 */
5561 }
5562 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5563     int type;
5564     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5565         return 0;   /* B_Direct_8x8 */
5566     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5567         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5568     type = 3;
5569     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5570         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5571             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5572         type += 4;
5573     }
5574     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5575     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5576     return type;
5577 }
5578
5579 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5580     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5581 }
5582
5583 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5584     int refa = h->ref_cache[list][scan8[n] - 1];
5585     int refb = h->ref_cache[list][scan8[n] - 8];
5586     int ref  = 0;
5587     int ctx  = 0;
5588
5589     if( h->slice_type == B_TYPE) {
5590         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5591             ctx++;
5592         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5593             ctx += 2;
5594     } else {
5595         if( refa > 0 )
5596             ctx++;
5597         if( refb > 0 )
5598             ctx += 2;
5599     }
5600
5601     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5602         ref++;
5603         if( ctx < 4 )
5604             ctx = 4;
5605         else
5606             ctx = 5;
5607         if(ref >= 32 /*h->ref_list[list]*/){
5608             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5609             return 0; //FIXME we should return -1 and check the return everywhere
5610         }
5611     }
5612     return ref;
5613 }
5614
5615 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5616     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5617                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5618     int ctxbase = (l == 0) ? 40 : 47;
5619     int ctx, mvd;
5620
5621     if( amvd < 3 )
5622         ctx = 0;
5623     else if( amvd > 32 )
5624         ctx = 2;
5625     else
5626         ctx = 1;
5627
5628     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5629         return 0;
5630
5631     mvd= 1;
5632     ctx= 3;
5633     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5634         mvd++;
5635         if( ctx < 6 )
5636             ctx++;
5637     }
5638
5639     if( mvd >= 9 ) {
5640         int k = 3;
5641         while( get_cabac_bypass( &h->cabac ) ) {
5642             mvd += 1 << k;
5643             k++;
5644             if(k>24){
5645                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5646                 return INT_MIN;
5647             }
5648         }
5649         while( k-- ) {
5650             if( get_cabac_bypass( &h->cabac ) )
5651                 mvd += 1 << k;
5652         }
5653     }
5654     return get_cabac_bypass_sign( &h->cabac, -mvd );
5655 }
5656
5657 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5658     int nza, nzb;
5659     int ctx = 0;
5660
5661     if( cat == 0 ) {
5662         nza = h->left_cbp&0x100;
5663         nzb = h-> top_cbp&0x100;
5664     } else if( cat == 1 || cat == 2 ) {
5665         nza = h->non_zero_count_cache[scan8[idx] - 1];
5666         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5667     } else if( cat == 3 ) {
5668         nza = (h->left_cbp>>(6+idx))&0x01;
5669         nzb = (h-> top_cbp>>(6+idx))&0x01;
5670     } else {
5671         assert(cat == 4);
5672         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5673         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5674     }
5675
5676     if( nza > 0 )
5677         ctx++;
5678
5679     if( nzb > 0 )
5680         ctx += 2;
5681
5682     return ctx + 4 * cat;
5683 }
5684
5685 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5686     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5687     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5688     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5689     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5690 };
5691
5692 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5693     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5694     static const int significant_coeff_flag_offset[2][6] = {
5695       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5696       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5697     };
5698     static const int last_coeff_flag_offset[2][6] = {
5699       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5700       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5701     };
5702     static const int coeff_abs_level_m1_offset[6] = {
5703         227+0, 227+10, 227+20, 227+30, 227+39, 426
5704     };
5705     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5706       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5707         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5708         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5709        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5710       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5711         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5712         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5713         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5714     };
5715
5716     int index[64];
5717
5718     int av_unused last;
5719     int coeff_count = 0;
5720
5721     int abslevel1 = 1;
5722     int abslevelgt1 = 0;
5723
5724     uint8_t *significant_coeff_ctx_base;
5725     uint8_t *last_coeff_ctx_base;
5726     uint8_t *abs_level_m1_ctx_base;
5727
5728 #ifndef ARCH_X86
5729 #define CABAC_ON_STACK
5730 #endif
5731 #ifdef CABAC_ON_STACK
5732 #define CC &cc
5733     CABACContext cc;
5734     cc.range     = h->cabac.range;
5735     cc.low       = h->cabac.low;
5736     cc.bytestream= h->cabac.bytestream;
5737 #else
5738 #define CC &h->cabac
5739 #endif
5740
5741
5742     /* cat: 0-> DC 16x16  n = 0
5743      *      1-> AC 16x16  n = luma4x4idx
5744      *      2-> Luma4x4   n = luma4x4idx
5745      *      3-> DC Chroma n = iCbCr
5746      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5747      *      5-> Luma8x8   n = 4 * luma8x8idx
5748      */
5749
5750     /* read coded block flag */
5751     if( cat != 5 ) {
5752         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5753             if( cat == 1 || cat == 2 )
5754                 h->non_zero_count_cache[scan8[n]] = 0;
5755             else if( cat == 4 )
5756                 h->non_zero_count_cache[scan8[16+n]] = 0;
5757 #ifdef CABAC_ON_STACK
5758             h->cabac.range     = cc.range     ;
5759             h->cabac.low       = cc.low       ;
5760             h->cabac.bytestream= cc.bytestream;
5761 #endif
5762             return 0;
5763         }
5764     }
5765
5766     significant_coeff_ctx_base = h->cabac_state
5767         + significant_coeff_flag_offset[MB_FIELD][cat];
5768     last_coeff_ctx_base = h->cabac_state
5769         + last_coeff_flag_offset[MB_FIELD][cat];
5770     abs_level_m1_ctx_base = h->cabac_state
5771         + coeff_abs_level_m1_offset[cat];
5772
5773     if( cat == 5 ) {
5774 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5775         for(last= 0; last < coefs; last++) { \
5776             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5777             if( get_cabac( CC, sig_ctx )) { \
5778                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5779                 index[coeff_count++] = last; \
5780                 if( get_cabac( CC, last_ctx ) ) { \
5781                     last= max_coeff; \
5782                     break; \
5783                 } \
5784             } \
5785         }\
5786         if( last == max_coeff -1 ) {\
5787             index[coeff_count++] = last;\
5788         }
5789         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5790 #if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5791         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5792     } else {
5793         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5794 #else
5795         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5796     } else {
5797         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5798 #endif
5799     }
5800     assert(coeff_count > 0);
5801
5802     if( cat == 0 )
5803         h->cbp_table[mb_xy] |= 0x100;
5804     else if( cat == 1 || cat == 2 )
5805         h->non_zero_count_cache[scan8[n]] = coeff_count;
5806     else if( cat == 3 )
5807         h->cbp_table[mb_xy] |= 0x40 << n;
5808     else if( cat == 4 )
5809         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5810     else {
5811         assert( cat == 5 );
5812         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5813     }
5814
5815     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5816         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5817         int j= scantable[index[coeff_count]];
5818
5819         if( get_cabac( CC, ctx ) == 0 ) {
5820             if( !qmul ) {
5821                 block[j] = get_cabac_bypass_sign( CC, -1);
5822             }else{
5823                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5824             }
5825
5826             abslevel1++;
5827         } else {
5828             int coeff_abs = 2;
5829             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5830             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5831                 coeff_abs++;
5832             }
5833
5834             if( coeff_abs >= 15 ) {
5835                 int j = 0;
5836                 while( get_cabac_bypass( CC ) ) {
5837                     j++;
5838                 }
5839
5840                 coeff_abs=1;
5841                 while( j-- ) {
5842                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5843                 }
5844                 coeff_abs+= 14;
5845             }
5846
5847             if( !qmul ) {
5848                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5849                 else                                block[j] =  coeff_abs;
5850             }else{
5851                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5852                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5853             }
5854
5855             abslevelgt1++;
5856         }
5857     }
5858 #ifdef CABAC_ON_STACK
5859             h->cabac.range     = cc.range     ;
5860             h->cabac.low       = cc.low       ;
5861             h->cabac.bytestream= cc.bytestream;
5862 #endif
5863     return 0;
5864 }
5865
5866 static inline void compute_mb_neighbors(H264Context *h)
5867 {
5868     MpegEncContext * const s = &h->s;
5869     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5870     h->top_mb_xy     = mb_xy - s->mb_stride;
5871     h->left_mb_xy[0] = mb_xy - 1;
5872     if(FRAME_MBAFF){
5873         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5874         const int top_pair_xy      = pair_xy     - s->mb_stride;
5875         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5876         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5877         const int curr_mb_frame_flag = !MB_FIELD;
5878         const int bottom = (s->mb_y & 1);
5879         if (bottom
5880                 ? !curr_mb_frame_flag // bottom macroblock
5881                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5882                 ) {
5883             h->top_mb_xy -= s->mb_stride;
5884         }
5885         if (left_mb_frame_flag != curr_mb_frame_flag) {
5886             h->left_mb_xy[0] = pair_xy - 1;
5887         }
5888     }
5889     return;
5890 }
5891
5892 /**
5893  * decodes a macroblock
5894  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5895  */
5896 static int decode_mb_cabac(H264Context *h) {
5897     MpegEncContext * const s = &h->s;
5898     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5899     int mb_type, partition_count, cbp = 0;
5900     int dct8x8_allowed= h->pps.transform_8x8_mode;
5901
5902     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5903
5904     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5905     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5906         int skip;
5907         /* a skipped mb needs the aff flag from the following mb */
5908         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5909             predict_field_decoding_flag(h);
5910         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5911             skip = h->next_mb_skipped;
5912         else
5913             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5914         /* read skip flags */
5915         if( skip ) {
5916             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5917                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5918                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5919                 if(h->next_mb_skipped)
5920                     predict_field_decoding_flag(h);
5921                 else
5922                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5923             }
5924
5925             decode_mb_skip(h);
5926
5927             h->cbp_table[mb_xy] = 0;
5928             h->chroma_pred_mode_table[mb_xy] = 0;
5929             h->last_qscale_diff = 0;
5930
5931             return 0;
5932
5933         }
5934     }
5935     if(FRAME_MBAFF){
5936         if( (s->mb_y&1) == 0 )
5937             h->mb_mbaff =
5938             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5939     }else
5940         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5941
5942     h->prev_mb_skipped = 0;
5943
5944     compute_mb_neighbors(h);
5945     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5946         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5947         return -1;
5948     }
5949
5950     if( h->slice_type == B_TYPE ) {
5951         if( mb_type < 23 ){
5952             partition_count= b_mb_type_info[mb_type].partition_count;
5953             mb_type=         b_mb_type_info[mb_type].type;
5954         }else{
5955             mb_type -= 23;
5956             goto decode_intra_mb;
5957         }
5958     } else if( h->slice_type == P_TYPE ) {
5959         if( mb_type < 5) {
5960             partition_count= p_mb_type_info[mb_type].partition_count;
5961             mb_type=         p_mb_type_info[mb_type].type;
5962         } else {
5963             mb_type -= 5;
5964             goto decode_intra_mb;
5965         }
5966     } else {
5967        assert(h->slice_type == I_TYPE);
5968 decode_intra_mb:
5969         partition_count = 0;
5970         cbp= i_mb_type_info[mb_type].cbp;
5971         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5972         mb_type= i_mb_type_info[mb_type].type;
5973     }
5974     if(MB_FIELD)
5975         mb_type |= MB_TYPE_INTERLACED;
5976
5977     h->slice_table[ mb_xy ]= h->slice_num;
5978
5979     if(IS_INTRA_PCM(mb_type)) {
5980         const uint8_t *ptr;
5981         unsigned int x, y;
5982
5983         // We assume these blocks are very rare so we do not optimize it.
5984         // FIXME The two following lines get the bitstream position in the cabac
5985         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5986         ptr= h->cabac.bytestream;
5987         if(h->cabac.low&0x1) ptr--;
5988         if(CABAC_BITS==16){
5989             if(h->cabac.low&0x1FF) ptr--;
5990         }
5991
5992         // The pixels are stored in the same order as levels in h->mb array.
5993         for(y=0; y<16; y++){
5994             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5995             for(x=0; x<16; x++){
5996                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5997                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5998             }
5999         }
6000         for(y=0; y<8; y++){
6001             const int index= 256 + 4*(y&3) + 32*(y>>2);
6002             for(x=0; x<8; x++){
6003                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6004                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6005             }
6006         }
6007         for(y=0; y<8; y++){
6008             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6009             for(x=0; x<8; x++){
6010                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6011                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6012             }
6013         }
6014
6015         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6016
6017         // All blocks are present
6018         h->cbp_table[mb_xy] = 0x1ef;
6019         h->chroma_pred_mode_table[mb_xy] = 0;
6020         // In deblocking, the quantizer is 0
6021         s->current_picture.qscale_table[mb_xy]= 0;
6022         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
6023         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
6024         // All coeffs are present
6025         memset(h->non_zero_count[mb_xy], 16, 16);
6026         s->current_picture.mb_type[mb_xy]= mb_type;
6027         return 0;
6028     }
6029
6030     if(MB_MBAFF){
6031         h->ref_count[0] <<= 1;
6032         h->ref_count[1] <<= 1;
6033     }
6034
6035     fill_caches(h, mb_type, 0);
6036
6037     if( IS_INTRA( mb_type ) ) {
6038         int i, pred_mode;
6039         if( IS_INTRA4x4( mb_type ) ) {
6040             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6041                 mb_type |= MB_TYPE_8x8DCT;
6042                 for( i = 0; i < 16; i+=4 ) {
6043                     int pred = pred_intra_mode( h, i );
6044                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6045                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6046                 }
6047             } else {
6048                 for( i = 0; i < 16; i++ ) {
6049                     int pred = pred_intra_mode( h, i );
6050                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6051
6052                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6053                 }
6054             }
6055             write_back_intra_pred_mode(h);
6056             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6057         } else {
6058             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6059             if( h->intra16x16_pred_mode < 0 ) return -1;
6060         }
6061         h->chroma_pred_mode_table[mb_xy] =
6062         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
6063
6064         pred_mode= check_intra_pred_mode( h, pred_mode );
6065         if( pred_mode < 0 ) return -1;
6066         h->chroma_pred_mode= pred_mode;
6067     } else if( partition_count == 4 ) {
6068         int i, j, sub_partition_count[4], list, ref[2][4];
6069
6070         if( h->slice_type == B_TYPE ) {
6071             for( i = 0; i < 4; i++ ) {
6072                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6073                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6074                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6075             }
6076             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6077                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6078                 pred_direct_motion(h, &mb_type);
6079                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6080                     for( i = 0; i < 4; i++ )
6081                         if( IS_DIRECT(h->sub_mb_type[i]) )
6082                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6083                 }
6084             }
6085         } else {
6086             for( i = 0; i < 4; i++ ) {
6087                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6088                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6089                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6090             }
6091         }
6092
6093         for( list = 0; list < h->list_count; list++ ) {
6094                 for( i = 0; i < 4; i++ ) {
6095                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6096                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6097                         if( h->ref_count[list] > 1 )
6098                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6099                         else
6100                             ref[list][i] = 0;
6101                     } else {
6102                         ref[list][i] = -1;
6103                     }
6104                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6105                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6106                 }
6107         }
6108
6109         if(dct8x8_allowed)
6110             dct8x8_allowed = get_dct8x8_allowed(h);
6111
6112         for(list=0; list<h->list_count; list++){
6113             for(i=0; i<4; i++){
6114                 if(IS_DIRECT(h->sub_mb_type[i])){
6115                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6116                     continue;
6117                 }
6118                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6119
6120                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6121                     const int sub_mb_type= h->sub_mb_type[i];
6122                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6123                     for(j=0; j<sub_partition_count[i]; j++){
6124                         int mpx, mpy;
6125                         int mx, my;
6126                         const int index= 4*i + block_width*j;
6127                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6128                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6129                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6130
6131                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6132                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6133                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6134
6135                         if(IS_SUB_8X8(sub_mb_type)){
6136                             mv_cache[ 1 ][0]=
6137                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6138                             mv_cache[ 1 ][1]=
6139                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6140
6141                             mvd_cache[ 1 ][0]=
6142                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6143                             mvd_cache[ 1 ][1]=
6144                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6145                         }else if(IS_SUB_8X4(sub_mb_type)){
6146                             mv_cache[ 1 ][0]= mx;
6147                             mv_cache[ 1 ][1]= my;
6148
6149                             mvd_cache[ 1 ][0]= mx - mpx;
6150                             mvd_cache[ 1 ][1]= my - mpy;
6151                         }else if(IS_SUB_4X8(sub_mb_type)){
6152                             mv_cache[ 8 ][0]= mx;
6153                             mv_cache[ 8 ][1]= my;
6154
6155                             mvd_cache[ 8 ][0]= mx - mpx;
6156                             mvd_cache[ 8 ][1]= my - mpy;
6157                         }
6158                         mv_cache[ 0 ][0]= mx;
6159                         mv_cache[ 0 ][1]= my;
6160
6161                         mvd_cache[ 0 ][0]= mx - mpx;
6162                         mvd_cache[ 0 ][1]= my - mpy;
6163                     }
6164                 }else{
6165                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6166                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6167                     p[0] = p[1] = p[8] = p[9] = 0;
6168                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6169                 }
6170             }
6171         }
6172     } else if( IS_DIRECT(mb_type) ) {
6173         pred_direct_motion(h, &mb_type);
6174         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6175         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6176         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6177     } else {
6178         int list, mx, my, i, mpx, mpy;
6179         if(IS_16X16(mb_type)){
6180             for(list=0; list<h->list_count; list++){
6181                 if(IS_DIR(mb_type, 0, list)){
6182                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6183                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6184                 }else
6185                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
6186             }
6187             for(list=0; list<h->list_count; list++){
6188                 if(IS_DIR(mb_type, 0, list)){
6189                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6190
6191                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6192                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6193                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6194
6195                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6196                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6197                 }else
6198                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6199             }
6200         }
6201         else if(IS_16X8(mb_type)){
6202             for(list=0; list<h->list_count; list++){
6203                     for(i=0; i<2; i++){
6204                         if(IS_DIR(mb_type, i, list)){
6205                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6206                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6207                         }else
6208                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6209                     }
6210             }
6211             for(list=0; list<h->list_count; list++){
6212                 for(i=0; i<2; i++){
6213                     if(IS_DIR(mb_type, i, list)){
6214                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6215                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6216                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6217                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6218
6219                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6220                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6221                     }else{
6222                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6223                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6224                     }
6225                 }
6226             }
6227         }else{
6228             assert(IS_8X16(mb_type));
6229             for(list=0; list<h->list_count; list++){
6230                     for(i=0; i<2; i++){
6231                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6232                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6233                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6234                         }else
6235                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6236                     }
6237             }
6238             for(list=0; list<h->list_count; list++){
6239                 for(i=0; i<2; i++){
6240                     if(IS_DIR(mb_type, i, list)){
6241                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6242                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6243                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6244
6245                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6246                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6247                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6248                     }else{
6249                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6250                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6251                     }
6252                 }
6253             }
6254         }
6255     }
6256
6257    if( IS_INTER( mb_type ) ) {
6258         h->chroma_pred_mode_table[mb_xy] = 0;
6259         write_back_motion( h, mb_type );
6260    }
6261
6262     if( !IS_INTRA16x16( mb_type ) ) {
6263         cbp  = decode_cabac_mb_cbp_luma( h );
6264         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6265     }
6266
6267     h->cbp_table[mb_xy] = h->cbp = cbp;
6268
6269     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6270         if( decode_cabac_mb_transform_size( h ) )
6271             mb_type |= MB_TYPE_8x8DCT;
6272     }
6273     s->current_picture.mb_type[mb_xy]= mb_type;
6274
6275     if( cbp || IS_INTRA16x16( mb_type ) ) {
6276         const uint8_t *scan, *scan8x8, *dc_scan;
6277         int dqp;
6278
6279         if(IS_INTERLACED(mb_type)){
6280             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6281             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6282             dc_scan= luma_dc_field_scan;
6283         }else{
6284             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6285             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6286             dc_scan= luma_dc_zigzag_scan;
6287         }
6288
6289         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6290         if( dqp == INT_MIN ){
6291             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6292             return -1;
6293         }
6294         s->qscale += dqp;
6295         if(((unsigned)s->qscale) > 51){
6296             if(s->qscale<0) s->qscale+= 52;
6297             else            s->qscale-= 52;
6298         }
6299         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6300         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6301
6302         if( IS_INTRA16x16( mb_type ) ) {
6303             int i;
6304             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6305             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6306                 return -1;
6307             if( cbp&15 ) {
6308                 for( i = 0; i < 16; i++ ) {
6309                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6310                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6311                         return -1;
6312                 }
6313             } else {
6314                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6315             }
6316         } else {
6317             int i8x8, i4x4;
6318             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6319                 if( cbp & (1<<i8x8) ) {
6320                     if( IS_8x8DCT(mb_type) ) {
6321                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6322                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6323                             return -1;
6324                     } else
6325                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6326                         const int index = 4*i8x8 + i4x4;
6327                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6328 //START_TIMER
6329                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6330                             return -1;
6331 //STOP_TIMER("decode_residual")
6332                     }
6333                 } else {
6334                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6335                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6336                 }
6337             }
6338         }
6339
6340         if( cbp&0x30 ){
6341             int c;
6342             for( c = 0; c < 2; c++ ) {
6343                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6344                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6345                     return -1;
6346             }
6347         }
6348
6349         if( cbp&0x20 ) {
6350             int c, i;
6351             for( c = 0; c < 2; c++ ) {
6352                 const uint32_t *qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6353                 for( i = 0; i < 4; i++ ) {
6354                     const int index = 16 + 4 * c + i;
6355                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6356                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15) < 0)
6357                         return -1;
6358                 }
6359             }
6360         } else {
6361             uint8_t * const nnz= &h->non_zero_count_cache[0];
6362             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6363             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6364         }
6365     } else {
6366         uint8_t * const nnz= &h->non_zero_count_cache[0];
6367         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6368         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6369         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6370         h->last_qscale_diff = 0;
6371     }
6372
6373     s->current_picture.qscale_table[mb_xy]= s->qscale;
6374     write_back_non_zero_count(h);
6375
6376     if(MB_MBAFF){
6377         h->ref_count[0] >>= 1;
6378         h->ref_count[1] >>= 1;
6379     }
6380
6381     return 0;
6382 }
6383
6384
6385 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6386     int i, d;
6387     const int index_a = qp + h->slice_alpha_c0_offset;
6388     const int alpha = (alpha_table+52)[index_a];
6389     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6390
6391     if( bS[0] < 4 ) {
6392         int8_t tc[4];
6393         for(i=0; i<4; i++)
6394             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6395         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6396     } else {
6397         /* 16px edge length, because bS=4 is triggered by being at
6398          * the edge of an intra MB, so all 4 bS are the same */
6399             for( d = 0; d < 16; d++ ) {
6400                 const int p0 = pix[-1];
6401                 const int p1 = pix[-2];
6402                 const int p2 = pix[-3];
6403
6404                 const int q0 = pix[0];
6405                 const int q1 = pix[1];
6406                 const int q2 = pix[2];
6407
6408                 if( FFABS( p0 - q0 ) < alpha &&
6409                     FFABS( p1 - p0 ) < beta &&
6410                     FFABS( q1 - q0 ) < beta ) {
6411
6412                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6413                         if( FFABS( p2 - p0 ) < beta)
6414                         {
6415                             const int p3 = pix[-4];
6416                             /* p0', p1', p2' */
6417                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6418                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6419                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6420                         } else {
6421                             /* p0' */
6422                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6423                         }
6424                         if( FFABS( q2 - q0 ) < beta)
6425                         {
6426                             const int q3 = pix[3];
6427                             /* q0', q1', q2' */
6428                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6429                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6430                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6431                         } else {
6432                             /* q0' */
6433                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6434                         }
6435                     }else{
6436                         /* p0', q0' */
6437                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6438                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6439                     }
6440                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6441                 }
6442                 pix += stride;
6443             }
6444     }
6445 }
6446 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6447     int i;
6448     const int index_a = qp + h->slice_alpha_c0_offset;
6449     const int alpha = (alpha_table+52)[index_a];
6450     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6451
6452     if( bS[0] < 4 ) {
6453         int8_t tc[4];
6454         for(i=0; i<4; i++)
6455             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6456         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6457     } else {
6458         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6459     }
6460 }
6461
6462 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6463     int i;
6464     for( i = 0; i < 16; i++, pix += stride) {
6465         int index_a;
6466         int alpha;
6467         int beta;
6468
6469         int qp_index;
6470         int bS_index = (i >> 1);
6471         if (!MB_FIELD) {
6472             bS_index &= ~1;
6473             bS_index |= (i & 1);
6474         }
6475
6476         if( bS[bS_index] == 0 ) {
6477             continue;
6478         }
6479
6480         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6481         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6482         alpha = (alpha_table+52)[index_a];
6483         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6484
6485         if( bS[bS_index] < 4 ) {
6486             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6487             const int p0 = pix[-1];
6488             const int p1 = pix[-2];
6489             const int p2 = pix[-3];
6490             const int q0 = pix[0];
6491             const int q1 = pix[1];
6492             const int q2 = pix[2];
6493
6494             if( FFABS( p0 - q0 ) < alpha &&
6495                 FFABS( p1 - p0 ) < beta &&
6496                 FFABS( q1 - q0 ) < beta ) {
6497                 int tc = tc0;
6498                 int i_delta;
6499
6500                 if( FFABS( p2 - p0 ) < beta ) {
6501                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6502                     tc++;
6503                 }
6504                 if( FFABS( q2 - q0 ) < beta ) {
6505                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6506                     tc++;
6507                 }
6508
6509                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6510                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6511                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6512                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6513             }
6514         }else{
6515             const int p0 = pix[-1];
6516             const int p1 = pix[-2];
6517             const int p2 = pix[-3];
6518
6519             const int q0 = pix[0];
6520             const int q1 = pix[1];
6521             const int q2 = pix[2];
6522
6523             if( FFABS( p0 - q0 ) < alpha &&
6524                 FFABS( p1 - p0 ) < beta &&
6525                 FFABS( q1 - q0 ) < beta ) {
6526
6527                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6528                     if( FFABS( p2 - p0 ) < beta)
6529                     {
6530                         const int p3 = pix[-4];
6531                         /* p0', p1', p2' */
6532                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6533                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6534                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6535                     } else {
6536                         /* p0' */
6537                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6538                     }
6539                     if( FFABS( q2 - q0 ) < beta)
6540                     {
6541                         const int q3 = pix[3];
6542                         /* q0', q1', q2' */
6543                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6544                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6545                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6546                     } else {
6547                         /* q0' */
6548                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6549                     }
6550                 }else{
6551                     /* p0', q0' */
6552                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6553                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6554                 }
6555                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6556             }
6557         }
6558     }
6559 }
6560 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6561     int i;
6562     for( i = 0; i < 8; i++, pix += stride) {
6563         int index_a;
6564         int alpha;
6565         int beta;
6566
6567         int qp_index;
6568         int bS_index = i;
6569
6570         if( bS[bS_index] == 0 ) {
6571             continue;
6572         }
6573
6574         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6575         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6576         alpha = (alpha_table+52)[index_a];
6577         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6578
6579         if( bS[bS_index] < 4 ) {
6580             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6581             const int p0 = pix[-1];
6582             const int p1 = pix[-2];
6583             const int q0 = pix[0];
6584             const int q1 = pix[1];
6585
6586             if( FFABS( p0 - q0 ) < alpha &&
6587                 FFABS( p1 - p0 ) < beta &&
6588                 FFABS( q1 - q0 ) < beta ) {
6589                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6590
6591                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6592                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6593                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6594             }
6595         }else{
6596             const int p0 = pix[-1];
6597             const int p1 = pix[-2];
6598             const int q0 = pix[0];
6599             const int q1 = pix[1];
6600
6601             if( FFABS( p0 - q0 ) < alpha &&
6602                 FFABS( p1 - p0 ) < beta &&
6603                 FFABS( q1 - q0 ) < beta ) {
6604
6605                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6606                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6607                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6608             }
6609         }
6610     }
6611 }
6612
6613 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6614     int i, d;
6615     const int index_a = qp + h->slice_alpha_c0_offset;
6616     const int alpha = (alpha_table+52)[index_a];
6617     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6618     const int pix_next  = stride;
6619
6620     if( bS[0] < 4 ) {
6621         int8_t tc[4];
6622         for(i=0; i<4; i++)
6623             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6624         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6625     } else {
6626         /* 16px edge length, see filter_mb_edgev */
6627             for( d = 0; d < 16; d++ ) {
6628                 const int p0 = pix[-1*pix_next];
6629                 const int p1 = pix[-2*pix_next];
6630                 const int p2 = pix[-3*pix_next];
6631                 const int q0 = pix[0];
6632                 const int q1 = pix[1*pix_next];
6633                 const int q2 = pix[2*pix_next];
6634
6635                 if( FFABS( p0 - q0 ) < alpha &&
6636                     FFABS( p1 - p0 ) < beta &&
6637                     FFABS( q1 - q0 ) < beta ) {
6638
6639                     const int p3 = pix[-4*pix_next];
6640                     const int q3 = pix[ 3*pix_next];
6641
6642                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6643                         if( FFABS( p2 - p0 ) < beta) {
6644                             /* p0', p1', p2' */
6645                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6646                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6647                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6648                         } else {
6649                             /* p0' */
6650                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6651                         }
6652                         if( FFABS( q2 - q0 ) < beta) {
6653                             /* q0', q1', q2' */
6654                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6655                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6656                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6657                         } else {
6658                             /* q0' */
6659                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6660                         }
6661                     }else{
6662                         /* p0', q0' */
6663                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6664                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6665                     }
6666                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6667                 }
6668                 pix++;
6669             }
6670     }
6671 }
6672
6673 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6674     int i;
6675     const int index_a = qp + h->slice_alpha_c0_offset;
6676     const int alpha = (alpha_table+52)[index_a];
6677     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6678
6679     if( bS[0] < 4 ) {
6680         int8_t tc[4];
6681         for(i=0; i<4; i++)
6682             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6683         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6684     } else {
6685         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6686     }
6687 }
6688
6689 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6690     MpegEncContext * const s = &h->s;
6691     int mb_xy, mb_type;
6692     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6693
6694     mb_xy = mb_x + mb_y*s->mb_stride;
6695
6696     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6697        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6698                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6699         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6700         return;
6701     }
6702     assert(!FRAME_MBAFF);
6703
6704     mb_type = s->current_picture.mb_type[mb_xy];
6705     qp = s->current_picture.qscale_table[mb_xy];
6706     qp0 = s->current_picture.qscale_table[mb_xy-1];
6707     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6708     qpc = get_chroma_qp( h, 0, qp );
6709     qpc0 = get_chroma_qp( h, 0, qp0 );
6710     qpc1 = get_chroma_qp( h, 0, qp1 );
6711     qp0 = (qp + qp0 + 1) >> 1;
6712     qp1 = (qp + qp1 + 1) >> 1;
6713     qpc0 = (qpc + qpc0 + 1) >> 1;
6714     qpc1 = (qpc + qpc1 + 1) >> 1;
6715     qp_thresh = 15 - h->slice_alpha_c0_offset;
6716     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6717        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6718         return;
6719
6720     if( IS_INTRA(mb_type) ) {
6721         int16_t bS4[4] = {4,4,4,4};
6722         int16_t bS3[4] = {3,3,3,3};
6723         if( IS_8x8DCT(mb_type) ) {
6724             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6725             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6726             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6727             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6728         } else {
6729             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6730             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6731             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6732             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6733             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6734             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6735             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6736             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6737         }
6738         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6739         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6740         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6741         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6742         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6743         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6744         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6745         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6746         return;
6747     } else {
6748         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6749         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6750         int edges;
6751         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6752             edges = 4;
6753             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6754         } else {
6755             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6756                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6757             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6758                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6759                              ? 3 : 0;
6760             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6761             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6762             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6763                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6764         }
6765         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6766             bSv[0][0] = 0x0004000400040004ULL;
6767         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6768             bSv[1][0] = 0x0004000400040004ULL;
6769
6770 #define FILTER(hv,dir,edge)\
6771         if(bSv[dir][edge]) {\
6772             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6773             if(!(edge&1)) {\
6774                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6775                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6776             }\
6777         }
6778         if( edges == 1 ) {
6779             FILTER(v,0,0);
6780             FILTER(h,1,0);
6781         } else if( IS_8x8DCT(mb_type) ) {
6782             FILTER(v,0,0);
6783             FILTER(v,0,2);
6784             FILTER(h,1,0);
6785             FILTER(h,1,2);
6786         } else {
6787             FILTER(v,0,0);
6788             FILTER(v,0,1);
6789             FILTER(v,0,2);
6790             FILTER(v,0,3);
6791             FILTER(h,1,0);
6792             FILTER(h,1,1);
6793             FILTER(h,1,2);
6794             FILTER(h,1,3);
6795         }
6796 #undef FILTER
6797     }
6798 }
6799
6800 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6801     MpegEncContext * const s = &h->s;
6802     const int mb_xy= mb_x + mb_y*s->mb_stride;
6803     const int mb_type = s->current_picture.mb_type[mb_xy];
6804     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6805     int first_vertical_edge_done = 0;
6806     int dir;
6807     /* FIXME: A given frame may occupy more than one position in
6808      * the reference list. So ref2frm should be populated with
6809      * frame numbers, not indices. */
6810     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6811                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6812
6813     //for sufficiently low qp, filtering wouldn't do anything
6814     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6815     if(!FRAME_MBAFF){
6816         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6817         int qp = s->current_picture.qscale_table[mb_xy];
6818         if(qp <= qp_thresh
6819            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6820            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6821             return;
6822         }
6823     }
6824
6825     if (FRAME_MBAFF
6826             // left mb is in picture
6827             && h->slice_table[mb_xy-1] != 255
6828             // and current and left pair do not have the same interlaced type
6829             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6830             // and left mb is in the same slice if deblocking_filter == 2
6831             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6832         /* First vertical edge is different in MBAFF frames
6833          * There are 8 different bS to compute and 2 different Qp
6834          */
6835         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6836         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6837         int16_t bS[8];
6838         int qp[2];
6839         int bqp[2];
6840         int rqp[2];
6841         int mb_qp, mbn0_qp, mbn1_qp;
6842         int i;
6843         first_vertical_edge_done = 1;
6844
6845         if( IS_INTRA(mb_type) )
6846             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6847         else {
6848             for( i = 0; i < 8; i++ ) {
6849                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6850
6851                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6852                     bS[i] = 4;
6853                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6854                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6855                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6856                     bS[i] = 2;
6857                 else
6858                     bS[i] = 1;
6859             }
6860         }
6861
6862         mb_qp = s->current_picture.qscale_table[mb_xy];
6863         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6864         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6865         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6866         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6867                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6868         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6869                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6870         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6871         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6872                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6873         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6874                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6875
6876         /* Filter edge */
6877         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6878         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6879         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6880         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6881         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6882     }
6883     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6884     for( dir = 0; dir < 2; dir++ )
6885     {
6886         int edge;
6887         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6888         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6889         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6890
6891         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6892                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6893         // how often to recheck mv-based bS when iterating between edges
6894         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6895                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6896         // how often to recheck mv-based bS when iterating along each edge
6897         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6898
6899         if (first_vertical_edge_done) {
6900             start = 1;
6901             first_vertical_edge_done = 0;
6902         }
6903
6904         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6905             start = 1;
6906
6907         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6908             && !IS_INTERLACED(mb_type)
6909             && IS_INTERLACED(mbm_type)
6910             ) {
6911             // This is a special case in the norm where the filtering must
6912             // be done twice (one each of the field) even if we are in a
6913             // frame macroblock.
6914             //
6915             static const int nnz_idx[4] = {4,5,6,3};
6916             unsigned int tmp_linesize   = 2 *   linesize;
6917             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6918             int mbn_xy = mb_xy - 2 * s->mb_stride;
6919             int qp;
6920             int i, j;
6921             int16_t bS[4];
6922
6923             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6924                 if( IS_INTRA(mb_type) ||
6925                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6926                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6927                 } else {
6928                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6929                     for( i = 0; i < 4; i++ ) {
6930                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6931                             mbn_nnz[nnz_idx[i]] != 0 )
6932                             bS[i] = 2;
6933                         else
6934                             bS[i] = 1;
6935                     }
6936                 }
6937                 // Do not use s->qscale as luma quantizer because it has not the same
6938                 // value in IPCM macroblocks.
6939                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6940                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6941                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6942                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6943                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6944                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6945                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6946                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6947             }
6948
6949             start = 1;
6950         }
6951
6952         /* Calculate bS */
6953         for( edge = start; edge < edges; edge++ ) {
6954             /* mbn_xy: neighbor macroblock */
6955             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6956             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6957             int16_t bS[4];
6958             int qp;
6959
6960             if( (edge&1) && IS_8x8DCT(mb_type) )
6961                 continue;
6962
6963             if( IS_INTRA(mb_type) ||
6964                 IS_INTRA(mbn_type) ) {
6965                 int value;
6966                 if (edge == 0) {
6967                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6968                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6969                     ) {
6970                         value = 4;
6971                     } else {
6972                         value = 3;
6973                     }
6974                 } else {
6975                     value = 3;
6976                 }
6977                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6978             } else {
6979                 int i, l;
6980                 int mv_done;
6981
6982                 if( edge & mask_edge ) {
6983                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6984                     mv_done = 1;
6985                 }
6986                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6987                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6988                     mv_done = 1;
6989                 }
6990                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6991                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6992                     int bn_idx= b_idx - (dir ? 8:1);
6993                     int v = 0;
6994                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6995                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6996                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6997                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6998                     }
6999                     bS[0] = bS[1] = bS[2] = bS[3] = v;
7000                     mv_done = 1;
7001                 }
7002                 else
7003                     mv_done = 0;
7004
7005                 for( i = 0; i < 4; i++ ) {
7006                     int x = dir == 0 ? edge : i;
7007                     int y = dir == 0 ? i    : edge;
7008                     int b_idx= 8 + 4 + x + 8*y;
7009                     int bn_idx= b_idx - (dir ? 8:1);
7010
7011                     if( h->non_zero_count_cache[b_idx] != 0 ||
7012                         h->non_zero_count_cache[bn_idx] != 0 ) {
7013                         bS[i] = 2;
7014                     }
7015                     else if(!mv_done)
7016                     {
7017                         bS[i] = 0;
7018                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7019                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7020                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7021                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7022                                 bS[i] = 1;
7023                                 break;
7024                             }
7025                         }
7026                     }
7027                 }
7028
7029                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7030                     continue;
7031             }
7032
7033             /* Filter edge */
7034             // Do not use s->qscale as luma quantizer because it has not the same
7035             // value in IPCM macroblocks.
7036             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7037             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7038             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7039             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
7040             if( dir == 0 ) {
7041                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7042                 if( (edge&1) == 0 ) {
7043                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
7044                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
7045                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
7046                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
7047                 }
7048             } else {
7049                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7050                 if( (edge&1) == 0 ) {
7051                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
7052                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
7053                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
7054                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
7055                 }
7056             }
7057         }
7058     }
7059 }
7060
7061 static int decode_slice(H264Context *h){
7062     MpegEncContext * const s = &h->s;
7063     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7064
7065     s->mb_skip_run= -1;
7066
7067     if( h->pps.cabac ) {
7068         int i;
7069
7070         /* realign */
7071         align_get_bits( &s->gb );
7072
7073         /* init cabac */
7074         ff_init_cabac_states( &h->cabac);
7075         ff_init_cabac_decoder( &h->cabac,
7076                                s->gb.buffer + get_bits_count(&s->gb)/8,
7077                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7078         /* calculate pre-state */
7079         for( i= 0; i < 460; i++ ) {
7080             int pre;
7081             if( h->slice_type == I_TYPE )
7082                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7083             else
7084                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7085
7086             if( pre <= 63 )
7087                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7088             else
7089                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7090         }
7091
7092         for(;;){
7093 //START_TIMER
7094             int ret = decode_mb_cabac(h);
7095             int eos;
7096 //STOP_TIMER("decode_mb_cabac")
7097
7098             if(ret>=0) hl_decode_mb(h);
7099
7100             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7101                 s->mb_y++;
7102
7103                 if(ret>=0) ret = decode_mb_cabac(h);
7104
7105                 if(ret>=0) hl_decode_mb(h);
7106                 s->mb_y--;
7107             }
7108             eos = get_cabac_terminate( &h->cabac );
7109
7110             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7111                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7112                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7113                 return -1;
7114             }
7115
7116             if( ++s->mb_x >= s->mb_width ) {
7117                 s->mb_x = 0;
7118                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7119                 ++s->mb_y;
7120                 if(FRAME_MBAFF) {
7121                     ++s->mb_y;
7122                 }
7123             }
7124
7125             if( eos || s->mb_y >= s->mb_height ) {
7126                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7127                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7128                 return 0;
7129             }
7130         }
7131
7132     } else {
7133         for(;;){
7134             int ret = decode_mb_cavlc(h);
7135
7136             if(ret>=0) hl_decode_mb(h);
7137
7138             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7139                 s->mb_y++;
7140                 ret = decode_mb_cavlc(h);
7141
7142                 if(ret>=0) hl_decode_mb(h);
7143                 s->mb_y--;
7144             }
7145
7146             if(ret<0){
7147                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7148                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7149
7150                 return -1;
7151             }
7152
7153             if(++s->mb_x >= s->mb_width){
7154                 s->mb_x=0;
7155                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7156                 ++s->mb_y;
7157                 if(FRAME_MBAFF) {
7158                     ++s->mb_y;
7159                 }
7160                 if(s->mb_y >= s->mb_height){
7161                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7162
7163                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7164                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7165
7166                         return 0;
7167                     }else{
7168                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7169
7170                         return -1;
7171                     }
7172                 }
7173             }
7174
7175             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7176                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7177                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7178                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7179
7180                     return 0;
7181                 }else{
7182                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7183
7184                     return -1;
7185                 }
7186             }
7187         }
7188     }
7189
7190 #if 0
7191     for(;s->mb_y < s->mb_height; s->mb_y++){
7192         for(;s->mb_x < s->mb_width; s->mb_x++){
7193             int ret= decode_mb(h);
7194
7195             hl_decode_mb(h);
7196
7197             if(ret<0){
7198                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7199                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7200
7201                 return -1;
7202             }
7203
7204             if(++s->mb_x >= s->mb_width){
7205                 s->mb_x=0;
7206                 if(++s->mb_y >= s->mb_height){
7207                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7208                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7209
7210                         return 0;
7211                     }else{
7212                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7213
7214                         return -1;
7215                     }
7216                 }
7217             }
7218
7219             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7220                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7221                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7222
7223                     return 0;
7224                 }else{
7225                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7226
7227                     return -1;
7228                 }
7229             }
7230         }
7231         s->mb_x=0;
7232         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7233     }
7234 #endif
7235     return -1; //not reached
7236 }
7237
7238 static int decode_unregistered_user_data(H264Context *h, int size){
7239     MpegEncContext * const s = &h->s;
7240     uint8_t user_data[16+256];
7241     int e, build, i;
7242
7243     if(size<16)
7244         return -1;
7245
7246     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7247         user_data[i]= get_bits(&s->gb, 8);
7248     }
7249
7250     user_data[i]= 0;
7251     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7252     if(e==1 && build>=0)
7253         h->x264_build= build;
7254
7255     if(s->avctx->debug & FF_DEBUG_BUGS)
7256         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7257
7258     for(; i<size; i++)
7259         skip_bits(&s->gb, 8);
7260
7261     return 0;
7262 }
7263
7264 static int decode_sei(H264Context *h){
7265     MpegEncContext * const s = &h->s;
7266
7267     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7268         int size, type;
7269
7270         type=0;
7271         do{
7272             type+= show_bits(&s->gb, 8);
7273         }while(get_bits(&s->gb, 8) == 255);
7274
7275         size=0;
7276         do{
7277             size+= show_bits(&s->gb, 8);
7278         }while(get_bits(&s->gb, 8) == 255);
7279
7280         switch(type){
7281         case 5:
7282             if(decode_unregistered_user_data(h, size) < 0)
7283                 return -1;
7284             break;
7285         default:
7286             skip_bits(&s->gb, 8*size);
7287         }
7288
7289         //FIXME check bits here
7290         align_get_bits(&s->gb);
7291     }
7292
7293     return 0;
7294 }
7295
7296 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7297     MpegEncContext * const s = &h->s;
7298     int cpb_count, i;
7299     cpb_count = get_ue_golomb(&s->gb) + 1;
7300     get_bits(&s->gb, 4); /* bit_rate_scale */
7301     get_bits(&s->gb, 4); /* cpb_size_scale */
7302     for(i=0; i<cpb_count; i++){
7303         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7304         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7305         get_bits1(&s->gb);     /* cbr_flag */
7306     }
7307     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7308     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7309     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7310     get_bits(&s->gb, 5); /* time_offset_length */
7311 }
7312
7313 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7314     MpegEncContext * const s = &h->s;
7315     int aspect_ratio_info_present_flag;
7316     unsigned int aspect_ratio_idc;
7317     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7318
7319     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7320
7321     if( aspect_ratio_info_present_flag ) {
7322         aspect_ratio_idc= get_bits(&s->gb, 8);
7323         if( aspect_ratio_idc == EXTENDED_SAR ) {
7324             sps->sar.num= get_bits(&s->gb, 16);
7325             sps->sar.den= get_bits(&s->gb, 16);
7326         }else if(aspect_ratio_idc < 14){
7327             sps->sar=  pixel_aspect[aspect_ratio_idc];
7328         }else{
7329             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7330             return -1;
7331         }
7332     }else{
7333         sps->sar.num=
7334         sps->sar.den= 0;
7335     }
7336 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7337
7338     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7339         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7340     }
7341
7342     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7343         get_bits(&s->gb, 3);    /* video_format */
7344         get_bits1(&s->gb);      /* video_full_range_flag */
7345         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7346             get_bits(&s->gb, 8); /* colour_primaries */
7347             get_bits(&s->gb, 8); /* transfer_characteristics */
7348             get_bits(&s->gb, 8); /* matrix_coefficients */
7349         }
7350     }
7351
7352     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7353         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7354         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7355     }
7356
7357     sps->timing_info_present_flag = get_bits1(&s->gb);
7358     if(sps->timing_info_present_flag){
7359         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7360         sps->time_scale = get_bits_long(&s->gb, 32);
7361         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7362     }
7363
7364     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7365     if(nal_hrd_parameters_present_flag)
7366         decode_hrd_parameters(h, sps);
7367     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7368     if(vcl_hrd_parameters_present_flag)
7369         decode_hrd_parameters(h, sps);
7370     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7371         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7372     get_bits1(&s->gb);         /* pic_struct_present_flag */
7373
7374     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7375     if(sps->bitstream_restriction_flag){
7376         unsigned int num_reorder_frames;
7377         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7378         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7379         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7380         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7381         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7382         num_reorder_frames= get_ue_golomb(&s->gb);
7383         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7384
7385         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7386             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7387             return -1;
7388         }
7389
7390         sps->num_reorder_frames= num_reorder_frames;
7391     }
7392
7393     return 0;
7394 }
7395
7396 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7397                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7398     MpegEncContext * const s = &h->s;
7399     int i, last = 8, next = 8;
7400     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7401     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7402         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7403     else
7404     for(i=0;i<size;i++){
7405         if(next)
7406             next = (last + get_se_golomb(&s->gb)) & 0xff;
7407         if(!i && !next){ /* matrix not written, we use the preset one */
7408             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7409             break;
7410         }
7411         last = factors[scan[i]] = next ? next : last;
7412     }
7413 }
7414
7415 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7416                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7417     MpegEncContext * const s = &h->s;
7418     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7419     const uint8_t *fallback[4] = {
7420         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7421         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7422         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7423         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7424     };
7425     if(get_bits1(&s->gb)){
7426         sps->scaling_matrix_present |= is_sps;
7427         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7428         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7429         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7430         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7431         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7432         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7433         if(is_sps || pps->transform_8x8_mode){
7434             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7435             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7436         }
7437     } else if(fallback_sps) {
7438         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7439         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7440     }
7441 }
7442
7443 /**
7444  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7445  */
7446 static void *
7447 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7448                     const size_t size, const char *name)
7449 {
7450     if(id>=max) {
7451         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7452         return NULL;
7453     }
7454
7455     if(!vec[id]) {
7456         vec[id] = av_mallocz(size);
7457         if(vec[id] == NULL)
7458             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7459     }
7460     return vec[id];
7461 }
7462
7463 static inline int decode_seq_parameter_set(H264Context *h){
7464     MpegEncContext * const s = &h->s;
7465     int profile_idc, level_idc;
7466     unsigned int sps_id, tmp, mb_width, mb_height;
7467     int i;
7468     SPS *sps;
7469
7470     profile_idc= get_bits(&s->gb, 8);
7471     get_bits1(&s->gb);   //constraint_set0_flag
7472     get_bits1(&s->gb);   //constraint_set1_flag
7473     get_bits1(&s->gb);   //constraint_set2_flag
7474     get_bits1(&s->gb);   //constraint_set3_flag
7475     get_bits(&s->gb, 4); // reserved
7476     level_idc= get_bits(&s->gb, 8);
7477     sps_id= get_ue_golomb(&s->gb);
7478
7479     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7480     if(sps == NULL)
7481         return -1;
7482
7483     sps->profile_idc= profile_idc;
7484     sps->level_idc= level_idc;
7485
7486     if(sps->profile_idc >= 100){ //high profile
7487         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7488             get_bits1(&s->gb);  //residual_color_transform_flag
7489         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7490         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7491         sps->transform_bypass = get_bits1(&s->gb);
7492         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7493     }else
7494         sps->scaling_matrix_present = 0;
7495
7496     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7497     sps->poc_type= get_ue_golomb(&s->gb);
7498
7499     if(sps->poc_type == 0){ //FIXME #define
7500         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7501     } else if(sps->poc_type == 1){//FIXME #define
7502         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7503         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7504         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7505         tmp= get_ue_golomb(&s->gb);
7506
7507         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7508             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7509             return -1;
7510         }
7511         sps->poc_cycle_length= tmp;
7512
7513         for(i=0; i<sps->poc_cycle_length; i++)
7514             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7515     }else if(sps->poc_type != 2){
7516         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7517         return -1;
7518     }
7519
7520     tmp= get_ue_golomb(&s->gb);
7521     if(tmp > MAX_PICTURE_COUNT-2){
7522         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7523     }
7524     sps->ref_frame_count= tmp;
7525     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7526     mb_width= get_ue_golomb(&s->gb) + 1;
7527     mb_height= get_ue_golomb(&s->gb) + 1;
7528     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7529        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7530         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7531         return -1;
7532     }
7533     sps->mb_width = mb_width;
7534     sps->mb_height= mb_height;
7535
7536     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7537     if(!sps->frame_mbs_only_flag)
7538         sps->mb_aff= get_bits1(&s->gb);
7539     else
7540         sps->mb_aff= 0;
7541
7542     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7543
7544 #ifndef ALLOW_INTERLACE
7545     if(sps->mb_aff)
7546         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7547 #endif
7548     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7549         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7550
7551     sps->crop= get_bits1(&s->gb);
7552     if(sps->crop){
7553         sps->crop_left  = get_ue_golomb(&s->gb);
7554         sps->crop_right = get_ue_golomb(&s->gb);
7555         sps->crop_top   = get_ue_golomb(&s->gb);
7556         sps->crop_bottom= get_ue_golomb(&s->gb);
7557         if(sps->crop_left || sps->crop_top){
7558             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7559         }
7560     }else{
7561         sps->crop_left  =
7562         sps->crop_right =
7563         sps->crop_top   =
7564         sps->crop_bottom= 0;
7565     }
7566
7567     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7568     if( sps->vui_parameters_present_flag )
7569         decode_vui_parameters(h, sps);
7570
7571     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7572         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7573                sps_id, sps->profile_idc, sps->level_idc,
7574                sps->poc_type,
7575                sps->ref_frame_count,
7576                sps->mb_width, sps->mb_height,
7577                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7578                sps->direct_8x8_inference_flag ? "8B8" : "",
7579                sps->crop_left, sps->crop_right,
7580                sps->crop_top, sps->crop_bottom,
7581                sps->vui_parameters_present_flag ? "VUI" : ""
7582                );
7583     }
7584     return 0;
7585 }
7586
7587 static void
7588 build_qp_table(PPS *pps, int t, int index)
7589 {
7590     int i;
7591     for(i = 0; i < 255; i++)
7592         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7593 }
7594
7595 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7596     MpegEncContext * const s = &h->s;
7597     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7598     PPS *pps;
7599
7600     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7601     if(pps == NULL)
7602         return -1;
7603
7604     tmp= get_ue_golomb(&s->gb);
7605     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7606         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7607         return -1;
7608     }
7609     pps->sps_id= tmp;
7610
7611     pps->cabac= get_bits1(&s->gb);
7612     pps->pic_order_present= get_bits1(&s->gb);
7613     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7614     if(pps->slice_group_count > 1 ){
7615         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7616         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7617         switch(pps->mb_slice_group_map_type){
7618         case 0:
7619 #if 0
7620 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7621 |    run_length[ i ]                                |1  |ue(v)   |
7622 #endif
7623             break;
7624         case 2:
7625 #if 0
7626 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7627 |{                                                  |   |        |
7628 |    top_left_mb[ i ]                               |1  |ue(v)   |
7629 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7630 |   }                                               |   |        |
7631 #endif
7632             break;
7633         case 3:
7634         case 4:
7635         case 5:
7636 #if 0
7637 |   slice_group_change_direction_flag               |1  |u(1)    |
7638 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7639 #endif
7640             break;
7641         case 6:
7642 #if 0
7643 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7644 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7645 |)                                                  |   |        |
7646 |    slice_group_id[ i ]                            |1  |u(v)    |
7647 #endif
7648             break;
7649         }
7650     }
7651     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7652     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7653     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7654         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7655         pps->ref_count[0]= pps->ref_count[1]= 1;
7656         return -1;
7657     }
7658
7659     pps->weighted_pred= get_bits1(&s->gb);
7660     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7661     pps->init_qp= get_se_golomb(&s->gb) + 26;
7662     pps->init_qs= get_se_golomb(&s->gb) + 26;
7663     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7664     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7665     pps->constrained_intra_pred= get_bits1(&s->gb);
7666     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7667
7668     pps->transform_8x8_mode= 0;
7669     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7670     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7671     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7672
7673     if(get_bits_count(&s->gb) < bit_length){
7674         pps->transform_8x8_mode= get_bits1(&s->gb);
7675         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7676         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7677     } else {
7678         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7679     }
7680
7681     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7682     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7683         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7684         h->pps.chroma_qp_diff= 1;
7685     } else
7686         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7687
7688     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7689         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7690                pps_id, pps->sps_id,
7691                pps->cabac ? "CABAC" : "CAVLC",
7692                pps->slice_group_count,
7693                pps->ref_count[0], pps->ref_count[1],
7694                pps->weighted_pred ? "weighted" : "",
7695                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7696                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7697                pps->constrained_intra_pred ? "CONSTR" : "",
7698                pps->redundant_pic_cnt_present ? "REDU" : "",
7699                pps->transform_8x8_mode ? "8x8DCT" : ""
7700                );
7701     }
7702
7703     return 0;
7704 }
7705
7706 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7707     MpegEncContext * const s = &h->s;
7708     AVCodecContext * const avctx= s->avctx;
7709     int buf_index=0;
7710 #if 0
7711     int i;
7712     for(i=0; i<50; i++){
7713         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7714     }
7715 #endif
7716     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7717         h->slice_num = 0;
7718         s->current_picture_ptr= NULL;
7719     }
7720
7721     for(;;){
7722         int consumed;
7723         int dst_length;
7724         int bit_length;
7725         uint8_t *ptr;
7726         int i, nalsize = 0;
7727
7728         if(h->is_avc) {
7729             if(buf_index >= buf_size) break;
7730             nalsize = 0;
7731             for(i = 0; i < h->nal_length_size; i++)
7732                 nalsize = (nalsize << 8) | buf[buf_index++];
7733             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7734                 if(nalsize == 1){
7735                     buf_index++;
7736                     continue;
7737                 }else{
7738                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7739                     break;
7740                 }
7741             }
7742         } else {
7743             // start code prefix search
7744             for(; buf_index + 3 < buf_size; buf_index++){
7745                 // This should always succeed in the first iteration.
7746                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7747                     break;
7748             }
7749
7750             if(buf_index+3 >= buf_size) break;
7751
7752             buf_index+=3;
7753         }
7754
7755         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7756         if (ptr==NULL || dst_length < 0){
7757             return -1;
7758         }
7759         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7760             dst_length--;
7761         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7762
7763         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7764             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7765         }
7766
7767         if (h->is_avc && (nalsize != consumed))
7768             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7769
7770         buf_index += consumed;
7771
7772         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7773            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7774             continue;
7775
7776         switch(h->nal_unit_type){
7777         case NAL_IDR_SLICE:
7778             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7779         case NAL_SLICE:
7780             init_get_bits(&s->gb, ptr, bit_length);
7781             h->intra_gb_ptr=
7782             h->inter_gb_ptr= &s->gb;
7783             s->data_partitioning = 0;
7784
7785             if(decode_slice_header(h) < 0){
7786                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7787                 break;
7788             }
7789             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
7790             if(h->redundant_pic_count==0 && s->hurry_up < 5
7791                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7792                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7793                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7794                && avctx->skip_frame < AVDISCARD_ALL)
7795                 decode_slice(h);
7796             break;
7797         case NAL_DPA:
7798             init_get_bits(&s->gb, ptr, bit_length);
7799             h->intra_gb_ptr=
7800             h->inter_gb_ptr= NULL;
7801             s->data_partitioning = 1;
7802
7803             if(decode_slice_header(h) < 0){
7804                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7805             }
7806             break;
7807         case NAL_DPB:
7808             init_get_bits(&h->intra_gb, ptr, bit_length);
7809             h->intra_gb_ptr= &h->intra_gb;
7810             break;
7811         case NAL_DPC:
7812             init_get_bits(&h->inter_gb, ptr, bit_length);
7813             h->inter_gb_ptr= &h->inter_gb;
7814
7815             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
7816                && s->context_initialized
7817                && s->hurry_up < 5
7818                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7819                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7820                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7821                && avctx->skip_frame < AVDISCARD_ALL)
7822                 decode_slice(h);
7823             break;
7824         case NAL_SEI:
7825             init_get_bits(&s->gb, ptr, bit_length);
7826             decode_sei(h);
7827             break;
7828         case NAL_SPS:
7829             init_get_bits(&s->gb, ptr, bit_length);
7830             decode_seq_parameter_set(h);
7831
7832             if(s->flags& CODEC_FLAG_LOW_DELAY)
7833                 s->low_delay=1;
7834
7835             if(avctx->has_b_frames < 2)
7836                 avctx->has_b_frames= !s->low_delay;
7837             break;
7838         case NAL_PPS:
7839             init_get_bits(&s->gb, ptr, bit_length);
7840
7841             decode_picture_parameter_set(h, bit_length);
7842
7843             break;
7844         case NAL_AUD:
7845         case NAL_END_SEQUENCE:
7846         case NAL_END_STREAM:
7847         case NAL_FILLER_DATA:
7848         case NAL_SPS_EXT:
7849         case NAL_AUXILIARY_SLICE:
7850             break;
7851         default:
7852             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
7853         }
7854     }
7855
7856     return buf_index;
7857 }
7858
7859 /**
7860  * returns the number of bytes consumed for building the current frame
7861  */
7862 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7863     if(s->flags&CODEC_FLAG_TRUNCATED){
7864         pos -= s->parse_context.last_index;
7865         if(pos<0) pos=0; // FIXME remove (unneeded?)
7866
7867         return pos;
7868     }else{
7869         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7870         if(pos+10>buf_size) pos=buf_size; // oops ;)
7871
7872         return pos;
7873     }
7874 }
7875
7876 static int decode_frame(AVCodecContext *avctx,
7877                              void *data, int *data_size,
7878                              uint8_t *buf, int buf_size)
7879 {
7880     H264Context *h = avctx->priv_data;
7881     MpegEncContext *s = &h->s;
7882     AVFrame *pict = data;
7883     int buf_index;
7884
7885     s->flags= avctx->flags;
7886     s->flags2= avctx->flags2;
7887
7888    /* no supplementary picture */
7889     if (buf_size == 0) {
7890         Picture *out;
7891         int i, out_idx;
7892
7893 //FIXME factorize this with the output code below
7894         out = h->delayed_pic[0];
7895         out_idx = 0;
7896         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7897             if(h->delayed_pic[i]->poc < out->poc){
7898                 out = h->delayed_pic[i];
7899                 out_idx = i;
7900             }
7901
7902         for(i=out_idx; h->delayed_pic[i]; i++)
7903             h->delayed_pic[i] = h->delayed_pic[i+1];
7904
7905         if(out){
7906             *data_size = sizeof(AVFrame);
7907             *pict= *(AVFrame*)out;
7908         }
7909
7910         return 0;
7911     }
7912
7913     if(s->flags&CODEC_FLAG_TRUNCATED){
7914         int next= ff_h264_find_frame_end(h, buf, buf_size);
7915
7916         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7917             return buf_size;
7918 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7919     }
7920
7921     if(h->is_avc && !h->got_avcC) {
7922         int i, cnt, nalsize;
7923         unsigned char *p = avctx->extradata;
7924         if(avctx->extradata_size < 7) {
7925             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7926             return -1;
7927         }
7928         if(*p != 1) {
7929             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7930             return -1;
7931         }
7932         /* sps and pps in the avcC always have length coded with 2 bytes,
7933            so put a fake nal_length_size = 2 while parsing them */
7934         h->nal_length_size = 2;
7935         // Decode sps from avcC
7936         cnt = *(p+5) & 0x1f; // Number of sps
7937         p += 6;
7938         for (i = 0; i < cnt; i++) {
7939             nalsize = AV_RB16(p) + 2;
7940             if(decode_nal_units(h, p, nalsize) < 0) {
7941                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7942                 return -1;
7943             }
7944             p += nalsize;
7945         }
7946         // Decode pps from avcC
7947         cnt = *(p++); // Number of pps
7948         for (i = 0; i < cnt; i++) {
7949             nalsize = AV_RB16(p) + 2;
7950             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7951                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7952                 return -1;
7953             }
7954             p += nalsize;
7955         }
7956         // Now store right nal length size, that will be use to parse all other nals
7957         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7958         // Do not reparse avcC
7959         h->got_avcC = 1;
7960     }
7961
7962     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7963         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7964             return -1;
7965     }
7966
7967     buf_index=decode_nal_units(h, buf, buf_size);
7968     if(buf_index < 0)
7969         return -1;
7970
7971     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7972         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7973         return -1;
7974     }
7975
7976     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7977         Picture *out = s->current_picture_ptr;
7978         Picture *cur = s->current_picture_ptr;
7979         Picture *prev = h->delayed_output_pic;
7980         int i, pics, cross_idr, out_of_order, out_idx;
7981
7982         s->mb_y= 0;
7983
7984         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7985         s->current_picture_ptr->pict_type= s->pict_type;
7986
7987         h->prev_frame_num_offset= h->frame_num_offset;
7988         h->prev_frame_num= h->frame_num;
7989         if(s->current_picture_ptr->reference){
7990             h->prev_poc_msb= h->poc_msb;
7991             h->prev_poc_lsb= h->poc_lsb;
7992         }
7993         if(s->current_picture_ptr->reference)
7994             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7995
7996         ff_er_frame_end(s);
7997
7998         MPV_frame_end(s);
7999
8000     //FIXME do something with unavailable reference frames
8001
8002 #if 0 //decode order
8003         *data_size = sizeof(AVFrame);
8004 #else
8005         /* Sort B-frames into display order */
8006
8007         if(h->sps.bitstream_restriction_flag
8008            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
8009             s->avctx->has_b_frames = h->sps.num_reorder_frames;
8010             s->low_delay = 0;
8011         }
8012
8013         pics = 0;
8014         while(h->delayed_pic[pics]) pics++;
8015
8016         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
8017
8018         h->delayed_pic[pics++] = cur;
8019         if(cur->reference == 0)
8020             cur->reference = 1;
8021
8022         cross_idr = 0;
8023         for(i=0; h->delayed_pic[i]; i++)
8024             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8025                 cross_idr = 1;
8026
8027         out = h->delayed_pic[0];
8028         out_idx = 0;
8029         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8030             if(h->delayed_pic[i]->poc < out->poc){
8031                 out = h->delayed_pic[i];
8032                 out_idx = i;
8033             }
8034
8035         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8036         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8037             { }
8038         else if(prev && pics <= s->avctx->has_b_frames)
8039             out = prev;
8040         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8041            || (s->low_delay &&
8042             ((!cross_idr && prev && out->poc > prev->poc + 2)
8043              || cur->pict_type == B_TYPE)))
8044         {
8045             s->low_delay = 0;
8046             s->avctx->has_b_frames++;
8047             out = prev;
8048         }
8049         else if(out_of_order)
8050             out = prev;
8051
8052         if(out_of_order || pics > s->avctx->has_b_frames){
8053             for(i=out_idx; h->delayed_pic[i]; i++)
8054                 h->delayed_pic[i] = h->delayed_pic[i+1];
8055         }
8056
8057         if(prev == out)
8058             *data_size = 0;
8059         else
8060             *data_size = sizeof(AVFrame);
8061         if(prev && prev != out && prev->reference == 1)
8062             prev->reference = 0;
8063         h->delayed_output_pic = out;
8064 #endif
8065
8066         if(out)
8067             *pict= *(AVFrame*)out;
8068         else
8069             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8070     }
8071
8072     assert(pict->data[0] || !*data_size);
8073     ff_print_debug_info(s, pict);
8074 //printf("out %d\n", (int)pict->data[0]);
8075 #if 0 //?
8076
8077     /* Return the Picture timestamp as the frame number */
8078     /* we substract 1 because it is added on utils.c    */
8079     avctx->frame_number = s->picture_number - 1;
8080 #endif
8081     return get_consumed_bytes(s, buf_index, buf_size);
8082 }
8083 #if 0
8084 static inline void fill_mb_avail(H264Context *h){
8085     MpegEncContext * const s = &h->s;
8086     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8087
8088     if(s->mb_y){
8089         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8090         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8091         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8092     }else{
8093         h->mb_avail[0]=
8094         h->mb_avail[1]=
8095         h->mb_avail[2]= 0;
8096     }
8097     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8098     h->mb_avail[4]= 1; //FIXME move out
8099     h->mb_avail[5]= 0; //FIXME move out
8100 }
8101 #endif
8102
8103 #if 0 //selftest
8104 #define COUNT 8000
8105 #define SIZE (COUNT*40)
8106 int main(){
8107     int i;
8108     uint8_t temp[SIZE];
8109     PutBitContext pb;
8110     GetBitContext gb;
8111 //    int int_temp[10000];
8112     DSPContext dsp;
8113     AVCodecContext avctx;
8114
8115     dsputil_init(&dsp, &avctx);
8116
8117     init_put_bits(&pb, temp, SIZE);
8118     printf("testing unsigned exp golomb\n");
8119     for(i=0; i<COUNT; i++){
8120         START_TIMER
8121         set_ue_golomb(&pb, i);
8122         STOP_TIMER("set_ue_golomb");
8123     }
8124     flush_put_bits(&pb);
8125
8126     init_get_bits(&gb, temp, 8*SIZE);
8127     for(i=0; i<COUNT; i++){
8128         int j, s;
8129
8130         s= show_bits(&gb, 24);
8131
8132         START_TIMER
8133         j= get_ue_golomb(&gb);
8134         if(j != i){
8135             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8136 //            return -1;
8137         }
8138         STOP_TIMER("get_ue_golomb");
8139     }
8140
8141
8142     init_put_bits(&pb, temp, SIZE);
8143     printf("testing signed exp golomb\n");
8144     for(i=0; i<COUNT; i++){
8145         START_TIMER
8146         set_se_golomb(&pb, i - COUNT/2);
8147         STOP_TIMER("set_se_golomb");
8148     }
8149     flush_put_bits(&pb);
8150
8151     init_get_bits(&gb, temp, 8*SIZE);
8152     for(i=0; i<COUNT; i++){
8153         int j, s;
8154
8155         s= show_bits(&gb, 24);
8156
8157         START_TIMER
8158         j= get_se_golomb(&gb);
8159         if(j != i - COUNT/2){
8160             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8161 //            return -1;
8162         }
8163         STOP_TIMER("get_se_golomb");
8164     }
8165
8166     printf("testing 4x4 (I)DCT\n");
8167
8168     DCTELEM block[16];
8169     uint8_t src[16], ref[16];
8170     uint64_t error= 0, max_error=0;
8171
8172     for(i=0; i<COUNT; i++){
8173         int j;
8174 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8175         for(j=0; j<16; j++){
8176             ref[j]= random()%255;
8177             src[j]= random()%255;
8178         }
8179
8180         h264_diff_dct_c(block, src, ref, 4);
8181
8182         //normalize
8183         for(j=0; j<16; j++){
8184 //            printf("%d ", block[j]);
8185             block[j]= block[j]*4;
8186             if(j&1) block[j]= (block[j]*4 + 2)/5;
8187             if(j&4) block[j]= (block[j]*4 + 2)/5;
8188         }
8189 //        printf("\n");
8190
8191         s->dsp.h264_idct_add(ref, block, 4);
8192 /*        for(j=0; j<16; j++){
8193             printf("%d ", ref[j]);
8194         }
8195         printf("\n");*/
8196
8197         for(j=0; j<16; j++){
8198             int diff= FFABS(src[j] - ref[j]);
8199
8200             error+= diff*diff;
8201             max_error= FFMAX(max_error, diff);
8202         }
8203     }
8204     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8205 #if 0
8206     printf("testing quantizer\n");
8207     for(qp=0; qp<52; qp++){
8208         for(i=0; i<16; i++)
8209             src1_block[i]= src2_block[i]= random()%255;
8210
8211     }
8212 #endif
8213     printf("Testing NAL layer\n");
8214
8215     uint8_t bitstream[COUNT];
8216     uint8_t nal[COUNT*2];
8217     H264Context h;
8218     memset(&h, 0, sizeof(H264Context));
8219
8220     for(i=0; i<COUNT; i++){
8221         int zeros= i;
8222         int nal_length;
8223         int consumed;
8224         int out_length;
8225         uint8_t *out;
8226         int j;
8227
8228         for(j=0; j<COUNT; j++){
8229             bitstream[j]= (random() % 255) + 1;
8230         }
8231
8232         for(j=0; j<zeros; j++){
8233             int pos= random() % COUNT;
8234             while(bitstream[pos] == 0){
8235                 pos++;
8236                 pos %= COUNT;
8237             }
8238             bitstream[pos]=0;
8239         }
8240
8241         START_TIMER
8242
8243         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8244         if(nal_length<0){
8245             printf("encoding failed\n");
8246             return -1;
8247         }
8248
8249         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8250
8251         STOP_TIMER("NAL")
8252
8253         if(out_length != COUNT){
8254             printf("incorrect length %d %d\n", out_length, COUNT);
8255             return -1;
8256         }
8257
8258         if(consumed != nal_length){
8259             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8260             return -1;
8261         }
8262
8263         if(memcmp(bitstream, out, COUNT)){
8264             printf("mismatch\n");
8265             return -1;
8266         }
8267     }
8268
8269     printf("Testing RBSP\n");
8270
8271
8272     return 0;
8273 }
8274 #endif
8275
8276
8277 static int decode_end(AVCodecContext *avctx)
8278 {
8279     H264Context *h = avctx->priv_data;
8280     MpegEncContext *s = &h->s;
8281
8282     av_freep(&h->rbsp_buffer[0]);
8283     av_freep(&h->rbsp_buffer[1]);
8284     free_tables(h); //FIXME cleanup init stuff perhaps
8285     MPV_common_end(s);
8286
8287 //    memset(h, 0, sizeof(H264Context));
8288
8289     return 0;
8290 }
8291
8292
8293 AVCodec h264_decoder = {
8294     "h264",
8295     CODEC_TYPE_VIDEO,
8296     CODEC_ID_H264,
8297     sizeof(H264Context),
8298     decode_init,
8299     NULL,
8300     decode_end,
8301     decode_frame,
8302     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8303     .flush= flush_dpb,
8304 };
8305
8306 #include "svq3.c"