git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 static VLC coeff_token_vlc[4];
  42 static VLC chroma_dc_coeff_token_vlc;
  43
  44 static VLC total_zeros_vlc[15];
  45 static VLC chroma_dc_total_zeros_vlc[3];
  46
  47 static VLC run_vlc[6];
  48 static VLC run7_vlc;
  49
  50 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  51 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  52 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  53 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  54
  55 static av_always_inline uint32_t pack16to32(int a, int b){
  56 #ifdef WORDS_BIGENDIAN
  57    return (b&0xFFFF) + (a<<16);
  58 #else
  59    return (a&0xFFFF) + (b<<16);
  60 #endif
  61 }
  62
  63 const uint8_t ff_rem6[52]={
  64 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  65 };
  66
  67 const uint8_t ff_div6[52]={
  68 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  69 };
  70
  71
  72 /**
  73  * fill a rectangle.
  74  * @param h height of the rectangle, should be a constant
  75  * @param w width of the rectangle, should be a constant
  76  * @param size the size of val (1 or 4), should be a constant
  77  */
  78 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  79     uint8_t *p= (uint8_t*)vp;
  80     assert(size==1 || size==4);
  81     assert(w<=4);
  82
  83     w      *= size;
  84     stride *= size;
  85
  86     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  87     assert((stride&(w-1))==0);
  88     if(w==2){
  89         const uint16_t v= size==4 ? val : val*0x0101;
  90         *(uint16_t*)(p + 0*stride)= v;
  91         if(h==1) return;
  92         *(uint16_t*)(p + 1*stride)= v;
  93         if(h==2) return;
  94         *(uint16_t*)(p + 2*stride)=
  95         *(uint16_t*)(p + 3*stride)= v;
  96     }else if(w==4){
  97         const uint32_t v= size==4 ? val : val*0x01010101;
  98         *(uint32_t*)(p + 0*stride)= v;
  99         if(h==1) return;
 100         *(uint32_t*)(p + 1*stride)= v;
 101         if(h==2) return;
 102         *(uint32_t*)(p + 2*stride)=
 103         *(uint32_t*)(p + 3*stride)= v;
 104     }else if(w==8){
 105     //gcc can't optimize 64bit math on x86_32
 106 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 107         const uint64_t v= val*0x0100000001ULL;
 108         *(uint64_t*)(p + 0*stride)= v;
 109         if(h==1) return;
 110         *(uint64_t*)(p + 1*stride)= v;
 111         if(h==2) return;
 112         *(uint64_t*)(p + 2*stride)=
 113         *(uint64_t*)(p + 3*stride)= v;
 114     }else if(w==16){
 115         const uint64_t v= val*0x0100000001ULL;
 116         *(uint64_t*)(p + 0+0*stride)=
 117         *(uint64_t*)(p + 8+0*stride)=
 118         *(uint64_t*)(p + 0+1*stride)=
 119         *(uint64_t*)(p + 8+1*stride)= v;
 120         if(h==2) return;
 121         *(uint64_t*)(p + 0+2*stride)=
 122         *(uint64_t*)(p + 8+2*stride)=
 123         *(uint64_t*)(p + 0+3*stride)=
 124         *(uint64_t*)(p + 8+3*stride)= v;
 125 #else
 126         *(uint32_t*)(p + 0+0*stride)=
 127         *(uint32_t*)(p + 4+0*stride)= val;
 128         if(h==1) return;
 129         *(uint32_t*)(p + 0+1*stride)=
 130         *(uint32_t*)(p + 4+1*stride)= val;
 131         if(h==2) return;
 132         *(uint32_t*)(p + 0+2*stride)=
 133         *(uint32_t*)(p + 4+2*stride)=
 134         *(uint32_t*)(p + 0+3*stride)=
 135         *(uint32_t*)(p + 4+3*stride)= val;
 136     }else if(w==16){
 137         *(uint32_t*)(p + 0+0*stride)=
 138         *(uint32_t*)(p + 4+0*stride)=
 139         *(uint32_t*)(p + 8+0*stride)=
 140         *(uint32_t*)(p +12+0*stride)=
 141         *(uint32_t*)(p + 0+1*stride)=
 142         *(uint32_t*)(p + 4+1*stride)=
 143         *(uint32_t*)(p + 8+1*stride)=
 144         *(uint32_t*)(p +12+1*stride)= val;
 145         if(h==2) return;
 146         *(uint32_t*)(p + 0+2*stride)=
 147         *(uint32_t*)(p + 4+2*stride)=
 148         *(uint32_t*)(p + 8+2*stride)=
 149         *(uint32_t*)(p +12+2*stride)=
 150         *(uint32_t*)(p + 0+3*stride)=
 151         *(uint32_t*)(p + 4+3*stride)=
 152         *(uint32_t*)(p + 8+3*stride)=
 153         *(uint32_t*)(p +12+3*stride)= val;
 154 #endif
 155     }else
 156         assert(0);
 157     assert(h==4);
 158 }
 159
 160 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 161     MpegEncContext * const s = &h->s;
 162     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 163     int topleft_xy, top_xy, topright_xy, left_xy[2];
 164     int topleft_type, top_type, topright_type, left_type[2];
 165     int left_block[8];
 166     int i;
 167
 168     //FIXME deblocking could skip the intra and nnz parts.
 169     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 170         return;
 171
 172     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 173
 174     top_xy     = mb_xy  - s->mb_stride;
 175     topleft_xy = top_xy - 1;
 176     topright_xy= top_xy + 1;
 177     left_xy[1] = left_xy[0] = mb_xy-1;
 178     left_block[0]= 0;
 179     left_block[1]= 1;
 180     left_block[2]= 2;
 181     left_block[3]= 3;
 182     left_block[4]= 7;
 183     left_block[5]= 10;
 184     left_block[6]= 8;
 185     left_block[7]= 11;
 186     if(FRAME_MBAFF){
 187         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 188         const int top_pair_xy      = pair_xy     - s->mb_stride;
 189         const int topleft_pair_xy  = top_pair_xy - 1;
 190         const int topright_pair_xy = top_pair_xy + 1;
 191         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 192         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 193         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 194         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 195         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 196         const int bottom = (s->mb_y & 1);
 197         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 198         if (bottom
 199                 ? !curr_mb_frame_flag // bottom macroblock
 200                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 201                 ) {
 202             top_xy -= s->mb_stride;
 203         }
 204         if (bottom
 205                 ? !curr_mb_frame_flag // bottom macroblock
 206                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 207                 ) {
 208             topleft_xy -= s->mb_stride;
 209         }
 210         if (bottom
 211                 ? !curr_mb_frame_flag // bottom macroblock
 212                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 213                 ) {
 214             topright_xy -= s->mb_stride;
 215         }
 216         if (left_mb_frame_flag != curr_mb_frame_flag) {
 217             left_xy[1] = left_xy[0] = pair_xy - 1;
 218             if (curr_mb_frame_flag) {
 219                 if (bottom) {
 220                     left_block[0]= 2;
 221                     left_block[1]= 2;
 222                     left_block[2]= 3;
 223                     left_block[3]= 3;
 224                     left_block[4]= 8;
 225                     left_block[5]= 11;
 226                     left_block[6]= 8;
 227                     left_block[7]= 11;
 228                 } else {
 229                     left_block[0]= 0;
 230                     left_block[1]= 0;
 231                     left_block[2]= 1;
 232                     left_block[3]= 1;
 233                     left_block[4]= 7;
 234                     left_block[5]= 10;
 235                     left_block[6]= 7;
 236                     left_block[7]= 10;
 237                 }
 238             } else {
 239                 left_xy[1] += s->mb_stride;
 240                 //left_block[0]= 0;
 241                 left_block[1]= 2;
 242                 left_block[2]= 0;
 243                 left_block[3]= 2;
 244                 //left_block[4]= 7;
 245                 left_block[5]= 10;
 246                 left_block[6]= 7;
 247                 left_block[7]= 10;
 248             }
 249         }
 250     }
 251
 252     h->top_mb_xy = top_xy;
 253     h->left_mb_xy[0] = left_xy[0];
 254     h->left_mb_xy[1] = left_xy[1];
 255     if(for_deblock){
 256         topleft_type = 0;
 257         topright_type = 0;
 258         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 259         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 260         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 261
 262         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 263             int list;
 264             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 265             for(i=0; i<16; i++)
 266                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 267             for(list=0; list<h->list_count; list++){
 268                 if(USES_LIST(mb_type,list)){
 269                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 270                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 271                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 272                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 273                         dst[0] = src[0];
 274                         dst[1] = src[1];
 275                         dst[2] = src[2];
 276                         dst[3] = src[3];
 277                     }
 278                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 279                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 280                     ref += h->b8_stride;
 281                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 282                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 283                 }else{
 284                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 285                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 286                 }
 287             }
 288         }
 289     }else{
 290         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 291         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 292         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 293         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 294         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 295     }
 296
 297     if(IS_INTRA(mb_type)){
 298         h->topleft_samples_available=
 299         h->top_samples_available=
 300         h->left_samples_available= 0xFFFF;
 301         h->topright_samples_available= 0xEEEA;
 302
 303         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 304             h->topleft_samples_available= 0xB3FF;
 305             h->top_samples_available= 0x33FF;
 306             h->topright_samples_available= 0x26EA;
 307         }
 308         for(i=0; i<2; i++){
 309             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 310                 h->topleft_samples_available&= 0xDF5F;
 311                 h->left_samples_available&= 0x5F5F;
 312             }
 313         }
 314
 315         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 316             h->topleft_samples_available&= 0x7FFF;
 317
 318         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 319             h->topright_samples_available&= 0xFBFF;
 320
 321         if(IS_INTRA4x4(mb_type)){
 322             if(IS_INTRA4x4(top_type)){
 323                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 324                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 325                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 326                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 327             }else{
 328                 int pred;
 329                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 330                     pred= -1;
 331                 else{
 332                     pred= 2;
 333                 }
 334                 h->intra4x4_pred_mode_cache[4+8*0]=
 335                 h->intra4x4_pred_mode_cache[5+8*0]=
 336                 h->intra4x4_pred_mode_cache[6+8*0]=
 337                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 338             }
 339             for(i=0; i<2; i++){
 340                 if(IS_INTRA4x4(left_type[i])){
 341                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 342                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 343                 }else{
 344                     int pred;
 345                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 346                         pred= -1;
 347                     else{
 348                         pred= 2;
 349                     }
 350                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 351                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 352                 }
 353             }
 354         }
 355     }
 356
 357
 358 /*
 359 0 . T T. T T T T
 360 1 L . .L . . . .
 361 2 L . .L . . . .
 362 3 . T TL . . . .
 363 4 L . .L . . . .
 364 5 L . .. . . . .
 365 */
 366 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 367     if(top_type){
 368         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 369         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 370         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 371         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 372
 373         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 374         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 375
 376         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 377         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 378
 379     }else{
 380         h->non_zero_count_cache[4+8*0]=
 381         h->non_zero_count_cache[5+8*0]=
 382         h->non_zero_count_cache[6+8*0]=
 383         h->non_zero_count_cache[7+8*0]=
 384
 385         h->non_zero_count_cache[1+8*0]=
 386         h->non_zero_count_cache[2+8*0]=
 387
 388         h->non_zero_count_cache[1+8*3]=
 389         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 390
 391     }
 392
 393     for (i=0; i<2; i++) {
 394         if(left_type[i]){
 395             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 396             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 397             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 398             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 399         }else{
 400             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 401             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 402             h->non_zero_count_cache[0+8*1 +   8*i]=
 403             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 404         }
 405     }
 406
 407     if( h->pps.cabac ) {
 408         // top_cbp
 409         if(top_type) {
 410             h->top_cbp = h->cbp_table[top_xy];
 411         } else if(IS_INTRA(mb_type)) {
 412             h->top_cbp = 0x1C0;
 413         } else {
 414             h->top_cbp = 0;
 415         }
 416         // left_cbp
 417         if (left_type[0]) {
 418             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 419         } else if(IS_INTRA(mb_type)) {
 420             h->left_cbp = 0x1C0;
 421         } else {
 422             h->left_cbp = 0;
 423         }
 424         if (left_type[0]) {
 425             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 426         }
 427         if (left_type[1]) {
 428             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 429         }
 430     }
 431
 432 #if 1
 433     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 434         int list;
 435         for(list=0; list<h->list_count; list++){
 436             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 437                 /*if(!h->mv_cache_clean[list]){
 438                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 439                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 440                     h->mv_cache_clean[list]= 1;
 441                 }*/
 442                 continue;
 443             }
 444             h->mv_cache_clean[list]= 0;
 445
 446             if(USES_LIST(top_type, list)){
 447                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 448                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 449                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 450                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 451                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 452                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 453                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 454                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 455                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 456                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 457             }else{
 458                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 459                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 460                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 461                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 462                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 463             }
 464
 465             for(i=0; i<2; i++){
 466                 int cache_idx = scan8[0] - 1 + i*2*8;
 467                 if(USES_LIST(left_type[i], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 469                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 470                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 471                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 472                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 473                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 474                 }else{
 475                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 476                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 477                     h->ref_cache[list][cache_idx  ]=
 478                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 479                 }
 480             }
 481
 482             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 483                 continue;
 484
 485             if(USES_LIST(topleft_type, list)){
 486                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 487                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 488                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 489                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 490             }else{
 491                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 492                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 493             }
 494
 495             if(USES_LIST(topright_type, list)){
 496                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 497                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 498                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 499                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 500             }else{
 501                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 502                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 503             }
 504
 505             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 506                 continue;
 507
 508             h->ref_cache[list][scan8[5 ]+1] =
 509             h->ref_cache[list][scan8[7 ]+1] =
 510             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 511             h->ref_cache[list][scan8[4 ]] =
 512             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 513             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 514             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 515             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 516             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 517             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 518
 519             if( h->pps.cabac ) {
 520                 /* XXX beurk, Load mvd */
 521                 if(USES_LIST(top_type, list)){
 522                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 523                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 524                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 525                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 526                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 527                 }else{
 528                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 529                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 530                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 531                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 532                 }
 533                 if(USES_LIST(left_type[0], list)){
 534                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 535                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 536                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 537                 }else{
 538                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 539                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 540                 }
 541                 if(USES_LIST(left_type[1], list)){
 542                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 543                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 544                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 545                 }else{
 546                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 547                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 548                 }
 549                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 550                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 551                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 552                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 553                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 554
 555                 if(h->slice_type == B_TYPE){
 556                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 557
 558                     if(IS_DIRECT(top_type)){
 559                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 560                     }else if(IS_8X8(top_type)){
 561                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 562                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 563                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 564                     }else{
 565                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 566                     }
 567
 568                     if(IS_DIRECT(left_type[0]))
 569                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 570                     else if(IS_8X8(left_type[0]))
 571                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 572                     else
 573                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 574
 575                     if(IS_DIRECT(left_type[1]))
 576                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 577                     else if(IS_8X8(left_type[1]))
 578                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 579                     else
 580                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 581                 }
 582             }
 583
 584             if(FRAME_MBAFF){
 585 #define MAP_MVS\
 586                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 587                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 588                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 589                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 590                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 591                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 592                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 593                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 594                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 595                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 596                 if(MB_FIELD){
 597 #define MAP_F2F(idx, mb_type)\
 598                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 599                         h->ref_cache[list][idx] <<= 1;\
 600                         h->mv_cache[list][idx][1] /= 2;\
 601                         h->mvd_cache[list][idx][1] /= 2;\
 602                     }
 603                     MAP_MVS
 604 #undef MAP_F2F
 605                 }else{
 606 #define MAP_F2F(idx, mb_type)\
 607                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 608                         h->ref_cache[list][idx] >>= 1;\
 609                         h->mv_cache[list][idx][1] <<= 1;\
 610                         h->mvd_cache[list][idx][1] <<= 1;\
 611                     }
 612                     MAP_MVS
 613 #undef MAP_F2F
 614                 }
 615             }
 616         }
 617     }
 618 #endif
 619
 620     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 621 }
 622
 623 static inline void write_back_intra_pred_mode(H264Context *h){
 624     MpegEncContext * const s = &h->s;
 625     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 626
 627     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 628     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 629     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 630     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 631     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 632     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 633     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 634 }
 635
 636 /**
 637  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 638  */
 639 static inline int check_intra4x4_pred_mode(H264Context *h){
 640     MpegEncContext * const s = &h->s;
 641     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 642     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 643     int i;
 644
 645     if(!(h->top_samples_available&0x8000)){
 646         for(i=0; i<4; i++){
 647             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 648             if(status<0){
 649                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 650                 return -1;
 651             } else if(status){
 652                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 653             }
 654         }
 655     }
 656
 657     if(!(h->left_samples_available&0x8000)){
 658         for(i=0; i<4; i++){
 659             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 660             if(status<0){
 661                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 662                 return -1;
 663             } else if(status){
 664                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 665             }
 666         }
 667     }
 668
 669     return 0;
 670 } //FIXME cleanup like next
 671
 672 /**
 673  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 674  */
 675 static inline int check_intra_pred_mode(H264Context *h, int mode){
 676     MpegEncContext * const s = &h->s;
 677     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 678     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 679
 680     if(mode > 6U) {
 681         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 682         return -1;
 683     }
 684
 685     if(!(h->top_samples_available&0x8000)){
 686         mode= top[ mode ];
 687         if(mode<0){
 688             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 689             return -1;
 690         }
 691     }
 692
 693     if(!(h->left_samples_available&0x8000)){
 694         mode= left[ mode ];
 695         if(mode<0){
 696             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 697             return -1;
 698         }
 699     }
 700
 701     return mode;
 702 }
 703
 704 /**
 705  * gets the predicted intra4x4 prediction mode.
 706  */
 707 static inline int pred_intra_mode(H264Context *h, int n){
 708     const int index8= scan8[n];
 709     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 710     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 711     const int min= FFMIN(left, top);
 712
 713     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 714
 715     if(min<0) return DC_PRED;
 716     else      return min;
 717 }
 718
 719 static inline void write_back_non_zero_count(H264Context *h){
 720     MpegEncContext * const s = &h->s;
 721     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 722
 723     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 724     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 725     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 726     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 727     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 728     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 729     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 730
 731     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 732     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 733     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 734
 735     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 736     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 737     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 738
 739     if(FRAME_MBAFF){
 740         // store all luma nnzs, for deblocking
 741         int v = 0, i;
 742         for(i=0; i<16; i++)
 743             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 744         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 745     }
 746 }
 747
 748 /**
 749  * gets the predicted number of non zero coefficients.
 750  * @param n block index
 751  */
 752 static inline int pred_non_zero_count(H264Context *h, int n){
 753     const int index8= scan8[n];
 754     const int left= h->non_zero_count_cache[index8 - 1];
 755     const int top = h->non_zero_count_cache[index8 - 8];
 756     int i= left + top;
 757
 758     if(i<64) i= (i+1)>>1;
 759
 760     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 761
 762     return i&31;
 763 }
 764
 765 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 766     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 767     MpegEncContext *s = &h->s;
 768
 769     /* there is no consistent mapping of mvs to neighboring locations that will
 770      * make mbaff happy, so we can't move all this logic to fill_caches */
 771     if(FRAME_MBAFF){
 772         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 773         const int16_t *mv;
 774         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 775         *C = h->mv_cache[list][scan8[0]-2];
 776
 777         if(!MB_FIELD
 778            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 779             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 780             if(IS_INTERLACED(mb_types[topright_xy])){
 781 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 782                 const int x4 = X4, y4 = Y4;\
 783                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 784                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 785                     return LIST_NOT_USED;\
 786                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 787                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 788                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 789                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 790
 791                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 792             }
 793         }
 794         if(topright_ref == PART_NOT_AVAILABLE
 795            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 796            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 797             if(!MB_FIELD
 798                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 799                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 800             }
 801             if(MB_FIELD
 802                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 803                && i >= scan8[0]+8){
 804                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 805                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 806             }
 807         }
 808 #undef SET_DIAG_MV
 809     }
 810
 811     if(topright_ref != PART_NOT_AVAILABLE){
 812         *C= h->mv_cache[list][ i - 8 + part_width ];
 813         return topright_ref;
 814     }else{
 815         tprintf(s->avctx, "topright MV not available\n");
 816
 817         *C= h->mv_cache[list][ i - 8 - 1 ];
 818         return h->ref_cache[list][ i - 8 - 1 ];
 819     }
 820 }
 821
 822 /**
 823  * gets the predicted MV.
 824  * @param n the block index
 825  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 826  * @param mx the x component of the predicted motion vector
 827  * @param my the y component of the predicted motion vector
 828  */
 829 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 830     const int index8= scan8[n];
 831     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 832     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 833     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 834     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 835     const int16_t * C;
 836     int diagonal_ref, match_count;
 837
 838     assert(part_width==1 || part_width==2 || part_width==4);
 839
 840 /* mv_cache
 841   B . . A T T T T
 842   U . . L . . , .
 843   U . . L . . . .
 844   U . . L . . , .
 845   . . . L . . . .
 846 */
 847
 848     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 849     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 850     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 851     if(match_count > 1){ //most common
 852         *mx= mid_pred(A[0], B[0], C[0]);
 853         *my= mid_pred(A[1], B[1], C[1]);
 854     }else if(match_count==1){
 855         if(left_ref==ref){
 856             *mx= A[0];
 857             *my= A[1];
 858         }else if(top_ref==ref){
 859             *mx= B[0];
 860             *my= B[1];
 861         }else{
 862             *mx= C[0];
 863             *my= C[1];
 864         }
 865     }else{
 866         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 867             *mx= A[0];
 868             *my= A[1];
 869         }else{
 870             *mx= mid_pred(A[0], B[0], C[0]);
 871             *my= mid_pred(A[1], B[1], C[1]);
 872         }
 873     }
 874
 875     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 876 }
 877
 878 /**
 879  * gets the directionally predicted 16x8 MV.
 880  * @param n the block index
 881  * @param mx the x component of the predicted motion vector
 882  * @param my the y component of the predicted motion vector
 883  */
 884 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 885     if(n==0){
 886         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 887         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 888
 889         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 890
 891         if(top_ref == ref){
 892             *mx= B[0];
 893             *my= B[1];
 894             return;
 895         }
 896     }else{
 897         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 898         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 899
 900         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 901
 902         if(left_ref == ref){
 903             *mx= A[0];
 904             *my= A[1];
 905             return;
 906         }
 907     }
 908
 909     //RARE
 910     pred_motion(h, n, 4, list, ref, mx, my);
 911 }
 912
 913 /**
 914  * gets the directionally predicted 8x16 MV.
 915  * @param n the block index
 916  * @param mx the x component of the predicted motion vector
 917  * @param my the y component of the predicted motion vector
 918  */
 919 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 920     if(n==0){
 921         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 922         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 923
 924         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 925
 926         if(left_ref == ref){
 927             *mx= A[0];
 928             *my= A[1];
 929             return;
 930         }
 931     }else{
 932         const int16_t * C;
 933         int diagonal_ref;
 934
 935         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 936
 937         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 938
 939         if(diagonal_ref == ref){
 940             *mx= C[0];
 941             *my= C[1];
 942             return;
 943         }
 944     }
 945
 946     //RARE
 947     pred_motion(h, n, 2, list, ref, mx, my);
 948 }
 949
 950 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 951     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 952     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 953
 954     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 955
 956     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 957        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 958        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 959
 960         *mx = *my = 0;
 961         return;
 962     }
 963
 964     pred_motion(h, 0, 4, 0, 0, mx, my);
 965
 966     return;
 967 }
 968
 969 static inline void direct_dist_scale_factor(H264Context * const h){
 970     const int poc = h->s.current_picture_ptr->poc;
 971     const int poc1 = h->ref_list[1][0].poc;
 972     int i;
 973     for(i=0; i<h->ref_count[0]; i++){
 974         int poc0 = h->ref_list[0][i].poc;
 975         int td = av_clip(poc1 - poc0, -128, 127);
 976         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 977             h->dist_scale_factor[i] = 256;
 978         }else{
 979             int tb = av_clip(poc - poc0, -128, 127);
 980             int tx = (16384 + (FFABS(td) >> 1)) / td;
 981             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 982         }
 983     }
 984     if(FRAME_MBAFF){
 985         for(i=0; i<h->ref_count[0]; i++){
 986             h->dist_scale_factor_field[2*i] =
 987             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 988         }
 989     }
 990 }
 991 static inline void direct_ref_list_init(H264Context * const h){
 992     MpegEncContext * const s = &h->s;
 993     Picture * const ref1 = &h->ref_list[1][0];
 994     Picture * const cur = s->current_picture_ptr;
 995     int list, i, j;
 996     if(cur->pict_type == I_TYPE)
 997         cur->ref_count[0] = 0;
 998     if(cur->pict_type != B_TYPE)
 999         cur->ref_count[1] = 0;
1000     for(list=0; list<2; list++){
1001         cur->ref_count[list] = h->ref_count[list];
1002         for(j=0; j<h->ref_count[list]; j++)
1003             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1004     }
1005     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1006         return;
1007     for(list=0; list<2; list++){
1008         for(i=0; i<ref1->ref_count[list]; i++){
1009             const int poc = ref1->ref_poc[list][i];
1010             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1011             for(j=0; j<h->ref_count[list]; j++)
1012                 if(h->ref_list[list][j].poc == poc){
1013                     h->map_col_to_list0[list][i] = j;
1014                     break;
1015                 }
1016         }
1017     }
1018     if(FRAME_MBAFF){
1019         for(list=0; list<2; list++){
1020             for(i=0; i<ref1->ref_count[list]; i++){
1021                 j = h->map_col_to_list0[list][i];
1022                 h->map_col_to_list0_field[list][2*i] = 2*j;
1023                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1024             }
1025         }
1026     }
1027 }
1028
1029 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1030     MpegEncContext * const s = &h->s;
1031     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1032     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1033     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1034     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1035     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1036     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1037     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1038     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1039     const int is_b8x8 = IS_8X8(*mb_type);
1040     unsigned int sub_mb_type;
1041     int i8, i4;
1042
1043 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1044     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1045         /* FIXME save sub mb types from previous frames (or derive from MVs)
1046          * so we know exactly what block size to use */
1047         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1048         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1049     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1050         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1051         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1052     }else{
1053         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1054         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1055     }
1056     if(!is_b8x8)
1057         *mb_type |= MB_TYPE_DIRECT2;
1058     if(MB_FIELD)
1059         *mb_type |= MB_TYPE_INTERLACED;
1060
1061     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1062
1063     if(h->direct_spatial_mv_pred){
1064         int ref[2];
1065         int mv[2][2];
1066         int list;
1067
1068         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1069
1070         /* ref = min(neighbors) */
1071         for(list=0; list<2; list++){
1072             int refa = h->ref_cache[list][scan8[0] - 1];
1073             int refb = h->ref_cache[list][scan8[0] - 8];
1074             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1075             if(refc == -2)
1076                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1077             ref[list] = refa;
1078             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1079                 ref[list] = refb;
1080             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1081                 ref[list] = refc;
1082             if(ref[list] < 0)
1083                 ref[list] = -1;
1084         }
1085
1086         if(ref[0] < 0 && ref[1] < 0){
1087             ref[0] = ref[1] = 0;
1088             mv[0][0] = mv[0][1] =
1089             mv[1][0] = mv[1][1] = 0;
1090         }else{
1091             for(list=0; list<2; list++){
1092                 if(ref[list] >= 0)
1093                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1094                 else
1095                     mv[list][0] = mv[list][1] = 0;
1096             }
1097         }
1098
1099         if(ref[1] < 0){
1100             *mb_type &= ~MB_TYPE_P0L1;
1101             sub_mb_type &= ~MB_TYPE_P0L1;
1102         }else if(ref[0] < 0){
1103             *mb_type &= ~MB_TYPE_P0L0;
1104             sub_mb_type &= ~MB_TYPE_P0L0;
1105         }
1106
1107         if(IS_16X16(*mb_type)){
1108             int a=0, b=0;
1109
1110             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1111             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1112             if(!IS_INTRA(mb_type_col)
1113                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1114                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1115                        && (h->x264_build>33 || !h->x264_build)))){
1116                 if(ref[0] > 0)
1117                     a= pack16to32(mv[0][0],mv[0][1]);
1118                 if(ref[1] > 0)
1119                     b= pack16to32(mv[1][0],mv[1][1]);
1120             }else{
1121                 a= pack16to32(mv[0][0],mv[0][1]);
1122                 b= pack16to32(mv[1][0],mv[1][1]);
1123             }
1124             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1125             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1126         }else{
1127             for(i8=0; i8<4; i8++){
1128                 const int x8 = i8&1;
1129                 const int y8 = i8>>1;
1130
1131                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1132                     continue;
1133                 h->sub_mb_type[i8] = sub_mb_type;
1134
1135                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1136                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1137                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1138                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1139
1140                 /* col_zero_flag */
1141                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1142                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1143                                                   && (h->x264_build>33 || !h->x264_build)))){
1144                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1145                     if(IS_SUB_8X8(sub_mb_type)){
1146                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1147                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1148                             if(ref[0] == 0)
1149                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1150                             if(ref[1] == 0)
1151                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1152                         }
1153                     }else
1154                     for(i4=0; i4<4; i4++){
1155                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1156                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1157                             if(ref[0] == 0)
1158                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1159                             if(ref[1] == 0)
1160                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1161                         }
1162                     }
1163                 }
1164             }
1165         }
1166     }else{ /* direct temporal mv pred */
1167         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1168         const int *dist_scale_factor = h->dist_scale_factor;
1169
1170         if(FRAME_MBAFF){
1171             if(IS_INTERLACED(*mb_type)){
1172                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1173                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1174                 dist_scale_factor = h->dist_scale_factor_field;
1175             }
1176             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1177                 /* FIXME assumes direct_8x8_inference == 1 */
1178                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1179                 int mb_types_col[2];
1180                 int y_shift;
1181
1182                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1183                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1184                          | (*mb_type & MB_TYPE_INTERLACED);
1185                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1186
1187                 if(IS_INTERLACED(*mb_type)){
1188                     /* frame to field scaling */
1189                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1190                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1191                     if(s->mb_y&1){
1192                         l1ref0 -= 2*h->b8_stride;
1193                         l1ref1 -= 2*h->b8_stride;
1194                         l1mv0 -= 4*h->b_stride;
1195                         l1mv1 -= 4*h->b_stride;
1196                     }
1197                     y_shift = 0;
1198
1199                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1200                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1201                        && !is_b8x8)
1202                         *mb_type |= MB_TYPE_16x8;
1203                     else
1204                         *mb_type |= MB_TYPE_8x8;
1205                 }else{
1206                     /* field to frame scaling */
1207                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1208                      * but in MBAFF, top and bottom POC are equal */
1209                     int dy = (s->mb_y&1) ? 1 : 2;
1210                     mb_types_col[0] =
1211                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1212                     l1ref0 += dy*h->b8_stride;
1213                     l1ref1 += dy*h->b8_stride;
1214                     l1mv0 += 2*dy*h->b_stride;
1215                     l1mv1 += 2*dy*h->b_stride;
1216                     y_shift = 2;
1217
1218                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1219                        && !is_b8x8)
1220                         *mb_type |= MB_TYPE_16x16;
1221                     else
1222                         *mb_type |= MB_TYPE_8x8;
1223                 }
1224
1225                 for(i8=0; i8<4; i8++){
1226                     const int x8 = i8&1;
1227                     const int y8 = i8>>1;
1228                     int ref0, scale;
1229                     const int16_t (*l1mv)[2]= l1mv0;
1230
1231                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1232                         continue;
1233                     h->sub_mb_type[i8] = sub_mb_type;
1234
1235                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1236                     if(IS_INTRA(mb_types_col[y8])){
1237                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1238                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1239                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1240                         continue;
1241                     }
1242
1243                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1244                     if(ref0 >= 0)
1245                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1246                     else{
1247                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1248                         l1mv= l1mv1;
1249                     }
1250                     scale = dist_scale_factor[ref0];
1251                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1252
1253                     {
1254                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1255                         int my_col = (mv_col[1]<<y_shift)/2;
1256                         int mx = (scale * mv_col[0] + 128) >> 8;
1257                         int my = (scale * my_col + 128) >> 8;
1258                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1259                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1260                     }
1261                 }
1262                 return;
1263             }
1264         }
1265
1266         /* one-to-one mv scaling */
1267
1268         if(IS_16X16(*mb_type)){
1269             int ref, mv0, mv1;
1270
1271             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1272             if(IS_INTRA(mb_type_col)){
1273                 ref=mv0=mv1=0;
1274             }else{
1275                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1276                                                 : map_col_to_list0[1][l1ref1[0]];
1277                 const int scale = dist_scale_factor[ref0];
1278                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1279                 int mv_l0[2];
1280                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1281                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1282                 ref= ref0;
1283                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1284                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1285             }
1286             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1287             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1288             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1289         }else{
1290             for(i8=0; i8<4; i8++){
1291                 const int x8 = i8&1;
1292                 const int y8 = i8>>1;
1293                 int ref0, scale;
1294                 const int16_t (*l1mv)[2]= l1mv0;
1295
1296                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1297                     continue;
1298                 h->sub_mb_type[i8] = sub_mb_type;
1299                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1300                 if(IS_INTRA(mb_type_col)){
1301                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1302                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1303                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1304                     continue;
1305                 }
1306
1307                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1308                 if(ref0 >= 0)
1309                     ref0 = map_col_to_list0[0][ref0];
1310                 else{
1311                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1312                     l1mv= l1mv1;
1313                 }
1314                 scale = dist_scale_factor[ref0];
1315
1316                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1317                 if(IS_SUB_8X8(sub_mb_type)){
1318                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1319                     int mx = (scale * mv_col[0] + 128) >> 8;
1320                     int my = (scale * mv_col[1] + 128) >> 8;
1321                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1322                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1323                 }else
1324                 for(i4=0; i4<4; i4++){
1325                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1326                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1327                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1328                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1329                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1330                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1331                 }
1332             }
1333         }
1334     }
1335 }
1336
1337 static inline void write_back_motion(H264Context *h, int mb_type){
1338     MpegEncContext * const s = &h->s;
1339     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1340     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1341     int list;
1342
1343     if(!USES_LIST(mb_type, 0))
1344         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1345
1346     for(list=0; list<h->list_count; list++){
1347         int y;
1348         if(!USES_LIST(mb_type, list))
1349             continue;
1350
1351         for(y=0; y<4; y++){
1352             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1353             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1354         }
1355         if( h->pps.cabac ) {
1356             if(IS_SKIP(mb_type))
1357                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1358             else
1359             for(y=0; y<4; y++){
1360                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1361                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1362             }
1363         }
1364
1365         {
1366             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1367             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1368             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1369             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1370             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1371         }
1372     }
1373
1374     if(h->slice_type == B_TYPE && h->pps.cabac){
1375         if(IS_8X8(mb_type)){
1376             uint8_t *direct_table = &h->direct_table[b8_xy];
1377             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1378             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1379             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1380         }
1381     }
1382 }
1383
1384 /**
1385  * Decodes a network abstraction layer unit.
1386  * @param consumed is the number of bytes used as input
1387  * @param length is the length of the array
1388  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1389  * @returns decoded bytes, might be src+1 if no escapes
1390  */
1391 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1392     int i, si, di;
1393     uint8_t *dst;
1394     int bufidx;
1395
1396 //    src[0]&0x80;                //forbidden bit
1397     h->nal_ref_idc= src[0]>>5;
1398     h->nal_unit_type= src[0]&0x1F;
1399
1400     src++; length--;
1401 #if 0
1402     for(i=0; i<length; i++)
1403         printf("%2X ", src[i]);
1404 #endif
1405     for(i=0; i+1<length; i+=2){
1406         if(src[i]) continue;
1407         if(i>0 && src[i-1]==0) i--;
1408         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1409             if(src[i+2]!=3){
1410                 /* startcode, so we must be past the end */
1411                 length=i;
1412             }
1413             break;
1414         }
1415     }
1416
1417     if(i>=length-1){ //no escaped 0
1418         *dst_length= length;
1419         *consumed= length+1; //+1 for the header
1420         return src;
1421     }
1422
1423     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1424     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1425     dst= h->rbsp_buffer[bufidx];
1426
1427     if (dst == NULL){
1428         return NULL;
1429     }
1430
1431 //printf("decoding esc\n");
1432     si=di=0;
1433     while(si<length){
1434         //remove escapes (very rare 1:2^22)
1435         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1436             if(src[si+2]==3){ //escape
1437                 dst[di++]= 0;
1438                 dst[di++]= 0;
1439                 si+=3;
1440                 continue;
1441             }else //next start code
1442                 break;
1443         }
1444
1445         dst[di++]= src[si++];
1446     }
1447
1448     *dst_length= di;
1449     *consumed= si + 1;//+1 for the header
1450 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1451     return dst;
1452 }
1453
1454 /**
1455  * identifies the exact end of the bitstream
1456  * @return the length of the trailing, or 0 if damaged
1457  */
1458 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1459     int v= *src;
1460     int r;
1461
1462     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1463
1464     for(r=1; r<9; r++){
1465         if(v&1) return r;
1466         v>>=1;
1467     }
1468     return 0;
1469 }
1470
1471 /**
1472  * idct tranforms the 16 dc values and dequantize them.
1473  * @param qp quantization parameter
1474  */
1475 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1476 #define stride 16
1477     int i;
1478     int temp[16]; //FIXME check if this is a good idea
1479     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1480     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1481
1482 //memset(block, 64, 2*256);
1483 //return;
1484     for(i=0; i<4; i++){
1485         const int offset= y_offset[i];
1486         const int z0= block[offset+stride*0] + block[offset+stride*4];
1487         const int z1= block[offset+stride*0] - block[offset+stride*4];
1488         const int z2= block[offset+stride*1] - block[offset+stride*5];
1489         const int z3= block[offset+stride*1] + block[offset+stride*5];
1490
1491         temp[4*i+0]= z0+z3;
1492         temp[4*i+1]= z1+z2;
1493         temp[4*i+2]= z1-z2;
1494         temp[4*i+3]= z0-z3;
1495     }
1496
1497     for(i=0; i<4; i++){
1498         const int offset= x_offset[i];
1499         const int z0= temp[4*0+i] + temp[4*2+i];
1500         const int z1= temp[4*0+i] - temp[4*2+i];
1501         const int z2= temp[4*1+i] - temp[4*3+i];
1502         const int z3= temp[4*1+i] + temp[4*3+i];
1503
1504         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1505         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1506         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1507         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1508     }
1509 }
1510
1511 #if 0
1512 /**
1513  * dct tranforms the 16 dc values.
1514  * @param qp quantization parameter ??? FIXME
1515  */
1516 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1517 //    const int qmul= dequant_coeff[qp][0];
1518     int i;
1519     int temp[16]; //FIXME check if this is a good idea
1520     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1521     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1522
1523     for(i=0; i<4; i++){
1524         const int offset= y_offset[i];
1525         const int z0= block[offset+stride*0] + block[offset+stride*4];
1526         const int z1= block[offset+stride*0] - block[offset+stride*4];
1527         const int z2= block[offset+stride*1] - block[offset+stride*5];
1528         const int z3= block[offset+stride*1] + block[offset+stride*5];
1529
1530         temp[4*i+0]= z0+z3;
1531         temp[4*i+1]= z1+z2;
1532         temp[4*i+2]= z1-z2;
1533         temp[4*i+3]= z0-z3;
1534     }
1535
1536     for(i=0; i<4; i++){
1537         const int offset= x_offset[i];
1538         const int z0= temp[4*0+i] + temp[4*2+i];
1539         const int z1= temp[4*0+i] - temp[4*2+i];
1540         const int z2= temp[4*1+i] - temp[4*3+i];
1541         const int z3= temp[4*1+i] + temp[4*3+i];
1542
1543         block[stride*0 +offset]= (z0 + z3)>>1;
1544         block[stride*2 +offset]= (z1 + z2)>>1;
1545         block[stride*8 +offset]= (z1 - z2)>>1;
1546         block[stride*10+offset]= (z0 - z3)>>1;
1547     }
1548 }
1549 #endif
1550
1551 #undef xStride
1552 #undef stride
1553
1554 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1555     const int stride= 16*2;
1556     const int xStride= 16;
1557     int a,b,c,d,e;
1558
1559     a= block[stride*0 + xStride*0];
1560     b= block[stride*0 + xStride*1];
1561     c= block[stride*1 + xStride*0];
1562     d= block[stride*1 + xStride*1];
1563
1564     e= a-b;
1565     a= a+b;
1566     b= c-d;
1567     c= c+d;
1568
1569     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1570     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1571     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1572     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1573 }
1574
1575 #if 0
1576 static void chroma_dc_dct_c(DCTELEM *block){
1577     const int stride= 16*2;
1578     const int xStride= 16;
1579     int a,b,c,d,e;
1580
1581     a= block[stride*0 + xStride*0];
1582     b= block[stride*0 + xStride*1];
1583     c= block[stride*1 + xStride*0];
1584     d= block[stride*1 + xStride*1];
1585
1586     e= a-b;
1587     a= a+b;
1588     b= c-d;
1589     c= c+d;
1590
1591     block[stride*0 + xStride*0]= (a+c);
1592     block[stride*0 + xStride*1]= (e+b);
1593     block[stride*1 + xStride*0]= (a-c);
1594     block[stride*1 + xStride*1]= (e-b);
1595 }
1596 #endif
1597
1598 /**
1599  * gets the chroma qp.
1600  */
1601 static inline int get_chroma_qp(H264Context *h, int qscale){
1602     return h->pps.chroma_qp_table[qscale & 0xff];
1603 }
1604
1605 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1606 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1607 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1608     int i;
1609     const int * const quant_table= quant_coeff[qscale];
1610     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1611     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1612     const unsigned int threshold2= (threshold1<<1);
1613     int last_non_zero;
1614
1615     if(separate_dc){
1616         if(qscale<=18){
1617             //avoid overflows
1618             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1619             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1620             const unsigned int dc_threshold2= (dc_threshold1<<1);
1621
1622             int level= block[0]*quant_coeff[qscale+18][0];
1623             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1624                 if(level>0){
1625                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1626                     block[0]= level;
1627                 }else{
1628                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1629                     block[0]= -level;
1630                 }
1631 //                last_non_zero = i;
1632             }else{
1633                 block[0]=0;
1634             }
1635         }else{
1636             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1637             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1638             const unsigned int dc_threshold2= (dc_threshold1<<1);
1639
1640             int level= block[0]*quant_table[0];
1641             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1642                 if(level>0){
1643                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1644                     block[0]= level;
1645                 }else{
1646                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1647                     block[0]= -level;
1648                 }
1649 //                last_non_zero = i;
1650             }else{
1651                 block[0]=0;
1652             }
1653         }
1654         last_non_zero= 0;
1655         i=1;
1656     }else{
1657         last_non_zero= -1;
1658         i=0;
1659     }
1660
1661     for(; i<16; i++){
1662         const int j= scantable[i];
1663         int level= block[j]*quant_table[j];
1664
1665 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1666 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1667         if(((unsigned)(level+threshold1))>threshold2){
1668             if(level>0){
1669                 level= (bias + level)>>QUANT_SHIFT;
1670                 block[j]= level;
1671             }else{
1672                 level= (bias - level)>>QUANT_SHIFT;
1673                 block[j]= -level;
1674             }
1675             last_non_zero = i;
1676         }else{
1677             block[j]=0;
1678         }
1679     }
1680
1681     return last_non_zero;
1682 }
1683
1684 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1685     const uint32_t a= ((uint32_t*)(src-stride))[0];
1686     ((uint32_t*)(src+0*stride))[0]= a;
1687     ((uint32_t*)(src+1*stride))[0]= a;
1688     ((uint32_t*)(src+2*stride))[0]= a;
1689     ((uint32_t*)(src+3*stride))[0]= a;
1690 }
1691
1692 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1693     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1694     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1695     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1696     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1697 }
1698
1699 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1700     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1701                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1702
1703     ((uint32_t*)(src+0*stride))[0]=
1704     ((uint32_t*)(src+1*stride))[0]=
1705     ((uint32_t*)(src+2*stride))[0]=
1706     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1707 }
1708
1709 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1710     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1711
1712     ((uint32_t*)(src+0*stride))[0]=
1713     ((uint32_t*)(src+1*stride))[0]=
1714     ((uint32_t*)(src+2*stride))[0]=
1715     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1716 }
1717
1718 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1719     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1720
1721     ((uint32_t*)(src+0*stride))[0]=
1722     ((uint32_t*)(src+1*stride))[0]=
1723     ((uint32_t*)(src+2*stride))[0]=
1724     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1725 }
1726
1727 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1728     ((uint32_t*)(src+0*stride))[0]=
1729     ((uint32_t*)(src+1*stride))[0]=
1730     ((uint32_t*)(src+2*stride))[0]=
1731     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1732 }
1733
1734
1735 #define LOAD_TOP_RIGHT_EDGE\
1736     const int av_unused t4= topright[0];\
1737     const int av_unused t5= topright[1];\
1738     const int av_unused t6= topright[2];\
1739     const int av_unused t7= topright[3];\
1740
1741 #define LOAD_LEFT_EDGE\
1742     const int av_unused l0= src[-1+0*stride];\
1743     const int av_unused l1= src[-1+1*stride];\
1744     const int av_unused l2= src[-1+2*stride];\
1745     const int av_unused l3= src[-1+3*stride];\
1746
1747 #define LOAD_TOP_EDGE\
1748     const int av_unused t0= src[ 0-1*stride];\
1749     const int av_unused t1= src[ 1-1*stride];\
1750     const int av_unused t2= src[ 2-1*stride];\
1751     const int av_unused t3= src[ 3-1*stride];\
1752
1753 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1754     const int lt= src[-1-1*stride];
1755     LOAD_TOP_EDGE
1756     LOAD_LEFT_EDGE
1757
1758     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1759     src[0+2*stride]=
1760     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1761     src[0+1*stride]=
1762     src[1+2*stride]=
1763     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1764     src[0+0*stride]=
1765     src[1+1*stride]=
1766     src[2+2*stride]=
1767     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1768     src[1+0*stride]=
1769     src[2+1*stride]=
1770     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1771     src[2+0*stride]=
1772     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1773     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1774 }
1775
1776 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1777     LOAD_TOP_EDGE
1778     LOAD_TOP_RIGHT_EDGE
1779 //    LOAD_LEFT_EDGE
1780
1781     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1782     src[1+0*stride]=
1783     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1784     src[2+0*stride]=
1785     src[1+1*stride]=
1786     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1787     src[3+0*stride]=
1788     src[2+1*stride]=
1789     src[1+2*stride]=
1790     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1791     src[3+1*stride]=
1792     src[2+2*stride]=
1793     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1794     src[3+2*stride]=
1795     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1796     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1797 }
1798
1799 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1800     const int lt= src[-1-1*stride];
1801     LOAD_TOP_EDGE
1802     LOAD_LEFT_EDGE
1803
1804     src[0+0*stride]=
1805     src[1+2*stride]=(lt + t0 + 1)>>1;
1806     src[1+0*stride]=
1807     src[2+2*stride]=(t0 + t1 + 1)>>1;
1808     src[2+0*stride]=
1809     src[3+2*stride]=(t1 + t2 + 1)>>1;
1810     src[3+0*stride]=(t2 + t3 + 1)>>1;
1811     src[0+1*stride]=
1812     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1813     src[1+1*stride]=
1814     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
1815     src[2+1*stride]=
1816     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1817     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1818     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1819     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1820 }
1821
1822 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
1823     LOAD_TOP_EDGE
1824     LOAD_TOP_RIGHT_EDGE
1825
1826     src[0+0*stride]=(t0 + t1 + 1)>>1;
1827     src[1+0*stride]=
1828     src[0+2*stride]=(t1 + t2 + 1)>>1;
1829     src[2+0*stride]=
1830     src[1+2*stride]=(t2 + t3 + 1)>>1;
1831     src[3+0*stride]=
1832     src[2+2*stride]=(t3 + t4+ 1)>>1;
1833     src[3+2*stride]=(t4 + t5+ 1)>>1;
1834     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1835     src[1+1*stride]=
1836     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1837     src[2+1*stride]=
1838     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
1839     src[3+1*stride]=
1840     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
1841     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
1842 }
1843
1844 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
1845     LOAD_LEFT_EDGE
1846
1847     src[0+0*stride]=(l0 + l1 + 1)>>1;
1848     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1849     src[2+0*stride]=
1850     src[0+1*stride]=(l1 + l2 + 1)>>1;
1851     src[3+0*stride]=
1852     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1853     src[2+1*stride]=
1854     src[0+2*stride]=(l2 + l3 + 1)>>1;
1855     src[3+1*stride]=
1856     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
1857     src[3+2*stride]=
1858     src[1+3*stride]=
1859     src[0+3*stride]=
1860     src[2+2*stride]=
1861     src[2+3*stride]=
1862     src[3+3*stride]=l3;
1863 }
1864
1865 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
1866     const int lt= src[-1-1*stride];
1867     LOAD_TOP_EDGE
1868     LOAD_LEFT_EDGE
1869
1870     src[0+0*stride]=
1871     src[2+1*stride]=(lt + l0 + 1)>>1;
1872     src[1+0*stride]=
1873     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
1874     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
1875     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1876     src[0+1*stride]=
1877     src[2+2*stride]=(l0 + l1 + 1)>>1;
1878     src[1+1*stride]=
1879     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1880     src[0+2*stride]=
1881     src[2+3*stride]=(l1 + l2+ 1)>>1;
1882     src[1+2*stride]=
1883     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1884     src[0+3*stride]=(l2 + l3 + 1)>>1;
1885     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1886 }
1887
1888 void ff_pred16x16_vertical_c(uint8_t *src, int stride){
1889     int i;
1890     const uint32_t a= ((uint32_t*)(src-stride))[0];
1891     const uint32_t b= ((uint32_t*)(src-stride))[1];
1892     const uint32_t c= ((uint32_t*)(src-stride))[2];
1893     const uint32_t d= ((uint32_t*)(src-stride))[3];
1894
1895     for(i=0; i<16; i++){
1896         ((uint32_t*)(src+i*stride))[0]= a;
1897         ((uint32_t*)(src+i*stride))[1]= b;
1898         ((uint32_t*)(src+i*stride))[2]= c;
1899         ((uint32_t*)(src+i*stride))[3]= d;
1900     }
1901 }
1902
1903 void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
1904     int i;
1905
1906     for(i=0; i<16; i++){
1907         ((uint32_t*)(src+i*stride))[0]=
1908         ((uint32_t*)(src+i*stride))[1]=
1909         ((uint32_t*)(src+i*stride))[2]=
1910         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
1911     }
1912 }
1913
1914 void ff_pred16x16_dc_c(uint8_t *src, int stride){
1915     int i, dc=0;
1916
1917     for(i=0;i<16; i++){
1918         dc+= src[-1+i*stride];
1919     }
1920
1921     for(i=0;i<16; i++){
1922         dc+= src[i-stride];
1923     }
1924
1925     dc= 0x01010101*((dc + 16)>>5);
1926
1927     for(i=0; i<16; i++){
1928         ((uint32_t*)(src+i*stride))[0]=
1929         ((uint32_t*)(src+i*stride))[1]=
1930         ((uint32_t*)(src+i*stride))[2]=
1931         ((uint32_t*)(src+i*stride))[3]= dc;
1932     }
1933 }
1934
1935 void ff_pred16x16_left_dc_c(uint8_t *src, int stride){
1936     int i, dc=0;
1937
1938     for(i=0;i<16; i++){
1939         dc+= src[-1+i*stride];
1940     }
1941
1942     dc= 0x01010101*((dc + 8)>>4);
1943
1944     for(i=0; i<16; i++){
1945         ((uint32_t*)(src+i*stride))[0]=
1946         ((uint32_t*)(src+i*stride))[1]=
1947         ((uint32_t*)(src+i*stride))[2]=
1948         ((uint32_t*)(src+i*stride))[3]= dc;
1949     }
1950 }
1951
1952 void ff_pred16x16_top_dc_c(uint8_t *src, int stride){
1953     int i, dc=0;
1954
1955     for(i=0;i<16; i++){
1956         dc+= src[i-stride];
1957     }
1958     dc= 0x01010101*((dc + 8)>>4);
1959
1960     for(i=0; i<16; i++){
1961         ((uint32_t*)(src+i*stride))[0]=
1962         ((uint32_t*)(src+i*stride))[1]=
1963         ((uint32_t*)(src+i*stride))[2]=
1964         ((uint32_t*)(src+i*stride))[3]= dc;
1965     }
1966 }
1967
1968 void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
1969     int i;
1970
1971     for(i=0; i<16; i++){
1972         ((uint32_t*)(src+i*stride))[0]=
1973         ((uint32_t*)(src+i*stride))[1]=
1974         ((uint32_t*)(src+i*stride))[2]=
1975         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
1976     }
1977 }
1978
1979 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
1980   int i, j, k;
1981   int a;
1982   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1983   const uint8_t * const src0 = src+7-stride;
1984   const uint8_t *src1 = src+8*stride-1;
1985   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
1986   int H = src0[1] - src0[-1];
1987   int V = src1[0] - src2[ 0];
1988   for(k=2; k<=8; ++k) {
1989     src1 += stride; src2 -= stride;
1990     H += k*(src0[k] - src0[-k]);
1991     V += k*(src1[0] - src2[ 0]);
1992   }
1993   if(svq3){
1994     H = ( 5*(H/4) ) / 16;
1995     V = ( 5*(V/4) ) / 16;
1996
1997     /* required for 100% accuracy */
1998     i = H; H = V; V = i;
1999   }else{
2000     H = ( 5*H+32 ) >> 6;
2001     V = ( 5*V+32 ) >> 6;
2002   }
2003
2004   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2005   for(j=16; j>0; --j) {
2006     int b = a;
2007     a += V;
2008     for(i=-16; i<0; i+=4) {
2009       src[16+i] = cm[ (b    ) >> 5 ];
2010       src[17+i] = cm[ (b+  H) >> 5 ];
2011       src[18+i] = cm[ (b+2*H) >> 5 ];
2012       src[19+i] = cm[ (b+3*H) >> 5 ];
2013       b += 4*H;
2014     }
2015     src += stride;
2016   }
2017 }
2018
2019 void ff_pred16x16_plane_c(uint8_t *src, int stride){
2020     pred16x16_plane_compat_c(src, stride, 0);
2021 }
2022
2023 void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2024     int i;
2025     const uint32_t a= ((uint32_t*)(src-stride))[0];
2026     const uint32_t b= ((uint32_t*)(src-stride))[1];
2027
2028     for(i=0; i<8; i++){
2029         ((uint32_t*)(src+i*stride))[0]= a;
2030         ((uint32_t*)(src+i*stride))[1]= b;
2031     }
2032 }
2033
2034 void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2035     int i;
2036
2037     for(i=0; i<8; i++){
2038         ((uint32_t*)(src+i*stride))[0]=
2039         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2040     }
2041 }
2042
2043 void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2044     int i;
2045
2046     for(i=0; i<8; i++){
2047         ((uint32_t*)(src+i*stride))[0]=
2048         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2049     }
2050 }
2051
2052 void ff_pred8x8_left_dc_c(uint8_t *src, int stride){
2053     int i;
2054     int dc0, dc2;
2055
2056     dc0=dc2=0;
2057     for(i=0;i<4; i++){
2058         dc0+= src[-1+i*stride];
2059         dc2+= src[-1+(i+4)*stride];
2060     }
2061     dc0= 0x01010101*((dc0 + 2)>>2);
2062     dc2= 0x01010101*((dc2 + 2)>>2);
2063
2064     for(i=0; i<4; i++){
2065         ((uint32_t*)(src+i*stride))[0]=
2066         ((uint32_t*)(src+i*stride))[1]= dc0;
2067     }
2068     for(i=4; i<8; i++){
2069         ((uint32_t*)(src+i*stride))[0]=
2070         ((uint32_t*)(src+i*stride))[1]= dc2;
2071     }
2072 }
2073
2074 void ff_pred8x8_top_dc_c(uint8_t *src, int stride){
2075     int i;
2076     int dc0, dc1;
2077
2078     dc0=dc1=0;
2079     for(i=0;i<4; i++){
2080         dc0+= src[i-stride];
2081         dc1+= src[4+i-stride];
2082     }
2083     dc0= 0x01010101*((dc0 + 2)>>2);
2084     dc1= 0x01010101*((dc1 + 2)>>2);
2085
2086     for(i=0; i<4; i++){
2087         ((uint32_t*)(src+i*stride))[0]= dc0;
2088         ((uint32_t*)(src+i*stride))[1]= dc1;
2089     }
2090     for(i=4; i<8; i++){
2091         ((uint32_t*)(src+i*stride))[0]= dc0;
2092         ((uint32_t*)(src+i*stride))[1]= dc1;
2093     }
2094 }
2095
2096
2097 void ff_pred8x8_dc_c(uint8_t *src, int stride){
2098     int i;
2099     int dc0, dc1, dc2, dc3;
2100
2101     dc0=dc1=dc2=0;
2102     for(i=0;i<4; i++){
2103         dc0+= src[-1+i*stride] + src[i-stride];
2104         dc1+= src[4+i-stride];
2105         dc2+= src[-1+(i+4)*stride];
2106     }
2107     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2108     dc0= 0x01010101*((dc0 + 4)>>3);
2109     dc1= 0x01010101*((dc1 + 2)>>2);
2110     dc2= 0x01010101*((dc2 + 2)>>2);
2111
2112     for(i=0; i<4; i++){
2113         ((uint32_t*)(src+i*stride))[0]= dc0;
2114         ((uint32_t*)(src+i*stride))[1]= dc1;
2115     }
2116     for(i=4; i<8; i++){
2117         ((uint32_t*)(src+i*stride))[0]= dc2;
2118         ((uint32_t*)(src+i*stride))[1]= dc3;
2119     }
2120 }
2121
2122 void ff_pred8x8_plane_c(uint8_t *src, int stride){
2123   int j, k;
2124   int a;
2125   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2126   const uint8_t * const src0 = src+3-stride;
2127   const uint8_t *src1 = src+4*stride-1;
2128   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2129   int H = src0[1] - src0[-1];
2130   int V = src1[0] - src2[ 0];
2131   for(k=2; k<=4; ++k) {
2132     src1 += stride; src2 -= stride;
2133     H += k*(src0[k] - src0[-k]);
2134     V += k*(src1[0] - src2[ 0]);
2135   }
2136   H = ( 17*H+16 ) >> 5;
2137   V = ( 17*V+16 ) >> 5;
2138
2139   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2140   for(j=8; j>0; --j) {
2141     int b = a;
2142     a += V;
2143     src[0] = cm[ (b    ) >> 5 ];
2144     src[1] = cm[ (b+  H) >> 5 ];
2145     src[2] = cm[ (b+2*H) >> 5 ];
2146     src[3] = cm[ (b+3*H) >> 5 ];
2147     src[4] = cm[ (b+4*H) >> 5 ];
2148     src[5] = cm[ (b+5*H) >> 5 ];
2149     src[6] = cm[ (b+6*H) >> 5 ];
2150     src[7] = cm[ (b+7*H) >> 5 ];
2151     src += stride;
2152   }
2153 }
2154
2155 #define SRC(x,y) src[(x)+(y)*stride]
2156 #define PL(y) \
2157     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2158 #define PREDICT_8x8_LOAD_LEFT \
2159     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2160                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2161     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2162     const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2163
2164 #define PT(x) \
2165     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2166 #define PREDICT_8x8_LOAD_TOP \
2167     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2168                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2169     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2170     const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2171                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2172
2173 #define PTR(x) \
2174     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2175 #define PREDICT_8x8_LOAD_TOPRIGHT \
2176     int t8, t9, t10, t11, t12, t13, t14, t15; \
2177     if(has_topright) { \
2178         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2179         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2180     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2181
2182 #define PREDICT_8x8_LOAD_TOPLEFT \
2183     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2184
2185 #define PREDICT_8x8_DC(v) \
2186     int y; \
2187     for( y = 0; y < 8; y++ ) { \
2188         ((uint32_t*)src)[0] = \
2189         ((uint32_t*)src)[1] = v; \
2190         src += stride; \
2191     }
2192
2193 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2194 {
2195     PREDICT_8x8_DC(0x80808080);
2196 }
2197 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2198 {
2199     PREDICT_8x8_LOAD_LEFT;
2200     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2201     PREDICT_8x8_DC(dc);
2202 }
2203 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2204 {
2205     PREDICT_8x8_LOAD_TOP;
2206     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2207     PREDICT_8x8_DC(dc);
2208 }
2209 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2210 {
2211     PREDICT_8x8_LOAD_LEFT;
2212     PREDICT_8x8_LOAD_TOP;
2213     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2214                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2215     PREDICT_8x8_DC(dc);
2216 }
2217 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2218 {
2219     PREDICT_8x8_LOAD_LEFT;
2220 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2221                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2222     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2223 #undef ROW
2224 }
2225 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2226 {
2227     int y;
2228     PREDICT_8x8_LOAD_TOP;
2229     src[0] = t0;
2230     src[1] = t1;
2231     src[2] = t2;
2232     src[3] = t3;
2233     src[4] = t4;
2234     src[5] = t5;
2235     src[6] = t6;
2236     src[7] = t7;
2237     for( y = 1; y < 8; y++ )
2238         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2239 }
2240 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2241 {
2242     PREDICT_8x8_LOAD_TOP;
2243     PREDICT_8x8_LOAD_TOPRIGHT;
2244     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2245     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2246     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2247     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2248     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2249     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2250     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2251     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2252     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2253     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2254     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2255     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2256     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2257     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2258     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2259 }
2260 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2261 {
2262     PREDICT_8x8_LOAD_TOP;
2263     PREDICT_8x8_LOAD_LEFT;
2264     PREDICT_8x8_LOAD_TOPLEFT;
2265     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2266     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2267     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2268     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2269     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2270     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2271     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2272     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2273     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2274     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2275     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2276     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2277     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2278     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2279     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2280
2281 }
2282 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2283 {
2284     PREDICT_8x8_LOAD_TOP;
2285     PREDICT_8x8_LOAD_LEFT;
2286     PREDICT_8x8_LOAD_TOPLEFT;
2287     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2288     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2289     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2290     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2291     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2292     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2293     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2294     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2295     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2296     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2297     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2298     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2299     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2300     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2301     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2302     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2303     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2304     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2305     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2306     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2307     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2308     SRC(7,0)= (t6 + t7 + 1) >> 1;
2309 }
2310 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2311 {
2312     PREDICT_8x8_LOAD_TOP;
2313     PREDICT_8x8_LOAD_LEFT;
2314     PREDICT_8x8_LOAD_TOPLEFT;
2315     SRC(0,7)= (l6 + l7 + 1) >> 1;
2316     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2317     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2318     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2319     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2320     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2321     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2322     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2323     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2324     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2325     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2326     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2327     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2328     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2329     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2330     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2331     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2332     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2333     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2334     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2335     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2336     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2337 }
2338 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2339 {
2340     PREDICT_8x8_LOAD_TOP;
2341     PREDICT_8x8_LOAD_TOPRIGHT;
2342     SRC(0,0)= (t0 + t1 + 1) >> 1;
2343     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2344     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2345     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2346     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2347     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2348     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2349     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2350     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2351     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2352     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2353     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2354     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2355     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2356     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2357     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2358     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2359     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2360     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2361     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2362     SRC(7,6)= (t10 + t11 + 1) >> 1;
2363     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2364 }
2365 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2366 {
2367     PREDICT_8x8_LOAD_LEFT;
2368     SRC(0,0)= (l0 + l1 + 1) >> 1;
2369     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2370     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2371     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2372     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2373     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2374     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2375     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2376     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2377     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2378     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2379     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2380     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2381     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2382     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2383     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2384     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2385     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2386 }
2387 #undef PREDICT_8x8_LOAD_LEFT
2388 #undef PREDICT_8x8_LOAD_TOP
2389 #undef PREDICT_8x8_LOAD_TOPLEFT
2390 #undef PREDICT_8x8_LOAD_TOPRIGHT
2391 #undef PREDICT_8x8_DC
2392 #undef PTR
2393 #undef PT
2394 #undef PL
2395 #undef SRC
2396
2397 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2398                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2399                            int src_x_offset, int src_y_offset,
2400                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2401     MpegEncContext * const s = &h->s;
2402     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2403     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2404     const int luma_xy= (mx&3) + ((my&3)<<2);
2405     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2406     uint8_t * src_cb, * src_cr;
2407     int extra_width= h->emu_edge_width;
2408     int extra_height= h->emu_edge_height;
2409     int emu=0;
2410     const int full_mx= mx>>2;
2411     const int full_my= my>>2;
2412     const int pic_width  = 16*s->mb_width;
2413     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2414
2415     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
2416         return;
2417
2418     if(mx&7) extra_width -= 3;
2419     if(my&7) extra_height -= 3;
2420
2421     if(   full_mx < 0-extra_width
2422        || full_my < 0-extra_height
2423        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2424        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2425         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2426             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2427         emu=1;
2428     }
2429
2430     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2431     if(!square){
2432         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2433     }
2434
2435     if(s->flags&CODEC_FLAG_GRAY) return;
2436
2437     if(MB_MBAFF){
2438         // chroma offset when predicting from a field of opposite parity
2439         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2440         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2441     }
2442     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2443     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2444
2445     if(emu){
2446         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2447             src_cb= s->edge_emu_buffer;
2448     }
2449     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2450
2451     if(emu){
2452         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2453             src_cr= s->edge_emu_buffer;
2454     }
2455     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2456 }
2457
2458 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2459                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2460                            int x_offset, int y_offset,
2461                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2462                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2463                            int list0, int list1){
2464     MpegEncContext * const s = &h->s;
2465     qpel_mc_func *qpix_op=  qpix_put;
2466     h264_chroma_mc_func chroma_op= chroma_put;
2467
2468     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2469     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2470     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2471     x_offset += 8*s->mb_x;
2472     y_offset += 8*(s->mb_y >> MB_MBAFF);
2473
2474     if(list0){
2475         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2476         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2477                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2478                            qpix_op, chroma_op);
2479
2480         qpix_op=  qpix_avg;
2481         chroma_op= chroma_avg;
2482     }
2483
2484     if(list1){
2485         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2486         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2487                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2488                            qpix_op, chroma_op);
2489     }
2490 }
2491
2492 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2493                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2494                            int x_offset, int y_offset,
2495                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2496                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2497                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2498                            int list0, int list1){
2499     MpegEncContext * const s = &h->s;
2500
2501     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2502     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2503     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2504     x_offset += 8*s->mb_x;
2505     y_offset += 8*(s->mb_y >> MB_MBAFF);
2506
2507     if(list0 && list1){
2508         /* don't optimize for luma-only case, since B-frames usually
2509          * use implicit weights => chroma too. */
2510         uint8_t *tmp_cb = s->obmc_scratchpad;
2511         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2512         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2513         int refn0 = h->ref_cache[0][ scan8[n] ];
2514         int refn1 = h->ref_cache[1][ scan8[n] ];
2515
2516         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2517                     dest_y, dest_cb, dest_cr,
2518                     x_offset, y_offset, qpix_put, chroma_put);
2519         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2520                     tmp_y, tmp_cb, tmp_cr,
2521                     x_offset, y_offset, qpix_put, chroma_put);
2522
2523         if(h->use_weight == 2){
2524             int weight0 = h->implicit_weight[refn0][refn1];
2525             int weight1 = 64 - weight0;
2526             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2527             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2528             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2529         }else{
2530             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2531                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2532                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2533             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2534                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2535                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2536             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2537                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2538                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2539         }
2540     }else{
2541         int list = list1 ? 1 : 0;
2542         int refn = h->ref_cache[list][ scan8[n] ];
2543         Picture *ref= &h->ref_list[list][refn];
2544         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2545                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2546                     qpix_put, chroma_put);
2547
2548         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2549                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2550         if(h->use_weight_chroma){
2551             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2552                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2553             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2554                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2555         }
2556     }
2557 }
2558
2559 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2560                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2561                            int x_offset, int y_offset,
2562                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2563                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2564                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2565                            int list0, int list1){
2566     if((h->use_weight==2 && list0 && list1
2567         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2568        || h->use_weight==1)
2569         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2570                          x_offset, y_offset, qpix_put, chroma_put,
2571                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2572     else
2573         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2574                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2575 }
2576
2577 static inline void prefetch_motion(H264Context *h, int list){
2578     /* fetch pixels for estimated mv 4 macroblocks ahead
2579      * optimized for 64byte cache lines */
2580     MpegEncContext * const s = &h->s;
2581     const int refn = h->ref_cache[list][scan8[0]];
2582     if(refn >= 0){
2583         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2584         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2585         uint8_t **src= h->ref_list[list][refn].data;
2586         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2587         s->dsp.prefetch(src[0]+off, s->linesize, 4);
2588         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2589         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2590     }
2591 }
2592
2593 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2594                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2595                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2596                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2597     MpegEncContext * const s = &h->s;
2598     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2599     const int mb_type= s->current_picture.mb_type[mb_xy];
2600
2601     assert(IS_INTER(mb_type));
2602
2603     prefetch_motion(h, 0);
2604
2605     if(IS_16X16(mb_type)){
2606         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2607                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2608                 &weight_op[0], &weight_avg[0],
2609                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2610     }else if(IS_16X8(mb_type)){
2611         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2612                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2613                 &weight_op[1], &weight_avg[1],
2614                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2615         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2616                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2617                 &weight_op[1], &weight_avg[1],
2618                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2619     }else if(IS_8X16(mb_type)){
2620         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2621                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2622                 &weight_op[2], &weight_avg[2],
2623                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2624         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2625                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2626                 &weight_op[2], &weight_avg[2],
2627                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2628     }else{
2629         int i;
2630
2631         assert(IS_8X8(mb_type));
2632
2633         for(i=0; i<4; i++){
2634             const int sub_mb_type= h->sub_mb_type[i];
2635             const int n= 4*i;
2636             int x_offset= (i&1)<<2;
2637             int y_offset= (i&2)<<1;
2638
2639             if(IS_SUB_8X8(sub_mb_type)){
2640                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2641                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2642                     &weight_op[3], &weight_avg[3],
2643                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2644             }else if(IS_SUB_8X4(sub_mb_type)){
2645                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2646                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2647                     &weight_op[4], &weight_avg[4],
2648                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2649                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2650                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2651                     &weight_op[4], &weight_avg[4],
2652                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2653             }else if(IS_SUB_4X8(sub_mb_type)){
2654                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2655                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2656                     &weight_op[5], &weight_avg[5],
2657                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2658                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2659                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2660                     &weight_op[5], &weight_avg[5],
2661                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2662             }else{
2663                 int j;
2664                 assert(IS_SUB_4X4(sub_mb_type));
2665                 for(j=0; j<4; j++){
2666                     int sub_x_offset= x_offset + 2*(j&1);
2667                     int sub_y_offset= y_offset +   (j&2);
2668                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2669                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2670                         &weight_op[6], &weight_avg[6],
2671                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2672                 }
2673             }
2674         }
2675     }
2676
2677     prefetch_motion(h, 1);
2678 }
2679
2680 static void decode_init_vlc(void){
2681     static int done = 0;
2682
2683     if (!done) {
2684         int i;
2685         done = 1;
2686
2687         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2688                  &chroma_dc_coeff_token_len [0], 1, 1,
2689                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2690
2691         for(i=0; i<4; i++){
2692             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2693                      &coeff_token_len [i][0], 1, 1,
2694                      &coeff_token_bits[i][0], 1, 1, 1);
2695         }
2696
2697         for(i=0; i<3; i++){
2698             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2699                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2700                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2701         }
2702         for(i=0; i<15; i++){
2703             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2704                      &total_zeros_len [i][0], 1, 1,
2705                      &total_zeros_bits[i][0], 1, 1, 1);
2706         }
2707
2708         for(i=0; i<6; i++){
2709             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2710                      &run_len [i][0], 1, 1,
2711                      &run_bits[i][0], 1, 1, 1);
2712         }
2713         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2714                  &run_len [6][0], 1, 1,
2715                  &run_bits[6][0], 1, 1, 1);
2716     }
2717 }
2718
2719 /**
2720  * Sets the intra prediction function pointers.
2721  */
2722 static void init_pred_ptrs(H264Context *h){
2723 //    MpegEncContext * const s = &h->s;
2724
2725     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2726     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2727     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2728     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2729     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2730     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2731     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2732     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2733     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2734     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2735     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2736     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2737
2738     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
2739     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
2740     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
2741     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
2742     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
2743     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
2744     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
2745     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
2746     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
2747     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
2748     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
2749     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
2750
2751     h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
2752     h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
2753     h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
2754     h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
2755     h->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c;
2756     h->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c;
2757     h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
2758
2759     h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
2760     h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
2761     h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
2762     h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
2763     h->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c;
2764     h->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c;
2765     h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
2766 }
2767
2768 static void free_tables(H264Context *h){
2769     int i;
2770     av_freep(&h->intra4x4_pred_mode);
2771     av_freep(&h->chroma_pred_mode_table);
2772     av_freep(&h->cbp_table);
2773     av_freep(&h->mvd_table[0]);
2774     av_freep(&h->mvd_table[1]);
2775     av_freep(&h->direct_table);
2776     av_freep(&h->non_zero_count);
2777     av_freep(&h->slice_table_base);
2778     av_freep(&h->top_borders[1]);
2779     av_freep(&h->top_borders[0]);
2780     h->slice_table= NULL;
2781
2782     av_freep(&h->mb2b_xy);
2783     av_freep(&h->mb2b8_xy);
2784
2785     av_freep(&h->s.obmc_scratchpad);
2786
2787     for(i = 0; i < MAX_SPS_COUNT; i++)
2788         av_freep(h->sps_buffers + i);
2789
2790     for(i = 0; i < MAX_PPS_COUNT; i++)
2791         av_freep(h->pps_buffers + i);
2792 }
2793
2794 static void init_dequant8_coeff_table(H264Context *h){
2795     int i,q,x;
2796     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2797     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2798     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2799
2800     for(i=0; i<2; i++ ){
2801         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2802             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2803             break;
2804         }
2805
2806         for(q=0; q<52; q++){
2807             int shift = ff_div6[q];
2808             int idx = ff_rem6[q];
2809             for(x=0; x<64; x++)
2810                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2811                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2812                     h->pps.scaling_matrix8[i][x]) << shift;
2813         }
2814     }
2815 }
2816
2817 static void init_dequant4_coeff_table(H264Context *h){
2818     int i,j,q,x;
2819     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2820     for(i=0; i<6; i++ ){
2821         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2822         for(j=0; j<i; j++){
2823             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2824                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2825                 break;
2826             }
2827         }
2828         if(j<i)
2829             continue;
2830
2831         for(q=0; q<52; q++){
2832             int shift = ff_div6[q] + 2;
2833             int idx = ff_rem6[q];
2834             for(x=0; x<16; x++)
2835                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2836                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2837                     h->pps.scaling_matrix4[i][x]) << shift;
2838         }
2839     }
2840 }
2841
2842 static void init_dequant_tables(H264Context *h){
2843     int i,x;
2844     init_dequant4_coeff_table(h);
2845     if(h->pps.transform_8x8_mode)
2846         init_dequant8_coeff_table(h);
2847     if(h->sps.transform_bypass){
2848         for(i=0; i<6; i++)
2849             for(x=0; x<16; x++)
2850                 h->dequant4_coeff[i][0][x] = 1<<6;
2851         if(h->pps.transform_8x8_mode)
2852             for(i=0; i<2; i++)
2853                 for(x=0; x<64; x++)
2854                     h->dequant8_coeff[i][0][x] = 1<<6;
2855     }
2856 }
2857
2858
2859 /**
2860  * allocates tables.
2861  * needs width/height
2862  */
2863 static int alloc_tables(H264Context *h){
2864     MpegEncContext * const s = &h->s;
2865     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2866     int x,y;
2867
2868     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2869
2870     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2871     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2872     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2873     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2874     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2875
2876     if( h->pps.cabac ) {
2877         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2878         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2879         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2880         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2881     }
2882
2883     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2884     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2885
2886     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2887     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2888     for(y=0; y<s->mb_height; y++){
2889         for(x=0; x<s->mb_width; x++){
2890             const int mb_xy= x + y*s->mb_stride;
2891             const int b_xy = 4*x + 4*y*h->b_stride;
2892             const int b8_xy= 2*x + 2*y*h->b8_stride;
2893
2894             h->mb2b_xy [mb_xy]= b_xy;
2895             h->mb2b8_xy[mb_xy]= b8_xy;
2896         }
2897     }
2898
2899     s->obmc_scratchpad = NULL;
2900
2901     if(!h->dequant4_coeff[0])
2902         init_dequant_tables(h);
2903
2904     return 0;
2905 fail:
2906     free_tables(h);
2907     return -1;
2908 }
2909
2910 static void common_init(H264Context *h){
2911     MpegEncContext * const s = &h->s;
2912
2913     s->width = s->avctx->width;
2914     s->height = s->avctx->height;
2915     s->codec_id= s->avctx->codec->id;
2916
2917     init_pred_ptrs(h);
2918
2919     h->dequant_coeff_pps= -1;
2920     s->unrestricted_mv=1;
2921     s->decode=1; //FIXME
2922
2923     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2924     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2925 }
2926
2927 static int decode_init(AVCodecContext *avctx){
2928     H264Context *h= avctx->priv_data;
2929     MpegEncContext * const s = &h->s;
2930
2931     MPV_decode_defaults(s);
2932
2933     s->avctx = avctx;
2934     common_init(h);
2935
2936     s->out_format = FMT_H264;
2937     s->workaround_bugs= avctx->workaround_bugs;
2938
2939     // set defaults
2940 //    s->decode_mb= ff_h263_decode_mb;
2941     s->low_delay= 1;
2942     avctx->pix_fmt= PIX_FMT_YUV420P;
2943
2944     decode_init_vlc();
2945
2946     if(avctx->extradata_size > 0 && avctx->extradata &&
2947        *(char *)avctx->extradata == 1){
2948         h->is_avc = 1;
2949         h->got_avcC = 0;
2950     } else {
2951         h->is_avc = 0;
2952     }
2953
2954     return 0;
2955 }
2956
2957 static int frame_start(H264Context *h){
2958     MpegEncContext * const s = &h->s;
2959     int i;
2960
2961     if(MPV_frame_start(s, s->avctx) < 0)
2962         return -1;
2963     ff_er_frame_start(s);
2964
2965     assert(s->linesize && s->uvlinesize);
2966
2967     for(i=0; i<16; i++){
2968         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2969         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2970     }
2971     for(i=0; i<4; i++){
2972         h->block_offset[16+i]=
2973         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2974         h->block_offset[24+16+i]=
2975         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2976     }
2977
2978     /* can't be in alloc_tables because linesize isn't known there.
2979      * FIXME: redo bipred weight to not require extra buffer? */
2980     if(!s->obmc_scratchpad)
2981         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2982
2983     /* some macroblocks will be accessed before they're available */
2984     if(FRAME_MBAFF)
2985         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2986
2987 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2988     return 0;
2989 }
2990
2991 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2992     MpegEncContext * const s = &h->s;
2993     int i;
2994
2995     src_y  -=   linesize;
2996     src_cb -= uvlinesize;
2997     src_cr -= uvlinesize;
2998
2999     // There are two lines saved, the line above the the top macroblock of a pair,
3000     // and the line above the bottom macroblock
3001     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3002     for(i=1; i<17; i++){
3003         h->left_border[i]= src_y[15+i*  linesize];
3004     }
3005
3006     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3007     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3008
3009     if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3010         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3011         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3012         for(i=1; i<9; i++){
3013             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3014             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3015         }
3016         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3017         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3018     }
3019 }
3020
3021 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
3022     MpegEncContext * const s = &h->s;
3023     int temp8, i;
3024     uint64_t temp64;
3025     int deblock_left;
3026     int deblock_top;
3027     int mb_xy;
3028
3029     if(h->deblocking_filter == 2) {
3030         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
3031         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
3032         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
3033     } else {
3034         deblock_left = (s->mb_x > 0);
3035         deblock_top =  (s->mb_y > 0);
3036     }
3037
3038     src_y  -=   linesize + 1;
3039     src_cb -= uvlinesize + 1;
3040     src_cr -= uvlinesize + 1;
3041
3042 #define XCHG(a,b,t,xchg)\
3043 t= a;\
3044 if(xchg)\
3045     a= b;\
3046 b= t;
3047
3048     if(deblock_left){
3049         for(i = !deblock_top; i<17; i++){
3050             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3051         }
3052     }
3053
3054     if(deblock_top){
3055         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3056         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3057         if(s->mb_x+1 < s->mb_width){
3058             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3059         }
3060     }
3061
3062     if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3063         if(deblock_left){
3064             for(i = !deblock_top; i<9; i++){
3065                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3066                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3067             }
3068         }
3069         if(deblock_top){
3070             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3071             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3072         }
3073     }
3074 }
3075
3076 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3077     MpegEncContext * const s = &h->s;
3078     int i;
3079
3080     src_y  -= 2 *   linesize;
3081     src_cb -= 2 * uvlinesize;
3082     src_cr -= 2 * uvlinesize;
3083
3084     // There are two lines saved, the line above the the top macroblock of a pair,
3085     // and the line above the bottom macroblock
3086     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3087     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3088     for(i=2; i<34; i++){
3089         h->left_border[i]= src_y[15+i*  linesize];
3090     }
3091
3092     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3093     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3094     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3095     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3096
3097     if(!(s->flags&CODEC_FLAG_GRAY)){
3098         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3099         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3100         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3101         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3102         for(i=2; i<18; i++){
3103             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3104             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3105         }
3106         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3107         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3108         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3109         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3110     }
3111 }
3112
3113 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3114     MpegEncContext * const s = &h->s;
3115     int temp8, i;
3116     uint64_t temp64;
3117     int deblock_left = (s->mb_x > 0);
3118     int deblock_top  = (s->mb_y > 1);
3119
3120     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3121
3122     src_y  -= 2 *   linesize + 1;
3123     src_cb -= 2 * uvlinesize + 1;
3124     src_cr -= 2 * uvlinesize + 1;
3125
3126 #define XCHG(a,b,t,xchg)\
3127 t= a;\
3128 if(xchg)\
3129     a= b;\
3130 b= t;
3131
3132     if(deblock_left){
3133         for(i = (!deblock_top)<<1; i<34; i++){
3134             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3135         }
3136     }
3137
3138     if(deblock_top){
3139         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3140         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3141         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3142         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3143         if(s->mb_x+1 < s->mb_width){
3144             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3145             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3146         }
3147     }
3148
3149     if(!(s->flags&CODEC_FLAG_GRAY)){
3150         if(deblock_left){
3151             for(i = (!deblock_top) << 1; i<18; i++){
3152                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3153                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3154             }
3155         }
3156         if(deblock_top){
3157             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3158             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3159             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3160             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3161         }
3162     }
3163 }
3164
3165 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
3166     MpegEncContext * const s = &h->s;
3167     const int mb_x= s->mb_x;
3168     const int mb_y= s->mb_y;
3169     const int mb_xy= mb_x + mb_y*s->mb_stride;
3170     const int mb_type= s->current_picture.mb_type[mb_xy];
3171     uint8_t  *dest_y, *dest_cb, *dest_cr;
3172     int linesize, uvlinesize /*dct_offset*/;
3173     int i;
3174     int *block_offset = &h->block_offset[0];
3175     const unsigned int bottom = mb_y & 1;
3176     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
3177     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3178     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3179
3180     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3181     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3182     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3183
3184     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3185     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3186
3187     if (!simple && MB_FIELD) {
3188         linesize   = h->mb_linesize   = s->linesize * 2;
3189         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3190         block_offset = &h->block_offset[24];
3191         if(mb_y&1){ //FIXME move out of this func?
3192             dest_y -= s->linesize*15;
3193             dest_cb-= s->uvlinesize*7;
3194             dest_cr-= s->uvlinesize*7;
3195         }
3196         if(FRAME_MBAFF) {
3197             int list;
3198             for(list=0; list<h->list_count; list++){
3199                 if(!USES_LIST(mb_type, list))
3200                     continue;
3201                 if(IS_16X16(mb_type)){
3202                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3203                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3204                 }else{
3205                     for(i=0; i<16; i+=4){
3206                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3207                         int ref = h->ref_cache[list][scan8[i]];
3208                         if(ref >= 0)
3209                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3210                     }
3211                 }
3212             }
3213         }
3214     } else {
3215         linesize   = h->mb_linesize   = s->linesize;
3216         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3217 //        dct_offset = s->linesize * 16;
3218     }
3219
3220     if(transform_bypass){
3221         idct_dc_add =
3222         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3223     }else if(IS_8x8DCT(mb_type)){
3224         idct_dc_add = s->dsp.h264_idct8_dc_add;
3225         idct_add = s->dsp.h264_idct8_add;
3226     }else{
3227         idct_dc_add = s->dsp.h264_idct_dc_add;
3228         idct_add = s->dsp.h264_idct_add;
3229     }
3230
3231     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3232        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3233         int mbt_y = mb_y&~1;
3234         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3235         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3236         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3237         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3238     }
3239
3240     if (!simple && IS_INTRA_PCM(mb_type)) {
3241         unsigned int x, y;
3242
3243         // The pixels are stored in h->mb array in the same order as levels,
3244         // copy them in output in the correct order.
3245         for(i=0; i<16; i++) {
3246             for (y=0; y<4; y++) {
3247                 for (x=0; x<4; x++) {
3248                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3249                 }
3250             }
3251         }
3252         for(i=16; i<16+4; i++) {
3253             for (y=0; y<4; y++) {
3254                 for (x=0; x<4; x++) {
3255                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3256                 }
3257             }
3258         }
3259         for(i=20; i<20+4; i++) {
3260             for (y=0; y<4; y++) {
3261                 for (x=0; x<4; x++) {
3262                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3263                 }
3264             }
3265         }
3266     } else {
3267         if(IS_INTRA(mb_type)){
3268             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3269                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
3270
3271             if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3272                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3273                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3274             }
3275
3276             if(IS_INTRA4x4(mb_type)){
3277                 if(simple || !s->encoding){
3278                     if(IS_8x8DCT(mb_type)){
3279                         for(i=0; i<16; i+=4){
3280                             uint8_t * const ptr= dest_y + block_offset[i];
3281                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3282                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3283                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3284                                                    (h->topright_samples_available<<i)&0x4000, linesize);
3285                             if(nnz){
3286                                 if(nnz == 1 && h->mb[i*16])
3287                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3288                                 else
3289                                     idct_add(ptr, h->mb + i*16, linesize);
3290                             }
3291                         }
3292                     }else
3293                     for(i=0; i<16; i++){
3294                         uint8_t * const ptr= dest_y + block_offset[i];
3295                         uint8_t *topright;
3296                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3297                         int nnz, tr;
3298
3299                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3300                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3301                             assert(mb_y || linesize <= block_offset[i]);
3302                             if(!topright_avail){
3303                                 tr= ptr[3 - linesize]*0x01010101;
3304                                 topright= (uint8_t*) &tr;
3305                             }else
3306                                 topright= ptr + 4 - linesize;
3307                         }else
3308                             topright= NULL;
3309
3310                         h->pred4x4[ dir ](ptr, topright, linesize);
3311                         nnz = h->non_zero_count_cache[ scan8[i] ];
3312                         if(nnz){
3313                             if(is_h264){
3314                                 if(nnz == 1 && h->mb[i*16])
3315                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3316                                 else
3317                                     idct_add(ptr, h->mb + i*16, linesize);
3318                             }else
3319                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3320                         }
3321                     }
3322                 }
3323             }else{
3324                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3325                 if(is_h264){
3326                     if(!transform_bypass)
3327                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3328                 }else
3329                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3330             }
3331             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3332                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
3333         }else if(is_h264){
3334             hl_motion(h, dest_y, dest_cb, dest_cr,
3335                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3336                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3337                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3338         }
3339
3340
3341         if(!IS_INTRA4x4(mb_type)){
3342             if(is_h264){
3343                 if(IS_INTRA16x16(mb_type)){
3344                     for(i=0; i<16; i++){
3345                         if(h->non_zero_count_cache[ scan8[i] ])
3346                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3347                         else if(h->mb[i*16])
3348                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3349                     }
3350                 }else{
3351                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3352                     for(i=0; i<16; i+=di){
3353                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3354                         if(nnz){
3355                             if(nnz==1 && h->mb[i*16])
3356                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3357                             else
3358                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3359                         }
3360                     }
3361                 }
3362             }else{
3363                 for(i=0; i<16; i++){
3364                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3365                         uint8_t * const ptr= dest_y + block_offset[i];
3366                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3367                     }
3368                 }
3369             }
3370         }
3371
3372         if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3373             uint8_t *dest[2] = {dest_cb, dest_cr};
3374             if(transform_bypass){
3375                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3376             }else{
3377                 idct_add = s->dsp.h264_idct_add;
3378                 idct_dc_add = s->dsp.h264_idct_dc_add;
3379                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3380                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3381             }
3382             if(is_h264){
3383                 for(i=16; i<16+8; i++){
3384                     if(h->non_zero_count_cache[ scan8[i] ])
3385                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3386                     else if(h->mb[i*16])
3387                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3388                 }
3389             }else{
3390                 for(i=16; i<16+8; i++){
3391                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3392                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3393                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3394                     }
3395                 }
3396             }
3397         }
3398     }
3399     if(h->deblocking_filter) {
3400         if (!simple && FRAME_MBAFF) {
3401             //FIXME try deblocking one mb at a time?
3402             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3403             const int mb_y = s->mb_y - 1;
3404             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3405             const int mb_xy= mb_x + mb_y*s->mb_stride;
3406             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3407             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3408             if (!bottom) return;
3409             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3410             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3411             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3412
3413             if(IS_INTRA(mb_type_top | mb_type_bottom))
3414                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3415
3416             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3417             // deblock a pair
3418             // top
3419             s->mb_y--;
3420             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3421             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3422             h->chroma_qp = get_chroma_qp(h, s->current_picture.qscale_table[mb_xy]);
3423             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3424             // bottom
3425             s->mb_y++;
3426             tprintf(h->s.avctx, "call mbaff filter_mb\n");
3427             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3428             h->chroma_qp = get_chroma_qp(h, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3429             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3430         } else {
3431             tprintf(h->s.avctx, "call filter_mb\n");
3432             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
3433             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3434             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3435         }
3436     }
3437 }
3438
3439 /**
3440  * Process a macroblock; this case avoids checks for expensive uncommon cases.
3441  */
3442 static void hl_decode_mb_simple(H264Context *h){
3443     hl_decode_mb_internal(h, 1);
3444 }
3445
3446 /**
3447  * Process a macroblock; this handles edge cases, such as interlacing.
3448  */
3449 static void av_noinline hl_decode_mb_complex(H264Context *h){
3450     hl_decode_mb_internal(h, 0);
3451 }
3452
3453 static void hl_decode_mb(H264Context *h){
3454     MpegEncContext * const s = &h->s;
3455     const int mb_x= s->mb_x;
3456     const int mb_y= s->mb_y;
3457     const int mb_xy= mb_x + mb_y*s->mb_stride;
3458     const int mb_type= s->current_picture.mb_type[mb_xy];
3459     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (s->flags&CODEC_FLAG_GRAY) || s->encoding;
3460
3461     if(!s->decode)
3462         return;
3463
3464     if (is_complex)
3465         hl_decode_mb_complex(h);
3466     else hl_decode_mb_simple(h);
3467 }
3468
3469 /**
3470  * fills the default_ref_list.
3471  */
3472 static int fill_default_ref_list(H264Context *h){
3473     MpegEncContext * const s = &h->s;
3474     int i;
3475     int smallest_poc_greater_than_current = -1;
3476     Picture sorted_short_ref[32];
3477
3478     if(h->slice_type==B_TYPE){
3479         int out_i;
3480         int limit= INT_MIN;
3481
3482         /* sort frame according to poc in B slice */
3483         for(out_i=0; out_i<h->short_ref_count; out_i++){
3484             int best_i=INT_MIN;
3485             int best_poc=INT_MAX;
3486
3487             for(i=0; i<h->short_ref_count; i++){
3488                 const int poc= h->short_ref[i]->poc;
3489                 if(poc > limit && poc < best_poc){
3490                     best_poc= poc;
3491                     best_i= i;
3492                 }
3493             }
3494
3495             assert(best_i != INT_MIN);
3496
3497             limit= best_poc;
3498             sorted_short_ref[out_i]= *h->short_ref[best_i];
3499             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3500             if (-1 == smallest_poc_greater_than_current) {
3501                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3502                     smallest_poc_greater_than_current = out_i;
3503                 }
3504             }
3505         }
3506     }
3507
3508     if(s->picture_structure == PICT_FRAME){
3509         if(h->slice_type==B_TYPE){
3510             int list;
3511             tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3512
3513             // find the largest poc
3514             for(list=0; list<2; list++){
3515                 int index = 0;
3516                 int j= -99;
3517                 int step= list ? -1 : 1;
3518
3519                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3520                     while(j<0 || j>= h->short_ref_count){
3521                         if(j != -99 && step == (list ? -1 : 1))
3522                             return -1;
3523                         step = -step;
3524                         j= smallest_poc_greater_than_current + (step>>1);
3525                     }
3526                     if(sorted_short_ref[j].reference != 3) continue;
3527                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3528                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3529                 }
3530
3531                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3532                     if(h->long_ref[i] == NULL) continue;
3533                     if(h->long_ref[i]->reference != 3) continue;
3534
3535                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3536                     h->default_ref_list[ list ][index++].pic_id= i;;
3537                 }
3538
3539                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3540                     // swap the two first elements of L1 when
3541                     // L0 and L1 are identical
3542                     Picture temp= h->default_ref_list[1][0];
3543                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3544                     h->default_ref_list[1][1] = temp;
3545                 }
3546
3547                 if(index < h->ref_count[ list ])
3548                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3549             }
3550         }else{
3551             int index=0;
3552             for(i=0; i<h->short_ref_count; i++){
3553                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3554                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3555                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3556             }
3557             for(i = 0; i < 16; i++){
3558                 if(h->long_ref[i] == NULL) continue;
3559                 if(h->long_ref[i]->reference != 3) continue;
3560                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3561                 h->default_ref_list[0][index++].pic_id= i;;
3562             }
3563             if(index < h->ref_count[0])
3564                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3565         }
3566     }else{ //FIELD
3567         if(h->slice_type==B_TYPE){
3568         }else{
3569             //FIXME second field balh
3570         }
3571     }
3572 #ifdef TRACE
3573     for (i=0; i<h->ref_count[0]; i++) {
3574         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3575     }
3576     if(h->slice_type==B_TYPE){
3577         for (i=0; i<h->ref_count[1]; i++) {
3578             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3579         }
3580     }
3581 #endif
3582     return 0;
3583 }
3584
3585 static void print_short_term(H264Context *h);
3586 static void print_long_term(H264Context *h);
3587
3588 static int decode_ref_pic_list_reordering(H264Context *h){
3589     MpegEncContext * const s = &h->s;
3590     int list, index;
3591
3592     print_short_term(h);
3593     print_long_term(h);
3594     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3595
3596     for(list=0; list<h->list_count; list++){
3597         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3598
3599         if(get_bits1(&s->gb)){
3600             int pred= h->curr_pic_num;
3601
3602             for(index=0; ; index++){
3603                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3604                 unsigned int pic_id;
3605                 int i;
3606                 Picture *ref = NULL;
3607
3608                 if(reordering_of_pic_nums_idc==3)
3609                     break;
3610
3611                 if(index >= h->ref_count[list]){
3612                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3613                     return -1;
3614                 }
3615
3616                 if(reordering_of_pic_nums_idc<3){
3617                     if(reordering_of_pic_nums_idc<2){
3618                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3619
3620                         if(abs_diff_pic_num >= h->max_pic_num){
3621                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3622                             return -1;
3623                         }
3624
3625                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3626                         else                                pred+= abs_diff_pic_num;
3627                         pred &= h->max_pic_num - 1;
3628
3629                         for(i= h->short_ref_count-1; i>=0; i--){
3630                             ref = h->short_ref[i];
3631                             assert(ref->reference == 3);
3632                             assert(!ref->long_ref);
3633                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3634                                 break;
3635                         }
3636                         if(i>=0)
3637                             ref->pic_id= ref->frame_num;
3638                     }else{
3639                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3640                         if(pic_id>31){
3641                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3642                             return -1;
3643                         }
3644                         ref = h->long_ref[pic_id];
3645                         if(ref){
3646                             ref->pic_id= pic_id;
3647                             assert(ref->reference == 3);
3648                             assert(ref->long_ref);
3649                             i=0;
3650                         }else{
3651                             i=-1;
3652                         }
3653                     }
3654
3655                     if (i < 0) {
3656                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3657                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3658                     } else {
3659                         for(i=index; i+1<h->ref_count[list]; i++){
3660                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3661                                 break;
3662                         }
3663                         for(; i > index; i--){
3664                             h->ref_list[list][i]= h->ref_list[list][i-1];
3665                         }
3666                         h->ref_list[list][index]= *ref;
3667                     }
3668                 }else{
3669                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3670                     return -1;
3671                 }
3672             }
3673         }
3674     }
3675     for(list=0; list<h->list_count; list++){
3676         for(index= 0; index < h->ref_count[list]; index++){
3677             if(!h->ref_list[list][index].data[0])
3678                 h->ref_list[list][index]= s->current_picture;
3679         }
3680     }
3681
3682     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3683         direct_dist_scale_factor(h);
3684     direct_ref_list_init(h);
3685     return 0;
3686 }
3687
3688 static void fill_mbaff_ref_list(H264Context *h){
3689     int list, i, j;
3690     for(list=0; list<2; list++){ //FIXME try list_count
3691         for(i=0; i<h->ref_count[list]; i++){
3692             Picture *frame = &h->ref_list[list][i];
3693             Picture *field = &h->ref_list[list][16+2*i];
3694             field[0] = *frame;
3695             for(j=0; j<3; j++)
3696                 field[0].linesize[j] <<= 1;
3697             field[1] = field[0];
3698             for(j=0; j<3; j++)
3699                 field[1].data[j] += frame->linesize[j];
3700
3701             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3702             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3703             for(j=0; j<2; j++){
3704                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3705                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3706             }
3707         }
3708     }
3709     for(j=0; j<h->ref_count[1]; j++){
3710         for(i=0; i<h->ref_count[0]; i++)
3711             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3712         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3713         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3714     }
3715 }
3716
3717 static int pred_weight_table(H264Context *h){
3718     MpegEncContext * const s = &h->s;
3719     int list, i;
3720     int luma_def, chroma_def;
3721
3722     h->use_weight= 0;
3723     h->use_weight_chroma= 0;
3724     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3725     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3726     luma_def = 1<<h->luma_log2_weight_denom;
3727     chroma_def = 1<<h->chroma_log2_weight_denom;
3728
3729     for(list=0; list<2; list++){
3730         for(i=0; i<h->ref_count[list]; i++){
3731             int luma_weight_flag, chroma_weight_flag;
3732
3733             luma_weight_flag= get_bits1(&s->gb);
3734             if(luma_weight_flag){
3735                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3736                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3737                 if(   h->luma_weight[list][i] != luma_def
3738                    || h->luma_offset[list][i] != 0)
3739                     h->use_weight= 1;
3740             }else{
3741                 h->luma_weight[list][i]= luma_def;
3742                 h->luma_offset[list][i]= 0;
3743             }
3744
3745             chroma_weight_flag= get_bits1(&s->gb);
3746             if(chroma_weight_flag){
3747                 int j;
3748                 for(j=0; j<2; j++){
3749                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3750                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3751                     if(   h->chroma_weight[list][i][j] != chroma_def
3752                        || h->chroma_offset[list][i][j] != 0)
3753                         h->use_weight_chroma= 1;
3754                 }
3755             }else{
3756                 int j;
3757                 for(j=0; j<2; j++){
3758                     h->chroma_weight[list][i][j]= chroma_def;
3759                     h->chroma_offset[list][i][j]= 0;
3760                 }
3761             }
3762         }
3763         if(h->slice_type != B_TYPE) break;
3764     }
3765     h->use_weight= h->use_weight || h->use_weight_chroma;
3766     return 0;
3767 }
3768
3769 static void implicit_weight_table(H264Context *h){
3770     MpegEncContext * const s = &h->s;
3771     int ref0, ref1;
3772     int cur_poc = s->current_picture_ptr->poc;
3773
3774     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3775        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3776         h->use_weight= 0;
3777         h->use_weight_chroma= 0;
3778         return;
3779     }
3780
3781     h->use_weight= 2;
3782     h->use_weight_chroma= 2;
3783     h->luma_log2_weight_denom= 5;
3784     h->chroma_log2_weight_denom= 5;
3785
3786     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3787         int poc0 = h->ref_list[0][ref0].poc;
3788         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3789             int poc1 = h->ref_list[1][ref1].poc;
3790             int td = av_clip(poc1 - poc0, -128, 127);
3791             if(td){
3792                 int tb = av_clip(cur_poc - poc0, -128, 127);
3793                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3794                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3795                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3796                     h->implicit_weight[ref0][ref1] = 32;
3797                 else
3798                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3799             }else
3800                 h->implicit_weight[ref0][ref1] = 32;
3801         }
3802     }
3803 }
3804
3805 static inline void unreference_pic(H264Context *h, Picture *pic){
3806     int i;
3807     pic->reference=0;
3808     if(pic == h->delayed_output_pic)
3809         pic->reference=1;
3810     else{
3811         for(i = 0; h->delayed_pic[i]; i++)
3812             if(pic == h->delayed_pic[i]){
3813                 pic->reference=1;
3814                 break;
3815             }
3816     }
3817 }
3818
3819 /**
3820  * instantaneous decoder refresh.
3821  */
3822 static void idr(H264Context *h){
3823     int i;
3824
3825     for(i=0; i<16; i++){
3826         if (h->long_ref[i] != NULL) {
3827             unreference_pic(h, h->long_ref[i]);
3828             h->long_ref[i]= NULL;
3829         }
3830     }
3831     h->long_ref_count=0;
3832
3833     for(i=0; i<h->short_ref_count; i++){
3834         unreference_pic(h, h->short_ref[i]);
3835         h->short_ref[i]= NULL;
3836     }
3837     h->short_ref_count=0;
3838 }
3839
3840 /* forget old pics after a seek */
3841 static void flush_dpb(AVCodecContext *avctx){
3842     H264Context *h= avctx->priv_data;
3843     int i;
3844     for(i=0; i<16; i++) {
3845         if(h->delayed_pic[i])
3846             h->delayed_pic[i]->reference= 0;
3847         h->delayed_pic[i]= NULL;
3848     }
3849     if(h->delayed_output_pic)
3850         h->delayed_output_pic->reference= 0;
3851     h->delayed_output_pic= NULL;
3852     idr(h);
3853     if(h->s.current_picture_ptr)
3854         h->s.current_picture_ptr->reference= 0;
3855 }
3856
3857 /**
3858  *
3859  * @return the removed picture or NULL if an error occurs
3860  */
3861 static Picture * remove_short(H264Context *h, int frame_num){
3862     MpegEncContext * const s = &h->s;
3863     int i;
3864
3865     if(s->avctx->debug&FF_DEBUG_MMCO)
3866         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3867
3868     for(i=0; i<h->short_ref_count; i++){
3869         Picture *pic= h->short_ref[i];
3870         if(s->avctx->debug&FF_DEBUG_MMCO)
3871             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3872         if(pic->frame_num == frame_num){
3873             h->short_ref[i]= NULL;
3874             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3875             h->short_ref_count--;
3876             return pic;
3877         }
3878     }
3879     return NULL;
3880 }
3881
3882 /**
3883  *
3884  * @return the removed picture or NULL if an error occurs
3885  */
3886 static Picture * remove_long(H264Context *h, int i){
3887     Picture *pic;
3888
3889     pic= h->long_ref[i];
3890     h->long_ref[i]= NULL;
3891     if(pic) h->long_ref_count--;
3892
3893     return pic;
3894 }
3895
3896 /**
3897  * print short term list
3898  */
3899 static void print_short_term(H264Context *h) {
3900     uint32_t i;
3901     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3902         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3903         for(i=0; i<h->short_ref_count; i++){
3904             Picture *pic= h->short_ref[i];
3905             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3906         }
3907     }
3908 }
3909
3910 /**
3911  * print long term list
3912  */
3913 static void print_long_term(H264Context *h) {
3914     uint32_t i;
3915     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3916         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3917         for(i = 0; i < 16; i++){
3918             Picture *pic= h->long_ref[i];
3919             if (pic) {
3920                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3921             }
3922         }
3923     }
3924 }
3925
3926 /**
3927  * Executes the reference picture marking (memory management control operations).
3928  */
3929 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3930     MpegEncContext * const s = &h->s;
3931     int i, j;
3932     int current_is_long=0;
3933     Picture *pic;
3934
3935     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3936         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3937
3938     for(i=0; i<mmco_count; i++){
3939         if(s->avctx->debug&FF_DEBUG_MMCO)
3940             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
3941
3942         switch(mmco[i].opcode){
3943         case MMCO_SHORT2UNUSED:
3944             pic= remove_short(h, mmco[i].short_frame_num);
3945             if(pic)
3946                 unreference_pic(h, pic);
3947             else if(s->avctx->debug&FF_DEBUG_MMCO)
3948                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3949             break;
3950         case MMCO_SHORT2LONG:
3951             pic= remove_long(h, mmco[i].long_index);
3952             if(pic) unreference_pic(h, pic);
3953
3954             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
3955             if (h->long_ref[ mmco[i].long_index ]){
3956                 h->long_ref[ mmco[i].long_index ]->long_ref=1;
3957                 h->long_ref_count++;
3958             }
3959             break;
3960         case MMCO_LONG2UNUSED:
3961             pic= remove_long(h, mmco[i].long_index);
3962             if(pic)
3963                 unreference_pic(h, pic);
3964             else if(s->avctx->debug&FF_DEBUG_MMCO)
3965                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3966             break;
3967         case MMCO_LONG:
3968             pic= remove_long(h, mmco[i].long_index);
3969             if(pic) unreference_pic(h, pic);
3970
3971             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
3972             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3973             h->long_ref_count++;
3974
3975             current_is_long=1;
3976             break;
3977         case MMCO_SET_MAX_LONG:
3978             assert(mmco[i].long_index <= 16);
3979             // just remove the long term which index is greater than new max
3980             for(j = mmco[i].long_index; j<16; j++){
3981                 pic = remove_long(h, j);
3982                 if (pic) unreference_pic(h, pic);
3983             }
3984             break;
3985         case MMCO_RESET:
3986             while(h->short_ref_count){
3987                 pic= remove_short(h, h->short_ref[0]->frame_num);
3988                 if(pic) unreference_pic(h, pic);
3989             }
3990             for(j = 0; j < 16; j++) {
3991                 pic= remove_long(h, j);
3992                 if(pic) unreference_pic(h, pic);
3993             }
3994             break;
3995         default: assert(0);
3996         }
3997     }
3998
3999     if(!current_is_long){
4000         pic= remove_short(h, s->current_picture_ptr->frame_num);
4001         if(pic){
4002             unreference_pic(h, pic);
4003             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4004         }
4005
4006         if(h->short_ref_count)
4007             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4008
4009         h->short_ref[0]= s->current_picture_ptr;
4010         h->short_ref[0]->long_ref=0;
4011         h->short_ref_count++;
4012     }
4013
4014     print_short_term(h);
4015     print_long_term(h);
4016     return 0;
4017 }
4018
4019 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
4020     MpegEncContext * const s = &h->s;
4021     int i;
4022
4023     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4024         s->broken_link= get_bits1(gb) -1;
4025         h->mmco[0].long_index= get_bits1(gb) - 1; // current_long_term_idx
4026         if(h->mmco[0].long_index == -1)
4027             h->mmco_index= 0;
4028         else{
4029             h->mmco[0].opcode= MMCO_LONG;
4030             h->mmco_index= 1;
4031         }
4032     }else{
4033         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
4034             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4035                 MMCOOpcode opcode= get_ue_golomb(gb);
4036
4037                 h->mmco[i].opcode= opcode;
4038                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4039                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4040 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4041                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4042                         return -1;
4043                     }*/
4044                 }
4045                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4046                     unsigned int long_index= get_ue_golomb(gb);
4047                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
4048                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4049                         return -1;
4050                     }
4051                     h->mmco[i].long_index= long_index;
4052                 }
4053
4054                 if(opcode > (unsigned)MMCO_LONG){
4055                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4056                     return -1;
4057                 }
4058                 if(opcode == MMCO_END)
4059                     break;
4060             }
4061             h->mmco_index= i;
4062         }else{
4063             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4064
4065             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4066                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4067                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4068                 h->mmco_index= 1;
4069             }else
4070                 h->mmco_index= 0;
4071         }
4072     }
4073
4074     return 0;
4075 }
4076
4077 static int init_poc(H264Context *h){
4078     MpegEncContext * const s = &h->s;
4079     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4080     int field_poc[2];
4081
4082     if(h->nal_unit_type == NAL_IDR_SLICE){
4083         h->frame_num_offset= 0;
4084     }else{
4085         if(h->frame_num < h->prev_frame_num)
4086             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4087         else
4088             h->frame_num_offset= h->prev_frame_num_offset;
4089     }
4090
4091     if(h->sps.poc_type==0){
4092         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4093
4094         if(h->nal_unit_type == NAL_IDR_SLICE){
4095              h->prev_poc_msb=
4096              h->prev_poc_lsb= 0;
4097         }
4098
4099         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4100             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4101         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4102             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4103         else
4104             h->poc_msb = h->prev_poc_msb;
4105 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4106         field_poc[0] =
4107         field_poc[1] = h->poc_msb + h->poc_lsb;
4108         if(s->picture_structure == PICT_FRAME)
4109             field_poc[1] += h->delta_poc_bottom;
4110     }else if(h->sps.poc_type==1){
4111         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4112         int i;
4113
4114         if(h->sps.poc_cycle_length != 0)
4115             abs_frame_num = h->frame_num_offset + h->frame_num;
4116         else
4117             abs_frame_num = 0;
4118
4119         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4120             abs_frame_num--;
4121
4122         expected_delta_per_poc_cycle = 0;
4123         for(i=0; i < h->sps.poc_cycle_length; i++)
4124             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4125
4126         if(abs_frame_num > 0){
4127             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4128             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4129
4130             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4131             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4132                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4133         } else
4134             expectedpoc = 0;
4135
4136         if(h->nal_ref_idc == 0)
4137             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4138
4139         field_poc[0] = expectedpoc + h->delta_poc[0];
4140         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4141
4142         if(s->picture_structure == PICT_FRAME)
4143             field_poc[1] += h->delta_poc[1];
4144     }else{
4145         int poc;
4146         if(h->nal_unit_type == NAL_IDR_SLICE){
4147             poc= 0;
4148         }else{
4149             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4150             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4151         }
4152         field_poc[0]= poc;
4153         field_poc[1]= poc;
4154     }
4155
4156     if(s->picture_structure != PICT_BOTTOM_FIELD)
4157         s->current_picture_ptr->field_poc[0]= field_poc[0];
4158     if(s->picture_structure != PICT_TOP_FIELD)
4159         s->current_picture_ptr->field_poc[1]= field_poc[1];
4160     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4161         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4162
4163     return 0;
4164 }
4165
4166
4167 /**
4168  * initialize scan tables
4169  */
4170 static void init_scan_tables(H264Context *h){
4171     MpegEncContext * const s = &h->s;
4172     int i;
4173     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4174         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4175         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4176     }else{
4177         for(i=0; i<16; i++){
4178 #define T(x) (x>>2) | ((x<<2) & 0xF)
4179             h->zigzag_scan[i] = T(zigzag_scan[i]);
4180             h-> field_scan[i] = T( field_scan[i]);
4181 #undef T
4182         }
4183     }
4184     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4185         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4186         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4187         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4188         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4189     }else{
4190         for(i=0; i<64; i++){
4191 #define T(x) (x>>3) | ((x&7)<<3)
4192             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4193             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4194             h->field_scan8x8[i]        = T(field_scan8x8[i]);
4195             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4196 #undef T
4197         }
4198     }
4199     if(h->sps.transform_bypass){ //FIXME same ugly
4200         h->zigzag_scan_q0          = zigzag_scan;
4201         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4202         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4203         h->field_scan_q0           = field_scan;
4204         h->field_scan8x8_q0        = field_scan8x8;
4205         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4206     }else{
4207         h->zigzag_scan_q0          = h->zigzag_scan;
4208         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4209         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4210         h->field_scan_q0           = h->field_scan;
4211         h->field_scan8x8_q0        = h->field_scan8x8;
4212         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4213     }
4214 }
4215 /**
4216  * decodes a slice header.
4217  * this will allso call MPV_common_init() and frame_start() as needed
4218  */
4219 static int decode_slice_header(H264Context *h){
4220     MpegEncContext * const s = &h->s;
4221     unsigned int first_mb_in_slice;
4222     unsigned int pps_id;
4223     int num_ref_idx_active_override_flag;
4224     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4225     unsigned int slice_type, tmp;
4226     int default_ref_list_done = 0;
4227
4228     s->current_picture.reference= h->nal_ref_idc != 0;
4229     s->dropable= h->nal_ref_idc == 0;
4230
4231     first_mb_in_slice= get_ue_golomb(&s->gb);
4232
4233     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
4234         h->slice_num = 0;
4235         s->current_picture_ptr= NULL;
4236     }
4237
4238     slice_type= get_ue_golomb(&s->gb);
4239     if(slice_type > 9){
4240         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4241         return -1;
4242     }
4243     if(slice_type > 4){
4244         slice_type -= 5;
4245         h->slice_type_fixed=1;
4246     }else
4247         h->slice_type_fixed=0;
4248
4249     slice_type= slice_type_map[ slice_type ];
4250     if (slice_type == I_TYPE
4251         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4252         default_ref_list_done = 1;
4253     }
4254     h->slice_type= slice_type;
4255
4256     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4257
4258     pps_id= get_ue_golomb(&s->gb);
4259     if(pps_id>=MAX_PPS_COUNT){
4260         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4261         return -1;
4262     }
4263     if(!h->pps_buffers[pps_id]) {
4264         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4265         return -1;
4266     }
4267     h->pps= *h->pps_buffers[pps_id];
4268
4269     if(!h->sps_buffers[h->pps.sps_id]) {
4270         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4271         return -1;
4272     }
4273     h->sps = *h->sps_buffers[h->pps.sps_id];
4274
4275     if(h->dequant_coeff_pps != pps_id){
4276         h->dequant_coeff_pps = pps_id;
4277         init_dequant_tables(h);
4278     }
4279
4280     s->mb_width= h->sps.mb_width;
4281     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4282
4283     h->b_stride=  s->mb_width*4;
4284     h->b8_stride= s->mb_width*2;
4285
4286     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4287     if(h->sps.frame_mbs_only_flag)
4288         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4289     else
4290         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4291
4292     if (s->context_initialized
4293         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4294         free_tables(h);
4295         MPV_common_end(s);
4296     }
4297     if (!s->context_initialized) {
4298         if (MPV_common_init(s) < 0)
4299             return -1;
4300
4301         init_scan_tables(h);
4302         alloc_tables(h);
4303
4304         s->avctx->width = s->width;
4305         s->avctx->height = s->height;
4306         s->avctx->sample_aspect_ratio= h->sps.sar;
4307         if(!s->avctx->sample_aspect_ratio.den)
4308             s->avctx->sample_aspect_ratio.den = 1;
4309
4310         if(h->sps.timing_info_present_flag){
4311             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4312             if(h->x264_build > 0 && h->x264_build < 44)
4313                 s->avctx->time_base.den *= 2;
4314             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4315                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4316         }
4317     }
4318
4319     if(h->slice_num == 0){
4320         if(frame_start(h) < 0)
4321             return -1;
4322     }
4323
4324     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4325     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4326
4327     h->mb_mbaff = 0;
4328     h->mb_aff_frame = 0;
4329     if(h->sps.frame_mbs_only_flag){
4330         s->picture_structure= PICT_FRAME;
4331     }else{
4332         if(get_bits1(&s->gb)) { //field_pic_flag
4333             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4334             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4335         } else {
4336             s->picture_structure= PICT_FRAME;
4337             h->mb_aff_frame = h->sps.mb_aff;
4338         }
4339     }
4340     assert(s->mb_num == s->mb_width * s->mb_height);
4341     if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
4342        first_mb_in_slice                    >= s->mb_num){
4343         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4344         return -1;
4345     }
4346     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4347     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4348     assert(s->mb_y < s->mb_height);
4349
4350     if(s->picture_structure==PICT_FRAME){
4351         h->curr_pic_num=   h->frame_num;
4352         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4353     }else{
4354         h->curr_pic_num= 2*h->frame_num;
4355         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4356     }
4357
4358     if(h->nal_unit_type == NAL_IDR_SLICE){
4359         get_ue_golomb(&s->gb); /* idr_pic_id */
4360     }
4361
4362     if(h->sps.poc_type==0){
4363         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4364
4365         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4366             h->delta_poc_bottom= get_se_golomb(&s->gb);
4367         }
4368     }
4369
4370     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4371         h->delta_poc[0]= get_se_golomb(&s->gb);
4372
4373         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4374             h->delta_poc[1]= get_se_golomb(&s->gb);
4375     }
4376
4377     init_poc(h);
4378
4379     if(h->pps.redundant_pic_cnt_present){
4380         h->redundant_pic_count= get_ue_golomb(&s->gb);
4381     }
4382
4383     //set defaults, might be overriden a few line later
4384     h->ref_count[0]= h->pps.ref_count[0];
4385     h->ref_count[1]= h->pps.ref_count[1];
4386
4387     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4388         if(h->slice_type == B_TYPE){
4389             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4390             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4391                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4392         }
4393         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4394
4395         if(num_ref_idx_active_override_flag){
4396             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4397             if(h->slice_type==B_TYPE)
4398                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4399
4400             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4401                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4402                 h->ref_count[0]= h->ref_count[1]= 1;
4403                 return -1;
4404             }
4405         }
4406         if(h->slice_type == B_TYPE)
4407             h->list_count= 2;
4408         else
4409             h->list_count= 1;
4410     }else
4411         h->list_count= 0;
4412
4413     if(!default_ref_list_done){
4414         fill_default_ref_list(h);
4415     }
4416
4417     if(decode_ref_pic_list_reordering(h) < 0)
4418         return -1;
4419
4420     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4421        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4422         pred_weight_table(h);
4423     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4424         implicit_weight_table(h);
4425     else
4426         h->use_weight = 0;
4427
4428     if(s->current_picture.reference)
4429         decode_ref_pic_marking(h, &s->gb);
4430
4431     if(FRAME_MBAFF)
4432         fill_mbaff_ref_list(h);
4433
4434     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4435         tmp = get_ue_golomb(&s->gb);
4436         if(tmp > 2){
4437             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4438             return -1;
4439         }
4440         h->cabac_init_idc= tmp;
4441     }
4442
4443     h->last_qscale_diff = 0;
4444     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4445     if(tmp>51){
4446         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4447         return -1;
4448     }
4449     s->qscale= tmp;
4450     h->chroma_qp = get_chroma_qp(h, s->qscale);
4451     //FIXME qscale / qp ... stuff
4452     if(h->slice_type == SP_TYPE){
4453         get_bits1(&s->gb); /* sp_for_switch_flag */
4454     }
4455     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4456         get_se_golomb(&s->gb); /* slice_qs_delta */
4457     }
4458
4459     h->deblocking_filter = 1;
4460     h->slice_alpha_c0_offset = 0;
4461     h->slice_beta_offset = 0;
4462     if( h->pps.deblocking_filter_parameters_present ) {
4463         tmp= get_ue_golomb(&s->gb);
4464         if(tmp > 2){
4465             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4466             return -1;
4467         }
4468         h->deblocking_filter= tmp;
4469         if(h->deblocking_filter < 2)
4470             h->deblocking_filter^= 1; // 1<->0
4471
4472         if( h->deblocking_filter ) {
4473             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4474             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4475         }
4476     }
4477     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4478        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4479        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4480        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4481         h->deblocking_filter= 0;
4482
4483 #if 0 //FMO
4484     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4485         slice_group_change_cycle= get_bits(&s->gb, ?);
4486 #endif
4487
4488     h->slice_num++;
4489
4490     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4491     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4492
4493     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4494         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4495                h->slice_num,
4496                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4497                first_mb_in_slice,
4498                av_get_pict_type_char(h->slice_type),
4499                pps_id, h->frame_num,
4500                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4501                h->ref_count[0], h->ref_count[1],
4502                s->qscale,
4503                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4504                h->use_weight,
4505                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4506                );
4507     }
4508
4509     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4510         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4511         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4512     }else{
4513         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4514         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4515     }
4516
4517     return 0;
4518 }
4519
4520 /**
4521  *
4522  */
4523 static inline int get_level_prefix(GetBitContext *gb){
4524     unsigned int buf;
4525     int log;
4526
4527     OPEN_READER(re, gb);
4528     UPDATE_CACHE(re, gb);
4529     buf=GET_CACHE(re, gb);
4530
4531     log= 32 - av_log2(buf);
4532 #ifdef TRACE
4533     print_bin(buf>>(32-log), log);
4534     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4535 #endif
4536
4537     LAST_SKIP_BITS(re, gb, log);
4538     CLOSE_READER(re, gb);
4539
4540     return log-1;
4541 }
4542
4543 static inline int get_dct8x8_allowed(H264Context *h){
4544     int i;
4545     for(i=0; i<4; i++){
4546         if(!IS_SUB_8X8(h->sub_mb_type[i])
4547            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4548             return 0;
4549     }
4550     return 1;
4551 }
4552
4553 /**
4554  * decodes a residual block.
4555  * @param n block index
4556  * @param scantable scantable
4557  * @param max_coeff number of coefficients in the block
4558  * @return <0 if an error occured
4559  */
4560 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4561     MpegEncContext * const s = &h->s;
4562     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4563     int level[16];
4564     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4565
4566     //FIXME put trailing_onex into the context
4567
4568     if(n == CHROMA_DC_BLOCK_INDEX){
4569         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4570         total_coeff= coeff_token>>2;
4571     }else{
4572         if(n == LUMA_DC_BLOCK_INDEX){
4573             total_coeff= pred_non_zero_count(h, 0);
4574             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4575             total_coeff= coeff_token>>2;
4576         }else{
4577             total_coeff= pred_non_zero_count(h, n);
4578             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4579             total_coeff= coeff_token>>2;
4580             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4581         }
4582     }
4583
4584     //FIXME set last_non_zero?
4585
4586     if(total_coeff==0)
4587         return 0;
4588     if(total_coeff > (unsigned)max_coeff) {
4589         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4590         return -1;
4591     }
4592
4593     trailing_ones= coeff_token&3;
4594     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4595     assert(total_coeff<=16);
4596
4597     for(i=0; i<trailing_ones; i++){
4598         level[i]= 1 - 2*get_bits1(gb);
4599     }
4600
4601     if(i<total_coeff) {
4602         int level_code, mask;
4603         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4604         int prefix= get_level_prefix(gb);
4605
4606         //first coefficient has suffix_length equal to 0 or 1
4607         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4608             if(suffix_length)
4609                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4610             else
4611                 level_code= (prefix<<suffix_length); //part
4612         }else if(prefix==14){
4613             if(suffix_length)
4614                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4615             else
4616                 level_code= prefix + get_bits(gb, 4); //part
4617         }else if(prefix==15){
4618             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4619             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4620         }else{
4621             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4622             return -1;
4623         }
4624
4625         if(trailing_ones < 3) level_code += 2;
4626
4627         suffix_length = 1;
4628         if(level_code > 5)
4629             suffix_length++;
4630         mask= -(level_code&1);
4631         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4632         i++;
4633
4634         //remaining coefficients have suffix_length > 0
4635         for(;i<total_coeff;i++) {
4636             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4637             prefix = get_level_prefix(gb);
4638             if(prefix<15){
4639                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4640             }else if(prefix==15){
4641                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4642             }else{
4643                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4644                 return -1;
4645             }
4646             mask= -(level_code&1);
4647             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4648             if(level_code > suffix_limit[suffix_length])
4649                 suffix_length++;
4650         }
4651     }
4652
4653     if(total_coeff == max_coeff)
4654         zeros_left=0;
4655     else{
4656         if(n == CHROMA_DC_BLOCK_INDEX)
4657             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4658         else
4659             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4660     }
4661
4662     coeff_num = zeros_left + total_coeff - 1;
4663     j = scantable[coeff_num];
4664     if(n > 24){
4665         block[j] = level[0];
4666         for(i=1;i<total_coeff;i++) {
4667             if(zeros_left <= 0)
4668                 run_before = 0;
4669             else if(zeros_left < 7){
4670                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4671             }else{
4672                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4673             }
4674             zeros_left -= run_before;
4675             coeff_num -= 1 + run_before;
4676             j= scantable[ coeff_num ];
4677
4678             block[j]= level[i];
4679         }
4680     }else{
4681         block[j] = (level[0] * qmul[j] + 32)>>6;
4682         for(i=1;i<total_coeff;i++) {
4683             if(zeros_left <= 0)
4684                 run_before = 0;
4685             else if(zeros_left < 7){
4686                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4687             }else{
4688                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4689             }
4690             zeros_left -= run_before;
4691             coeff_num -= 1 + run_before;
4692             j= scantable[ coeff_num ];
4693
4694             block[j]= (level[i] * qmul[j] + 32)>>6;
4695         }
4696     }
4697
4698     if(zeros_left<0){
4699         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4700         return -1;
4701     }
4702
4703     return 0;
4704 }
4705
4706 static void predict_field_decoding_flag(H264Context *h){
4707     MpegEncContext * const s = &h->s;
4708     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4709     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4710                 ? s->current_picture.mb_type[mb_xy-1]
4711                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4712                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4713                 : 0;
4714     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4715 }
4716
4717 /**
4718  * decodes a P_SKIP or B_SKIP macroblock
4719  */
4720 static void decode_mb_skip(H264Context *h){
4721     MpegEncContext * const s = &h->s;
4722     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4723     int mb_type=0;
4724
4725     memset(h->non_zero_count[mb_xy], 0, 16);
4726     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4727
4728     if(MB_FIELD)
4729         mb_type|= MB_TYPE_INTERLACED;
4730
4731     if( h->slice_type == B_TYPE )
4732     {
4733         // just for fill_caches. pred_direct_motion will set the real mb_type
4734         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4735
4736         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4737         pred_direct_motion(h, &mb_type);
4738         mb_type|= MB_TYPE_SKIP;
4739     }
4740     else
4741     {
4742         int mx, my;
4743         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4744
4745         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4746         pred_pskip_motion(h, &mx, &my);
4747         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4748         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4749     }
4750
4751     write_back_motion(h, mb_type);
4752     s->current_picture.mb_type[mb_xy]= mb_type;
4753     s->current_picture.qscale_table[mb_xy]= s->qscale;
4754     h->slice_table[ mb_xy ]= h->slice_num;
4755     h->prev_mb_skipped= 1;
4756 }
4757
4758 /**
4759  * decodes a macroblock
4760  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4761  */
4762 static int decode_mb_cavlc(H264Context *h){
4763     MpegEncContext * const s = &h->s;
4764     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4765     int partition_count;
4766     unsigned int mb_type, cbp;
4767     int dct8x8_allowed= h->pps.transform_8x8_mode;
4768
4769     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4770
4771     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4772     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4773                 down the code */
4774     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4775         if(s->mb_skip_run==-1)
4776             s->mb_skip_run= get_ue_golomb(&s->gb);
4777
4778         if (s->mb_skip_run--) {
4779             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4780                 if(s->mb_skip_run==0)
4781                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4782                 else
4783                     predict_field_decoding_flag(h);
4784             }
4785             decode_mb_skip(h);
4786             return 0;
4787         }
4788     }
4789     if(FRAME_MBAFF){
4790         if( (s->mb_y&1) == 0 )
4791             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4792     }else
4793         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4794
4795     h->prev_mb_skipped= 0;
4796
4797     mb_type= get_ue_golomb(&s->gb);
4798     if(h->slice_type == B_TYPE){
4799         if(mb_type < 23){
4800             partition_count= b_mb_type_info[mb_type].partition_count;
4801             mb_type=         b_mb_type_info[mb_type].type;
4802         }else{
4803             mb_type -= 23;
4804             goto decode_intra_mb;
4805         }
4806     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4807         if(mb_type < 5){
4808             partition_count= p_mb_type_info[mb_type].partition_count;
4809             mb_type=         p_mb_type_info[mb_type].type;
4810         }else{
4811             mb_type -= 5;
4812             goto decode_intra_mb;
4813         }
4814     }else{
4815        assert(h->slice_type == I_TYPE);
4816 decode_intra_mb:
4817         if(mb_type > 25){
4818             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4819             return -1;
4820         }
4821         partition_count=0;
4822         cbp= i_mb_type_info[mb_type].cbp;
4823         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4824         mb_type= i_mb_type_info[mb_type].type;
4825     }
4826
4827     if(MB_FIELD)
4828         mb_type |= MB_TYPE_INTERLACED;
4829
4830     h->slice_table[ mb_xy ]= h->slice_num;
4831
4832     if(IS_INTRA_PCM(mb_type)){
4833         unsigned int x, y;
4834
4835         // We assume these blocks are very rare so we do not optimize it.
4836         align_get_bits(&s->gb);
4837
4838         // The pixels are stored in the same order as levels in h->mb array.
4839         for(y=0; y<16; y++){
4840             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4841             for(x=0; x<16; x++){
4842                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4843                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4844             }
4845         }
4846         for(y=0; y<8; y++){
4847             const int index= 256 + 4*(y&3) + 32*(y>>2);
4848             for(x=0; x<8; x++){
4849                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4850                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4851             }
4852         }
4853         for(y=0; y<8; y++){
4854             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4855             for(x=0; x<8; x++){
4856                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4857                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4858             }
4859         }
4860
4861         // In deblocking, the quantizer is 0
4862         s->current_picture.qscale_table[mb_xy]= 0;
4863         h->chroma_qp = get_chroma_qp(h, 0);
4864         // All coeffs are present
4865         memset(h->non_zero_count[mb_xy], 16, 16);
4866
4867         s->current_picture.mb_type[mb_xy]= mb_type;
4868         return 0;
4869     }
4870
4871     if(MB_MBAFF){
4872         h->ref_count[0] <<= 1;
4873         h->ref_count[1] <<= 1;
4874     }
4875
4876     fill_caches(h, mb_type, 0);
4877
4878     //mb_pred
4879     if(IS_INTRA(mb_type)){
4880             int pred_mode;
4881 //            init_top_left_availability(h);
4882             if(IS_INTRA4x4(mb_type)){
4883                 int i;
4884                 int di = 1;
4885                 if(dct8x8_allowed && get_bits1(&s->gb)){
4886                     mb_type |= MB_TYPE_8x8DCT;
4887                     di = 4;
4888                 }
4889
4890 //                fill_intra4x4_pred_table(h);
4891                 for(i=0; i<16; i+=di){
4892                     int mode= pred_intra_mode(h, i);
4893
4894                     if(!get_bits1(&s->gb)){
4895                         const int rem_mode= get_bits(&s->gb, 3);
4896                         mode = rem_mode + (rem_mode >= mode);
4897                     }
4898
4899                     if(di==4)
4900                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4901                     else
4902                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4903                 }
4904                 write_back_intra_pred_mode(h);
4905                 if( check_intra4x4_pred_mode(h) < 0)
4906                     return -1;
4907             }else{
4908                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4909                 if(h->intra16x16_pred_mode < 0)
4910                     return -1;
4911             }
4912
4913             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4914             if(pred_mode < 0)
4915                 return -1;
4916             h->chroma_pred_mode= pred_mode;
4917     }else if(partition_count==4){
4918         int i, j, sub_partition_count[4], list, ref[2][4];
4919
4920         if(h->slice_type == B_TYPE){
4921             for(i=0; i<4; i++){
4922                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4923                 if(h->sub_mb_type[i] >=13){
4924                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4925                     return -1;
4926                 }
4927                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4928                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4929             }
4930             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4931                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4932                 pred_direct_motion(h, &mb_type);
4933                 h->ref_cache[0][scan8[4]] =
4934                 h->ref_cache[1][scan8[4]] =
4935                 h->ref_cache[0][scan8[12]] =
4936                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4937             }
4938         }else{
4939             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4940             for(i=0; i<4; i++){
4941                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4942                 if(h->sub_mb_type[i] >=4){
4943                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4944                     return -1;
4945                 }
4946                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4947                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4948             }
4949         }
4950
4951         for(list=0; list<h->list_count; list++){
4952             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4953             for(i=0; i<4; i++){
4954                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4955                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4956                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4957                     if(tmp>=ref_count){
4958                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4959                         return -1;
4960                     }
4961                     ref[list][i]= tmp;
4962                 }else{
4963                  //FIXME
4964                     ref[list][i] = -1;
4965                 }
4966             }
4967         }
4968
4969         if(dct8x8_allowed)
4970             dct8x8_allowed = get_dct8x8_allowed(h);
4971
4972         for(list=0; list<h->list_count; list++){
4973             for(i=0; i<4; i++){
4974                 if(IS_DIRECT(h->sub_mb_type[i])) {
4975                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4976                     continue;
4977                 }
4978                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4979                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4980
4981                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4982                     const int sub_mb_type= h->sub_mb_type[i];
4983                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4984                     for(j=0; j<sub_partition_count[i]; j++){
4985                         int mx, my;
4986                         const int index= 4*i + block_width*j;
4987                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4988                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4989                         mx += get_se_golomb(&s->gb);
4990                         my += get_se_golomb(&s->gb);
4991                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4992
4993                         if(IS_SUB_8X8(sub_mb_type)){
4994                             mv_cache[ 1 ][0]=
4995                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4996                             mv_cache[ 1 ][1]=
4997                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4998                         }else if(IS_SUB_8X4(sub_mb_type)){
4999                             mv_cache[ 1 ][0]= mx;
5000                             mv_cache[ 1 ][1]= my;
5001                         }else if(IS_SUB_4X8(sub_mb_type)){
5002                             mv_cache[ 8 ][0]= mx;
5003                             mv_cache[ 8 ][1]= my;
5004                         }
5005                         mv_cache[ 0 ][0]= mx;
5006                         mv_cache[ 0 ][1]= my;
5007                     }
5008                 }else{
5009                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5010                     p[0] = p[1]=
5011                     p[8] = p[9]= 0;
5012                 }
5013             }
5014         }
5015     }else if(IS_DIRECT(mb_type)){
5016         pred_direct_motion(h, &mb_type);
5017         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5018     }else{
5019         int list, mx, my, i;
5020          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5021         if(IS_16X16(mb_type)){
5022             for(list=0; list<h->list_count; list++){
5023                     unsigned int val;
5024                     if(IS_DIR(mb_type, 0, list)){
5025                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
5026                         if(val >= h->ref_count[list]){
5027                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5028                             return -1;
5029                         }
5030                     }else
5031                         val= LIST_NOT_USED&0xFF;
5032                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5033             }
5034             for(list=0; list<h->list_count; list++){
5035                 unsigned int val;
5036                 if(IS_DIR(mb_type, 0, list)){
5037                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5038                     mx += get_se_golomb(&s->gb);
5039                     my += get_se_golomb(&s->gb);
5040                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5041
5042                     val= pack16to32(mx,my);
5043                 }else
5044                     val=0;
5045                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
5046             }
5047         }
5048         else if(IS_16X8(mb_type)){
5049             for(list=0; list<h->list_count; list++){
5050                     for(i=0; i<2; i++){
5051                         unsigned int val;
5052                         if(IS_DIR(mb_type, i, list)){
5053                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5054                             if(val >= h->ref_count[list]){
5055                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5056                                 return -1;
5057                             }
5058                         }else
5059                             val= LIST_NOT_USED&0xFF;
5060                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5061                     }
5062             }
5063             for(list=0; list<h->list_count; list++){
5064                 for(i=0; i<2; i++){
5065                     unsigned int val;
5066                     if(IS_DIR(mb_type, i, list)){
5067                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5068                         mx += get_se_golomb(&s->gb);
5069                         my += get_se_golomb(&s->gb);
5070                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5071
5072                         val= pack16to32(mx,my);
5073                     }else
5074                         val=0;
5075                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
5076                 }
5077             }
5078         }else{
5079             assert(IS_8X16(mb_type));
5080             for(list=0; list<h->list_count; list++){
5081                     for(i=0; i<2; i++){
5082                         unsigned int val;
5083                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5084                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5085                             if(val >= h->ref_count[list]){
5086                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5087                                 return -1;
5088                             }
5089                         }else
5090                             val= LIST_NOT_USED&0xFF;
5091                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5092                     }
5093             }
5094             for(list=0; list<h->list_count; list++){
5095                 for(i=0; i<2; i++){
5096                     unsigned int val;
5097                     if(IS_DIR(mb_type, i, list)){
5098                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5099                         mx += get_se_golomb(&s->gb);
5100                         my += get_se_golomb(&s->gb);
5101                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5102
5103                         val= pack16to32(mx,my);
5104                     }else
5105                         val=0;
5106                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
5107                 }
5108             }
5109         }
5110     }
5111
5112     if(IS_INTER(mb_type))
5113         write_back_motion(h, mb_type);
5114
5115     if(!IS_INTRA16x16(mb_type)){
5116         cbp= get_ue_golomb(&s->gb);
5117         if(cbp > 47){
5118             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
5119             return -1;
5120         }
5121
5122         if(IS_INTRA4x4(mb_type))
5123             cbp= golomb_to_intra4x4_cbp[cbp];
5124         else
5125             cbp= golomb_to_inter_cbp[cbp];
5126     }
5127     h->cbp = cbp;
5128
5129     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5130         if(get_bits1(&s->gb))
5131             mb_type |= MB_TYPE_8x8DCT;
5132     }
5133     s->current_picture.mb_type[mb_xy]= mb_type;
5134
5135     if(cbp || IS_INTRA16x16(mb_type)){
5136         int i8x8, i4x4, chroma_idx;
5137         int chroma_qp, dquant;
5138         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5139         const uint8_t *scan, *scan8x8, *dc_scan;
5140
5141 //        fill_non_zero_count_cache(h);
5142
5143         if(IS_INTERLACED(mb_type)){
5144             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5145             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5146             dc_scan= luma_dc_field_scan;
5147         }else{
5148             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5149             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5150             dc_scan= luma_dc_zigzag_scan;
5151         }
5152
5153         dquant= get_se_golomb(&s->gb);
5154
5155         if( dquant > 25 || dquant < -26 ){
5156             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5157             return -1;
5158         }
5159
5160         s->qscale += dquant;
5161         if(((unsigned)s->qscale) > 51){
5162             if(s->qscale<0) s->qscale+= 52;
5163             else            s->qscale-= 52;
5164         }
5165
5166         h->chroma_qp= chroma_qp= get_chroma_qp(h, s->qscale);
5167         if(IS_INTRA16x16(mb_type)){
5168             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5169                 return -1; //FIXME continue if partitioned and other return -1 too
5170             }
5171
5172             assert((cbp&15) == 0 || (cbp&15) == 15);
5173
5174             if(cbp&15){
5175                 for(i8x8=0; i8x8<4; i8x8++){
5176                     for(i4x4=0; i4x4<4; i4x4++){
5177                         const int index= i4x4 + 4*i8x8;
5178                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5179                             return -1;
5180                         }
5181                     }
5182                 }
5183             }else{
5184                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5185             }
5186         }else{
5187             for(i8x8=0; i8x8<4; i8x8++){
5188                 if(cbp & (1<<i8x8)){
5189                     if(IS_8x8DCT(mb_type)){
5190                         DCTELEM *buf = &h->mb[64*i8x8];
5191                         uint8_t *nnz;
5192                         for(i4x4=0; i4x4<4; i4x4++){
5193                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5194                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5195                                 return -1;
5196                         }
5197                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5198                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5199                     }else{
5200                         for(i4x4=0; i4x4<4; i4x4++){
5201                             const int index= i4x4 + 4*i8x8;
5202
5203                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5204                                 return -1;
5205                             }
5206                         }
5207                     }
5208                 }else{
5209                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5210                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5211                 }
5212             }
5213         }
5214
5215         if(cbp&0x30){
5216             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5217                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5218                     return -1;
5219                 }
5220         }
5221
5222         if(cbp&0x20){
5223             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5224                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp];
5225                 for(i4x4=0; i4x4<4; i4x4++){
5226                     const int index= 16 + 4*chroma_idx + i4x4;
5227                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
5228                         return -1;
5229                     }
5230                 }
5231             }
5232         }else{
5233             uint8_t * const nnz= &h->non_zero_count_cache[0];
5234             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5235             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5236         }
5237     }else{
5238         uint8_t * const nnz= &h->non_zero_count_cache[0];
5239         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5240         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5241         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5242     }
5243     s->current_picture.qscale_table[mb_xy]= s->qscale;
5244     write_back_non_zero_count(h);
5245
5246     if(MB_MBAFF){
5247         h->ref_count[0] >>= 1;
5248         h->ref_count[1] >>= 1;
5249     }
5250
5251     return 0;
5252 }
5253
5254 static int decode_cabac_field_decoding_flag(H264Context *h) {
5255     MpegEncContext * const s = &h->s;
5256     const int mb_x = s->mb_x;
5257     const int mb_y = s->mb_y & ~1;
5258     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5259     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5260
5261     unsigned int ctx = 0;
5262
5263     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5264         ctx += 1;
5265     }
5266     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5267         ctx += 1;
5268     }
5269
5270     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5271 }
5272
5273 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5274     uint8_t *state= &h->cabac_state[ctx_base];
5275     int mb_type;
5276
5277     if(intra_slice){
5278         MpegEncContext * const s = &h->s;
5279         const int mba_xy = h->left_mb_xy[0];
5280         const int mbb_xy = h->top_mb_xy;
5281         int ctx=0;
5282         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5283             ctx++;
5284         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5285             ctx++;
5286         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5287             return 0;   /* I4x4 */
5288         state += 2;
5289     }else{
5290         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5291             return 0;   /* I4x4 */
5292     }
5293
5294     if( get_cabac_terminate( &h->cabac ) )
5295         return 25;  /* PCM */
5296
5297     mb_type = 1; /* I16x16 */
5298     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5299     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5300         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5301     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5302     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5303     return mb_type;
5304 }
5305
5306 static int decode_cabac_mb_type( H264Context *h ) {
5307     MpegEncContext * const s = &h->s;
5308
5309     if( h->slice_type == I_TYPE ) {
5310         return decode_cabac_intra_mb_type(h, 3, 1);
5311     } else if( h->slice_type == P_TYPE ) {
5312         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5313             /* P-type */
5314             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5315                 /* P_L0_D16x16, P_8x8 */
5316                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5317             } else {
5318                 /* P_L0_D8x16, P_L0_D16x8 */
5319                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5320             }
5321         } else {
5322             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5323         }
5324     } else if( h->slice_type == B_TYPE ) {
5325         const int mba_xy = h->left_mb_xy[0];
5326         const int mbb_xy = h->top_mb_xy;
5327         int ctx = 0;
5328         int bits;
5329
5330         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5331             ctx++;
5332         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5333             ctx++;
5334
5335         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5336             return 0; /* B_Direct_16x16 */
5337
5338         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5339             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5340         }
5341
5342         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5343         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5344         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5345         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5346         if( bits < 8 )
5347             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5348         else if( bits == 13 ) {
5349             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5350         } else if( bits == 14 )
5351             return 11; /* B_L1_L0_8x16 */
5352         else if( bits == 15 )
5353             return 22; /* B_8x8 */
5354
5355         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5356         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5357     } else {
5358         /* TODO SI/SP frames? */
5359         return -1;
5360     }
5361 }
5362
5363 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5364     MpegEncContext * const s = &h->s;
5365     int mba_xy, mbb_xy;
5366     int ctx = 0;
5367
5368     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5369         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5370         mba_xy = mb_xy - 1;
5371         if( (mb_y&1)
5372             && h->slice_table[mba_xy] == h->slice_num
5373             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5374             mba_xy += s->mb_stride;
5375         if( MB_FIELD ){
5376             mbb_xy = mb_xy - s->mb_stride;
5377             if( !(mb_y&1)
5378                 && h->slice_table[mbb_xy] == h->slice_num
5379                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5380                 mbb_xy -= s->mb_stride;
5381         }else
5382             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5383     }else{
5384         int mb_xy = mb_x + mb_y*s->mb_stride;
5385         mba_xy = mb_xy - 1;
5386         mbb_xy = mb_xy - s->mb_stride;
5387     }
5388
5389     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5390         ctx++;
5391     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5392         ctx++;
5393
5394     if( h->slice_type == B_TYPE )
5395         ctx += 13;
5396     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5397 }
5398
5399 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5400     int mode = 0;
5401
5402     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5403         return pred_mode;
5404
5405     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5406     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5407     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5408
5409     if( mode >= pred_mode )
5410         return mode + 1;
5411     else
5412         return mode;
5413 }
5414
5415 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5416     const int mba_xy = h->left_mb_xy[0];
5417     const int mbb_xy = h->top_mb_xy;
5418
5419     int ctx = 0;
5420
5421     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5422     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5423         ctx++;
5424
5425     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5426         ctx++;
5427
5428     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5429         return 0;
5430
5431     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5432         return 1;
5433     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5434         return 2;
5435     else
5436         return 3;
5437 }
5438
5439 static const uint8_t block_idx_x[16] = {
5440     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5441 };
5442 static const uint8_t block_idx_y[16] = {
5443     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5444 };
5445 static const uint8_t block_idx_xy[4][4] = {
5446     { 0, 2, 8,  10},
5447     { 1, 3, 9,  11},
5448     { 4, 6, 12, 14},
5449     { 5, 7, 13, 15}
5450 };
5451
5452 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5453     int cbp = 0;
5454     int cbp_b = -1;
5455     int i8x8;
5456
5457     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5458         cbp_b = h->top_cbp;
5459         tprintf(h->s.avctx, "cbp_b = top_cbp = %x\n", cbp_b);
5460     }
5461
5462     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5463         int cbp_a = -1;
5464         int x, y;
5465         int ctx = 0;
5466
5467         x = block_idx_x[4*i8x8];
5468         y = block_idx_y[4*i8x8];
5469
5470         if( x > 0 )
5471             cbp_a = cbp;
5472         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5473             cbp_a = h->left_cbp;
5474             tprintf(h->s.avctx, "cbp_a = left_cbp = %x\n", cbp_a);
5475         }
5476
5477         if( y > 0 )
5478             cbp_b = cbp;
5479
5480         /* No need to test for skip as we put 0 for skip block */
5481         /* No need to test for IPCM as we put 1 for IPCM block */
5482         if( cbp_a >= 0 ) {
5483             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5484             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5485                 ctx++;
5486         }
5487
5488         if( cbp_b >= 0 ) {
5489             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5490             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5491                 ctx += 2;
5492         }
5493
5494         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5495             cbp |= 1 << i8x8;
5496         }
5497     }
5498     return cbp;
5499 }
5500 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5501     int ctx;
5502     int cbp_a, cbp_b;
5503
5504     cbp_a = (h->left_cbp>>4)&0x03;
5505     cbp_b = (h-> top_cbp>>4)&0x03;
5506
5507     ctx = 0;
5508     if( cbp_a > 0 ) ctx++;
5509     if( cbp_b > 0 ) ctx += 2;
5510     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5511         return 0;
5512
5513     ctx = 4;
5514     if( cbp_a == 2 ) ctx++;
5515     if( cbp_b == 2 ) ctx += 2;
5516     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5517 }
5518 static int decode_cabac_mb_dqp( H264Context *h) {
5519     MpegEncContext * const s = &h->s;
5520     int mbn_xy;
5521     int   ctx = 0;
5522     int   val = 0;
5523
5524     if( s->mb_x > 0 )
5525         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5526     else
5527         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5528
5529     if( h->last_qscale_diff != 0 )
5530         ctx++;
5531
5532     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5533         if( ctx < 2 )
5534             ctx = 2;
5535         else
5536             ctx = 3;
5537         val++;
5538         if(val > 102) //prevent infinite loop
5539             return INT_MIN;
5540     }
5541
5542     if( val&0x01 )
5543         return (val + 1)/2;
5544     else
5545         return -(val + 1)/2;
5546 }
5547 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5548     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5549         return 0;   /* 8x8 */
5550     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5551         return 1;   /* 8x4 */
5552     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5553         return 2;   /* 4x8 */
5554     return 3;       /* 4x4 */
5555 }
5556 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5557     int type;
5558     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5559         return 0;   /* B_Direct_8x8 */
5560     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5561         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5562     type = 3;
5563     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5564         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5565             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5566         type += 4;
5567     }
5568     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5569     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5570     return type;
5571 }
5572
5573 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5574     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5575 }
5576
5577 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5578     int refa = h->ref_cache[list][scan8[n] - 1];
5579     int refb = h->ref_cache[list][scan8[n] - 8];
5580     int ref  = 0;
5581     int ctx  = 0;
5582
5583     if( h->slice_type == B_TYPE) {
5584         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5585             ctx++;
5586         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5587             ctx += 2;
5588     } else {
5589         if( refa > 0 )
5590             ctx++;
5591         if( refb > 0 )
5592             ctx += 2;
5593     }
5594
5595     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5596         ref++;
5597         if( ctx < 4 )
5598             ctx = 4;
5599         else
5600             ctx = 5;
5601         if(ref >= 32 /*h->ref_list[list]*/){
5602             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5603             return 0; //FIXME we should return -1 and check the return everywhere
5604         }
5605     }
5606     return ref;
5607 }
5608
5609 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5610     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5611                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5612     int ctxbase = (l == 0) ? 40 : 47;
5613     int ctx, mvd;
5614
5615     if( amvd < 3 )
5616         ctx = 0;
5617     else if( amvd > 32 )
5618         ctx = 2;
5619     else
5620         ctx = 1;
5621
5622     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5623         return 0;
5624
5625     mvd= 1;
5626     ctx= 3;
5627     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5628         mvd++;
5629         if( ctx < 6 )
5630             ctx++;
5631     }
5632
5633     if( mvd >= 9 ) {
5634         int k = 3;
5635         while( get_cabac_bypass( &h->cabac ) ) {
5636             mvd += 1 << k;
5637             k++;
5638             if(k>24){
5639                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5640                 return INT_MIN;
5641             }
5642         }
5643         while( k-- ) {
5644             if( get_cabac_bypass( &h->cabac ) )
5645                 mvd += 1 << k;
5646         }
5647     }
5648     return get_cabac_bypass_sign( &h->cabac, -mvd );
5649 }
5650
5651 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5652     int nza, nzb;
5653     int ctx = 0;
5654
5655     if( cat == 0 ) {
5656         nza = h->left_cbp&0x100;
5657         nzb = h-> top_cbp&0x100;
5658     } else if( cat == 1 || cat == 2 ) {
5659         nza = h->non_zero_count_cache[scan8[idx] - 1];
5660         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5661     } else if( cat == 3 ) {
5662         nza = (h->left_cbp>>(6+idx))&0x01;
5663         nzb = (h-> top_cbp>>(6+idx))&0x01;
5664     } else {
5665         assert(cat == 4);
5666         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5667         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5668     }
5669
5670     if( nza > 0 )
5671         ctx++;
5672
5673     if( nzb > 0 )
5674         ctx += 2;
5675
5676     return ctx + 4 * cat;
5677 }
5678
5679 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5680     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5681     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5682     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5683     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5684 };
5685
5686 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5687     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5688     static const int significant_coeff_flag_offset[2][6] = {
5689       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5690       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5691     };
5692     static const int last_coeff_flag_offset[2][6] = {
5693       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5694       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5695     };
5696     static const int coeff_abs_level_m1_offset[6] = {
5697         227+0, 227+10, 227+20, 227+30, 227+39, 426
5698     };
5699     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5700       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5701         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5702         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5703        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5704       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5705         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5706         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5707         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5708     };
5709
5710     int index[64];
5711
5712     int last;
5713     int coeff_count = 0;
5714
5715     int abslevel1 = 1;
5716     int abslevelgt1 = 0;
5717
5718     uint8_t *significant_coeff_ctx_base;
5719     uint8_t *last_coeff_ctx_base;
5720     uint8_t *abs_level_m1_ctx_base;
5721
5722 #ifndef ARCH_X86
5723 #define CABAC_ON_STACK
5724 #endif
5725 #ifdef CABAC_ON_STACK
5726 #define CC &cc
5727     CABACContext cc;
5728     cc.range     = h->cabac.range;
5729     cc.low       = h->cabac.low;
5730     cc.bytestream= h->cabac.bytestream;
5731 #else
5732 #define CC &h->cabac
5733 #endif
5734
5735
5736     /* cat: 0-> DC 16x16  n = 0
5737      *      1-> AC 16x16  n = luma4x4idx
5738      *      2-> Luma4x4   n = luma4x4idx
5739      *      3-> DC Chroma n = iCbCr
5740      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5741      *      5-> Luma8x8   n = 4 * luma8x8idx
5742      */
5743
5744     /* read coded block flag */
5745     if( cat != 5 ) {
5746         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5747             if( cat == 1 || cat == 2 )
5748                 h->non_zero_count_cache[scan8[n]] = 0;
5749             else if( cat == 4 )
5750                 h->non_zero_count_cache[scan8[16+n]] = 0;
5751 #ifdef CABAC_ON_STACK
5752             h->cabac.range     = cc.range     ;
5753             h->cabac.low       = cc.low       ;
5754             h->cabac.bytestream= cc.bytestream;
5755 #endif
5756             return 0;
5757         }
5758     }
5759
5760     significant_coeff_ctx_base = h->cabac_state
5761         + significant_coeff_flag_offset[MB_FIELD][cat];
5762     last_coeff_ctx_base = h->cabac_state
5763         + last_coeff_flag_offset[MB_FIELD][cat];
5764     abs_level_m1_ctx_base = h->cabac_state
5765         + coeff_abs_level_m1_offset[cat];
5766
5767     if( cat == 5 ) {
5768 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5769         for(last= 0; last < coefs; last++) { \
5770             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5771             if( get_cabac( CC, sig_ctx )) { \
5772                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5773                 index[coeff_count++] = last; \
5774                 if( get_cabac( CC, last_ctx ) ) { \
5775                     last= max_coeff; \
5776                     break; \
5777                 } \
5778             } \
5779         }\
5780         if( last == max_coeff -1 ) {\
5781             index[coeff_count++] = last;\
5782         }
5783         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5784 #if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5785         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5786     } else {
5787         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5788 #else
5789         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5790     } else {
5791         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5792 #endif
5793     }
5794     assert(coeff_count > 0);
5795
5796     if( cat == 0 )
5797         h->cbp_table[mb_xy] |= 0x100;
5798     else if( cat == 1 || cat == 2 )
5799         h->non_zero_count_cache[scan8[n]] = coeff_count;
5800     else if( cat == 3 )
5801         h->cbp_table[mb_xy] |= 0x40 << n;
5802     else if( cat == 4 )
5803         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5804     else {
5805         assert( cat == 5 );
5806         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5807     }
5808
5809     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5810         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5811         int j= scantable[index[coeff_count]];
5812
5813         if( get_cabac( CC, ctx ) == 0 ) {
5814             if( !qmul ) {
5815                 block[j] = get_cabac_bypass_sign( CC, -1);
5816             }else{
5817                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5818             }
5819
5820             abslevel1++;
5821         } else {
5822             int coeff_abs = 2;
5823             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5824             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5825                 coeff_abs++;
5826             }
5827
5828             if( coeff_abs >= 15 ) {
5829                 int j = 0;
5830                 while( get_cabac_bypass( CC ) ) {
5831                     j++;
5832                 }
5833
5834                 coeff_abs=1;
5835                 while( j-- ) {
5836                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5837                 }
5838                 coeff_abs+= 14;
5839             }
5840
5841             if( !qmul ) {
5842                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5843                 else                                block[j] =  coeff_abs;
5844             }else{
5845                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5846                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5847             }
5848
5849             abslevelgt1++;
5850         }
5851     }
5852 #ifdef CABAC_ON_STACK
5853             h->cabac.range     = cc.range     ;
5854             h->cabac.low       = cc.low       ;
5855             h->cabac.bytestream= cc.bytestream;
5856 #endif
5857     return 0;
5858 }
5859
5860 static inline void compute_mb_neighbors(H264Context *h)
5861 {
5862     MpegEncContext * const s = &h->s;
5863     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5864     h->top_mb_xy     = mb_xy - s->mb_stride;
5865     h->left_mb_xy[0] = mb_xy - 1;
5866     if(FRAME_MBAFF){
5867         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5868         const int top_pair_xy      = pair_xy     - s->mb_stride;
5869         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5870         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5871         const int curr_mb_frame_flag = !MB_FIELD;
5872         const int bottom = (s->mb_y & 1);
5873         if (bottom
5874                 ? !curr_mb_frame_flag // bottom macroblock
5875                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5876                 ) {
5877             h->top_mb_xy -= s->mb_stride;
5878         }
5879         if (left_mb_frame_flag != curr_mb_frame_flag) {
5880             h->left_mb_xy[0] = pair_xy - 1;
5881         }
5882     }
5883     return;
5884 }
5885
5886 /**
5887  * decodes a macroblock
5888  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5889  */
5890 static int decode_mb_cabac(H264Context *h) {
5891     MpegEncContext * const s = &h->s;
5892     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5893     int mb_type, partition_count, cbp = 0;
5894     int dct8x8_allowed= h->pps.transform_8x8_mode;
5895
5896     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5897
5898     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5899     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5900         int skip;
5901         /* a skipped mb needs the aff flag from the following mb */
5902         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5903             predict_field_decoding_flag(h);
5904         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5905             skip = h->next_mb_skipped;
5906         else
5907             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5908         /* read skip flags */
5909         if( skip ) {
5910             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5911                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5912                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5913                 if(h->next_mb_skipped)
5914                     predict_field_decoding_flag(h);
5915                 else
5916                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5917             }
5918
5919             decode_mb_skip(h);
5920
5921             h->cbp_table[mb_xy] = 0;
5922             h->chroma_pred_mode_table[mb_xy] = 0;
5923             h->last_qscale_diff = 0;
5924
5925             return 0;
5926
5927         }
5928     }
5929     if(FRAME_MBAFF){
5930         if( (s->mb_y&1) == 0 )
5931             h->mb_mbaff =
5932             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5933     }else
5934         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5935
5936     h->prev_mb_skipped = 0;
5937
5938     compute_mb_neighbors(h);
5939     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5940         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5941         return -1;
5942     }
5943
5944     if( h->slice_type == B_TYPE ) {
5945         if( mb_type < 23 ){
5946             partition_count= b_mb_type_info[mb_type].partition_count;
5947             mb_type=         b_mb_type_info[mb_type].type;
5948         }else{
5949             mb_type -= 23;
5950             goto decode_intra_mb;
5951         }
5952     } else if( h->slice_type == P_TYPE ) {
5953         if( mb_type < 5) {
5954             partition_count= p_mb_type_info[mb_type].partition_count;
5955             mb_type=         p_mb_type_info[mb_type].type;
5956         } else {
5957             mb_type -= 5;
5958             goto decode_intra_mb;
5959         }
5960     } else {
5961        assert(h->slice_type == I_TYPE);
5962 decode_intra_mb:
5963         partition_count = 0;
5964         cbp= i_mb_type_info[mb_type].cbp;
5965         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5966         mb_type= i_mb_type_info[mb_type].type;
5967     }
5968     if(MB_FIELD)
5969         mb_type |= MB_TYPE_INTERLACED;
5970
5971     h->slice_table[ mb_xy ]= h->slice_num;
5972
5973     if(IS_INTRA_PCM(mb_type)) {
5974         const uint8_t *ptr;
5975         unsigned int x, y;
5976
5977         // We assume these blocks are very rare so we do not optimize it.
5978         // FIXME The two following lines get the bitstream position in the cabac
5979         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5980         ptr= h->cabac.bytestream;
5981         if(h->cabac.low&0x1) ptr--;
5982         if(CABAC_BITS==16){
5983             if(h->cabac.low&0x1FF) ptr--;
5984         }
5985
5986         // The pixels are stored in the same order as levels in h->mb array.
5987         for(y=0; y<16; y++){
5988             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5989             for(x=0; x<16; x++){
5990                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5991                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5992             }
5993         }
5994         for(y=0; y<8; y++){
5995             const int index= 256 + 4*(y&3) + 32*(y>>2);
5996             for(x=0; x<8; x++){
5997                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5998                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5999             }
6000         }
6001         for(y=0; y<8; y++){
6002             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6003             for(x=0; x<8; x++){
6004                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6005                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6006             }
6007         }
6008
6009         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6010
6011         // All blocks are present
6012         h->cbp_table[mb_xy] = 0x1ef;
6013         h->chroma_pred_mode_table[mb_xy] = 0;
6014         // In deblocking, the quantizer is 0
6015         s->current_picture.qscale_table[mb_xy]= 0;
6016         h->chroma_qp = get_chroma_qp(h, 0);
6017         // All coeffs are present
6018         memset(h->non_zero_count[mb_xy], 16, 16);
6019         s->current_picture.mb_type[mb_xy]= mb_type;
6020         return 0;
6021     }
6022
6023     if(MB_MBAFF){
6024         h->ref_count[0] <<= 1;
6025         h->ref_count[1] <<= 1;
6026     }
6027
6028     fill_caches(h, mb_type, 0);
6029
6030     if( IS_INTRA( mb_type ) ) {
6031         int i, pred_mode;
6032         if( IS_INTRA4x4( mb_type ) ) {
6033             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6034                 mb_type |= MB_TYPE_8x8DCT;
6035                 for( i = 0; i < 16; i+=4 ) {
6036                     int pred = pred_intra_mode( h, i );
6037                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6038                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6039                 }
6040             } else {
6041                 for( i = 0; i < 16; i++ ) {
6042                     int pred = pred_intra_mode( h, i );
6043                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6044
6045                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6046                 }
6047             }
6048             write_back_intra_pred_mode(h);
6049             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6050         } else {
6051             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6052             if( h->intra16x16_pred_mode < 0 ) return -1;
6053         }
6054         h->chroma_pred_mode_table[mb_xy] =
6055         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
6056
6057         pred_mode= check_intra_pred_mode( h, pred_mode );
6058         if( pred_mode < 0 ) return -1;
6059         h->chroma_pred_mode= pred_mode;
6060     } else if( partition_count == 4 ) {
6061         int i, j, sub_partition_count[4], list, ref[2][4];
6062
6063         if( h->slice_type == B_TYPE ) {
6064             for( i = 0; i < 4; i++ ) {
6065                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6066                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6067                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6068             }
6069             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6070                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6071                 pred_direct_motion(h, &mb_type);
6072                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6073                     for( i = 0; i < 4; i++ )
6074                         if( IS_DIRECT(h->sub_mb_type[i]) )
6075                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6076                 }
6077             }
6078         } else {
6079             for( i = 0; i < 4; i++ ) {
6080                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6081                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6082                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6083             }
6084         }
6085
6086         for( list = 0; list < h->list_count; list++ ) {
6087                 for( i = 0; i < 4; i++ ) {
6088                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6089                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6090                         if( h->ref_count[list] > 1 )
6091                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6092                         else
6093                             ref[list][i] = 0;
6094                     } else {
6095                         ref[list][i] = -1;
6096                     }
6097                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6098                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6099                 }
6100         }
6101
6102         if(dct8x8_allowed)
6103             dct8x8_allowed = get_dct8x8_allowed(h);
6104
6105         for(list=0; list<h->list_count; list++){
6106             for(i=0; i<4; i++){
6107                 if(IS_DIRECT(h->sub_mb_type[i])){
6108                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6109                     continue;
6110                 }
6111                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6112
6113                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6114                     const int sub_mb_type= h->sub_mb_type[i];
6115                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6116                     for(j=0; j<sub_partition_count[i]; j++){
6117                         int mpx, mpy;
6118                         int mx, my;
6119                         const int index= 4*i + block_width*j;
6120                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6121                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6122                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6123
6124                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6125                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6126                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6127
6128                         if(IS_SUB_8X8(sub_mb_type)){
6129                             mv_cache[ 1 ][0]=
6130                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6131                             mv_cache[ 1 ][1]=
6132                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6133
6134                             mvd_cache[ 1 ][0]=
6135                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6136                             mvd_cache[ 1 ][1]=
6137                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6138                         }else if(IS_SUB_8X4(sub_mb_type)){
6139                             mv_cache[ 1 ][0]= mx;
6140                             mv_cache[ 1 ][1]= my;
6141
6142                             mvd_cache[ 1 ][0]= mx - mpx;
6143                             mvd_cache[ 1 ][1]= my - mpy;
6144                         }else if(IS_SUB_4X8(sub_mb_type)){
6145                             mv_cache[ 8 ][0]= mx;
6146                             mv_cache[ 8 ][1]= my;
6147
6148                             mvd_cache[ 8 ][0]= mx - mpx;
6149                             mvd_cache[ 8 ][1]= my - mpy;
6150                         }
6151                         mv_cache[ 0 ][0]= mx;
6152                         mv_cache[ 0 ][1]= my;
6153
6154                         mvd_cache[ 0 ][0]= mx - mpx;
6155                         mvd_cache[ 0 ][1]= my - mpy;
6156                     }
6157                 }else{
6158                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6159                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6160                     p[0] = p[1] = p[8] = p[9] = 0;
6161                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6162                 }
6163             }
6164         }
6165     } else if( IS_DIRECT(mb_type) ) {
6166         pred_direct_motion(h, &mb_type);
6167         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6168         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6169         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6170     } else {
6171         int list, mx, my, i, mpx, mpy;
6172         if(IS_16X16(mb_type)){
6173             for(list=0; list<h->list_count; list++){
6174                 if(IS_DIR(mb_type, 0, list)){
6175                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6176                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6177                 }else
6178                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
6179             }
6180             for(list=0; list<h->list_count; list++){
6181                 if(IS_DIR(mb_type, 0, list)){
6182                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6183
6184                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6185                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6186                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6187
6188                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6189                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6190                 }else
6191                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6192             }
6193         }
6194         else if(IS_16X8(mb_type)){
6195             for(list=0; list<h->list_count; list++){
6196                     for(i=0; i<2; i++){
6197                         if(IS_DIR(mb_type, i, list)){
6198                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6199                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6200                         }else
6201                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6202                     }
6203             }
6204             for(list=0; list<h->list_count; list++){
6205                 for(i=0; i<2; i++){
6206                     if(IS_DIR(mb_type, i, list)){
6207                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6208                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6209                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6210                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6211
6212                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6213                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6214                     }else{
6215                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6216                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6217                     }
6218                 }
6219             }
6220         }else{
6221             assert(IS_8X16(mb_type));
6222             for(list=0; list<h->list_count; list++){
6223                     for(i=0; i<2; i++){
6224                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6225                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6226                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6227                         }else
6228                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6229                     }
6230             }
6231             for(list=0; list<h->list_count; list++){
6232                 for(i=0; i<2; i++){
6233                     if(IS_DIR(mb_type, i, list)){
6234                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6235                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6236                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6237
6238                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6239                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6240                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6241                     }else{
6242                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6243                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6244                     }
6245                 }
6246             }
6247         }
6248     }
6249
6250    if( IS_INTER( mb_type ) ) {
6251         h->chroma_pred_mode_table[mb_xy] = 0;
6252         write_back_motion( h, mb_type );
6253    }
6254
6255     if( !IS_INTRA16x16( mb_type ) ) {
6256         cbp  = decode_cabac_mb_cbp_luma( h );
6257         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6258     }
6259
6260     h->cbp_table[mb_xy] = h->cbp = cbp;
6261
6262     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6263         if( decode_cabac_mb_transform_size( h ) )
6264             mb_type |= MB_TYPE_8x8DCT;
6265     }
6266     s->current_picture.mb_type[mb_xy]= mb_type;
6267
6268     if( cbp || IS_INTRA16x16( mb_type ) ) {
6269         const uint8_t *scan, *scan8x8, *dc_scan;
6270         int dqp;
6271
6272         if(IS_INTERLACED(mb_type)){
6273             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6274             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6275             dc_scan= luma_dc_field_scan;
6276         }else{
6277             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6278             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6279             dc_scan= luma_dc_zigzag_scan;
6280         }
6281
6282         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6283         if( dqp == INT_MIN ){
6284             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6285             return -1;
6286         }
6287         s->qscale += dqp;
6288         if(((unsigned)s->qscale) > 51){
6289             if(s->qscale<0) s->qscale+= 52;
6290             else            s->qscale-= 52;
6291         }
6292         h->chroma_qp = get_chroma_qp(h, s->qscale);
6293
6294         if( IS_INTRA16x16( mb_type ) ) {
6295             int i;
6296             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6297             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6298                 return -1;
6299             if( cbp&15 ) {
6300                 for( i = 0; i < 16; i++ ) {
6301                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6302                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6303                         return -1;
6304                 }
6305             } else {
6306                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6307             }
6308         } else {
6309             int i8x8, i4x4;
6310             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6311                 if( cbp & (1<<i8x8) ) {
6312                     if( IS_8x8DCT(mb_type) ) {
6313                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6314                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6315                             return -1;
6316                     } else
6317                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6318                         const int index = 4*i8x8 + i4x4;
6319                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6320 //START_TIMER
6321                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6322                             return -1;
6323 //STOP_TIMER("decode_residual")
6324                     }
6325                 } else {
6326                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6327                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6328                 }
6329             }
6330         }
6331
6332         if( cbp&0x30 ){
6333             int c;
6334             for( c = 0; c < 2; c++ ) {
6335                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6336                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6337                     return -1;
6338             }
6339         }
6340
6341         if( cbp&0x20 ) {
6342             int c, i;
6343             for( c = 0; c < 2; c++ ) {
6344                 const uint32_t *qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp];
6345                 for( i = 0; i < 4; i++ ) {
6346                     const int index = 16 + 4 * c + i;
6347                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6348                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15) < 0)
6349                         return -1;
6350                 }
6351             }
6352         } else {
6353             uint8_t * const nnz= &h->non_zero_count_cache[0];
6354             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6355             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6356         }
6357     } else {
6358         uint8_t * const nnz= &h->non_zero_count_cache[0];
6359         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6360         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6361         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6362         h->last_qscale_diff = 0;
6363     }
6364
6365     s->current_picture.qscale_table[mb_xy]= s->qscale;
6366     write_back_non_zero_count(h);
6367
6368     if(MB_MBAFF){
6369         h->ref_count[0] >>= 1;
6370         h->ref_count[1] >>= 1;
6371     }
6372
6373     return 0;
6374 }
6375
6376
6377 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6378     int i, d;
6379     const int index_a = qp + h->slice_alpha_c0_offset;
6380     const int alpha = (alpha_table+52)[index_a];
6381     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6382
6383     if( bS[0] < 4 ) {
6384         int8_t tc[4];
6385         for(i=0; i<4; i++)
6386             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6387         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6388     } else {
6389         /* 16px edge length, because bS=4 is triggered by being at
6390          * the edge of an intra MB, so all 4 bS are the same */
6391             for( d = 0; d < 16; d++ ) {
6392                 const int p0 = pix[-1];
6393                 const int p1 = pix[-2];
6394                 const int p2 = pix[-3];
6395
6396                 const int q0 = pix[0];
6397                 const int q1 = pix[1];
6398                 const int q2 = pix[2];
6399
6400                 if( FFABS( p0 - q0 ) < alpha &&
6401                     FFABS( p1 - p0 ) < beta &&
6402                     FFABS( q1 - q0 ) < beta ) {
6403
6404                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6405                         if( FFABS( p2 - p0 ) < beta)
6406                         {
6407                             const int p3 = pix[-4];
6408                             /* p0', p1', p2' */
6409                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6410                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6411                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6412                         } else {
6413                             /* p0' */
6414                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6415                         }
6416                         if( FFABS( q2 - q0 ) < beta)
6417                         {
6418                             const int q3 = pix[3];
6419                             /* q0', q1', q2' */
6420                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6421                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6422                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6423                         } else {
6424                             /* q0' */
6425                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6426                         }
6427                     }else{
6428                         /* p0', q0' */
6429                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6430                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6431                     }
6432                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6433                 }
6434                 pix += stride;
6435             }
6436     }
6437 }
6438 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6439     int i;
6440     const int index_a = qp + h->slice_alpha_c0_offset;
6441     const int alpha = (alpha_table+52)[index_a];
6442     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6443
6444     if( bS[0] < 4 ) {
6445         int8_t tc[4];
6446         for(i=0; i<4; i++)
6447             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6448         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6449     } else {
6450         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6451     }
6452 }
6453
6454 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6455     int i;
6456     for( i = 0; i < 16; i++, pix += stride) {
6457         int index_a;
6458         int alpha;
6459         int beta;
6460
6461         int qp_index;
6462         int bS_index = (i >> 1);
6463         if (!MB_FIELD) {
6464             bS_index &= ~1;
6465             bS_index |= (i & 1);
6466         }
6467
6468         if( bS[bS_index] == 0 ) {
6469             continue;
6470         }
6471
6472         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6473         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6474         alpha = (alpha_table+52)[index_a];
6475         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6476
6477         if( bS[bS_index] < 4 ) {
6478             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6479             const int p0 = pix[-1];
6480             const int p1 = pix[-2];
6481             const int p2 = pix[-3];
6482             const int q0 = pix[0];
6483             const int q1 = pix[1];
6484             const int q2 = pix[2];
6485
6486             if( FFABS( p0 - q0 ) < alpha &&
6487                 FFABS( p1 - p0 ) < beta &&
6488                 FFABS( q1 - q0 ) < beta ) {
6489                 int tc = tc0;
6490                 int i_delta;
6491
6492                 if( FFABS( p2 - p0 ) < beta ) {
6493                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6494                     tc++;
6495                 }
6496                 if( FFABS( q2 - q0 ) < beta ) {
6497                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6498                     tc++;
6499                 }
6500
6501                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6502                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6503                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6504                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6505             }
6506         }else{
6507             const int p0 = pix[-1];
6508             const int p1 = pix[-2];
6509             const int p2 = pix[-3];
6510
6511             const int q0 = pix[0];
6512             const int q1 = pix[1];
6513             const int q2 = pix[2];
6514
6515             if( FFABS( p0 - q0 ) < alpha &&
6516                 FFABS( p1 - p0 ) < beta &&
6517                 FFABS( q1 - q0 ) < beta ) {
6518
6519                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6520                     if( FFABS( p2 - p0 ) < beta)
6521                     {
6522                         const int p3 = pix[-4];
6523                         /* p0', p1', p2' */
6524                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6525                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6526                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6527                     } else {
6528                         /* p0' */
6529                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6530                     }
6531                     if( FFABS( q2 - q0 ) < beta)
6532                     {
6533                         const int q3 = pix[3];
6534                         /* q0', q1', q2' */
6535                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6536                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6537                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6538                     } else {
6539                         /* q0' */
6540                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6541                     }
6542                 }else{
6543                     /* p0', q0' */
6544                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6545                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6546                 }
6547                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6548             }
6549         }
6550     }
6551 }
6552 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6553     int i;
6554     for( i = 0; i < 8; i++, pix += stride) {
6555         int index_a;
6556         int alpha;
6557         int beta;
6558
6559         int qp_index;
6560         int bS_index = i;
6561
6562         if( bS[bS_index] == 0 ) {
6563             continue;
6564         }
6565
6566         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6567         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6568         alpha = (alpha_table+52)[index_a];
6569         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6570
6571         if( bS[bS_index] < 4 ) {
6572             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6573             const int p0 = pix[-1];
6574             const int p1 = pix[-2];
6575             const int q0 = pix[0];
6576             const int q1 = pix[1];
6577
6578             if( FFABS( p0 - q0 ) < alpha &&
6579                 FFABS( p1 - p0 ) < beta &&
6580                 FFABS( q1 - q0 ) < beta ) {
6581                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6582
6583                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6584                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6585                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6586             }
6587         }else{
6588             const int p0 = pix[-1];
6589             const int p1 = pix[-2];
6590             const int q0 = pix[0];
6591             const int q1 = pix[1];
6592
6593             if( FFABS( p0 - q0 ) < alpha &&
6594                 FFABS( p1 - p0 ) < beta &&
6595                 FFABS( q1 - q0 ) < beta ) {
6596
6597                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6598                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6599                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6600             }
6601         }
6602     }
6603 }
6604
6605 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6606     int i, d;
6607     const int index_a = qp + h->slice_alpha_c0_offset;
6608     const int alpha = (alpha_table+52)[index_a];
6609     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6610     const int pix_next  = stride;
6611
6612     if( bS[0] < 4 ) {
6613         int8_t tc[4];
6614         for(i=0; i<4; i++)
6615             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6616         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6617     } else {
6618         /* 16px edge length, see filter_mb_edgev */
6619             for( d = 0; d < 16; d++ ) {
6620                 const int p0 = pix[-1*pix_next];
6621                 const int p1 = pix[-2*pix_next];
6622                 const int p2 = pix[-3*pix_next];
6623                 const int q0 = pix[0];
6624                 const int q1 = pix[1*pix_next];
6625                 const int q2 = pix[2*pix_next];
6626
6627                 if( FFABS( p0 - q0 ) < alpha &&
6628                     FFABS( p1 - p0 ) < beta &&
6629                     FFABS( q1 - q0 ) < beta ) {
6630
6631                     const int p3 = pix[-4*pix_next];
6632                     const int q3 = pix[ 3*pix_next];
6633
6634                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6635                         if( FFABS( p2 - p0 ) < beta) {
6636                             /* p0', p1', p2' */
6637                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6638                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6639                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6640                         } else {
6641                             /* p0' */
6642                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6643                         }
6644                         if( FFABS( q2 - q0 ) < beta) {
6645                             /* q0', q1', q2' */
6646                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6647                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6648                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6649                         } else {
6650                             /* q0' */
6651                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6652                         }
6653                     }else{
6654                         /* p0', q0' */
6655                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6656                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6657                     }
6658                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6659                 }
6660                 pix++;
6661             }
6662     }
6663 }
6664
6665 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6666     int i;
6667     const int index_a = qp + h->slice_alpha_c0_offset;
6668     const int alpha = (alpha_table+52)[index_a];
6669     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6670
6671     if( bS[0] < 4 ) {
6672         int8_t tc[4];
6673         for(i=0; i<4; i++)
6674             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6675         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6676     } else {
6677         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6678     }
6679 }
6680
6681 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6682     MpegEncContext * const s = &h->s;
6683     int mb_xy, mb_type;
6684     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6685
6686     mb_xy = mb_x + mb_y*s->mb_stride;
6687
6688     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength ||
6689        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6690                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6691         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6692         return;
6693     }
6694     assert(!FRAME_MBAFF);
6695
6696     mb_type = s->current_picture.mb_type[mb_xy];
6697     qp = s->current_picture.qscale_table[mb_xy];
6698     qp0 = s->current_picture.qscale_table[mb_xy-1];
6699     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6700     qpc = get_chroma_qp( h, qp );
6701     qpc0 = get_chroma_qp( h, qp0 );
6702     qpc1 = get_chroma_qp( h, qp1 );
6703     qp0 = (qp + qp0 + 1) >> 1;
6704     qp1 = (qp + qp1 + 1) >> 1;
6705     qpc0 = (qpc + qpc0 + 1) >> 1;
6706     qpc1 = (qpc + qpc1 + 1) >> 1;
6707     qp_thresh = 15 - h->slice_alpha_c0_offset;
6708     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6709        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6710         return;
6711
6712     if( IS_INTRA(mb_type) ) {
6713         int16_t bS4[4] = {4,4,4,4};
6714         int16_t bS3[4] = {3,3,3,3};
6715         if( IS_8x8DCT(mb_type) ) {
6716             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6717             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6718             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6719             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6720         } else {
6721             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6722             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6723             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6724             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6725             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6726             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6727             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6728             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6729         }
6730         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6731         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6732         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6733         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6734         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6735         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6736         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6737         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6738         return;
6739     } else {
6740         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6741         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6742         int edges;
6743         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6744             edges = 4;
6745             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6746         } else {
6747             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6748                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6749             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6750                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6751                              ? 3 : 0;
6752             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6753             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6754             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6755                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6756         }
6757         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6758             bSv[0][0] = 0x0004000400040004ULL;
6759         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6760             bSv[1][0] = 0x0004000400040004ULL;
6761
6762 #define FILTER(hv,dir,edge)\
6763         if(bSv[dir][edge]) {\
6764             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6765             if(!(edge&1)) {\
6766                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6767                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6768             }\
6769         }
6770         if( edges == 1 ) {
6771             FILTER(v,0,0);
6772             FILTER(h,1,0);
6773         } else if( IS_8x8DCT(mb_type) ) {
6774             FILTER(v,0,0);
6775             FILTER(v,0,2);
6776             FILTER(h,1,0);
6777             FILTER(h,1,2);
6778         } else {
6779             FILTER(v,0,0);
6780             FILTER(v,0,1);
6781             FILTER(v,0,2);
6782             FILTER(v,0,3);
6783             FILTER(h,1,0);
6784             FILTER(h,1,1);
6785             FILTER(h,1,2);
6786             FILTER(h,1,3);
6787         }
6788 #undef FILTER
6789     }
6790 }
6791
6792 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6793     MpegEncContext * const s = &h->s;
6794     const int mb_xy= mb_x + mb_y*s->mb_stride;
6795     const int mb_type = s->current_picture.mb_type[mb_xy];
6796     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6797     int first_vertical_edge_done = 0;
6798     int dir;
6799     /* FIXME: A given frame may occupy more than one position in
6800      * the reference list. So ref2frm should be populated with
6801      * frame numbers, not indices. */
6802     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6803                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6804
6805     //for sufficiently low qp, filtering wouldn't do anything
6806     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6807     if(!FRAME_MBAFF){
6808         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
6809         int qp = s->current_picture.qscale_table[mb_xy];
6810         if(qp <= qp_thresh
6811            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6812            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6813             return;
6814         }
6815     }
6816
6817     if (FRAME_MBAFF
6818             // left mb is in picture
6819             && h->slice_table[mb_xy-1] != 255
6820             // and current and left pair do not have the same interlaced type
6821             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6822             // and left mb is in the same slice if deblocking_filter == 2
6823             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6824         /* First vertical edge is different in MBAFF frames
6825          * There are 8 different bS to compute and 2 different Qp
6826          */
6827         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6828         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6829         int16_t bS[8];
6830         int qp[2];
6831         int chroma_qp[2];
6832         int mb_qp, mbn0_qp, mbn1_qp;
6833         int i;
6834         first_vertical_edge_done = 1;
6835
6836         if( IS_INTRA(mb_type) )
6837             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6838         else {
6839             for( i = 0; i < 8; i++ ) {
6840                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6841
6842                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6843                     bS[i] = 4;
6844                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6845                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6846                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6847                     bS[i] = 2;
6848                 else
6849                     bS[i] = 1;
6850             }
6851         }
6852
6853         mb_qp = s->current_picture.qscale_table[mb_xy];
6854         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6855         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6856         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6857         chroma_qp[0] = ( get_chroma_qp( h, mb_qp ) +
6858                          get_chroma_qp( h, mbn0_qp ) + 1 ) >> 1;
6859         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6860         chroma_qp[1] = ( get_chroma_qp( h, mb_qp ) +
6861                          get_chroma_qp( h, mbn1_qp ) + 1 ) >> 1;
6862
6863         /* Filter edge */
6864         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
6865         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6866         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6867         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
6868         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
6869     }
6870     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6871     for( dir = 0; dir < 2; dir++ )
6872     {
6873         int edge;
6874         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6875         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6876         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6877
6878         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6879                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6880         // how often to recheck mv-based bS when iterating between edges
6881         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6882                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6883         // how often to recheck mv-based bS when iterating along each edge
6884         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6885
6886         if (first_vertical_edge_done) {
6887             start = 1;
6888             first_vertical_edge_done = 0;
6889         }
6890
6891         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6892             start = 1;
6893
6894         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6895             && !IS_INTERLACED(mb_type)
6896             && IS_INTERLACED(mbm_type)
6897             ) {
6898             // This is a special case in the norm where the filtering must
6899             // be done twice (one each of the field) even if we are in a
6900             // frame macroblock.
6901             //
6902             static const int nnz_idx[4] = {4,5,6,3};
6903             unsigned int tmp_linesize   = 2 *   linesize;
6904             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6905             int mbn_xy = mb_xy - 2 * s->mb_stride;
6906             int qp, chroma_qp;
6907             int i, j;
6908             int16_t bS[4];
6909
6910             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6911                 if( IS_INTRA(mb_type) ||
6912                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6913                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6914                 } else {
6915                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6916                     for( i = 0; i < 4; i++ ) {
6917                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6918                             mbn_nnz[nnz_idx[i]] != 0 )
6919                             bS[i] = 2;
6920                         else
6921                             bS[i] = 1;
6922                     }
6923                 }
6924                 // Do not use s->qscale as luma quantizer because it has not the same
6925                 // value in IPCM macroblocks.
6926                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6927                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6928                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6929                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6930                 chroma_qp = ( h->chroma_qp +
6931                               get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6932                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6933                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6934             }
6935
6936             start = 1;
6937         }
6938
6939         /* Calculate bS */
6940         for( edge = start; edge < edges; edge++ ) {
6941             /* mbn_xy: neighbor macroblock */
6942             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6943             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6944             int16_t bS[4];
6945             int qp;
6946
6947             if( (edge&1) && IS_8x8DCT(mb_type) )
6948                 continue;
6949
6950             if( IS_INTRA(mb_type) ||
6951                 IS_INTRA(mbn_type) ) {
6952                 int value;
6953                 if (edge == 0) {
6954                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6955                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6956                     ) {
6957                         value = 4;
6958                     } else {
6959                         value = 3;
6960                     }
6961                 } else {
6962                     value = 3;
6963                 }
6964                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6965             } else {
6966                 int i, l;
6967                 int mv_done;
6968
6969                 if( edge & mask_edge ) {
6970                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6971                     mv_done = 1;
6972                 }
6973                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6974                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6975                     mv_done = 1;
6976                 }
6977                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6978                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6979                     int bn_idx= b_idx - (dir ? 8:1);
6980                     int v = 0;
6981                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6982                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6983                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6984                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6985                     }
6986                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6987                     mv_done = 1;
6988                 }
6989                 else
6990                     mv_done = 0;
6991
6992                 for( i = 0; i < 4; i++ ) {
6993                     int x = dir == 0 ? edge : i;
6994                     int y = dir == 0 ? i    : edge;
6995                     int b_idx= 8 + 4 + x + 8*y;
6996                     int bn_idx= b_idx - (dir ? 8:1);
6997
6998                     if( h->non_zero_count_cache[b_idx] != 0 ||
6999                         h->non_zero_count_cache[bn_idx] != 0 ) {
7000                         bS[i] = 2;
7001                     }
7002                     else if(!mv_done)
7003                     {
7004                         bS[i] = 0;
7005                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7006                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7007                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7008                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7009                                 bS[i] = 1;
7010                                 break;
7011                             }
7012                         }
7013                     }
7014                 }
7015
7016                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7017                     continue;
7018             }
7019
7020             /* Filter edge */
7021             // Do not use s->qscale as luma quantizer because it has not the same
7022             // value in IPCM macroblocks.
7023             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7024             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7025             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7026             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
7027             if( dir == 0 ) {
7028                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7029                 if( (edge&1) == 0 ) {
7030                     int chroma_qp = ( h->chroma_qp +
7031                                       get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7032                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7033                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7034                 }
7035             } else {
7036                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7037                 if( (edge&1) == 0 ) {
7038                     int chroma_qp = ( h->chroma_qp +
7039                                       get_chroma_qp( h, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7040                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7041                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7042                 }
7043             }
7044         }
7045     }
7046 }
7047
7048 static int decode_slice(H264Context *h){
7049     MpegEncContext * const s = &h->s;
7050     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7051
7052     s->mb_skip_run= -1;
7053
7054     if( h->pps.cabac ) {
7055         int i;
7056
7057         /* realign */
7058         align_get_bits( &s->gb );
7059
7060         /* init cabac */
7061         ff_init_cabac_states( &h->cabac);
7062         ff_init_cabac_decoder( &h->cabac,
7063                                s->gb.buffer + get_bits_count(&s->gb)/8,
7064                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7065         /* calculate pre-state */
7066         for( i= 0; i < 460; i++ ) {
7067             int pre;
7068             if( h->slice_type == I_TYPE )
7069                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7070             else
7071                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7072
7073             if( pre <= 63 )
7074                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7075             else
7076                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7077         }
7078
7079         for(;;){
7080 //START_TIMER
7081             int ret = decode_mb_cabac(h);
7082             int eos;
7083 //STOP_TIMER("decode_mb_cabac")
7084
7085             if(ret>=0) hl_decode_mb(h);
7086
7087             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7088                 s->mb_y++;
7089
7090                 if(ret>=0) ret = decode_mb_cabac(h);
7091
7092                 if(ret>=0) hl_decode_mb(h);
7093                 s->mb_y--;
7094             }
7095             eos = get_cabac_terminate( &h->cabac );
7096
7097             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7098                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7099                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7100                 return -1;
7101             }
7102
7103             if( ++s->mb_x >= s->mb_width ) {
7104                 s->mb_x = 0;
7105                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7106                 ++s->mb_y;
7107                 if(FRAME_MBAFF) {
7108                     ++s->mb_y;
7109                 }
7110             }
7111
7112             if( eos || s->mb_y >= s->mb_height ) {
7113                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7114                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7115                 return 0;
7116             }
7117         }
7118
7119     } else {
7120         for(;;){
7121             int ret = decode_mb_cavlc(h);
7122
7123             if(ret>=0) hl_decode_mb(h);
7124
7125             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7126                 s->mb_y++;
7127                 ret = decode_mb_cavlc(h);
7128
7129                 if(ret>=0) hl_decode_mb(h);
7130                 s->mb_y--;
7131             }
7132
7133             if(ret<0){
7134                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7135                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7136
7137                 return -1;
7138             }
7139
7140             if(++s->mb_x >= s->mb_width){
7141                 s->mb_x=0;
7142                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7143                 ++s->mb_y;
7144                 if(FRAME_MBAFF) {
7145                     ++s->mb_y;
7146                 }
7147                 if(s->mb_y >= s->mb_height){
7148                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7149
7150                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7151                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7152
7153                         return 0;
7154                     }else{
7155                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7156
7157                         return -1;
7158                     }
7159                 }
7160             }
7161
7162             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7163                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7164                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7165                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7166
7167                     return 0;
7168                 }else{
7169                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7170
7171                     return -1;
7172                 }
7173             }
7174         }
7175     }
7176
7177 #if 0
7178     for(;s->mb_y < s->mb_height; s->mb_y++){
7179         for(;s->mb_x < s->mb_width; s->mb_x++){
7180             int ret= decode_mb(h);
7181
7182             hl_decode_mb(h);
7183
7184             if(ret<0){
7185                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7186                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7187
7188                 return -1;
7189             }
7190
7191             if(++s->mb_x >= s->mb_width){
7192                 s->mb_x=0;
7193                 if(++s->mb_y >= s->mb_height){
7194                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7195                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7196
7197                         return 0;
7198                     }else{
7199                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7200
7201                         return -1;
7202                     }
7203                 }
7204             }
7205
7206             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7207                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7208                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7209
7210                     return 0;
7211                 }else{
7212                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7213
7214                     return -1;
7215                 }
7216             }
7217         }
7218         s->mb_x=0;
7219         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7220     }
7221 #endif
7222     return -1; //not reached
7223 }
7224
7225 static int decode_unregistered_user_data(H264Context *h, int size){
7226     MpegEncContext * const s = &h->s;
7227     uint8_t user_data[16+256];
7228     int e, build, i;
7229
7230     if(size<16)
7231         return -1;
7232
7233     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7234         user_data[i]= get_bits(&s->gb, 8);
7235     }
7236
7237     user_data[i]= 0;
7238     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7239     if(e==1 && build>=0)
7240         h->x264_build= build;
7241
7242     if(s->avctx->debug & FF_DEBUG_BUGS)
7243         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7244
7245     for(; i<size; i++)
7246         skip_bits(&s->gb, 8);
7247
7248     return 0;
7249 }
7250
7251 static int decode_sei(H264Context *h){
7252     MpegEncContext * const s = &h->s;
7253
7254     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7255         int size, type;
7256
7257         type=0;
7258         do{
7259             type+= show_bits(&s->gb, 8);
7260         }while(get_bits(&s->gb, 8) == 255);
7261
7262         size=0;
7263         do{
7264             size+= show_bits(&s->gb, 8);
7265         }while(get_bits(&s->gb, 8) == 255);
7266
7267         switch(type){
7268         case 5:
7269             if(decode_unregistered_user_data(h, size) < 0)
7270                 return -1;
7271             break;
7272         default:
7273             skip_bits(&s->gb, 8*size);
7274         }
7275
7276         //FIXME check bits here
7277         align_get_bits(&s->gb);
7278     }
7279
7280     return 0;
7281 }
7282
7283 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7284     MpegEncContext * const s = &h->s;
7285     int cpb_count, i;
7286     cpb_count = get_ue_golomb(&s->gb) + 1;
7287     get_bits(&s->gb, 4); /* bit_rate_scale */
7288     get_bits(&s->gb, 4); /* cpb_size_scale */
7289     for(i=0; i<cpb_count; i++){
7290         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7291         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7292         get_bits1(&s->gb);     /* cbr_flag */
7293     }
7294     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7295     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7296     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7297     get_bits(&s->gb, 5); /* time_offset_length */
7298 }
7299
7300 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7301     MpegEncContext * const s = &h->s;
7302     int aspect_ratio_info_present_flag;
7303     unsigned int aspect_ratio_idc;
7304     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7305
7306     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7307
7308     if( aspect_ratio_info_present_flag ) {
7309         aspect_ratio_idc= get_bits(&s->gb, 8);
7310         if( aspect_ratio_idc == EXTENDED_SAR ) {
7311             sps->sar.num= get_bits(&s->gb, 16);
7312             sps->sar.den= get_bits(&s->gb, 16);
7313         }else if(aspect_ratio_idc < 14){
7314             sps->sar=  pixel_aspect[aspect_ratio_idc];
7315         }else{
7316             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7317             return -1;
7318         }
7319     }else{
7320         sps->sar.num=
7321         sps->sar.den= 0;
7322     }
7323 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7324
7325     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7326         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7327     }
7328
7329     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7330         get_bits(&s->gb, 3);    /* video_format */
7331         get_bits1(&s->gb);      /* video_full_range_flag */
7332         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7333             get_bits(&s->gb, 8); /* colour_primaries */
7334             get_bits(&s->gb, 8); /* transfer_characteristics */
7335             get_bits(&s->gb, 8); /* matrix_coefficients */
7336         }
7337     }
7338
7339     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7340         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7341         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7342     }
7343
7344     sps->timing_info_present_flag = get_bits1(&s->gb);
7345     if(sps->timing_info_present_flag){
7346         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7347         sps->time_scale = get_bits_long(&s->gb, 32);
7348         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7349     }
7350
7351     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7352     if(nal_hrd_parameters_present_flag)
7353         decode_hrd_parameters(h, sps);
7354     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7355     if(vcl_hrd_parameters_present_flag)
7356         decode_hrd_parameters(h, sps);
7357     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7358         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7359     get_bits1(&s->gb);         /* pic_struct_present_flag */
7360
7361     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7362     if(sps->bitstream_restriction_flag){
7363         unsigned int num_reorder_frames;
7364         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7365         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7366         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7367         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7368         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7369         num_reorder_frames= get_ue_golomb(&s->gb);
7370         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7371
7372         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7373             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7374             return -1;
7375         }
7376
7377         sps->num_reorder_frames= num_reorder_frames;
7378     }
7379
7380     return 0;
7381 }
7382
7383 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7384                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7385     MpegEncContext * const s = &h->s;
7386     int i, last = 8, next = 8;
7387     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7388     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7389         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7390     else
7391     for(i=0;i<size;i++){
7392         if(next)
7393             next = (last + get_se_golomb(&s->gb)) & 0xff;
7394         if(!i && !next){ /* matrix not written, we use the preset one */
7395             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7396             break;
7397         }
7398         last = factors[scan[i]] = next ? next : last;
7399     }
7400 }
7401
7402 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7403                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7404     MpegEncContext * const s = &h->s;
7405     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7406     const uint8_t *fallback[4] = {
7407         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7408         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7409         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7410         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7411     };
7412     if(get_bits1(&s->gb)){
7413         sps->scaling_matrix_present |= is_sps;
7414         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7415         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7416         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7417         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7418         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7419         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7420         if(is_sps || pps->transform_8x8_mode){
7421             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7422             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7423         }
7424     } else if(fallback_sps) {
7425         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7426         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7427     }
7428 }
7429
7430 /**
7431  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7432  */
7433 static void *
7434 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7435                     const size_t size, const char *name)
7436 {
7437     if(id>=max) {
7438         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7439         return NULL;
7440     }
7441
7442     if(!vec[id]) {
7443         vec[id] = av_mallocz(size);
7444         if(vec[id] == NULL)
7445             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7446     }
7447     return vec[id];
7448 }
7449
7450 static inline int decode_seq_parameter_set(H264Context *h){
7451     MpegEncContext * const s = &h->s;
7452     int profile_idc, level_idc;
7453     unsigned int sps_id, tmp, mb_width, mb_height;
7454     int i;
7455     SPS *sps;
7456
7457     profile_idc= get_bits(&s->gb, 8);
7458     get_bits1(&s->gb);   //constraint_set0_flag
7459     get_bits1(&s->gb);   //constraint_set1_flag
7460     get_bits1(&s->gb);   //constraint_set2_flag
7461     get_bits1(&s->gb);   //constraint_set3_flag
7462     get_bits(&s->gb, 4); // reserved
7463     level_idc= get_bits(&s->gb, 8);
7464     sps_id= get_ue_golomb(&s->gb);
7465
7466     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7467     if(sps == NULL)
7468         return -1;
7469
7470     sps->profile_idc= profile_idc;
7471     sps->level_idc= level_idc;
7472
7473     if(sps->profile_idc >= 100){ //high profile
7474         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7475             get_bits1(&s->gb);  //residual_color_transform_flag
7476         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7477         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7478         sps->transform_bypass = get_bits1(&s->gb);
7479         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7480     }else
7481         sps->scaling_matrix_present = 0;
7482
7483     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7484     sps->poc_type= get_ue_golomb(&s->gb);
7485
7486     if(sps->poc_type == 0){ //FIXME #define
7487         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7488     } else if(sps->poc_type == 1){//FIXME #define
7489         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7490         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7491         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7492         tmp= get_ue_golomb(&s->gb);
7493
7494         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7495             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7496             return -1;
7497         }
7498         sps->poc_cycle_length= tmp;
7499
7500         for(i=0; i<sps->poc_cycle_length; i++)
7501             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7502     }else if(sps->poc_type != 2){
7503         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7504         return -1;
7505     }
7506
7507     tmp= get_ue_golomb(&s->gb);
7508     if(tmp > MAX_PICTURE_COUNT-2){
7509         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7510     }
7511     sps->ref_frame_count= tmp;
7512     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7513     mb_width= get_ue_golomb(&s->gb) + 1;
7514     mb_height= get_ue_golomb(&s->gb) + 1;
7515     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7516        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7517         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7518         return -1;
7519     }
7520     sps->mb_width = mb_width;
7521     sps->mb_height= mb_height;
7522
7523     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7524     if(!sps->frame_mbs_only_flag)
7525         sps->mb_aff= get_bits1(&s->gb);
7526     else
7527         sps->mb_aff= 0;
7528
7529     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7530
7531 #ifndef ALLOW_INTERLACE
7532     if(sps->mb_aff)
7533         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7534 #endif
7535     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7536         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7537
7538     sps->crop= get_bits1(&s->gb);
7539     if(sps->crop){
7540         sps->crop_left  = get_ue_golomb(&s->gb);
7541         sps->crop_right = get_ue_golomb(&s->gb);
7542         sps->crop_top   = get_ue_golomb(&s->gb);
7543         sps->crop_bottom= get_ue_golomb(&s->gb);
7544         if(sps->crop_left || sps->crop_top){
7545             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7546         }
7547     }else{
7548         sps->crop_left  =
7549         sps->crop_right =
7550         sps->crop_top   =
7551         sps->crop_bottom= 0;
7552     }
7553
7554     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7555     if( sps->vui_parameters_present_flag )
7556         decode_vui_parameters(h, sps);
7557
7558     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7559         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7560                sps_id, sps->profile_idc, sps->level_idc,
7561                sps->poc_type,
7562                sps->ref_frame_count,
7563                sps->mb_width, sps->mb_height,
7564                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7565                sps->direct_8x8_inference_flag ? "8B8" : "",
7566                sps->crop_left, sps->crop_right,
7567                sps->crop_top, sps->crop_bottom,
7568                sps->vui_parameters_present_flag ? "VUI" : ""
7569                );
7570     }
7571     return 0;
7572 }
7573
7574 static void
7575 build_qp_table(PPS *pps, int index)
7576 {
7577     int i;
7578     for(i = 0; i < 255; i++)
7579         pps->chroma_qp_table[i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7580     pps->chroma_qp_index_offset = index;
7581 }
7582
7583 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7584     MpegEncContext * const s = &h->s;
7585     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7586     PPS *pps;
7587
7588     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7589     if(pps == NULL)
7590         return -1;
7591
7592     tmp= get_ue_golomb(&s->gb);
7593     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7594         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7595         return -1;
7596     }
7597     pps->sps_id= tmp;
7598
7599     pps->cabac= get_bits1(&s->gb);
7600     pps->pic_order_present= get_bits1(&s->gb);
7601     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7602     if(pps->slice_group_count > 1 ){
7603         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7604         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7605         switch(pps->mb_slice_group_map_type){
7606         case 0:
7607 #if 0
7608 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7609 |    run_length[ i ]                                |1  |ue(v)   |
7610 #endif
7611             break;
7612         case 2:
7613 #if 0
7614 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7615 |{                                                  |   |        |
7616 |    top_left_mb[ i ]                               |1  |ue(v)   |
7617 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7618 |   }                                               |   |        |
7619 #endif
7620             break;
7621         case 3:
7622         case 4:
7623         case 5:
7624 #if 0
7625 |   slice_group_change_direction_flag               |1  |u(1)    |
7626 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7627 #endif
7628             break;
7629         case 6:
7630 #if 0
7631 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7632 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7633 |)                                                  |   |        |
7634 |    slice_group_id[ i ]                            |1  |u(v)    |
7635 #endif
7636             break;
7637         }
7638     }
7639     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7640     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7641     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7642         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7643         pps->ref_count[0]= pps->ref_count[1]= 1;
7644         return -1;
7645     }
7646
7647     pps->weighted_pred= get_bits1(&s->gb);
7648     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7649     pps->init_qp= get_se_golomb(&s->gb) + 26;
7650     pps->init_qs= get_se_golomb(&s->gb) + 26;
7651     build_qp_table(pps, get_se_golomb(&s->gb));
7652     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7653     pps->constrained_intra_pred= get_bits1(&s->gb);
7654     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7655
7656     pps->transform_8x8_mode= 0;
7657     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7658     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7659     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7660
7661     if(get_bits_count(&s->gb) < bit_length){
7662         pps->transform_8x8_mode= get_bits1(&s->gb);
7663         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7664         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7665     }
7666
7667     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7668         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7669                pps_id, pps->sps_id,
7670                pps->cabac ? "CABAC" : "CAVLC",
7671                pps->slice_group_count,
7672                pps->ref_count[0], pps->ref_count[1],
7673                pps->weighted_pred ? "weighted" : "",
7674                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7675                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7676                pps->constrained_intra_pred ? "CONSTR" : "",
7677                pps->redundant_pic_cnt_present ? "REDU" : "",
7678                pps->transform_8x8_mode ? "8x8DCT" : ""
7679                );
7680     }
7681
7682     return 0;
7683 }
7684
7685 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7686     MpegEncContext * const s = &h->s;
7687     AVCodecContext * const avctx= s->avctx;
7688     int buf_index=0;
7689 #if 0
7690     int i;
7691     for(i=0; i<50; i++){
7692         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7693     }
7694 #endif
7695     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7696         h->slice_num = 0;
7697         s->current_picture_ptr= NULL;
7698     }
7699
7700     for(;;){
7701         int consumed;
7702         int dst_length;
7703         int bit_length;
7704         uint8_t *ptr;
7705         int i, nalsize = 0;
7706
7707         if(h->is_avc) {
7708             if(buf_index >= buf_size) break;
7709             nalsize = 0;
7710             for(i = 0; i < h->nal_length_size; i++)
7711                 nalsize = (nalsize << 8) | buf[buf_index++];
7712             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7713                 if(nalsize == 1){
7714                     buf_index++;
7715                     continue;
7716                 }else{
7717                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7718                     break;
7719                 }
7720             }
7721         } else {
7722             // start code prefix search
7723             for(; buf_index + 3 < buf_size; buf_index++){
7724                 // This should always succeed in the first iteration.
7725                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7726                     break;
7727             }
7728
7729             if(buf_index+3 >= buf_size) break;
7730
7731             buf_index+=3;
7732         }
7733
7734         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7735         if (ptr==NULL || dst_length < 0){
7736             return -1;
7737         }
7738         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7739             dst_length--;
7740         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7741
7742         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7743             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7744         }
7745
7746         if (h->is_avc && (nalsize != consumed))
7747             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7748
7749         buf_index += consumed;
7750
7751         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7752            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7753             continue;
7754
7755         switch(h->nal_unit_type){
7756         case NAL_IDR_SLICE:
7757             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7758         case NAL_SLICE:
7759             init_get_bits(&s->gb, ptr, bit_length);
7760             h->intra_gb_ptr=
7761             h->inter_gb_ptr= &s->gb;
7762             s->data_partitioning = 0;
7763
7764             if(decode_slice_header(h) < 0){
7765                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7766                 break;
7767             }
7768             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
7769             if(h->redundant_pic_count==0 && s->hurry_up < 5
7770                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7771                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7772                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7773                && avctx->skip_frame < AVDISCARD_ALL)
7774                 decode_slice(h);
7775             break;
7776         case NAL_DPA:
7777             init_get_bits(&s->gb, ptr, bit_length);
7778             h->intra_gb_ptr=
7779             h->inter_gb_ptr= NULL;
7780             s->data_partitioning = 1;
7781
7782             if(decode_slice_header(h) < 0){
7783                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7784             }
7785             break;
7786         case NAL_DPB:
7787             init_get_bits(&h->intra_gb, ptr, bit_length);
7788             h->intra_gb_ptr= &h->intra_gb;
7789             break;
7790         case NAL_DPC:
7791             init_get_bits(&h->inter_gb, ptr, bit_length);
7792             h->inter_gb_ptr= &h->inter_gb;
7793
7794             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
7795                && s->context_initialized
7796                && s->hurry_up < 5
7797                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7798                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7799                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7800                && avctx->skip_frame < AVDISCARD_ALL)
7801                 decode_slice(h);
7802             break;
7803         case NAL_SEI:
7804             init_get_bits(&s->gb, ptr, bit_length);
7805             decode_sei(h);
7806             break;
7807         case NAL_SPS:
7808             init_get_bits(&s->gb, ptr, bit_length);
7809             decode_seq_parameter_set(h);
7810
7811             if(s->flags& CODEC_FLAG_LOW_DELAY)
7812                 s->low_delay=1;
7813
7814             if(avctx->has_b_frames < 2)
7815                 avctx->has_b_frames= !s->low_delay;
7816             break;
7817         case NAL_PPS:
7818             init_get_bits(&s->gb, ptr, bit_length);
7819
7820             decode_picture_parameter_set(h, bit_length);
7821
7822             break;
7823         case NAL_AUD:
7824         case NAL_END_SEQUENCE:
7825         case NAL_END_STREAM:
7826         case NAL_FILLER_DATA:
7827         case NAL_SPS_EXT:
7828         case NAL_AUXILIARY_SLICE:
7829             break;
7830         default:
7831             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
7832         }
7833     }
7834
7835     return buf_index;
7836 }
7837
7838 /**
7839  * returns the number of bytes consumed for building the current frame
7840  */
7841 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7842     if(s->flags&CODEC_FLAG_TRUNCATED){
7843         pos -= s->parse_context.last_index;
7844         if(pos<0) pos=0; // FIXME remove (unneeded?)
7845
7846         return pos;
7847     }else{
7848         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7849         if(pos+10>buf_size) pos=buf_size; // oops ;)
7850
7851         return pos;
7852     }
7853 }
7854
7855 static int decode_frame(AVCodecContext *avctx,
7856                              void *data, int *data_size,
7857                              uint8_t *buf, int buf_size)
7858 {
7859     H264Context *h = avctx->priv_data;
7860     MpegEncContext *s = &h->s;
7861     AVFrame *pict = data;
7862     int buf_index;
7863
7864     s->flags= avctx->flags;
7865     s->flags2= avctx->flags2;
7866
7867    /* no supplementary picture */
7868     if (buf_size == 0) {
7869         Picture *out;
7870         int i, out_idx;
7871
7872 //FIXME factorize this with the output code below
7873         out = h->delayed_pic[0];
7874         out_idx = 0;
7875         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7876             if(h->delayed_pic[i]->poc < out->poc){
7877                 out = h->delayed_pic[i];
7878                 out_idx = i;
7879             }
7880
7881         for(i=out_idx; h->delayed_pic[i]; i++)
7882             h->delayed_pic[i] = h->delayed_pic[i+1];
7883
7884         if(out){
7885             *data_size = sizeof(AVFrame);
7886             *pict= *(AVFrame*)out;
7887         }
7888
7889         return 0;
7890     }
7891
7892     if(s->flags&CODEC_FLAG_TRUNCATED){
7893         int next= ff_h264_find_frame_end(h, buf, buf_size);
7894
7895         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7896             return buf_size;
7897 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7898     }
7899
7900     if(h->is_avc && !h->got_avcC) {
7901         int i, cnt, nalsize;
7902         unsigned char *p = avctx->extradata;
7903         if(avctx->extradata_size < 7) {
7904             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7905             return -1;
7906         }
7907         if(*p != 1) {
7908             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7909             return -1;
7910         }
7911         /* sps and pps in the avcC always have length coded with 2 bytes,
7912            so put a fake nal_length_size = 2 while parsing them */
7913         h->nal_length_size = 2;
7914         // Decode sps from avcC
7915         cnt = *(p+5) & 0x1f; // Number of sps
7916         p += 6;
7917         for (i = 0; i < cnt; i++) {
7918             nalsize = AV_RB16(p) + 2;
7919             if(decode_nal_units(h, p, nalsize) < 0) {
7920                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7921                 return -1;
7922             }
7923             p += nalsize;
7924         }
7925         // Decode pps from avcC
7926         cnt = *(p++); // Number of pps
7927         for (i = 0; i < cnt; i++) {
7928             nalsize = AV_RB16(p) + 2;
7929             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7930                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7931                 return -1;
7932             }
7933             p += nalsize;
7934         }
7935         // Now store right nal length size, that will be use to parse all other nals
7936         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7937         // Do not reparse avcC
7938         h->got_avcC = 1;
7939     }
7940
7941     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7942         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7943             return -1;
7944     }
7945
7946     buf_index=decode_nal_units(h, buf, buf_size);
7947     if(buf_index < 0)
7948         return -1;
7949
7950     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7951         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7952         return -1;
7953     }
7954
7955     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7956         Picture *out = s->current_picture_ptr;
7957         Picture *cur = s->current_picture_ptr;
7958         Picture *prev = h->delayed_output_pic;
7959         int i, pics, cross_idr, out_of_order, out_idx;
7960
7961         s->mb_y= 0;
7962
7963         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7964         s->current_picture_ptr->pict_type= s->pict_type;
7965
7966         h->prev_frame_num_offset= h->frame_num_offset;
7967         h->prev_frame_num= h->frame_num;
7968         if(s->current_picture_ptr->reference){
7969             h->prev_poc_msb= h->poc_msb;
7970             h->prev_poc_lsb= h->poc_lsb;
7971         }
7972         if(s->current_picture_ptr->reference)
7973             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7974
7975         ff_er_frame_end(s);
7976
7977         MPV_frame_end(s);
7978
7979     //FIXME do something with unavailable reference frames
7980
7981 #if 0 //decode order
7982         *data_size = sizeof(AVFrame);
7983 #else
7984         /* Sort B-frames into display order */
7985
7986         if(h->sps.bitstream_restriction_flag
7987            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7988             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7989             s->low_delay = 0;
7990         }
7991
7992         pics = 0;
7993         while(h->delayed_pic[pics]) pics++;
7994
7995         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7996
7997         h->delayed_pic[pics++] = cur;
7998         if(cur->reference == 0)
7999             cur->reference = 1;
8000
8001         cross_idr = 0;
8002         for(i=0; h->delayed_pic[i]; i++)
8003             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8004                 cross_idr = 1;
8005
8006         out = h->delayed_pic[0];
8007         out_idx = 0;
8008         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8009             if(h->delayed_pic[i]->poc < out->poc){
8010                 out = h->delayed_pic[i];
8011                 out_idx = i;
8012             }
8013
8014         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8015         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8016             { }
8017         else if(prev && pics <= s->avctx->has_b_frames)
8018             out = prev;
8019         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8020            || (s->low_delay &&
8021             ((!cross_idr && prev && out->poc > prev->poc + 2)
8022              || cur->pict_type == B_TYPE)))
8023         {
8024             s->low_delay = 0;
8025             s->avctx->has_b_frames++;
8026             out = prev;
8027         }
8028         else if(out_of_order)
8029             out = prev;
8030
8031         if(out_of_order || pics > s->avctx->has_b_frames){
8032             for(i=out_idx; h->delayed_pic[i]; i++)
8033                 h->delayed_pic[i] = h->delayed_pic[i+1];
8034         }
8035
8036         if(prev == out)
8037             *data_size = 0;
8038         else
8039             *data_size = sizeof(AVFrame);
8040         if(prev && prev != out && prev->reference == 1)
8041             prev->reference = 0;
8042         h->delayed_output_pic = out;
8043 #endif
8044
8045         if(out)
8046             *pict= *(AVFrame*)out;
8047         else
8048             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8049     }
8050
8051     assert(pict->data[0] || !*data_size);
8052     ff_print_debug_info(s, pict);
8053 //printf("out %d\n", (int)pict->data[0]);
8054 #if 0 //?
8055
8056     /* Return the Picture timestamp as the frame number */
8057     /* we substract 1 because it is added on utils.c    */
8058     avctx->frame_number = s->picture_number - 1;
8059 #endif
8060     return get_consumed_bytes(s, buf_index, buf_size);
8061 }
8062 #if 0
8063 static inline void fill_mb_avail(H264Context *h){
8064     MpegEncContext * const s = &h->s;
8065     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8066
8067     if(s->mb_y){
8068         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8069         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8070         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8071     }else{
8072         h->mb_avail[0]=
8073         h->mb_avail[1]=
8074         h->mb_avail[2]= 0;
8075     }
8076     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8077     h->mb_avail[4]= 1; //FIXME move out
8078     h->mb_avail[5]= 0; //FIXME move out
8079 }
8080 #endif
8081
8082 #if 0 //selftest
8083 #define COUNT 8000
8084 #define SIZE (COUNT*40)
8085 int main(){
8086     int i;
8087     uint8_t temp[SIZE];
8088     PutBitContext pb;
8089     GetBitContext gb;
8090 //    int int_temp[10000];
8091     DSPContext dsp;
8092     AVCodecContext avctx;
8093
8094     dsputil_init(&dsp, &avctx);
8095
8096     init_put_bits(&pb, temp, SIZE);
8097     printf("testing unsigned exp golomb\n");
8098     for(i=0; i<COUNT; i++){
8099         START_TIMER
8100         set_ue_golomb(&pb, i);
8101         STOP_TIMER("set_ue_golomb");
8102     }
8103     flush_put_bits(&pb);
8104
8105     init_get_bits(&gb, temp, 8*SIZE);
8106     for(i=0; i<COUNT; i++){
8107         int j, s;
8108
8109         s= show_bits(&gb, 24);
8110
8111         START_TIMER
8112         j= get_ue_golomb(&gb);
8113         if(j != i){
8114             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8115 //            return -1;
8116         }
8117         STOP_TIMER("get_ue_golomb");
8118     }
8119
8120
8121     init_put_bits(&pb, temp, SIZE);
8122     printf("testing signed exp golomb\n");
8123     for(i=0; i<COUNT; i++){
8124         START_TIMER
8125         set_se_golomb(&pb, i - COUNT/2);
8126         STOP_TIMER("set_se_golomb");
8127     }
8128     flush_put_bits(&pb);
8129
8130     init_get_bits(&gb, temp, 8*SIZE);
8131     for(i=0; i<COUNT; i++){
8132         int j, s;
8133
8134         s= show_bits(&gb, 24);
8135
8136         START_TIMER
8137         j= get_se_golomb(&gb);
8138         if(j != i - COUNT/2){
8139             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8140 //            return -1;
8141         }
8142         STOP_TIMER("get_se_golomb");
8143     }
8144
8145     printf("testing 4x4 (I)DCT\n");
8146
8147     DCTELEM block[16];
8148     uint8_t src[16], ref[16];
8149     uint64_t error= 0, max_error=0;
8150
8151     for(i=0; i<COUNT; i++){
8152         int j;
8153 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8154         for(j=0; j<16; j++){
8155             ref[j]= random()%255;
8156             src[j]= random()%255;
8157         }
8158
8159         h264_diff_dct_c(block, src, ref, 4);
8160
8161         //normalize
8162         for(j=0; j<16; j++){
8163 //            printf("%d ", block[j]);
8164             block[j]= block[j]*4;
8165             if(j&1) block[j]= (block[j]*4 + 2)/5;
8166             if(j&4) block[j]= (block[j]*4 + 2)/5;
8167         }
8168 //        printf("\n");
8169
8170         s->dsp.h264_idct_add(ref, block, 4);
8171 /*        for(j=0; j<16; j++){
8172             printf("%d ", ref[j]);
8173         }
8174         printf("\n");*/
8175
8176         for(j=0; j<16; j++){
8177             int diff= FFABS(src[j] - ref[j]);
8178
8179             error+= diff*diff;
8180             max_error= FFMAX(max_error, diff);
8181         }
8182     }
8183     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8184 #if 0
8185     printf("testing quantizer\n");
8186     for(qp=0; qp<52; qp++){
8187         for(i=0; i<16; i++)
8188             src1_block[i]= src2_block[i]= random()%255;
8189
8190     }
8191 #endif
8192     printf("Testing NAL layer\n");
8193
8194     uint8_t bitstream[COUNT];
8195     uint8_t nal[COUNT*2];
8196     H264Context h;
8197     memset(&h, 0, sizeof(H264Context));
8198
8199     for(i=0; i<COUNT; i++){
8200         int zeros= i;
8201         int nal_length;
8202         int consumed;
8203         int out_length;
8204         uint8_t *out;
8205         int j;
8206
8207         for(j=0; j<COUNT; j++){
8208             bitstream[j]= (random() % 255) + 1;
8209         }
8210
8211         for(j=0; j<zeros; j++){
8212             int pos= random() % COUNT;
8213             while(bitstream[pos] == 0){
8214                 pos++;
8215                 pos %= COUNT;
8216             }
8217             bitstream[pos]=0;
8218         }
8219
8220         START_TIMER
8221
8222         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8223         if(nal_length<0){
8224             printf("encoding failed\n");
8225             return -1;
8226         }
8227
8228         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8229
8230         STOP_TIMER("NAL")
8231
8232         if(out_length != COUNT){
8233             printf("incorrect length %d %d\n", out_length, COUNT);
8234             return -1;
8235         }
8236
8237         if(consumed != nal_length){
8238             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8239             return -1;
8240         }
8241
8242         if(memcmp(bitstream, out, COUNT)){
8243             printf("mismatch\n");
8244             return -1;
8245         }
8246     }
8247
8248     printf("Testing RBSP\n");
8249
8250
8251     return 0;
8252 }
8253 #endif
8254
8255
8256 static int decode_end(AVCodecContext *avctx)
8257 {
8258     H264Context *h = avctx->priv_data;
8259     MpegEncContext *s = &h->s;
8260
8261     av_freep(&h->rbsp_buffer[0]);
8262     av_freep(&h->rbsp_buffer[1]);
8263     free_tables(h); //FIXME cleanup init stuff perhaps
8264     MPV_common_end(s);
8265
8266 //    memset(h, 0, sizeof(H264Context));
8267
8268     return 0;
8269 }
8270
8271
8272 AVCodec h264_decoder = {
8273     "h264",
8274     CODEC_TYPE_VIDEO,
8275     CODEC_ID_H264,
8276     sizeof(H264Context),
8277     decode_init,
8278     NULL,
8279     decode_end,
8280     decode_frame,
8281     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8282     .flush= flush_dpb,
8283 };
8284
8285 #include "svq3.c"