git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35
  36 #include "cabac.h"
  37
  38 //#undef NDEBUG
  39 #include <assert.h>
  40
  41 static VLC coeff_token_vlc[4];
  42 static VLC chroma_dc_coeff_token_vlc;
  43
  44 static VLC total_zeros_vlc[15];
  45 static VLC chroma_dc_total_zeros_vlc[3];
  46
  47 static VLC run_vlc[6];
  48 static VLC run7_vlc;
  49
  50 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  51 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  52 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  53 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  54
  55 static av_always_inline uint32_t pack16to32(int a, int b){
  56 #ifdef WORDS_BIGENDIAN
  57    return (b&0xFFFF) + (a<<16);
  58 #else
  59    return (a&0xFFFF) + (b<<16);
  60 #endif
  61 }
  62
  63 const uint8_t ff_rem6[52]={
  64 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  65 };
  66
  67 const uint8_t ff_div6[52]={
  68 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  69 };
  70
  71
  72 /**
  73  * fill a rectangle.
  74  * @param h height of the rectangle, should be a constant
  75  * @param w width of the rectangle, should be a constant
  76  * @param size the size of val (1 or 4), should be a constant
  77  */
  78 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  79     uint8_t *p= (uint8_t*)vp;
  80     assert(size==1 || size==4);
  81     assert(w<=4);
  82
  83     w      *= size;
  84     stride *= size;
  85
  86     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  87     assert((stride&(w-1))==0);
  88     if(w==2){
  89         const uint16_t v= size==4 ? val : val*0x0101;
  90         *(uint16_t*)(p + 0*stride)= v;
  91         if(h==1) return;
  92         *(uint16_t*)(p + 1*stride)= v;
  93         if(h==2) return;
  94         *(uint16_t*)(p + 2*stride)=
  95         *(uint16_t*)(p + 3*stride)= v;
  96     }else if(w==4){
  97         const uint32_t v= size==4 ? val : val*0x01010101;
  98         *(uint32_t*)(p + 0*stride)= v;
  99         if(h==1) return;
 100         *(uint32_t*)(p + 1*stride)= v;
 101         if(h==2) return;
 102         *(uint32_t*)(p + 2*stride)=
 103         *(uint32_t*)(p + 3*stride)= v;
 104     }else if(w==8){
 105     //gcc can't optimize 64bit math on x86_32
 106 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 107         const uint64_t v= val*0x0100000001ULL;
 108         *(uint64_t*)(p + 0*stride)= v;
 109         if(h==1) return;
 110         *(uint64_t*)(p + 1*stride)= v;
 111         if(h==2) return;
 112         *(uint64_t*)(p + 2*stride)=
 113         *(uint64_t*)(p + 3*stride)= v;
 114     }else if(w==16){
 115         const uint64_t v= val*0x0100000001ULL;
 116         *(uint64_t*)(p + 0+0*stride)=
 117         *(uint64_t*)(p + 8+0*stride)=
 118         *(uint64_t*)(p + 0+1*stride)=
 119         *(uint64_t*)(p + 8+1*stride)= v;
 120         if(h==2) return;
 121         *(uint64_t*)(p + 0+2*stride)=
 122         *(uint64_t*)(p + 8+2*stride)=
 123         *(uint64_t*)(p + 0+3*stride)=
 124         *(uint64_t*)(p + 8+3*stride)= v;
 125 #else
 126         *(uint32_t*)(p + 0+0*stride)=
 127         *(uint32_t*)(p + 4+0*stride)= val;
 128         if(h==1) return;
 129         *(uint32_t*)(p + 0+1*stride)=
 130         *(uint32_t*)(p + 4+1*stride)= val;
 131         if(h==2) return;
 132         *(uint32_t*)(p + 0+2*stride)=
 133         *(uint32_t*)(p + 4+2*stride)=
 134         *(uint32_t*)(p + 0+3*stride)=
 135         *(uint32_t*)(p + 4+3*stride)= val;
 136     }else if(w==16){
 137         *(uint32_t*)(p + 0+0*stride)=
 138         *(uint32_t*)(p + 4+0*stride)=
 139         *(uint32_t*)(p + 8+0*stride)=
 140         *(uint32_t*)(p +12+0*stride)=
 141         *(uint32_t*)(p + 0+1*stride)=
 142         *(uint32_t*)(p + 4+1*stride)=
 143         *(uint32_t*)(p + 8+1*stride)=
 144         *(uint32_t*)(p +12+1*stride)= val;
 145         if(h==2) return;
 146         *(uint32_t*)(p + 0+2*stride)=
 147         *(uint32_t*)(p + 4+2*stride)=
 148         *(uint32_t*)(p + 8+2*stride)=
 149         *(uint32_t*)(p +12+2*stride)=
 150         *(uint32_t*)(p + 0+3*stride)=
 151         *(uint32_t*)(p + 4+3*stride)=
 152         *(uint32_t*)(p + 8+3*stride)=
 153         *(uint32_t*)(p +12+3*stride)= val;
 154 #endif
 155     }else
 156         assert(0);
 157     assert(h==4);
 158 }
 159
 160 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 161     MpegEncContext * const s = &h->s;
 162     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 163     int topleft_xy, top_xy, topright_xy, left_xy[2];
 164     int topleft_type, top_type, topright_type, left_type[2];
 165     int left_block[8];
 166     int i;
 167
 168     //FIXME deblocking could skip the intra and nnz parts.
 169     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 170         return;
 171
 172     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 173
 174     top_xy     = mb_xy  - s->mb_stride;
 175     topleft_xy = top_xy - 1;
 176     topright_xy= top_xy + 1;
 177     left_xy[1] = left_xy[0] = mb_xy-1;
 178     left_block[0]= 0;
 179     left_block[1]= 1;
 180     left_block[2]= 2;
 181     left_block[3]= 3;
 182     left_block[4]= 7;
 183     left_block[5]= 10;
 184     left_block[6]= 8;
 185     left_block[7]= 11;
 186     if(FRAME_MBAFF){
 187         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 188         const int top_pair_xy      = pair_xy     - s->mb_stride;
 189         const int topleft_pair_xy  = top_pair_xy - 1;
 190         const int topright_pair_xy = top_pair_xy + 1;
 191         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 192         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 193         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 194         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 195         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 196         const int bottom = (s->mb_y & 1);
 197         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 198         if (bottom
 199                 ? !curr_mb_frame_flag // bottom macroblock
 200                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 201                 ) {
 202             top_xy -= s->mb_stride;
 203         }
 204         if (bottom
 205                 ? !curr_mb_frame_flag // bottom macroblock
 206                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 207                 ) {
 208             topleft_xy -= s->mb_stride;
 209         }
 210         if (bottom
 211                 ? !curr_mb_frame_flag // bottom macroblock
 212                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 213                 ) {
 214             topright_xy -= s->mb_stride;
 215         }
 216         if (left_mb_frame_flag != curr_mb_frame_flag) {
 217             left_xy[1] = left_xy[0] = pair_xy - 1;
 218             if (curr_mb_frame_flag) {
 219                 if (bottom) {
 220                     left_block[0]= 2;
 221                     left_block[1]= 2;
 222                     left_block[2]= 3;
 223                     left_block[3]= 3;
 224                     left_block[4]= 8;
 225                     left_block[5]= 11;
 226                     left_block[6]= 8;
 227                     left_block[7]= 11;
 228                 } else {
 229                     left_block[0]= 0;
 230                     left_block[1]= 0;
 231                     left_block[2]= 1;
 232                     left_block[3]= 1;
 233                     left_block[4]= 7;
 234                     left_block[5]= 10;
 235                     left_block[6]= 7;
 236                     left_block[7]= 10;
 237                 }
 238             } else {
 239                 left_xy[1] += s->mb_stride;
 240                 //left_block[0]= 0;
 241                 left_block[1]= 2;
 242                 left_block[2]= 0;
 243                 left_block[3]= 2;
 244                 //left_block[4]= 7;
 245                 left_block[5]= 10;
 246                 left_block[6]= 7;
 247                 left_block[7]= 10;
 248             }
 249         }
 250     }
 251
 252     h->top_mb_xy = top_xy;
 253     h->left_mb_xy[0] = left_xy[0];
 254     h->left_mb_xy[1] = left_xy[1];
 255     if(for_deblock){
 256         topleft_type = 0;
 257         topright_type = 0;
 258         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 259         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 260         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 261
 262         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 263             int list;
 264             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 265             for(i=0; i<16; i++)
 266                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 267             for(list=0; list<h->list_count; list++){
 268                 if(USES_LIST(mb_type,list)){
 269                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 270                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 271                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 272                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 273                         dst[0] = src[0];
 274                         dst[1] = src[1];
 275                         dst[2] = src[2];
 276                         dst[3] = src[3];
 277                     }
 278                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 279                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 280                     ref += h->b8_stride;
 281                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 282                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 283                 }else{
 284                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 285                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 286                 }
 287             }
 288         }
 289     }else{
 290         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 291         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 292         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 293         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 294         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 295     }
 296
 297     if(IS_INTRA(mb_type)){
 298         h->topleft_samples_available=
 299         h->top_samples_available=
 300         h->left_samples_available= 0xFFFF;
 301         h->topright_samples_available= 0xEEEA;
 302
 303         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 304             h->topleft_samples_available= 0xB3FF;
 305             h->top_samples_available= 0x33FF;
 306             h->topright_samples_available= 0x26EA;
 307         }
 308         for(i=0; i<2; i++){
 309             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 310                 h->topleft_samples_available&= 0xDF5F;
 311                 h->left_samples_available&= 0x5F5F;
 312             }
 313         }
 314
 315         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 316             h->topleft_samples_available&= 0x7FFF;
 317
 318         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 319             h->topright_samples_available&= 0xFBFF;
 320
 321         if(IS_INTRA4x4(mb_type)){
 322             if(IS_INTRA4x4(top_type)){
 323                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 324                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 325                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 326                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 327             }else{
 328                 int pred;
 329                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 330                     pred= -1;
 331                 else{
 332                     pred= 2;
 333                 }
 334                 h->intra4x4_pred_mode_cache[4+8*0]=
 335                 h->intra4x4_pred_mode_cache[5+8*0]=
 336                 h->intra4x4_pred_mode_cache[6+8*0]=
 337                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 338             }
 339             for(i=0; i<2; i++){
 340                 if(IS_INTRA4x4(left_type[i])){
 341                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 342                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 343                 }else{
 344                     int pred;
 345                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 346                         pred= -1;
 347                     else{
 348                         pred= 2;
 349                     }
 350                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 351                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 352                 }
 353             }
 354         }
 355     }
 356
 357
 358 /*
 359 0 . T T. T T T T
 360 1 L . .L . . . .
 361 2 L . .L . . . .
 362 3 . T TL . . . .
 363 4 L . .L . . . .
 364 5 L . .. . . . .
 365 */
 366 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 367     if(top_type){
 368         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 369         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 370         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 371         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 372
 373         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 374         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 375
 376         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 377         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 378
 379     }else{
 380         h->non_zero_count_cache[4+8*0]=
 381         h->non_zero_count_cache[5+8*0]=
 382         h->non_zero_count_cache[6+8*0]=
 383         h->non_zero_count_cache[7+8*0]=
 384
 385         h->non_zero_count_cache[1+8*0]=
 386         h->non_zero_count_cache[2+8*0]=
 387
 388         h->non_zero_count_cache[1+8*3]=
 389         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 390
 391     }
 392
 393     for (i=0; i<2; i++) {
 394         if(left_type[i]){
 395             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 396             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 397             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 398             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 399         }else{
 400             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 401             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 402             h->non_zero_count_cache[0+8*1 +   8*i]=
 403             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 404         }
 405     }
 406
 407     if( h->pps.cabac ) {
 408         // top_cbp
 409         if(top_type) {
 410             h->top_cbp = h->cbp_table[top_xy];
 411         } else if(IS_INTRA(mb_type)) {
 412             h->top_cbp = 0x1C0;
 413         } else {
 414             h->top_cbp = 0;
 415         }
 416         // left_cbp
 417         if (left_type[0]) {
 418             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 419         } else if(IS_INTRA(mb_type)) {
 420             h->left_cbp = 0x1C0;
 421         } else {
 422             h->left_cbp = 0;
 423         }
 424         if (left_type[0]) {
 425             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 426         }
 427         if (left_type[1]) {
 428             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 429         }
 430     }
 431
 432 #if 1
 433     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 434         int list;
 435         for(list=0; list<h->list_count; list++){
 436             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 437                 /*if(!h->mv_cache_clean[list]){
 438                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 439                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 440                     h->mv_cache_clean[list]= 1;
 441                 }*/
 442                 continue;
 443             }
 444             h->mv_cache_clean[list]= 0;
 445
 446             if(USES_LIST(top_type, list)){
 447                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 448                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 449                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 450                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 451                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 452                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 453                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 454                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 455                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 456                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 457             }else{
 458                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 459                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 460                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 461                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 462                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 463             }
 464
 465             for(i=0; i<2; i++){
 466                 int cache_idx = scan8[0] - 1 + i*2*8;
 467                 if(USES_LIST(left_type[i], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 469                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 470                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 471                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 472                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 473                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 474                 }else{
 475                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 476                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 477                     h->ref_cache[list][cache_idx  ]=
 478                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 479                 }
 480             }
 481
 482             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 483                 continue;
 484
 485             if(USES_LIST(topleft_type, list)){
 486                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 487                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 488                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 489                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 490             }else{
 491                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 492                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 493             }
 494
 495             if(USES_LIST(topright_type, list)){
 496                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 497                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 498                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 499                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 500             }else{
 501                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 502                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 503             }
 504
 505             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 506                 continue;
 507
 508             h->ref_cache[list][scan8[5 ]+1] =
 509             h->ref_cache[list][scan8[7 ]+1] =
 510             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 511             h->ref_cache[list][scan8[4 ]] =
 512             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 513             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 514             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 515             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 516             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 517             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 518
 519             if( h->pps.cabac ) {
 520                 /* XXX beurk, Load mvd */
 521                 if(USES_LIST(top_type, list)){
 522                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 523                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 524                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 525                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 526                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 527                 }else{
 528                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 529                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 530                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 531                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 532                 }
 533                 if(USES_LIST(left_type[0], list)){
 534                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 535                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 536                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 537                 }else{
 538                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 539                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 540                 }
 541                 if(USES_LIST(left_type[1], list)){
 542                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 543                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 544                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 545                 }else{
 546                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 547                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 548                 }
 549                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 550                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 551                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 552                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 553                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 554
 555                 if(h->slice_type == B_TYPE){
 556                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 557
 558                     if(IS_DIRECT(top_type)){
 559                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 560                     }else if(IS_8X8(top_type)){
 561                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 562                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 563                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 564                     }else{
 565                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 566                     }
 567
 568                     if(IS_DIRECT(left_type[0]))
 569                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 570                     else if(IS_8X8(left_type[0]))
 571                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 572                     else
 573                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 574
 575                     if(IS_DIRECT(left_type[1]))
 576                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 577                     else if(IS_8X8(left_type[1]))
 578                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 579                     else
 580                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 581                 }
 582             }
 583
 584             if(FRAME_MBAFF){
 585 #define MAP_MVS\
 586                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 587                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 588                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 589                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 590                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 591                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 592                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 593                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 594                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 595                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 596                 if(MB_FIELD){
 597 #define MAP_F2F(idx, mb_type)\
 598                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 599                         h->ref_cache[list][idx] <<= 1;\
 600                         h->mv_cache[list][idx][1] /= 2;\
 601                         h->mvd_cache[list][idx][1] /= 2;\
 602                     }
 603                     MAP_MVS
 604 #undef MAP_F2F
 605                 }else{
 606 #define MAP_F2F(idx, mb_type)\
 607                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 608                         h->ref_cache[list][idx] >>= 1;\
 609                         h->mv_cache[list][idx][1] <<= 1;\
 610                         h->mvd_cache[list][idx][1] <<= 1;\
 611                     }
 612                     MAP_MVS
 613 #undef MAP_F2F
 614                 }
 615             }
 616         }
 617     }
 618 #endif
 619
 620     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 621 }
 622
 623 static inline void write_back_intra_pred_mode(H264Context *h){
 624     MpegEncContext * const s = &h->s;
 625     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 626
 627     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 628     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 629     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 630     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 631     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 632     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 633     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 634 }
 635
 636 /**
 637  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 638  */
 639 static inline int check_intra4x4_pred_mode(H264Context *h){
 640     MpegEncContext * const s = &h->s;
 641     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 642     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 643     int i;
 644
 645     if(!(h->top_samples_available&0x8000)){
 646         for(i=0; i<4; i++){
 647             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 648             if(status<0){
 649                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 650                 return -1;
 651             } else if(status){
 652                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 653             }
 654         }
 655     }
 656
 657     if(!(h->left_samples_available&0x8000)){
 658         for(i=0; i<4; i++){
 659             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 660             if(status<0){
 661                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 662                 return -1;
 663             } else if(status){
 664                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 665             }
 666         }
 667     }
 668
 669     return 0;
 670 } //FIXME cleanup like next
 671
 672 /**
 673  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 674  */
 675 static inline int check_intra_pred_mode(H264Context *h, int mode){
 676     MpegEncContext * const s = &h->s;
 677     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 678     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 679
 680     if(mode > 6U) {
 681         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 682         return -1;
 683     }
 684
 685     if(!(h->top_samples_available&0x8000)){
 686         mode= top[ mode ];
 687         if(mode<0){
 688             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 689             return -1;
 690         }
 691     }
 692
 693     if(!(h->left_samples_available&0x8000)){
 694         mode= left[ mode ];
 695         if(mode<0){
 696             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 697             return -1;
 698         }
 699     }
 700
 701     return mode;
 702 }
 703
 704 /**
 705  * gets the predicted intra4x4 prediction mode.
 706  */
 707 static inline int pred_intra_mode(H264Context *h, int n){
 708     const int index8= scan8[n];
 709     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 710     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 711     const int min= FFMIN(left, top);
 712
 713     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 714
 715     if(min<0) return DC_PRED;
 716     else      return min;
 717 }
 718
 719 static inline void write_back_non_zero_count(H264Context *h){
 720     MpegEncContext * const s = &h->s;
 721     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 722
 723     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 724     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 725     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 726     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 727     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 728     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 729     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 730
 731     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 732     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 733     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 734
 735     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 736     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 737     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 738
 739     if(FRAME_MBAFF){
 740         // store all luma nnzs, for deblocking
 741         int v = 0, i;
 742         for(i=0; i<16; i++)
 743             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 744         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 745     }
 746 }
 747
 748 /**
 749  * gets the predicted number of non zero coefficients.
 750  * @param n block index
 751  */
 752 static inline int pred_non_zero_count(H264Context *h, int n){
 753     const int index8= scan8[n];
 754     const int left= h->non_zero_count_cache[index8 - 1];
 755     const int top = h->non_zero_count_cache[index8 - 8];
 756     int i= left + top;
 757
 758     if(i<64) i= (i+1)>>1;
 759
 760     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 761
 762     return i&31;
 763 }
 764
 765 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 766     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 767     MpegEncContext *s = &h->s;
 768
 769     /* there is no consistent mapping of mvs to neighboring locations that will
 770      * make mbaff happy, so we can't move all this logic to fill_caches */
 771     if(FRAME_MBAFF){
 772         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 773         const int16_t *mv;
 774         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 775         *C = h->mv_cache[list][scan8[0]-2];
 776
 777         if(!MB_FIELD
 778            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 779             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 780             if(IS_INTERLACED(mb_types[topright_xy])){
 781 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 782                 const int x4 = X4, y4 = Y4;\
 783                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 784                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 785                     return LIST_NOT_USED;\
 786                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 787                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 788                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 789                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 790
 791                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 792             }
 793         }
 794         if(topright_ref == PART_NOT_AVAILABLE
 795            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 796            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 797             if(!MB_FIELD
 798                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 799                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 800             }
 801             if(MB_FIELD
 802                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 803                && i >= scan8[0]+8){
 804                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 805                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 806             }
 807         }
 808 #undef SET_DIAG_MV
 809     }
 810
 811     if(topright_ref != PART_NOT_AVAILABLE){
 812         *C= h->mv_cache[list][ i - 8 + part_width ];
 813         return topright_ref;
 814     }else{
 815         tprintf(s->avctx, "topright MV not available\n");
 816
 817         *C= h->mv_cache[list][ i - 8 - 1 ];
 818         return h->ref_cache[list][ i - 8 - 1 ];
 819     }
 820 }
 821
 822 /**
 823  * gets the predicted MV.
 824  * @param n the block index
 825  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 826  * @param mx the x component of the predicted motion vector
 827  * @param my the y component of the predicted motion vector
 828  */
 829 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 830     const int index8= scan8[n];
 831     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 832     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 833     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 834     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 835     const int16_t * C;
 836     int diagonal_ref, match_count;
 837
 838     assert(part_width==1 || part_width==2 || part_width==4);
 839
 840 /* mv_cache
 841   B . . A T T T T
 842   U . . L . . , .
 843   U . . L . . . .
 844   U . . L . . , .
 845   . . . L . . . .
 846 */
 847
 848     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 849     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 850     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 851     if(match_count > 1){ //most common
 852         *mx= mid_pred(A[0], B[0], C[0]);
 853         *my= mid_pred(A[1], B[1], C[1]);
 854     }else if(match_count==1){
 855         if(left_ref==ref){
 856             *mx= A[0];
 857             *my= A[1];
 858         }else if(top_ref==ref){
 859             *mx= B[0];
 860             *my= B[1];
 861         }else{
 862             *mx= C[0];
 863             *my= C[1];
 864         }
 865     }else{
 866         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 867             *mx= A[0];
 868             *my= A[1];
 869         }else{
 870             *mx= mid_pred(A[0], B[0], C[0]);
 871             *my= mid_pred(A[1], B[1], C[1]);
 872         }
 873     }
 874
 875     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 876 }
 877
 878 /**
 879  * gets the directionally predicted 16x8 MV.
 880  * @param n the block index
 881  * @param mx the x component of the predicted motion vector
 882  * @param my the y component of the predicted motion vector
 883  */
 884 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 885     if(n==0){
 886         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 887         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 888
 889         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 890
 891         if(top_ref == ref){
 892             *mx= B[0];
 893             *my= B[1];
 894             return;
 895         }
 896     }else{
 897         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 898         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 899
 900         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 901
 902         if(left_ref == ref){
 903             *mx= A[0];
 904             *my= A[1];
 905             return;
 906         }
 907     }
 908
 909     //RARE
 910     pred_motion(h, n, 4, list, ref, mx, my);
 911 }
 912
 913 /**
 914  * gets the directionally predicted 8x16 MV.
 915  * @param n the block index
 916  * @param mx the x component of the predicted motion vector
 917  * @param my the y component of the predicted motion vector
 918  */
 919 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 920     if(n==0){
 921         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 922         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 923
 924         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 925
 926         if(left_ref == ref){
 927             *mx= A[0];
 928             *my= A[1];
 929             return;
 930         }
 931     }else{
 932         const int16_t * C;
 933         int diagonal_ref;
 934
 935         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 936
 937         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 938
 939         if(diagonal_ref == ref){
 940             *mx= C[0];
 941             *my= C[1];
 942             return;
 943         }
 944     }
 945
 946     //RARE
 947     pred_motion(h, n, 2, list, ref, mx, my);
 948 }
 949
 950 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 951     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 952     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 953
 954     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 955
 956     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 957        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 958        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 959
 960         *mx = *my = 0;
 961         return;
 962     }
 963
 964     pred_motion(h, 0, 4, 0, 0, mx, my);
 965
 966     return;
 967 }
 968
 969 static inline void direct_dist_scale_factor(H264Context * const h){
 970     const int poc = h->s.current_picture_ptr->poc;
 971     const int poc1 = h->ref_list[1][0].poc;
 972     int i;
 973     for(i=0; i<h->ref_count[0]; i++){
 974         int poc0 = h->ref_list[0][i].poc;
 975         int td = av_clip(poc1 - poc0, -128, 127);
 976         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 977             h->dist_scale_factor[i] = 256;
 978         }else{
 979             int tb = av_clip(poc - poc0, -128, 127);
 980             int tx = (16384 + (FFABS(td) >> 1)) / td;
 981             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 982         }
 983     }
 984     if(FRAME_MBAFF){
 985         for(i=0; i<h->ref_count[0]; i++){
 986             h->dist_scale_factor_field[2*i] =
 987             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 988         }
 989     }
 990 }
 991 static inline void direct_ref_list_init(H264Context * const h){
 992     MpegEncContext * const s = &h->s;
 993     Picture * const ref1 = &h->ref_list[1][0];
 994     Picture * const cur = s->current_picture_ptr;
 995     int list, i, j;
 996     if(cur->pict_type == I_TYPE)
 997         cur->ref_count[0] = 0;
 998     if(cur->pict_type != B_TYPE)
 999         cur->ref_count[1] = 0;
1000     for(list=0; list<2; list++){
1001         cur->ref_count[list] = h->ref_count[list];
1002         for(j=0; j<h->ref_count[list]; j++)
1003             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1004     }
1005     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1006         return;
1007     for(list=0; list<2; list++){
1008         for(i=0; i<ref1->ref_count[list]; i++){
1009             const int poc = ref1->ref_poc[list][i];
1010             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1011             for(j=0; j<h->ref_count[list]; j++)
1012                 if(h->ref_list[list][j].poc == poc){
1013                     h->map_col_to_list0[list][i] = j;
1014                     break;
1015                 }
1016         }
1017     }
1018     if(FRAME_MBAFF){
1019         for(list=0; list<2; list++){
1020             for(i=0; i<ref1->ref_count[list]; i++){
1021                 j = h->map_col_to_list0[list][i];
1022                 h->map_col_to_list0_field[list][2*i] = 2*j;
1023                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1024             }
1025         }
1026     }
1027 }
1028
1029 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1030     MpegEncContext * const s = &h->s;
1031     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1032     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1033     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1034     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1035     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1036     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1037     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1038     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1039     const int is_b8x8 = IS_8X8(*mb_type);
1040     unsigned int sub_mb_type;
1041     int i8, i4;
1042
1043 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1044     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1045         /* FIXME save sub mb types from previous frames (or derive from MVs)
1046          * so we know exactly what block size to use */
1047         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1048         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1049     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1050         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1051         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1052     }else{
1053         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1054         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1055     }
1056     if(!is_b8x8)
1057         *mb_type |= MB_TYPE_DIRECT2;
1058     if(MB_FIELD)
1059         *mb_type |= MB_TYPE_INTERLACED;
1060
1061     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1062
1063     if(h->direct_spatial_mv_pred){
1064         int ref[2];
1065         int mv[2][2];
1066         int list;
1067
1068         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1069
1070         /* ref = min(neighbors) */
1071         for(list=0; list<2; list++){
1072             int refa = h->ref_cache[list][scan8[0] - 1];
1073             int refb = h->ref_cache[list][scan8[0] - 8];
1074             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1075             if(refc == -2)
1076                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1077             ref[list] = refa;
1078             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1079                 ref[list] = refb;
1080             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1081                 ref[list] = refc;
1082             if(ref[list] < 0)
1083                 ref[list] = -1;
1084         }
1085
1086         if(ref[0] < 0 && ref[1] < 0){
1087             ref[0] = ref[1] = 0;
1088             mv[0][0] = mv[0][1] =
1089             mv[1][0] = mv[1][1] = 0;
1090         }else{
1091             for(list=0; list<2; list++){
1092                 if(ref[list] >= 0)
1093                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1094                 else
1095                     mv[list][0] = mv[list][1] = 0;
1096             }
1097         }
1098
1099         if(ref[1] < 0){
1100             *mb_type &= ~MB_TYPE_P0L1;
1101             sub_mb_type &= ~MB_TYPE_P0L1;
1102         }else if(ref[0] < 0){
1103             *mb_type &= ~MB_TYPE_P0L0;
1104             sub_mb_type &= ~MB_TYPE_P0L0;
1105         }
1106
1107         if(IS_16X16(*mb_type)){
1108             int a=0, b=0;
1109
1110             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1111             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1112             if(!IS_INTRA(mb_type_col)
1113                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1114                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1115                        && (h->x264_build>33 || !h->x264_build)))){
1116                 if(ref[0] > 0)
1117                     a= pack16to32(mv[0][0],mv[0][1]);
1118                 if(ref[1] > 0)
1119                     b= pack16to32(mv[1][0],mv[1][1]);
1120             }else{
1121                 a= pack16to32(mv[0][0],mv[0][1]);
1122                 b= pack16to32(mv[1][0],mv[1][1]);
1123             }
1124             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1125             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1126         }else{
1127             for(i8=0; i8<4; i8++){
1128                 const int x8 = i8&1;
1129                 const int y8 = i8>>1;
1130
1131                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1132                     continue;
1133                 h->sub_mb_type[i8] = sub_mb_type;
1134
1135                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1136                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1137                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1138                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1139
1140                 /* col_zero_flag */
1141                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1142                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1143                                                   && (h->x264_build>33 || !h->x264_build)))){
1144                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1145                     if(IS_SUB_8X8(sub_mb_type)){
1146                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1147                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1148                             if(ref[0] == 0)
1149                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1150                             if(ref[1] == 0)
1151                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1152                         }
1153                     }else
1154                     for(i4=0; i4<4; i4++){
1155                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1156                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1157                             if(ref[0] == 0)
1158                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1159                             if(ref[1] == 0)
1160                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1161                         }
1162                     }
1163                 }
1164             }
1165         }
1166     }else{ /* direct temporal mv pred */
1167         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1168         const int *dist_scale_factor = h->dist_scale_factor;
1169
1170         if(FRAME_MBAFF){
1171             if(IS_INTERLACED(*mb_type)){
1172                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1173                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1174                 dist_scale_factor = h->dist_scale_factor_field;
1175             }
1176             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1177                 /* FIXME assumes direct_8x8_inference == 1 */
1178                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1179                 int mb_types_col[2];
1180                 int y_shift;
1181
1182                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1183                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1184                          | (*mb_type & MB_TYPE_INTERLACED);
1185                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1186
1187                 if(IS_INTERLACED(*mb_type)){
1188                     /* frame to field scaling */
1189                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1190                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1191                     if(s->mb_y&1){
1192                         l1ref0 -= 2*h->b8_stride;
1193                         l1ref1 -= 2*h->b8_stride;
1194                         l1mv0 -= 4*h->b_stride;
1195                         l1mv1 -= 4*h->b_stride;
1196                     }
1197                     y_shift = 0;
1198
1199                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1200                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1201                        && !is_b8x8)
1202                         *mb_type |= MB_TYPE_16x8;
1203                     else
1204                         *mb_type |= MB_TYPE_8x8;
1205                 }else{
1206                     /* field to frame scaling */
1207                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1208                      * but in MBAFF, top and bottom POC are equal */
1209                     int dy = (s->mb_y&1) ? 1 : 2;
1210                     mb_types_col[0] =
1211                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1212                     l1ref0 += dy*h->b8_stride;
1213                     l1ref1 += dy*h->b8_stride;
1214                     l1mv0 += 2*dy*h->b_stride;
1215                     l1mv1 += 2*dy*h->b_stride;
1216                     y_shift = 2;
1217
1218                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1219                        && !is_b8x8)
1220                         *mb_type |= MB_TYPE_16x16;
1221                     else
1222                         *mb_type |= MB_TYPE_8x8;
1223                 }
1224
1225                 for(i8=0; i8<4; i8++){
1226                     const int x8 = i8&1;
1227                     const int y8 = i8>>1;
1228                     int ref0, scale;
1229                     const int16_t (*l1mv)[2]= l1mv0;
1230
1231                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1232                         continue;
1233                     h->sub_mb_type[i8] = sub_mb_type;
1234
1235                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1236                     if(IS_INTRA(mb_types_col[y8])){
1237                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1238                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1239                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1240                         continue;
1241                     }
1242
1243                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1244                     if(ref0 >= 0)
1245                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1246                     else{
1247                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1248                         l1mv= l1mv1;
1249                     }
1250                     scale = dist_scale_factor[ref0];
1251                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1252
1253                     {
1254                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1255                         int my_col = (mv_col[1]<<y_shift)/2;
1256                         int mx = (scale * mv_col[0] + 128) >> 8;
1257                         int my = (scale * my_col + 128) >> 8;
1258                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1259                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1260                     }
1261                 }
1262                 return;
1263             }
1264         }
1265
1266         /* one-to-one mv scaling */
1267
1268         if(IS_16X16(*mb_type)){
1269             int ref, mv0, mv1;
1270
1271             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1272             if(IS_INTRA(mb_type_col)){
1273                 ref=mv0=mv1=0;
1274             }else{
1275                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1276                                                 : map_col_to_list0[1][l1ref1[0]];
1277                 const int scale = dist_scale_factor[ref0];
1278                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1279                 int mv_l0[2];
1280                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1281                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1282                 ref= ref0;
1283                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1284                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1285             }
1286             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1287             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1288             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1289         }else{
1290             for(i8=0; i8<4; i8++){
1291                 const int x8 = i8&1;
1292                 const int y8 = i8>>1;
1293                 int ref0, scale;
1294                 const int16_t (*l1mv)[2]= l1mv0;
1295
1296                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1297                     continue;
1298                 h->sub_mb_type[i8] = sub_mb_type;
1299                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1300                 if(IS_INTRA(mb_type_col)){
1301                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1302                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1303                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1304                     continue;
1305                 }
1306
1307                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1308                 if(ref0 >= 0)
1309                     ref0 = map_col_to_list0[0][ref0];
1310                 else{
1311                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1312                     l1mv= l1mv1;
1313                 }
1314                 scale = dist_scale_factor[ref0];
1315
1316                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1317                 if(IS_SUB_8X8(sub_mb_type)){
1318                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1319                     int mx = (scale * mv_col[0] + 128) >> 8;
1320                     int my = (scale * mv_col[1] + 128) >> 8;
1321                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1322                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1323                 }else
1324                 for(i4=0; i4<4; i4++){
1325                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1326                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1327                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1328                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1329                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1330                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1331                 }
1332             }
1333         }
1334     }
1335 }
1336
1337 static inline void write_back_motion(H264Context *h, int mb_type){
1338     MpegEncContext * const s = &h->s;
1339     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1340     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1341     int list;
1342
1343     if(!USES_LIST(mb_type, 0))
1344         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1345
1346     for(list=0; list<h->list_count; list++){
1347         int y;
1348         if(!USES_LIST(mb_type, list))
1349             continue;
1350
1351         for(y=0; y<4; y++){
1352             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1353             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1354         }
1355         if( h->pps.cabac ) {
1356             if(IS_SKIP(mb_type))
1357                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1358             else
1359             for(y=0; y<4; y++){
1360                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1361                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1362             }
1363         }
1364
1365         {
1366             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1367             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1368             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1369             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1370             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1371         }
1372     }
1373
1374     if(h->slice_type == B_TYPE && h->pps.cabac){
1375         if(IS_8X8(mb_type)){
1376             uint8_t *direct_table = &h->direct_table[b8_xy];
1377             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1378             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1379             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1380         }
1381     }
1382 }
1383
1384 /**
1385  * Decodes a network abstraction layer unit.
1386  * @param consumed is the number of bytes used as input
1387  * @param length is the length of the array
1388  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1389  * @returns decoded bytes, might be src+1 if no escapes
1390  */
1391 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1392     int i, si, di;
1393     uint8_t *dst;
1394     int bufidx;
1395
1396 //    src[0]&0x80;                //forbidden bit
1397     h->nal_ref_idc= src[0]>>5;
1398     h->nal_unit_type= src[0]&0x1F;
1399
1400     src++; length--;
1401 #if 0
1402     for(i=0; i<length; i++)
1403         printf("%2X ", src[i]);
1404 #endif
1405     for(i=0; i+1<length; i+=2){
1406         if(src[i]) continue;
1407         if(i>0 && src[i-1]==0) i--;
1408         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1409             if(src[i+2]!=3){
1410                 /* startcode, so we must be past the end */
1411                 length=i;
1412             }
1413             break;
1414         }
1415     }
1416
1417     if(i>=length-1){ //no escaped 0
1418         *dst_length= length;
1419         *consumed= length+1; //+1 for the header
1420         return src;
1421     }
1422
1423     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1424     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1425     dst= h->rbsp_buffer[bufidx];
1426
1427     if (dst == NULL){
1428         return NULL;
1429     }
1430
1431 //printf("decoding esc\n");
1432     si=di=0;
1433     while(si<length){
1434         //remove escapes (very rare 1:2^22)
1435         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1436             if(src[si+2]==3){ //escape
1437                 dst[di++]= 0;
1438                 dst[di++]= 0;
1439                 si+=3;
1440                 continue;
1441             }else //next start code
1442                 break;
1443         }
1444
1445         dst[di++]= src[si++];
1446     }
1447
1448     *dst_length= di;
1449     *consumed= si + 1;//+1 for the header
1450 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1451     return dst;
1452 }
1453
1454 /**
1455  * identifies the exact end of the bitstream
1456  * @return the length of the trailing, or 0 if damaged
1457  */
1458 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1459     int v= *src;
1460     int r;
1461
1462     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1463
1464     for(r=1; r<9; r++){
1465         if(v&1) return r;
1466         v>>=1;
1467     }
1468     return 0;
1469 }
1470
1471 /**
1472  * idct tranforms the 16 dc values and dequantize them.
1473  * @param qp quantization parameter
1474  */
1475 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1476 #define stride 16
1477     int i;
1478     int temp[16]; //FIXME check if this is a good idea
1479     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1480     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1481
1482 //memset(block, 64, 2*256);
1483 //return;
1484     for(i=0; i<4; i++){
1485         const int offset= y_offset[i];
1486         const int z0= block[offset+stride*0] + block[offset+stride*4];
1487         const int z1= block[offset+stride*0] - block[offset+stride*4];
1488         const int z2= block[offset+stride*1] - block[offset+stride*5];
1489         const int z3= block[offset+stride*1] + block[offset+stride*5];
1490
1491         temp[4*i+0]= z0+z3;
1492         temp[4*i+1]= z1+z2;
1493         temp[4*i+2]= z1-z2;
1494         temp[4*i+3]= z0-z3;
1495     }
1496
1497     for(i=0; i<4; i++){
1498         const int offset= x_offset[i];
1499         const int z0= temp[4*0+i] + temp[4*2+i];
1500         const int z1= temp[4*0+i] - temp[4*2+i];
1501         const int z2= temp[4*1+i] - temp[4*3+i];
1502         const int z3= temp[4*1+i] + temp[4*3+i];
1503
1504         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1505         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1506         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1507         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1508     }
1509 }
1510
1511 #if 0
1512 /**
1513  * dct tranforms the 16 dc values.
1514  * @param qp quantization parameter ??? FIXME
1515  */
1516 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1517 //    const int qmul= dequant_coeff[qp][0];
1518     int i;
1519     int temp[16]; //FIXME check if this is a good idea
1520     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1521     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1522
1523     for(i=0; i<4; i++){
1524         const int offset= y_offset[i];
1525         const int z0= block[offset+stride*0] + block[offset+stride*4];
1526         const int z1= block[offset+stride*0] - block[offset+stride*4];
1527         const int z2= block[offset+stride*1] - block[offset+stride*5];
1528         const int z3= block[offset+stride*1] + block[offset+stride*5];
1529
1530         temp[4*i+0]= z0+z3;
1531         temp[4*i+1]= z1+z2;
1532         temp[4*i+2]= z1-z2;
1533         temp[4*i+3]= z0-z3;
1534     }
1535
1536     for(i=0; i<4; i++){
1537         const int offset= x_offset[i];
1538         const int z0= temp[4*0+i] + temp[4*2+i];
1539         const int z1= temp[4*0+i] - temp[4*2+i];
1540         const int z2= temp[4*1+i] - temp[4*3+i];
1541         const int z3= temp[4*1+i] + temp[4*3+i];
1542
1543         block[stride*0 +offset]= (z0 + z3)>>1;
1544         block[stride*2 +offset]= (z1 + z2)>>1;
1545         block[stride*8 +offset]= (z1 - z2)>>1;
1546         block[stride*10+offset]= (z0 - z3)>>1;
1547     }
1548 }
1549 #endif
1550
1551 #undef xStride
1552 #undef stride
1553
1554 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1555     const int stride= 16*2;
1556     const int xStride= 16;
1557     int a,b,c,d,e;
1558
1559     a= block[stride*0 + xStride*0];
1560     b= block[stride*0 + xStride*1];
1561     c= block[stride*1 + xStride*0];
1562     d= block[stride*1 + xStride*1];
1563
1564     e= a-b;
1565     a= a+b;
1566     b= c-d;
1567     c= c+d;
1568
1569     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1570     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1571     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1572     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1573 }
1574
1575 #if 0
1576 static void chroma_dc_dct_c(DCTELEM *block){
1577     const int stride= 16*2;
1578     const int xStride= 16;
1579     int a,b,c,d,e;
1580
1581     a= block[stride*0 + xStride*0];
1582     b= block[stride*0 + xStride*1];
1583     c= block[stride*1 + xStride*0];
1584     d= block[stride*1 + xStride*1];
1585
1586     e= a-b;
1587     a= a+b;
1588     b= c-d;
1589     c= c+d;
1590
1591     block[stride*0 + xStride*0]= (a+c);
1592     block[stride*0 + xStride*1]= (e+b);
1593     block[stride*1 + xStride*0]= (a-c);
1594     block[stride*1 + xStride*1]= (e-b);
1595 }
1596 #endif
1597
1598 /**
1599  * gets the chroma qp.
1600  */
1601 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1602     return h->pps.chroma_qp_table[t][qscale & 0xff];
1603 }
1604
1605 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1606 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1607 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1608     int i;
1609     const int * const quant_table= quant_coeff[qscale];
1610     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1611     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1612     const unsigned int threshold2= (threshold1<<1);
1613     int last_non_zero;
1614
1615     if(separate_dc){
1616         if(qscale<=18){
1617             //avoid overflows
1618             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1619             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1620             const unsigned int dc_threshold2= (dc_threshold1<<1);
1621
1622             int level= block[0]*quant_coeff[qscale+18][0];
1623             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1624                 if(level>0){
1625                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1626                     block[0]= level;
1627                 }else{
1628                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1629                     block[0]= -level;
1630                 }
1631 //                last_non_zero = i;
1632             }else{
1633                 block[0]=0;
1634             }
1635         }else{
1636             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1637             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1638             const unsigned int dc_threshold2= (dc_threshold1<<1);
1639
1640             int level= block[0]*quant_table[0];
1641             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1642                 if(level>0){
1643                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1644                     block[0]= level;
1645                 }else{
1646                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1647                     block[0]= -level;
1648                 }
1649 //                last_non_zero = i;
1650             }else{
1651                 block[0]=0;
1652             }
1653         }
1654         last_non_zero= 0;
1655         i=1;
1656     }else{
1657         last_non_zero= -1;
1658         i=0;
1659     }
1660
1661     for(; i<16; i++){
1662         const int j= scantable[i];
1663         int level= block[j]*quant_table[j];
1664
1665 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1666 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1667         if(((unsigned)(level+threshold1))>threshold2){
1668             if(level>0){
1669                 level= (bias + level)>>QUANT_SHIFT;
1670                 block[j]= level;
1671             }else{
1672                 level= (bias - level)>>QUANT_SHIFT;
1673                 block[j]= -level;
1674             }
1675             last_non_zero = i;
1676         }else{
1677             block[j]=0;
1678         }
1679     }
1680
1681     return last_non_zero;
1682 }
1683
1684 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1685     const uint32_t a= ((uint32_t*)(src-stride))[0];
1686     ((uint32_t*)(src+0*stride))[0]= a;
1687     ((uint32_t*)(src+1*stride))[0]= a;
1688     ((uint32_t*)(src+2*stride))[0]= a;
1689     ((uint32_t*)(src+3*stride))[0]= a;
1690 }
1691
1692 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1693     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1694     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1695     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1696     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1697 }
1698
1699 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1700     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1701                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1702
1703     ((uint32_t*)(src+0*stride))[0]=
1704     ((uint32_t*)(src+1*stride))[0]=
1705     ((uint32_t*)(src+2*stride))[0]=
1706     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1707 }
1708
1709 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1710     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1711
1712     ((uint32_t*)(src+0*stride))[0]=
1713     ((uint32_t*)(src+1*stride))[0]=
1714     ((uint32_t*)(src+2*stride))[0]=
1715     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1716 }
1717
1718 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1719     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1720
1721     ((uint32_t*)(src+0*stride))[0]=
1722     ((uint32_t*)(src+1*stride))[0]=
1723     ((uint32_t*)(src+2*stride))[0]=
1724     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1725 }
1726
1727 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1728     ((uint32_t*)(src+0*stride))[0]=
1729     ((uint32_t*)(src+1*stride))[0]=
1730     ((uint32_t*)(src+2*stride))[0]=
1731     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1732 }
1733
1734
1735 #define LOAD_TOP_RIGHT_EDGE\
1736     const int av_unused t4= topright[0];\
1737     const int av_unused t5= topright[1];\
1738     const int av_unused t6= topright[2];\
1739     const int av_unused t7= topright[3];\
1740
1741 #define LOAD_LEFT_EDGE\
1742     const int av_unused l0= src[-1+0*stride];\
1743     const int av_unused l1= src[-1+1*stride];\
1744     const int av_unused l2= src[-1+2*stride];\
1745     const int av_unused l3= src[-1+3*stride];\
1746
1747 #define LOAD_TOP_EDGE\
1748     const int av_unused t0= src[ 0-1*stride];\
1749     const int av_unused t1= src[ 1-1*stride];\
1750     const int av_unused t2= src[ 2-1*stride];\
1751     const int av_unused t3= src[ 3-1*stride];\
1752
1753 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1754     const int lt= src[-1-1*stride];
1755     LOAD_TOP_EDGE
1756     LOAD_LEFT_EDGE
1757
1758     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1759     src[0+2*stride]=
1760     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1761     src[0+1*stride]=
1762     src[1+2*stride]=
1763     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1764     src[0+0*stride]=
1765     src[1+1*stride]=
1766     src[2+2*stride]=
1767     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1768     src[1+0*stride]=
1769     src[2+1*stride]=
1770     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1771     src[2+0*stride]=
1772     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1773     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1774 }
1775
1776 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1777     LOAD_TOP_EDGE
1778     LOAD_TOP_RIGHT_EDGE
1779 //    LOAD_LEFT_EDGE
1780
1781     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1782     src[1+0*stride]=
1783     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1784     src[2+0*stride]=
1785     src[1+1*stride]=
1786     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1787     src[3+0*stride]=
1788     src[2+1*stride]=
1789     src[1+2*stride]=
1790     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1791     src[3+1*stride]=
1792     src[2+2*stride]=
1793     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1794     src[3+2*stride]=
1795     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1796     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1797 }
1798
1799 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1800     const int lt= src[-1-1*stride];
1801     LOAD_TOP_EDGE
1802     LOAD_LEFT_EDGE
1803
1804     src[0+0*stride]=
1805     src[1+2*stride]=(lt + t0 + 1)>>1;
1806     src[1+0*stride]=
1807     src[2+2*stride]=(t0 + t1 + 1)>>1;
1808     src[2+0*stride]=
1809     src[3+2*stride]=(t1 + t2 + 1)>>1;
1810     src[3+0*stride]=(t2 + t3 + 1)>>1;
1811     src[0+1*stride]=
1812     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1813     src[1+1*stride]=
1814     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
1815     src[2+1*stride]=
1816     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1817     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1818     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1819     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1820 }
1821
1822 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
1823     LOAD_TOP_EDGE
1824     LOAD_TOP_RIGHT_EDGE
1825
1826     src[0+0*stride]=(t0 + t1 + 1)>>1;
1827     src[1+0*stride]=
1828     src[0+2*stride]=(t1 + t2 + 1)>>1;
1829     src[2+0*stride]=
1830     src[1+2*stride]=(t2 + t3 + 1)>>1;
1831     src[3+0*stride]=
1832     src[2+2*stride]=(t3 + t4+ 1)>>1;
1833     src[3+2*stride]=(t4 + t5+ 1)>>1;
1834     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1835     src[1+1*stride]=
1836     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1837     src[2+1*stride]=
1838     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
1839     src[3+1*stride]=
1840     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
1841     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
1842 }
1843
1844 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
1845     LOAD_LEFT_EDGE
1846
1847     src[0+0*stride]=(l0 + l1 + 1)>>1;
1848     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1849     src[2+0*stride]=
1850     src[0+1*stride]=(l1 + l2 + 1)>>1;
1851     src[3+0*stride]=
1852     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1853     src[2+1*stride]=
1854     src[0+2*stride]=(l2 + l3 + 1)>>1;
1855     src[3+1*stride]=
1856     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
1857     src[3+2*stride]=
1858     src[1+3*stride]=
1859     src[0+3*stride]=
1860     src[2+2*stride]=
1861     src[2+3*stride]=
1862     src[3+3*stride]=l3;
1863 }
1864
1865 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
1866     const int lt= src[-1-1*stride];
1867     LOAD_TOP_EDGE
1868     LOAD_LEFT_EDGE
1869
1870     src[0+0*stride]=
1871     src[2+1*stride]=(lt + l0 + 1)>>1;
1872     src[1+0*stride]=
1873     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
1874     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
1875     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1876     src[0+1*stride]=
1877     src[2+2*stride]=(l0 + l1 + 1)>>1;
1878     src[1+1*stride]=
1879     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1880     src[0+2*stride]=
1881     src[2+3*stride]=(l1 + l2+ 1)>>1;
1882     src[1+2*stride]=
1883     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1884     src[0+3*stride]=(l2 + l3 + 1)>>1;
1885     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1886 }
1887
1888 void ff_pred16x16_vertical_c(uint8_t *src, int stride){
1889     int i;
1890     const uint32_t a= ((uint32_t*)(src-stride))[0];
1891     const uint32_t b= ((uint32_t*)(src-stride))[1];
1892     const uint32_t c= ((uint32_t*)(src-stride))[2];
1893     const uint32_t d= ((uint32_t*)(src-stride))[3];
1894
1895     for(i=0; i<16; i++){
1896         ((uint32_t*)(src+i*stride))[0]= a;
1897         ((uint32_t*)(src+i*stride))[1]= b;
1898         ((uint32_t*)(src+i*stride))[2]= c;
1899         ((uint32_t*)(src+i*stride))[3]= d;
1900     }
1901 }
1902
1903 void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
1904     int i;
1905
1906     for(i=0; i<16; i++){
1907         ((uint32_t*)(src+i*stride))[0]=
1908         ((uint32_t*)(src+i*stride))[1]=
1909         ((uint32_t*)(src+i*stride))[2]=
1910         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
1911     }
1912 }
1913
1914 void ff_pred16x16_dc_c(uint8_t *src, int stride){
1915     int i, dc=0;
1916
1917     for(i=0;i<16; i++){
1918         dc+= src[-1+i*stride];
1919     }
1920
1921     for(i=0;i<16; i++){
1922         dc+= src[i-stride];
1923     }
1924
1925     dc= 0x01010101*((dc + 16)>>5);
1926
1927     for(i=0; i<16; i++){
1928         ((uint32_t*)(src+i*stride))[0]=
1929         ((uint32_t*)(src+i*stride))[1]=
1930         ((uint32_t*)(src+i*stride))[2]=
1931         ((uint32_t*)(src+i*stride))[3]= dc;
1932     }
1933 }
1934
1935 void ff_pred16x16_left_dc_c(uint8_t *src, int stride){
1936     int i, dc=0;
1937
1938     for(i=0;i<16; i++){
1939         dc+= src[-1+i*stride];
1940     }
1941
1942     dc= 0x01010101*((dc + 8)>>4);
1943
1944     for(i=0; i<16; i++){
1945         ((uint32_t*)(src+i*stride))[0]=
1946         ((uint32_t*)(src+i*stride))[1]=
1947         ((uint32_t*)(src+i*stride))[2]=
1948         ((uint32_t*)(src+i*stride))[3]= dc;
1949     }
1950 }
1951
1952 void ff_pred16x16_top_dc_c(uint8_t *src, int stride){
1953     int i, dc=0;
1954
1955     for(i=0;i<16; i++){
1956         dc+= src[i-stride];
1957     }
1958     dc= 0x01010101*((dc + 8)>>4);
1959
1960     for(i=0; i<16; i++){
1961         ((uint32_t*)(src+i*stride))[0]=
1962         ((uint32_t*)(src+i*stride))[1]=
1963         ((uint32_t*)(src+i*stride))[2]=
1964         ((uint32_t*)(src+i*stride))[3]= dc;
1965     }
1966 }
1967
1968 void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
1969     int i;
1970
1971     for(i=0; i<16; i++){
1972         ((uint32_t*)(src+i*stride))[0]=
1973         ((uint32_t*)(src+i*stride))[1]=
1974         ((uint32_t*)(src+i*stride))[2]=
1975         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
1976     }
1977 }
1978
1979 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
1980   int i, j, k;
1981   int a;
1982   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1983   const uint8_t * const src0 = src+7-stride;
1984   const uint8_t *src1 = src+8*stride-1;
1985   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
1986   int H = src0[1] - src0[-1];
1987   int V = src1[0] - src2[ 0];
1988   for(k=2; k<=8; ++k) {
1989     src1 += stride; src2 -= stride;
1990     H += k*(src0[k] - src0[-k]);
1991     V += k*(src1[0] - src2[ 0]);
1992   }
1993   if(svq3){
1994     H = ( 5*(H/4) ) / 16;
1995     V = ( 5*(V/4) ) / 16;
1996
1997     /* required for 100% accuracy */
1998     i = H; H = V; V = i;
1999   }else{
2000     H = ( 5*H+32 ) >> 6;
2001     V = ( 5*V+32 ) >> 6;
2002   }
2003
2004   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2005   for(j=16; j>0; --j) {
2006     int b = a;
2007     a += V;
2008     for(i=-16; i<0; i+=4) {
2009       src[16+i] = cm[ (b    ) >> 5 ];
2010       src[17+i] = cm[ (b+  H) >> 5 ];
2011       src[18+i] = cm[ (b+2*H) >> 5 ];
2012       src[19+i] = cm[ (b+3*H) >> 5 ];
2013       b += 4*H;
2014     }
2015     src += stride;
2016   }
2017 }
2018
2019 void ff_pred16x16_plane_c(uint8_t *src, int stride){
2020     pred16x16_plane_compat_c(src, stride, 0);
2021 }
2022
2023 void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2024     int i;
2025     const uint32_t a= ((uint32_t*)(src-stride))[0];
2026     const uint32_t b= ((uint32_t*)(src-stride))[1];
2027
2028     for(i=0; i<8; i++){
2029         ((uint32_t*)(src+i*stride))[0]= a;
2030         ((uint32_t*)(src+i*stride))[1]= b;
2031     }
2032 }
2033
2034 void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2035     int i;
2036
2037     for(i=0; i<8; i++){
2038         ((uint32_t*)(src+i*stride))[0]=
2039         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2040     }
2041 }
2042
2043 void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2044     int i;
2045
2046     for(i=0; i<8; i++){
2047         ((uint32_t*)(src+i*stride))[0]=
2048         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2049     }
2050 }
2051
2052 void ff_pred8x8_left_dc_c(uint8_t *src, int stride){
2053     int i;
2054     int dc0, dc2;
2055
2056     dc0=dc2=0;
2057     for(i=0;i<4; i++){
2058         dc0+= src[-1+i*stride];
2059         dc2+= src[-1+(i+4)*stride];
2060     }
2061     dc0= 0x01010101*((dc0 + 2)>>2);
2062     dc2= 0x01010101*((dc2 + 2)>>2);
2063
2064     for(i=0; i<4; i++){
2065         ((uint32_t*)(src+i*stride))[0]=
2066         ((uint32_t*)(src+i*stride))[1]= dc0;
2067     }
2068     for(i=4; i<8; i++){
2069         ((uint32_t*)(src+i*stride))[0]=
2070         ((uint32_t*)(src+i*stride))[1]= dc2;
2071     }
2072 }
2073
2074 void ff_pred8x8_top_dc_c(uint8_t *src, int stride){
2075     int i;
2076     int dc0, dc1;
2077
2078     dc0=dc1=0;
2079     for(i=0;i<4; i++){
2080         dc0+= src[i-stride];
2081         dc1+= src[4+i-stride];
2082     }
2083     dc0= 0x01010101*((dc0 + 2)>>2);
2084     dc1= 0x01010101*((dc1 + 2)>>2);
2085
2086     for(i=0; i<4; i++){
2087         ((uint32_t*)(src+i*stride))[0]= dc0;
2088         ((uint32_t*)(src+i*stride))[1]= dc1;
2089     }
2090     for(i=4; i<8; i++){
2091         ((uint32_t*)(src+i*stride))[0]= dc0;
2092         ((uint32_t*)(src+i*stride))[1]= dc1;
2093     }
2094 }
2095
2096
2097 void ff_pred8x8_dc_c(uint8_t *src, int stride){
2098     int i;
2099     int dc0, dc1, dc2, dc3;
2100
2101     dc0=dc1=dc2=0;
2102     for(i=0;i<4; i++){
2103         dc0+= src[-1+i*stride] + src[i-stride];
2104         dc1+= src[4+i-stride];
2105         dc2+= src[-1+(i+4)*stride];
2106     }
2107     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2108     dc0= 0x01010101*((dc0 + 4)>>3);
2109     dc1= 0x01010101*((dc1 + 2)>>2);
2110     dc2= 0x01010101*((dc2 + 2)>>2);
2111
2112     for(i=0; i<4; i++){
2113         ((uint32_t*)(src+i*stride))[0]= dc0;
2114         ((uint32_t*)(src+i*stride))[1]= dc1;
2115     }
2116     for(i=4; i<8; i++){
2117         ((uint32_t*)(src+i*stride))[0]= dc2;
2118         ((uint32_t*)(src+i*stride))[1]= dc3;
2119     }
2120 }
2121
2122 void ff_pred8x8_plane_c(uint8_t *src, int stride){
2123   int j, k;
2124   int a;
2125   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2126   const uint8_t * const src0 = src+3-stride;
2127   const uint8_t *src1 = src+4*stride-1;
2128   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2129   int H = src0[1] - src0[-1];
2130   int V = src1[0] - src2[ 0];
2131   for(k=2; k<=4; ++k) {
2132     src1 += stride; src2 -= stride;
2133     H += k*(src0[k] - src0[-k]);
2134     V += k*(src1[0] - src2[ 0]);
2135   }
2136   H = ( 17*H+16 ) >> 5;
2137   V = ( 17*V+16 ) >> 5;
2138
2139   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2140   for(j=8; j>0; --j) {
2141     int b = a;
2142     a += V;
2143     src[0] = cm[ (b    ) >> 5 ];
2144     src[1] = cm[ (b+  H) >> 5 ];
2145     src[2] = cm[ (b+2*H) >> 5 ];
2146     src[3] = cm[ (b+3*H) >> 5 ];
2147     src[4] = cm[ (b+4*H) >> 5 ];
2148     src[5] = cm[ (b+5*H) >> 5 ];
2149     src[6] = cm[ (b+6*H) >> 5 ];
2150     src[7] = cm[ (b+7*H) >> 5 ];
2151     src += stride;
2152   }
2153 }
2154
2155 #define SRC(x,y) src[(x)+(y)*stride]
2156 #define PL(y) \
2157     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2158 #define PREDICT_8x8_LOAD_LEFT \
2159     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2160                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2161     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2162     const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2163
2164 #define PT(x) \
2165     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2166 #define PREDICT_8x8_LOAD_TOP \
2167     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2168                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2169     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2170     const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2171                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2172
2173 #define PTR(x) \
2174     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2175 #define PREDICT_8x8_LOAD_TOPRIGHT \
2176     int t8, t9, t10, t11, t12, t13, t14, t15; \
2177     if(has_topright) { \
2178         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2179         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2180     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2181
2182 #define PREDICT_8x8_LOAD_TOPLEFT \
2183     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2184
2185 #define PREDICT_8x8_DC(v) \
2186     int y; \
2187     for( y = 0; y < 8; y++ ) { \
2188         ((uint32_t*)src)[0] = \
2189         ((uint32_t*)src)[1] = v; \
2190         src += stride; \
2191     }
2192
2193 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2194 {
2195     PREDICT_8x8_DC(0x80808080);
2196 }
2197 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2198 {
2199     PREDICT_8x8_LOAD_LEFT;
2200     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2201     PREDICT_8x8_DC(dc);
2202 }
2203 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2204 {
2205     PREDICT_8x8_LOAD_TOP;
2206     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2207     PREDICT_8x8_DC(dc);
2208 }
2209 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2210 {
2211     PREDICT_8x8_LOAD_LEFT;
2212     PREDICT_8x8_LOAD_TOP;
2213     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2214                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2215     PREDICT_8x8_DC(dc);
2216 }
2217 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2218 {
2219     PREDICT_8x8_LOAD_LEFT;
2220 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2221                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2222     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2223 #undef ROW
2224 }
2225 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2226 {
2227     int y;
2228     PREDICT_8x8_LOAD_TOP;
2229     src[0] = t0;
2230     src[1] = t1;
2231     src[2] = t2;
2232     src[3] = t3;
2233     src[4] = t4;
2234     src[5] = t5;
2235     src[6] = t6;
2236     src[7] = t7;
2237     for( y = 1; y < 8; y++ )
2238         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2239 }
2240 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2241 {
2242     PREDICT_8x8_LOAD_TOP;
2243     PREDICT_8x8_LOAD_TOPRIGHT;
2244     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2245     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2246     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2247     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2248     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2249     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2250     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2251     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2252     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2253     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2254     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2255     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2256     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2257     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2258     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2259 }
2260 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2261 {
2262     PREDICT_8x8_LOAD_TOP;
2263     PREDICT_8x8_LOAD_LEFT;
2264     PREDICT_8x8_LOAD_TOPLEFT;
2265     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2266     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2267     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2268     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2269     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2270     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2271     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2272     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2273     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2274     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2275     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2276     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2277     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2278     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2279     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2280
2281 }
2282 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2283 {
2284     PREDICT_8x8_LOAD_TOP;
2285     PREDICT_8x8_LOAD_LEFT;
2286     PREDICT_8x8_LOAD_TOPLEFT;
2287     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2288     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2289     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2290     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2291     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2292     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2293     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2294     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2295     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2296     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2297     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2298     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2299     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2300     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2301     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2302     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2303     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2304     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2305     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2306     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2307     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2308     SRC(7,0)= (t6 + t7 + 1) >> 1;
2309 }
2310 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2311 {
2312     PREDICT_8x8_LOAD_TOP;
2313     PREDICT_8x8_LOAD_LEFT;
2314     PREDICT_8x8_LOAD_TOPLEFT;
2315     SRC(0,7)= (l6 + l7 + 1) >> 1;
2316     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2317     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2318     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2319     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2320     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2321     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2322     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2323     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2324     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2325     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2326     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2327     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2328     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2329     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2330     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2331     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2332     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2333     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2334     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2335     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2336     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2337 }
2338 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2339 {
2340     PREDICT_8x8_LOAD_TOP;
2341     PREDICT_8x8_LOAD_TOPRIGHT;
2342     SRC(0,0)= (t0 + t1 + 1) >> 1;
2343     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2344     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2345     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2346     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2347     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2348     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2349     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2350     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2351     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2352     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2353     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2354     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2355     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2356     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2357     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2358     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2359     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2360     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2361     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2362     SRC(7,6)= (t10 + t11 + 1) >> 1;
2363     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2364 }
2365 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2366 {
2367     PREDICT_8x8_LOAD_LEFT;
2368     SRC(0,0)= (l0 + l1 + 1) >> 1;
2369     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2370     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2371     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2372     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2373     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2374     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2375     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2376     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2377     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2378     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2379     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2380     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2381     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2382     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2383     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2384     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2385     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2386 }
2387 #undef PREDICT_8x8_LOAD_LEFT
2388 #undef PREDICT_8x8_LOAD_TOP
2389 #undef PREDICT_8x8_LOAD_TOPLEFT
2390 #undef PREDICT_8x8_LOAD_TOPRIGHT
2391 #undef PREDICT_8x8_DC
2392 #undef PTR
2393 #undef PT
2394 #undef PL
2395 #undef SRC
2396
2397 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2398                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2399                            int src_x_offset, int src_y_offset,
2400                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2401     MpegEncContext * const s = &h->s;
2402     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2403     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2404     const int luma_xy= (mx&3) + ((my&3)<<2);
2405     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2406     uint8_t * src_cb, * src_cr;
2407     int extra_width= h->emu_edge_width;
2408     int extra_height= h->emu_edge_height;
2409     int emu=0;
2410     const int full_mx= mx>>2;
2411     const int full_my= my>>2;
2412     const int pic_width  = 16*s->mb_width;
2413     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2414
2415     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
2416         return;
2417
2418     if(mx&7) extra_width -= 3;
2419     if(my&7) extra_height -= 3;
2420
2421     if(   full_mx < 0-extra_width
2422        || full_my < 0-extra_height
2423        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2424        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2425         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2426             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2427         emu=1;
2428     }
2429
2430     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2431     if(!square){
2432         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2433     }
2434
2435     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
2436
2437     if(MB_MBAFF){
2438         // chroma offset when predicting from a field of opposite parity
2439         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2440         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2441     }
2442     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2443     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2444
2445     if(emu){
2446         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2447             src_cb= s->edge_emu_buffer;
2448     }
2449     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2450
2451     if(emu){
2452         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2453             src_cr= s->edge_emu_buffer;
2454     }
2455     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2456 }
2457
2458 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2459                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2460                            int x_offset, int y_offset,
2461                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2462                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2463                            int list0, int list1){
2464     MpegEncContext * const s = &h->s;
2465     qpel_mc_func *qpix_op=  qpix_put;
2466     h264_chroma_mc_func chroma_op= chroma_put;
2467
2468     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2469     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2470     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2471     x_offset += 8*s->mb_x;
2472     y_offset += 8*(s->mb_y >> MB_MBAFF);
2473
2474     if(list0){
2475         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2476         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2477                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2478                            qpix_op, chroma_op);
2479
2480         qpix_op=  qpix_avg;
2481         chroma_op= chroma_avg;
2482     }
2483
2484     if(list1){
2485         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2486         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2487                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2488                            qpix_op, chroma_op);
2489     }
2490 }
2491
2492 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2493                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2494                            int x_offset, int y_offset,
2495                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2496                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2497                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2498                            int list0, int list1){
2499     MpegEncContext * const s = &h->s;
2500
2501     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2502     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2503     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2504     x_offset += 8*s->mb_x;
2505     y_offset += 8*(s->mb_y >> MB_MBAFF);
2506
2507     if(list0 && list1){
2508         /* don't optimize for luma-only case, since B-frames usually
2509          * use implicit weights => chroma too. */
2510         uint8_t *tmp_cb = s->obmc_scratchpad;
2511         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2512         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2513         int refn0 = h->ref_cache[0][ scan8[n] ];
2514         int refn1 = h->ref_cache[1][ scan8[n] ];
2515
2516         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2517                     dest_y, dest_cb, dest_cr,
2518                     x_offset, y_offset, qpix_put, chroma_put);
2519         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2520                     tmp_y, tmp_cb, tmp_cr,
2521                     x_offset, y_offset, qpix_put, chroma_put);
2522
2523         if(h->use_weight == 2){
2524             int weight0 = h->implicit_weight[refn0][refn1];
2525             int weight1 = 64 - weight0;
2526             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2527             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2528             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2529         }else{
2530             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2531                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2532                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2533             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2534                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2535                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2536             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2537                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2538                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2539         }
2540     }else{
2541         int list = list1 ? 1 : 0;
2542         int refn = h->ref_cache[list][ scan8[n] ];
2543         Picture *ref= &h->ref_list[list][refn];
2544         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2545                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2546                     qpix_put, chroma_put);
2547
2548         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2549                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2550         if(h->use_weight_chroma){
2551             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2552                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2553             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2554                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2555         }
2556     }
2557 }
2558
2559 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2560                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2561                            int x_offset, int y_offset,
2562                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2563                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2564                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2565                            int list0, int list1){
2566     if((h->use_weight==2 && list0 && list1
2567         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2568        || h->use_weight==1)
2569         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2570                          x_offset, y_offset, qpix_put, chroma_put,
2571                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2572     else
2573         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2574                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2575 }
2576
2577 static inline void prefetch_motion(H264Context *h, int list){
2578     /* fetch pixels for estimated mv 4 macroblocks ahead
2579      * optimized for 64byte cache lines */
2580     MpegEncContext * const s = &h->s;
2581     const int refn = h->ref_cache[list][scan8[0]];
2582     if(refn >= 0){
2583         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2584         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2585         uint8_t **src= h->ref_list[list][refn].data;
2586         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2587         s->dsp.prefetch(src[0]+off, s->linesize, 4);
2588         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2589         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2590     }
2591 }
2592
2593 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2594                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2595                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2596                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2597     MpegEncContext * const s = &h->s;
2598     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2599     const int mb_type= s->current_picture.mb_type[mb_xy];
2600
2601     assert(IS_INTER(mb_type));
2602
2603     prefetch_motion(h, 0);
2604
2605     if(IS_16X16(mb_type)){
2606         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2607                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2608                 &weight_op[0], &weight_avg[0],
2609                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2610     }else if(IS_16X8(mb_type)){
2611         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2612                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2613                 &weight_op[1], &weight_avg[1],
2614                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2615         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2616                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2617                 &weight_op[1], &weight_avg[1],
2618                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2619     }else if(IS_8X16(mb_type)){
2620         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2621                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2622                 &weight_op[2], &weight_avg[2],
2623                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2624         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2625                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2626                 &weight_op[2], &weight_avg[2],
2627                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2628     }else{
2629         int i;
2630
2631         assert(IS_8X8(mb_type));
2632
2633         for(i=0; i<4; i++){
2634             const int sub_mb_type= h->sub_mb_type[i];
2635             const int n= 4*i;
2636             int x_offset= (i&1)<<2;
2637             int y_offset= (i&2)<<1;
2638
2639             if(IS_SUB_8X8(sub_mb_type)){
2640                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2641                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2642                     &weight_op[3], &weight_avg[3],
2643                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2644             }else if(IS_SUB_8X4(sub_mb_type)){
2645                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2646                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2647                     &weight_op[4], &weight_avg[4],
2648                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2649                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2650                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2651                     &weight_op[4], &weight_avg[4],
2652                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2653             }else if(IS_SUB_4X8(sub_mb_type)){
2654                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2655                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2656                     &weight_op[5], &weight_avg[5],
2657                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2658                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2659                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2660                     &weight_op[5], &weight_avg[5],
2661                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2662             }else{
2663                 int j;
2664                 assert(IS_SUB_4X4(sub_mb_type));
2665                 for(j=0; j<4; j++){
2666                     int sub_x_offset= x_offset + 2*(j&1);
2667                     int sub_y_offset= y_offset +   (j&2);
2668                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2669                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2670                         &weight_op[6], &weight_avg[6],
2671                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2672                 }
2673             }
2674         }
2675     }
2676
2677     prefetch_motion(h, 1);
2678 }
2679
2680 static void decode_init_vlc(void){
2681     static int done = 0;
2682
2683     if (!done) {
2684         int i;
2685         done = 1;
2686
2687         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2688                  &chroma_dc_coeff_token_len [0], 1, 1,
2689                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2690
2691         for(i=0; i<4; i++){
2692             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2693                      &coeff_token_len [i][0], 1, 1,
2694                      &coeff_token_bits[i][0], 1, 1, 1);
2695         }
2696
2697         for(i=0; i<3; i++){
2698             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2699                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2700                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2701         }
2702         for(i=0; i<15; i++){
2703             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2704                      &total_zeros_len [i][0], 1, 1,
2705                      &total_zeros_bits[i][0], 1, 1, 1);
2706         }
2707
2708         for(i=0; i<6; i++){
2709             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2710                      &run_len [i][0], 1, 1,
2711                      &run_bits[i][0], 1, 1, 1);
2712         }
2713         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2714                  &run_len [6][0], 1, 1,
2715                  &run_bits[6][0], 1, 1, 1);
2716     }
2717 }
2718
2719 /**
2720  * Sets the intra prediction function pointers.
2721  */
2722 static void init_pred_ptrs(H264Context *h){
2723 //    MpegEncContext * const s = &h->s;
2724
2725     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2726     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2727     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2728     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2729     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2730     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2731     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2732     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2733     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2734     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2735     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2736     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2737
2738     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
2739     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
2740     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
2741     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
2742     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
2743     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
2744     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
2745     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
2746     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
2747     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
2748     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
2749     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
2750
2751     h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
2752     h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
2753     h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
2754     h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
2755     h->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c;
2756     h->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c;
2757     h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
2758
2759     h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
2760     h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
2761     h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
2762     h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
2763     h->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c;
2764     h->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c;
2765     h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
2766 }
2767
2768 static void free_tables(H264Context *h){
2769     int i;
2770     av_freep(&h->intra4x4_pred_mode);
2771     av_freep(&h->chroma_pred_mode_table);
2772     av_freep(&h->cbp_table);
2773     av_freep(&h->mvd_table[0]);
2774     av_freep(&h->mvd_table[1]);
2775     av_freep(&h->direct_table);
2776     av_freep(&h->non_zero_count);
2777     av_freep(&h->slice_table_base);
2778     av_freep(&h->top_borders[1]);
2779     av_freep(&h->top_borders[0]);
2780     h->slice_table= NULL;
2781
2782     av_freep(&h->mb2b_xy);
2783     av_freep(&h->mb2b8_xy);
2784
2785     av_freep(&h->s.obmc_scratchpad);
2786
2787     for(i = 0; i < MAX_SPS_COUNT; i++)
2788         av_freep(h->sps_buffers + i);
2789
2790     for(i = 0; i < MAX_PPS_COUNT; i++)
2791         av_freep(h->pps_buffers + i);
2792 }
2793
2794 static void init_dequant8_coeff_table(H264Context *h){
2795     int i,q,x;
2796     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2797     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2798     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2799
2800     for(i=0; i<2; i++ ){
2801         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2802             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2803             break;
2804         }
2805
2806         for(q=0; q<52; q++){
2807             int shift = ff_div6[q];
2808             int idx = ff_rem6[q];
2809             for(x=0; x<64; x++)
2810                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2811                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2812                     h->pps.scaling_matrix8[i][x]) << shift;
2813         }
2814     }
2815 }
2816
2817 static void init_dequant4_coeff_table(H264Context *h){
2818     int i,j,q,x;
2819     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2820     for(i=0; i<6; i++ ){
2821         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2822         for(j=0; j<i; j++){
2823             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2824                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2825                 break;
2826             }
2827         }
2828         if(j<i)
2829             continue;
2830
2831         for(q=0; q<52; q++){
2832             int shift = ff_div6[q] + 2;
2833             int idx = ff_rem6[q];
2834             for(x=0; x<16; x++)
2835                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2836                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2837                     h->pps.scaling_matrix4[i][x]) << shift;
2838         }
2839     }
2840 }
2841
2842 static void init_dequant_tables(H264Context *h){
2843     int i,x;
2844     init_dequant4_coeff_table(h);
2845     if(h->pps.transform_8x8_mode)
2846         init_dequant8_coeff_table(h);
2847     if(h->sps.transform_bypass){
2848         for(i=0; i<6; i++)
2849             for(x=0; x<16; x++)
2850                 h->dequant4_coeff[i][0][x] = 1<<6;
2851         if(h->pps.transform_8x8_mode)
2852             for(i=0; i<2; i++)
2853                 for(x=0; x<64; x++)
2854                     h->dequant8_coeff[i][0][x] = 1<<6;
2855     }
2856 }
2857
2858
2859 /**
2860  * allocates tables.
2861  * needs width/height
2862  */
2863 static int alloc_tables(H264Context *h){
2864     MpegEncContext * const s = &h->s;
2865     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2866     int x,y;
2867
2868     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2869
2870     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2871     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2872     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2873     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2874     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2875
2876     if( h->pps.cabac ) {
2877         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2878         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2879         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2880         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2881     }
2882
2883     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2884     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2885
2886     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2887     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2888     for(y=0; y<s->mb_height; y++){
2889         for(x=0; x<s->mb_width; x++){
2890             const int mb_xy= x + y*s->mb_stride;
2891             const int b_xy = 4*x + 4*y*h->b_stride;
2892             const int b8_xy= 2*x + 2*y*h->b8_stride;
2893
2894             h->mb2b_xy [mb_xy]= b_xy;
2895             h->mb2b8_xy[mb_xy]= b8_xy;
2896         }
2897     }
2898
2899     s->obmc_scratchpad = NULL;
2900
2901     if(!h->dequant4_coeff[0])
2902         init_dequant_tables(h);
2903
2904     return 0;
2905 fail:
2906     free_tables(h);
2907     return -1;
2908 }
2909
2910 static void common_init(H264Context *h){
2911     MpegEncContext * const s = &h->s;
2912
2913     s->width = s->avctx->width;
2914     s->height = s->avctx->height;
2915     s->codec_id= s->avctx->codec->id;
2916
2917     init_pred_ptrs(h);
2918
2919     h->dequant_coeff_pps= -1;
2920     s->unrestricted_mv=1;
2921     s->decode=1; //FIXME
2922
2923     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2924     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2925 }
2926
2927 static int decode_init(AVCodecContext *avctx){
2928     H264Context *h= avctx->priv_data;
2929     MpegEncContext * const s = &h->s;
2930
2931     MPV_decode_defaults(s);
2932
2933     s->avctx = avctx;
2934     common_init(h);
2935
2936     s->out_format = FMT_H264;
2937     s->workaround_bugs= avctx->workaround_bugs;
2938
2939     // set defaults
2940 //    s->decode_mb= ff_h263_decode_mb;
2941     s->quarter_sample = 1;
2942     s->low_delay= 1;
2943     avctx->pix_fmt= PIX_FMT_YUV420P;
2944
2945     decode_init_vlc();
2946
2947     if(avctx->extradata_size > 0 && avctx->extradata &&
2948        *(char *)avctx->extradata == 1){
2949         h->is_avc = 1;
2950         h->got_avcC = 0;
2951     } else {
2952         h->is_avc = 0;
2953     }
2954
2955     return 0;
2956 }
2957
2958 static int frame_start(H264Context *h){
2959     MpegEncContext * const s = &h->s;
2960     int i;
2961
2962     if(MPV_frame_start(s, s->avctx) < 0)
2963         return -1;
2964     ff_er_frame_start(s);
2965
2966     assert(s->linesize && s->uvlinesize);
2967
2968     for(i=0; i<16; i++){
2969         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2970         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2971     }
2972     for(i=0; i<4; i++){
2973         h->block_offset[16+i]=
2974         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2975         h->block_offset[24+16+i]=
2976         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2977     }
2978
2979     /* can't be in alloc_tables because linesize isn't known there.
2980      * FIXME: redo bipred weight to not require extra buffer? */
2981     if(!s->obmc_scratchpad)
2982         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2983
2984     /* some macroblocks will be accessed before they're available */
2985     if(FRAME_MBAFF)
2986         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2987
2988 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2989     return 0;
2990 }
2991
2992 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2993     MpegEncContext * const s = &h->s;
2994     int i;
2995
2996     src_y  -=   linesize;
2997     src_cb -= uvlinesize;
2998     src_cr -= uvlinesize;
2999
3000     // There are two lines saved, the line above the the top macroblock of a pair,
3001     // and the line above the bottom macroblock
3002     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3003     for(i=1; i<17; i++){
3004         h->left_border[i]= src_y[15+i*  linesize];
3005     }
3006
3007     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3008     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3009
3010     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
3011         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3012         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3013         for(i=1; i<9; i++){
3014             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3015             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3016         }
3017         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3018         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3019     }
3020 }
3021
3022 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
3023     MpegEncContext * const s = &h->s;
3024     int temp8, i;
3025     uint64_t temp64;
3026     int deblock_left;
3027     int deblock_top;
3028     int mb_xy;
3029
3030     if(h->deblocking_filter == 2) {
3031         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
3032         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
3033         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
3034     } else {
3035         deblock_left = (s->mb_x > 0);
3036         deblock_top =  (s->mb_y > 0);
3037     }
3038
3039     src_y  -=   linesize + 1;
3040     src_cb -= uvlinesize + 1;
3041     src_cr -= uvlinesize + 1;
3042
3043 #define XCHG(a,b,t,xchg)\
3044 t= a;\
3045 if(xchg)\
3046     a= b;\
3047 b= t;
3048
3049     if(deblock_left){
3050         for(i = !deblock_top; i<17; i++){
3051             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3052         }
3053     }
3054
3055     if(deblock_top){
3056         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3057         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3058         if(s->mb_x+1 < s->mb_width){
3059             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3060         }
3061     }
3062
3063     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
3064         if(deblock_left){
3065             for(i = !deblock_top; i<9; i++){
3066                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3067                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3068             }
3069         }
3070         if(deblock_top){
3071             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3072             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3073         }
3074     }
3075 }
3076
3077 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3078     MpegEncContext * const s = &h->s;
3079     int i;
3080
3081     src_y  -= 2 *   linesize;
3082     src_cb -= 2 * uvlinesize;
3083     src_cr -= 2 * uvlinesize;
3084
3085     // There are two lines saved, the line above the the top macroblock of a pair,
3086     // and the line above the bottom macroblock
3087     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3088     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3089     for(i=2; i<34; i++){
3090         h->left_border[i]= src_y[15+i*  linesize];
3091     }
3092
3093     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3094     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3095     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3096     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3097
3098     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
3099         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3100         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3101         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3102         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3103         for(i=2; i<18; i++){
3104             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3105             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3106         }
3107         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3108         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3109         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3110         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3111     }
3112 }
3113
3114 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3115     MpegEncContext * const s = &h->s;
3116     int temp8, i;
3117     uint64_t temp64;
3118     int deblock_left = (s->mb_x > 0);
3119     int deblock_top  = (s->mb_y > 1);
3120
3121     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3122
3123     src_y  -= 2 *   linesize + 1;
3124     src_cb -= 2 * uvlinesize + 1;
3125     src_cr -= 2 * uvlinesize + 1;
3126
3127 #define XCHG(a,b,t,xchg)\
3128 t= a;\
3129 if(xchg)\
3130     a= b;\
3131 b= t;
3132
3133     if(deblock_left){
3134         for(i = (!deblock_top)<<1; i<34; i++){
3135             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3136         }
3137     }
3138
3139     if(deblock_top){
3140         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3141         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3142         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3143         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3144         if(s->mb_x+1 < s->mb_width){
3145             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3146             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3147         }
3148     }
3149
3150     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
3151         if(deblock_left){
3152             for(i = (!deblock_top) << 1; i<18; i++){
3153                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3154                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3155             }
3156         }
3157         if(deblock_top){
3158             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3159             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3160             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3161             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3162         }
3163     }
3164 }
3165
3166 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
3167     MpegEncContext * const s = &h->s;
3168     const int mb_x= s->mb_x;
3169     const int mb_y= s->mb_y;
3170     const int mb_xy= mb_x + mb_y*s->mb_stride;
3171     const int mb_type= s->current_picture.mb_type[mb_xy];
3172     uint8_t  *dest_y, *dest_cb, *dest_cr;
3173     int linesize, uvlinesize /*dct_offset*/;
3174     int i;
3175     int *block_offset = &h->block_offset[0];
3176     const unsigned int bottom = mb_y & 1;
3177     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
3178     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3179     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3180
3181     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3182     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3183     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3184
3185     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3186     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3187
3188     if (!simple && MB_FIELD) {
3189         linesize   = h->mb_linesize   = s->linesize * 2;
3190         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3191         block_offset = &h->block_offset[24];
3192         if(mb_y&1){ //FIXME move out of this func?
3193             dest_y -= s->linesize*15;
3194             dest_cb-= s->uvlinesize*7;
3195             dest_cr-= s->uvlinesize*7;
3196         }
3197         if(FRAME_MBAFF) {
3198             int list;
3199             for(list=0; list<h->list_count; list++){
3200                 if(!USES_LIST(mb_type, list))
3201                     continue;
3202                 if(IS_16X16(mb_type)){
3203                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3204                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3205                 }else{
3206                     for(i=0; i<16; i+=4){
3207                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3208                         int ref = h->ref_cache[list][scan8[i]];
3209                         if(ref >= 0)
3210                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3211                     }
3212                 }
3213             }
3214         }
3215     } else {
3216         linesize   = h->mb_linesize   = s->linesize;
3217         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3218 //        dct_offset = s->linesize * 16;
3219     }
3220
3221     if(transform_bypass){
3222         idct_dc_add =
3223         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3224     }else if(IS_8x8DCT(mb_type)){
3225         idct_dc_add = s->dsp.h264_idct8_dc_add;
3226         idct_add = s->dsp.h264_idct8_add;
3227     }else{
3228         idct_dc_add = s->dsp.h264_idct_dc_add;
3229         idct_add = s->dsp.h264_idct_add;
3230     }
3231
3232     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3233        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3234         int mbt_y = mb_y&~1;
3235         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3236         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3237         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3238         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3239     }
3240
3241     if (!simple && IS_INTRA_PCM(mb_type)) {
3242         unsigned int x, y;
3243
3244         // The pixels are stored in h->mb array in the same order as levels,
3245         // copy them in output in the correct order.
3246         for(i=0; i<16; i++) {
3247             for (y=0; y<4; y++) {
3248                 for (x=0; x<4; x++) {
3249                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3250                 }
3251             }
3252         }
3253         for(i=16; i<16+4; i++) {
3254             for (y=0; y<4; y++) {
3255                 for (x=0; x<4; x++) {
3256                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3257                 }
3258             }
3259         }
3260         for(i=20; i<20+4; i++) {
3261             for (y=0; y<4; y++) {
3262                 for (x=0; x<4; x++) {
3263                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3264                 }
3265             }
3266         }
3267     } else {
3268         if(IS_INTRA(mb_type)){
3269             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3270                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
3271
3272             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
3273                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3274                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3275             }
3276
3277             if(IS_INTRA4x4(mb_type)){
3278                 if(simple || !s->encoding){
3279                     if(IS_8x8DCT(mb_type)){
3280                         for(i=0; i<16; i+=4){
3281                             uint8_t * const ptr= dest_y + block_offset[i];
3282                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3283                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3284                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3285                                                    (h->topright_samples_available<<i)&0x4000, linesize);
3286                             if(nnz){
3287                                 if(nnz == 1 && h->mb[i*16])
3288                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3289                                 else
3290                                     idct_add(ptr, h->mb + i*16, linesize);
3291                             }
3292                         }
3293                     }else
3294                     for(i=0; i<16; i++){
3295                         uint8_t * const ptr= dest_y + block_offset[i];
3296                         uint8_t *topright;
3297                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3298                         int nnz, tr;
3299
3300                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3301                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3302                             assert(mb_y || linesize <= block_offset[i]);
3303                             if(!topright_avail){
3304                                 tr= ptr[3 - linesize]*0x01010101;
3305                                 topright= (uint8_t*) &tr;
3306                             }else
3307                                 topright= ptr + 4 - linesize;
3308                         }else
3309                             topright= NULL;
3310
3311                         h->pred4x4[ dir ](ptr, topright, linesize);
3312                         nnz = h->non_zero_count_cache[ scan8[i] ];
3313                         if(nnz){
3314                             if(is_h264){
3315                                 if(nnz == 1 && h->mb[i*16])
3316                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3317                                 else
3318                                     idct_add(ptr, h->mb + i*16, linesize);
3319                             }else
3320                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3321                         }
3322                     }
3323                 }
3324             }else{
3325                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3326                 if(is_h264){
3327                     if(!transform_bypass)
3328                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3329                 }else
3330                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3331             }
3332             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3333                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
3334         }else if(is_h264){
3335             hl_motion(h, dest_y, dest_cb, dest_cr,
3336                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3337                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3338                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3339         }
3340
3341
3342         if(!IS_INTRA4x4(mb_type)){
3343             if(is_h264){
3344                 if(IS_INTRA16x16(mb_type)){
3345                     for(i=0; i<16; i++){
3346                         if(h->non_zero_count_cache[ scan8[i] ])
3347                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3348                         else if(h->mb[i*16])
3349                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3350                     }
3351                 }else{
3352                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3353                     for(i=0; i<16; i+=di){
3354                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3355                         if(nnz){
3356                             if(nnz==1 && h->mb[i*16])
3357                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3358                             else
3359                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3360                         }
3361                     }
3362                 }
3363             }else{
3364                 for(i=0; i<16; i++){
3365                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3366                         uint8_t * const ptr= dest_y + block_offset[i];
3367                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3368                     }
3369                 }
3370             }
3371         }
3372
3373         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
3374             uint8_t *dest[2] = {dest_cb, dest_cr};
3375             if(transform_bypass){
3376                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3377             }else{
3378                 idct_add = s->dsp.h264_idct_add;
3379                 idct_dc_add = s->dsp.h264_idct_dc_add;
3380                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
3381                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
3382             }
3383             if(is_h264){
3384                 for(i=16; i<16+8; i++){
3385                     if(h->non_zero_count_cache[ scan8[i] ])
3386                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3387                     else if(h->mb[i*16])
3388                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3389                 }
3390             }else{
3391                 for(i=16; i<16+8; i++){
3392                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3393                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3394                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3395                     }
3396                 }
3397             }
3398         }
3399     }
3400     if(h->deblocking_filter) {
3401         if (!simple && FRAME_MBAFF) {
3402             //FIXME try deblocking one mb at a time?
3403             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3404             const int mb_y = s->mb_y - 1;
3405             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3406             const int mb_xy= mb_x + mb_y*s->mb_stride;
3407             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3408             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3409             if (!bottom) return;
3410             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3411             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3412             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3413
3414             if(IS_INTRA(mb_type_top | mb_type_bottom))
3415                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3416
3417             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3418             // deblock a pair
3419             // top
3420             s->mb_y--;
3421             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3422             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3423             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
3424             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
3425             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3426             // bottom
3427             s->mb_y++;
3428             tprintf(h->s.avctx, "call mbaff filter_mb\n");
3429             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3430             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3431             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3432             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3433         } else {
3434             tprintf(h->s.avctx, "call filter_mb\n");
3435             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
3436             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3437             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3438         }
3439     }
3440 }
3441
3442 /**
3443  * Process a macroblock; this case avoids checks for expensive uncommon cases.
3444  */
3445 static void hl_decode_mb_simple(H264Context *h){
3446     hl_decode_mb_internal(h, 1);
3447 }
3448
3449 /**
3450  * Process a macroblock; this handles edge cases, such as interlacing.
3451  */
3452 static void av_noinline hl_decode_mb_complex(H264Context *h){
3453     hl_decode_mb_internal(h, 0);
3454 }
3455
3456 static void hl_decode_mb(H264Context *h){
3457     MpegEncContext * const s = &h->s;
3458     const int mb_x= s->mb_x;
3459     const int mb_y= s->mb_y;
3460     const int mb_xy= mb_x + mb_y*s->mb_stride;
3461     const int mb_type= s->current_picture.mb_type[mb_xy];
3462     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
3463
3464     if(!s->decode)
3465         return;
3466
3467     if (is_complex)
3468         hl_decode_mb_complex(h);
3469     else hl_decode_mb_simple(h);
3470 }
3471
3472 /**
3473  * fills the default_ref_list.
3474  */
3475 static int fill_default_ref_list(H264Context *h){
3476     MpegEncContext * const s = &h->s;
3477     int i;
3478     int smallest_poc_greater_than_current = -1;
3479     Picture sorted_short_ref[32];
3480
3481     if(h->slice_type==B_TYPE){
3482         int out_i;
3483         int limit= INT_MIN;
3484
3485         /* sort frame according to poc in B slice */
3486         for(out_i=0; out_i<h->short_ref_count; out_i++){
3487             int best_i=INT_MIN;
3488             int best_poc=INT_MAX;
3489
3490             for(i=0; i<h->short_ref_count; i++){
3491                 const int poc= h->short_ref[i]->poc;
3492                 if(poc > limit && poc < best_poc){
3493                     best_poc= poc;
3494                     best_i= i;
3495                 }
3496             }
3497
3498             assert(best_i != INT_MIN);
3499
3500             limit= best_poc;
3501             sorted_short_ref[out_i]= *h->short_ref[best_i];
3502             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3503             if (-1 == smallest_poc_greater_than_current) {
3504                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3505                     smallest_poc_greater_than_current = out_i;
3506                 }
3507             }
3508         }
3509     }
3510
3511     if(s->picture_structure == PICT_FRAME){
3512         if(h->slice_type==B_TYPE){
3513             int list;
3514             tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3515
3516             // find the largest poc
3517             for(list=0; list<2; list++){
3518                 int index = 0;
3519                 int j= -99;
3520                 int step= list ? -1 : 1;
3521
3522                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3523                     while(j<0 || j>= h->short_ref_count){
3524                         if(j != -99 && step == (list ? -1 : 1))
3525                             return -1;
3526                         step = -step;
3527                         j= smallest_poc_greater_than_current + (step>>1);
3528                     }
3529                     if(sorted_short_ref[j].reference != 3) continue;
3530                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3531                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3532                 }
3533
3534                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3535                     if(h->long_ref[i] == NULL) continue;
3536                     if(h->long_ref[i]->reference != 3) continue;
3537
3538                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3539                     h->default_ref_list[ list ][index++].pic_id= i;;
3540                 }
3541
3542                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3543                     // swap the two first elements of L1 when
3544                     // L0 and L1 are identical
3545                     Picture temp= h->default_ref_list[1][0];
3546                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3547                     h->default_ref_list[1][1] = temp;
3548                 }
3549
3550                 if(index < h->ref_count[ list ])
3551                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3552             }
3553         }else{
3554             int index=0;
3555             for(i=0; i<h->short_ref_count; i++){
3556                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3557                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3558                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3559             }
3560             for(i = 0; i < 16; i++){
3561                 if(h->long_ref[i] == NULL) continue;
3562                 if(h->long_ref[i]->reference != 3) continue;
3563                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3564                 h->default_ref_list[0][index++].pic_id= i;;
3565             }
3566             if(index < h->ref_count[0])
3567                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3568         }
3569     }else{ //FIELD
3570         if(h->slice_type==B_TYPE){
3571         }else{
3572             //FIXME second field balh
3573         }
3574     }
3575 #ifdef TRACE
3576     for (i=0; i<h->ref_count[0]; i++) {
3577         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3578     }
3579     if(h->slice_type==B_TYPE){
3580         for (i=0; i<h->ref_count[1]; i++) {
3581             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3582         }
3583     }
3584 #endif
3585     return 0;
3586 }
3587
3588 static void print_short_term(H264Context *h);
3589 static void print_long_term(H264Context *h);
3590
3591 static int decode_ref_pic_list_reordering(H264Context *h){
3592     MpegEncContext * const s = &h->s;
3593     int list, index;
3594
3595     print_short_term(h);
3596     print_long_term(h);
3597     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3598
3599     for(list=0; list<h->list_count; list++){
3600         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3601
3602         if(get_bits1(&s->gb)){
3603             int pred= h->curr_pic_num;
3604
3605             for(index=0; ; index++){
3606                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3607                 unsigned int pic_id;
3608                 int i;
3609                 Picture *ref = NULL;
3610
3611                 if(reordering_of_pic_nums_idc==3)
3612                     break;
3613
3614                 if(index >= h->ref_count[list]){
3615                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3616                     return -1;
3617                 }
3618
3619                 if(reordering_of_pic_nums_idc<3){
3620                     if(reordering_of_pic_nums_idc<2){
3621                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3622
3623                         if(abs_diff_pic_num >= h->max_pic_num){
3624                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3625                             return -1;
3626                         }
3627
3628                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3629                         else                                pred+= abs_diff_pic_num;
3630                         pred &= h->max_pic_num - 1;
3631
3632                         for(i= h->short_ref_count-1; i>=0; i--){
3633                             ref = h->short_ref[i];
3634                             assert(ref->reference == 3);
3635                             assert(!ref->long_ref);
3636                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3637                                 break;
3638                         }
3639                         if(i>=0)
3640                             ref->pic_id= ref->frame_num;
3641                     }else{
3642                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3643                         if(pic_id>31){
3644                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3645                             return -1;
3646                         }
3647                         ref = h->long_ref[pic_id];
3648                         if(ref){
3649                             ref->pic_id= pic_id;
3650                             assert(ref->reference == 3);
3651                             assert(ref->long_ref);
3652                             i=0;
3653                         }else{
3654                             i=-1;
3655                         }
3656                     }
3657
3658                     if (i < 0) {
3659                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3660                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3661                     } else {
3662                         for(i=index; i+1<h->ref_count[list]; i++){
3663                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3664                                 break;
3665                         }
3666                         for(; i > index; i--){
3667                             h->ref_list[list][i]= h->ref_list[list][i-1];
3668                         }
3669                         h->ref_list[list][index]= *ref;
3670                     }
3671                 }else{
3672                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3673                     return -1;
3674                 }
3675             }
3676         }
3677     }
3678     for(list=0; list<h->list_count; list++){
3679         for(index= 0; index < h->ref_count[list]; index++){
3680             if(!h->ref_list[list][index].data[0])
3681                 h->ref_list[list][index]= s->current_picture;
3682         }
3683     }
3684
3685     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3686         direct_dist_scale_factor(h);
3687     direct_ref_list_init(h);
3688     return 0;
3689 }
3690
3691 static void fill_mbaff_ref_list(H264Context *h){
3692     int list, i, j;
3693     for(list=0; list<2; list++){ //FIXME try list_count
3694         for(i=0; i<h->ref_count[list]; i++){
3695             Picture *frame = &h->ref_list[list][i];
3696             Picture *field = &h->ref_list[list][16+2*i];
3697             field[0] = *frame;
3698             for(j=0; j<3; j++)
3699                 field[0].linesize[j] <<= 1;
3700             field[1] = field[0];
3701             for(j=0; j<3; j++)
3702                 field[1].data[j] += frame->linesize[j];
3703
3704             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3705             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3706             for(j=0; j<2; j++){
3707                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3708                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3709             }
3710         }
3711     }
3712     for(j=0; j<h->ref_count[1]; j++){
3713         for(i=0; i<h->ref_count[0]; i++)
3714             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3715         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3716         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3717     }
3718 }
3719
3720 static int pred_weight_table(H264Context *h){
3721     MpegEncContext * const s = &h->s;
3722     int list, i;
3723     int luma_def, chroma_def;
3724
3725     h->use_weight= 0;
3726     h->use_weight_chroma= 0;
3727     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3728     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3729     luma_def = 1<<h->luma_log2_weight_denom;
3730     chroma_def = 1<<h->chroma_log2_weight_denom;
3731
3732     for(list=0; list<2; list++){
3733         for(i=0; i<h->ref_count[list]; i++){
3734             int luma_weight_flag, chroma_weight_flag;
3735
3736             luma_weight_flag= get_bits1(&s->gb);
3737             if(luma_weight_flag){
3738                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3739                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3740                 if(   h->luma_weight[list][i] != luma_def
3741                    || h->luma_offset[list][i] != 0)
3742                     h->use_weight= 1;
3743             }else{
3744                 h->luma_weight[list][i]= luma_def;
3745                 h->luma_offset[list][i]= 0;
3746             }
3747
3748             chroma_weight_flag= get_bits1(&s->gb);
3749             if(chroma_weight_flag){
3750                 int j;
3751                 for(j=0; j<2; j++){
3752                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3753                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3754                     if(   h->chroma_weight[list][i][j] != chroma_def
3755                        || h->chroma_offset[list][i][j] != 0)
3756                         h->use_weight_chroma= 1;
3757                 }
3758             }else{
3759                 int j;
3760                 for(j=0; j<2; j++){
3761                     h->chroma_weight[list][i][j]= chroma_def;
3762                     h->chroma_offset[list][i][j]= 0;
3763                 }
3764             }
3765         }
3766         if(h->slice_type != B_TYPE) break;
3767     }
3768     h->use_weight= h->use_weight || h->use_weight_chroma;
3769     return 0;
3770 }
3771
3772 static void implicit_weight_table(H264Context *h){
3773     MpegEncContext * const s = &h->s;
3774     int ref0, ref1;
3775     int cur_poc = s->current_picture_ptr->poc;
3776
3777     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3778        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3779         h->use_weight= 0;
3780         h->use_weight_chroma= 0;
3781         return;
3782     }
3783
3784     h->use_weight= 2;
3785     h->use_weight_chroma= 2;
3786     h->luma_log2_weight_denom= 5;
3787     h->chroma_log2_weight_denom= 5;
3788
3789     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3790         int poc0 = h->ref_list[0][ref0].poc;
3791         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3792             int poc1 = h->ref_list[1][ref1].poc;
3793             int td = av_clip(poc1 - poc0, -128, 127);
3794             if(td){
3795                 int tb = av_clip(cur_poc - poc0, -128, 127);
3796                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3797                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3798                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3799                     h->implicit_weight[ref0][ref1] = 32;
3800                 else
3801                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3802             }else
3803                 h->implicit_weight[ref0][ref1] = 32;
3804         }
3805     }
3806 }
3807
3808 static inline void unreference_pic(H264Context *h, Picture *pic){
3809     int i;
3810     pic->reference=0;
3811     if(pic == h->delayed_output_pic)
3812         pic->reference=1;
3813     else{
3814         for(i = 0; h->delayed_pic[i]; i++)
3815             if(pic == h->delayed_pic[i]){
3816                 pic->reference=1;
3817                 break;
3818             }
3819     }
3820 }
3821
3822 /**
3823  * instantaneous decoder refresh.
3824  */
3825 static void idr(H264Context *h){
3826     int i;
3827
3828     for(i=0; i<16; i++){
3829         if (h->long_ref[i] != NULL) {
3830             unreference_pic(h, h->long_ref[i]);
3831             h->long_ref[i]= NULL;
3832         }
3833     }
3834     h->long_ref_count=0;
3835
3836     for(i=0; i<h->short_ref_count; i++){
3837         unreference_pic(h, h->short_ref[i]);
3838         h->short_ref[i]= NULL;
3839     }
3840     h->short_ref_count=0;
3841 }
3842
3843 /* forget old pics after a seek */
3844 static void flush_dpb(AVCodecContext *avctx){
3845     H264Context *h= avctx->priv_data;
3846     int i;
3847     for(i=0; i<16; i++) {
3848         if(h->delayed_pic[i])
3849             h->delayed_pic[i]->reference= 0;
3850         h->delayed_pic[i]= NULL;
3851     }
3852     if(h->delayed_output_pic)
3853         h->delayed_output_pic->reference= 0;
3854     h->delayed_output_pic= NULL;
3855     idr(h);
3856     if(h->s.current_picture_ptr)
3857         h->s.current_picture_ptr->reference= 0;
3858 }
3859
3860 /**
3861  *
3862  * @return the removed picture or NULL if an error occurs
3863  */
3864 static Picture * remove_short(H264Context *h, int frame_num){
3865     MpegEncContext * const s = &h->s;
3866     int i;
3867
3868     if(s->avctx->debug&FF_DEBUG_MMCO)
3869         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3870
3871     for(i=0; i<h->short_ref_count; i++){
3872         Picture *pic= h->short_ref[i];
3873         if(s->avctx->debug&FF_DEBUG_MMCO)
3874             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3875         if(pic->frame_num == frame_num){
3876             h->short_ref[i]= NULL;
3877             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3878             h->short_ref_count--;
3879             return pic;
3880         }
3881     }
3882     return NULL;
3883 }
3884
3885 /**
3886  *
3887  * @return the removed picture or NULL if an error occurs
3888  */
3889 static Picture * remove_long(H264Context *h, int i){
3890     Picture *pic;
3891
3892     pic= h->long_ref[i];
3893     h->long_ref[i]= NULL;
3894     if(pic) h->long_ref_count--;
3895
3896     return pic;
3897 }
3898
3899 /**
3900  * print short term list
3901  */
3902 static void print_short_term(H264Context *h) {
3903     uint32_t i;
3904     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3905         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3906         for(i=0; i<h->short_ref_count; i++){
3907             Picture *pic= h->short_ref[i];
3908             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3909         }
3910     }
3911 }
3912
3913 /**
3914  * print long term list
3915  */
3916 static void print_long_term(H264Context *h) {
3917     uint32_t i;
3918     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3919         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3920         for(i = 0; i < 16; i++){
3921             Picture *pic= h->long_ref[i];
3922             if (pic) {
3923                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3924             }
3925         }
3926     }
3927 }
3928
3929 /**
3930  * Executes the reference picture marking (memory management control operations).
3931  */
3932 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3933     MpegEncContext * const s = &h->s;
3934     int i, j;
3935     int current_is_long=0;
3936     Picture *pic;
3937
3938     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3939         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3940
3941     for(i=0; i<mmco_count; i++){
3942         if(s->avctx->debug&FF_DEBUG_MMCO)
3943             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
3944
3945         switch(mmco[i].opcode){
3946         case MMCO_SHORT2UNUSED:
3947             pic= remove_short(h, mmco[i].short_frame_num);
3948             if(pic)
3949                 unreference_pic(h, pic);
3950             else if(s->avctx->debug&FF_DEBUG_MMCO)
3951                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3952             break;
3953         case MMCO_SHORT2LONG:
3954             pic= remove_long(h, mmco[i].long_index);
3955             if(pic) unreference_pic(h, pic);
3956
3957             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
3958             if (h->long_ref[ mmco[i].long_index ]){
3959                 h->long_ref[ mmco[i].long_index ]->long_ref=1;
3960                 h->long_ref_count++;
3961             }
3962             break;
3963         case MMCO_LONG2UNUSED:
3964             pic= remove_long(h, mmco[i].long_index);
3965             if(pic)
3966                 unreference_pic(h, pic);
3967             else if(s->avctx->debug&FF_DEBUG_MMCO)
3968                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3969             break;
3970         case MMCO_LONG:
3971             pic= remove_long(h, mmco[i].long_index);
3972             if(pic) unreference_pic(h, pic);
3973
3974             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
3975             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3976             h->long_ref_count++;
3977
3978             current_is_long=1;
3979             break;
3980         case MMCO_SET_MAX_LONG:
3981             assert(mmco[i].long_index <= 16);
3982             // just remove the long term which index is greater than new max
3983             for(j = mmco[i].long_index; j<16; j++){
3984                 pic = remove_long(h, j);
3985                 if (pic) unreference_pic(h, pic);
3986             }
3987             break;
3988         case MMCO_RESET:
3989             while(h->short_ref_count){
3990                 pic= remove_short(h, h->short_ref[0]->frame_num);
3991                 if(pic) unreference_pic(h, pic);
3992             }
3993             for(j = 0; j < 16; j++) {
3994                 pic= remove_long(h, j);
3995                 if(pic) unreference_pic(h, pic);
3996             }
3997             break;
3998         default: assert(0);
3999         }
4000     }
4001
4002     if(!current_is_long){
4003         pic= remove_short(h, s->current_picture_ptr->frame_num);
4004         if(pic){
4005             unreference_pic(h, pic);
4006             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4007         }
4008
4009         if(h->short_ref_count)
4010             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4011
4012         h->short_ref[0]= s->current_picture_ptr;
4013         h->short_ref[0]->long_ref=0;
4014         h->short_ref_count++;
4015     }
4016
4017     print_short_term(h);
4018     print_long_term(h);
4019     return 0;
4020 }
4021
4022 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
4023     MpegEncContext * const s = &h->s;
4024     int i;
4025
4026     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4027         s->broken_link= get_bits1(gb) -1;
4028         h->mmco[0].long_index= get_bits1(gb) - 1; // current_long_term_idx
4029         if(h->mmco[0].long_index == -1)
4030             h->mmco_index= 0;
4031         else{
4032             h->mmco[0].opcode= MMCO_LONG;
4033             h->mmco_index= 1;
4034         }
4035     }else{
4036         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
4037             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4038                 MMCOOpcode opcode= get_ue_golomb(gb);
4039
4040                 h->mmco[i].opcode= opcode;
4041                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4042                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4043 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4044                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4045                         return -1;
4046                     }*/
4047                 }
4048                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4049                     unsigned int long_index= get_ue_golomb(gb);
4050                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
4051                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4052                         return -1;
4053                     }
4054                     h->mmco[i].long_index= long_index;
4055                 }
4056
4057                 if(opcode > (unsigned)MMCO_LONG){
4058                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4059                     return -1;
4060                 }
4061                 if(opcode == MMCO_END)
4062                     break;
4063             }
4064             h->mmco_index= i;
4065         }else{
4066             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4067
4068             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4069                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4070                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4071                 h->mmco_index= 1;
4072             }else
4073                 h->mmco_index= 0;
4074         }
4075     }
4076
4077     return 0;
4078 }
4079
4080 static int init_poc(H264Context *h){
4081     MpegEncContext * const s = &h->s;
4082     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4083     int field_poc[2];
4084
4085     if(h->nal_unit_type == NAL_IDR_SLICE){
4086         h->frame_num_offset= 0;
4087     }else{
4088         if(h->frame_num < h->prev_frame_num)
4089             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4090         else
4091             h->frame_num_offset= h->prev_frame_num_offset;
4092     }
4093
4094     if(h->sps.poc_type==0){
4095         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4096
4097         if(h->nal_unit_type == NAL_IDR_SLICE){
4098              h->prev_poc_msb=
4099              h->prev_poc_lsb= 0;
4100         }
4101
4102         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4103             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4104         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4105             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4106         else
4107             h->poc_msb = h->prev_poc_msb;
4108 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4109         field_poc[0] =
4110         field_poc[1] = h->poc_msb + h->poc_lsb;
4111         if(s->picture_structure == PICT_FRAME)
4112             field_poc[1] += h->delta_poc_bottom;
4113     }else if(h->sps.poc_type==1){
4114         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4115         int i;
4116
4117         if(h->sps.poc_cycle_length != 0)
4118             abs_frame_num = h->frame_num_offset + h->frame_num;
4119         else
4120             abs_frame_num = 0;
4121
4122         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4123             abs_frame_num--;
4124
4125         expected_delta_per_poc_cycle = 0;
4126         for(i=0; i < h->sps.poc_cycle_length; i++)
4127             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4128
4129         if(abs_frame_num > 0){
4130             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4131             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4132
4133             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4134             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4135                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4136         } else
4137             expectedpoc = 0;
4138
4139         if(h->nal_ref_idc == 0)
4140             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4141
4142         field_poc[0] = expectedpoc + h->delta_poc[0];
4143         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4144
4145         if(s->picture_structure == PICT_FRAME)
4146             field_poc[1] += h->delta_poc[1];
4147     }else{
4148         int poc;
4149         if(h->nal_unit_type == NAL_IDR_SLICE){
4150             poc= 0;
4151         }else{
4152             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4153             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4154         }
4155         field_poc[0]= poc;
4156         field_poc[1]= poc;
4157     }
4158
4159     if(s->picture_structure != PICT_BOTTOM_FIELD)
4160         s->current_picture_ptr->field_poc[0]= field_poc[0];
4161     if(s->picture_structure != PICT_TOP_FIELD)
4162         s->current_picture_ptr->field_poc[1]= field_poc[1];
4163     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4164         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4165
4166     return 0;
4167 }
4168
4169
4170 /**
4171  * initialize scan tables
4172  */
4173 static void init_scan_tables(H264Context *h){
4174     MpegEncContext * const s = &h->s;
4175     int i;
4176     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4177         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4178         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4179     }else{
4180         for(i=0; i<16; i++){
4181 #define T(x) (x>>2) | ((x<<2) & 0xF)
4182             h->zigzag_scan[i] = T(zigzag_scan[i]);
4183             h-> field_scan[i] = T( field_scan[i]);
4184 #undef T
4185         }
4186     }
4187     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4188         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4189         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4190         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4191         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4192     }else{
4193         for(i=0; i<64; i++){
4194 #define T(x) (x>>3) | ((x&7)<<3)
4195             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4196             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4197             h->field_scan8x8[i]        = T(field_scan8x8[i]);
4198             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4199 #undef T
4200         }
4201     }
4202     if(h->sps.transform_bypass){ //FIXME same ugly
4203         h->zigzag_scan_q0          = zigzag_scan;
4204         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4205         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4206         h->field_scan_q0           = field_scan;
4207         h->field_scan8x8_q0        = field_scan8x8;
4208         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4209     }else{
4210         h->zigzag_scan_q0          = h->zigzag_scan;
4211         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4212         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4213         h->field_scan_q0           = h->field_scan;
4214         h->field_scan8x8_q0        = h->field_scan8x8;
4215         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4216     }
4217 }
4218 /**
4219  * decodes a slice header.
4220  * this will allso call MPV_common_init() and frame_start() as needed
4221  */
4222 static int decode_slice_header(H264Context *h){
4223     MpegEncContext * const s = &h->s;
4224     unsigned int first_mb_in_slice;
4225     unsigned int pps_id;
4226     int num_ref_idx_active_override_flag;
4227     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4228     unsigned int slice_type, tmp;
4229     int default_ref_list_done = 0;
4230
4231     s->current_picture.reference= h->nal_ref_idc != 0;
4232     s->dropable= h->nal_ref_idc == 0;
4233
4234     first_mb_in_slice= get_ue_golomb(&s->gb);
4235
4236     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
4237         h->slice_num = 0;
4238         s->current_picture_ptr= NULL;
4239     }
4240
4241     slice_type= get_ue_golomb(&s->gb);
4242     if(slice_type > 9){
4243         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4244         return -1;
4245     }
4246     if(slice_type > 4){
4247         slice_type -= 5;
4248         h->slice_type_fixed=1;
4249     }else
4250         h->slice_type_fixed=0;
4251
4252     slice_type= slice_type_map[ slice_type ];
4253     if (slice_type == I_TYPE
4254         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4255         default_ref_list_done = 1;
4256     }
4257     h->slice_type= slice_type;
4258
4259     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4260
4261     pps_id= get_ue_golomb(&s->gb);
4262     if(pps_id>=MAX_PPS_COUNT){
4263         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4264         return -1;
4265     }
4266     if(!h->pps_buffers[pps_id]) {
4267         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4268         return -1;
4269     }
4270     h->pps= *h->pps_buffers[pps_id];
4271
4272     if(!h->sps_buffers[h->pps.sps_id]) {
4273         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4274         return -1;
4275     }
4276     h->sps = *h->sps_buffers[h->pps.sps_id];
4277
4278     if(h->dequant_coeff_pps != pps_id){
4279         h->dequant_coeff_pps = pps_id;
4280         init_dequant_tables(h);
4281     }
4282
4283     s->mb_width= h->sps.mb_width;
4284     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4285
4286     h->b_stride=  s->mb_width*4;
4287     h->b8_stride= s->mb_width*2;
4288
4289     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4290     if(h->sps.frame_mbs_only_flag)
4291         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4292     else
4293         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4294
4295     if (s->context_initialized
4296         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4297         free_tables(h);
4298         MPV_common_end(s);
4299     }
4300     if (!s->context_initialized) {
4301         if (MPV_common_init(s) < 0)
4302             return -1;
4303
4304         init_scan_tables(h);
4305         alloc_tables(h);
4306
4307         s->avctx->width = s->width;
4308         s->avctx->height = s->height;
4309         s->avctx->sample_aspect_ratio= h->sps.sar;
4310         if(!s->avctx->sample_aspect_ratio.den)
4311             s->avctx->sample_aspect_ratio.den = 1;
4312
4313         if(h->sps.timing_info_present_flag){
4314             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4315             if(h->x264_build > 0 && h->x264_build < 44)
4316                 s->avctx->time_base.den *= 2;
4317             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4318                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4319         }
4320     }
4321
4322     if(h->slice_num == 0){
4323         if(frame_start(h) < 0)
4324             return -1;
4325     }
4326
4327     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4328     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4329
4330     h->mb_mbaff = 0;
4331     h->mb_aff_frame = 0;
4332     if(h->sps.frame_mbs_only_flag){
4333         s->picture_structure= PICT_FRAME;
4334     }else{
4335         if(get_bits1(&s->gb)) { //field_pic_flag
4336             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4337             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4338         } else {
4339             s->picture_structure= PICT_FRAME;
4340             h->mb_aff_frame = h->sps.mb_aff;
4341         }
4342     }
4343     assert(s->mb_num == s->mb_width * s->mb_height);
4344     if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
4345        first_mb_in_slice                    >= s->mb_num){
4346         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4347         return -1;
4348     }
4349     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4350     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4351     assert(s->mb_y < s->mb_height);
4352
4353     if(s->picture_structure==PICT_FRAME){
4354         h->curr_pic_num=   h->frame_num;
4355         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4356     }else{
4357         h->curr_pic_num= 2*h->frame_num;
4358         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4359     }
4360
4361     if(h->nal_unit_type == NAL_IDR_SLICE){
4362         get_ue_golomb(&s->gb); /* idr_pic_id */
4363     }
4364
4365     if(h->sps.poc_type==0){
4366         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4367
4368         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4369             h->delta_poc_bottom= get_se_golomb(&s->gb);
4370         }
4371     }
4372
4373     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4374         h->delta_poc[0]= get_se_golomb(&s->gb);
4375
4376         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4377             h->delta_poc[1]= get_se_golomb(&s->gb);
4378     }
4379
4380     init_poc(h);
4381
4382     if(h->pps.redundant_pic_cnt_present){
4383         h->redundant_pic_count= get_ue_golomb(&s->gb);
4384     }
4385
4386     //set defaults, might be overriden a few line later
4387     h->ref_count[0]= h->pps.ref_count[0];
4388     h->ref_count[1]= h->pps.ref_count[1];
4389
4390     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4391         if(h->slice_type == B_TYPE){
4392             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4393             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4394                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4395         }
4396         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4397
4398         if(num_ref_idx_active_override_flag){
4399             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4400             if(h->slice_type==B_TYPE)
4401                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4402
4403             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4404                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4405                 h->ref_count[0]= h->ref_count[1]= 1;
4406                 return -1;
4407             }
4408         }
4409         if(h->slice_type == B_TYPE)
4410             h->list_count= 2;
4411         else
4412             h->list_count= 1;
4413     }else
4414         h->list_count= 0;
4415
4416     if(!default_ref_list_done){
4417         fill_default_ref_list(h);
4418     }
4419
4420     if(decode_ref_pic_list_reordering(h) < 0)
4421         return -1;
4422
4423     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4424        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4425         pred_weight_table(h);
4426     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4427         implicit_weight_table(h);
4428     else
4429         h->use_weight = 0;
4430
4431     if(s->current_picture.reference)
4432         decode_ref_pic_marking(h, &s->gb);
4433
4434     if(FRAME_MBAFF)
4435         fill_mbaff_ref_list(h);
4436
4437     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4438         tmp = get_ue_golomb(&s->gb);
4439         if(tmp > 2){
4440             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4441             return -1;
4442         }
4443         h->cabac_init_idc= tmp;
4444     }
4445
4446     h->last_qscale_diff = 0;
4447     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4448     if(tmp>51){
4449         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4450         return -1;
4451     }
4452     s->qscale= tmp;
4453     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4454     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4455     //FIXME qscale / qp ... stuff
4456     if(h->slice_type == SP_TYPE){
4457         get_bits1(&s->gb); /* sp_for_switch_flag */
4458     }
4459     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4460         get_se_golomb(&s->gb); /* slice_qs_delta */
4461     }
4462
4463     h->deblocking_filter = 1;
4464     h->slice_alpha_c0_offset = 0;
4465     h->slice_beta_offset = 0;
4466     if( h->pps.deblocking_filter_parameters_present ) {
4467         tmp= get_ue_golomb(&s->gb);
4468         if(tmp > 2){
4469             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4470             return -1;
4471         }
4472         h->deblocking_filter= tmp;
4473         if(h->deblocking_filter < 2)
4474             h->deblocking_filter^= 1; // 1<->0
4475
4476         if( h->deblocking_filter ) {
4477             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4478             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4479         }
4480     }
4481     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4482        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4483        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4484        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4485         h->deblocking_filter= 0;
4486
4487 #if 0 //FMO
4488     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4489         slice_group_change_cycle= get_bits(&s->gb, ?);
4490 #endif
4491
4492     h->slice_num++;
4493
4494     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4495     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4496
4497     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4498         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4499                h->slice_num,
4500                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4501                first_mb_in_slice,
4502                av_get_pict_type_char(h->slice_type),
4503                pps_id, h->frame_num,
4504                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4505                h->ref_count[0], h->ref_count[1],
4506                s->qscale,
4507                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4508                h->use_weight,
4509                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4510                );
4511     }
4512
4513     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4514         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4515         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4516     }else{
4517         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4518         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4519     }
4520
4521     return 0;
4522 }
4523
4524 /**
4525  *
4526  */
4527 static inline int get_level_prefix(GetBitContext *gb){
4528     unsigned int buf;
4529     int log;
4530
4531     OPEN_READER(re, gb);
4532     UPDATE_CACHE(re, gb);
4533     buf=GET_CACHE(re, gb);
4534
4535     log= 32 - av_log2(buf);
4536 #ifdef TRACE
4537     print_bin(buf>>(32-log), log);
4538     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4539 #endif
4540
4541     LAST_SKIP_BITS(re, gb, log);
4542     CLOSE_READER(re, gb);
4543
4544     return log-1;
4545 }
4546
4547 static inline int get_dct8x8_allowed(H264Context *h){
4548     int i;
4549     for(i=0; i<4; i++){
4550         if(!IS_SUB_8X8(h->sub_mb_type[i])
4551            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4552             return 0;
4553     }
4554     return 1;
4555 }
4556
4557 /**
4558  * decodes a residual block.
4559  * @param n block index
4560  * @param scantable scantable
4561  * @param max_coeff number of coefficients in the block
4562  * @return <0 if an error occured
4563  */
4564 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4565     MpegEncContext * const s = &h->s;
4566     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4567     int level[16];
4568     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4569
4570     //FIXME put trailing_onex into the context
4571
4572     if(n == CHROMA_DC_BLOCK_INDEX){
4573         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4574         total_coeff= coeff_token>>2;
4575     }else{
4576         if(n == LUMA_DC_BLOCK_INDEX){
4577             total_coeff= pred_non_zero_count(h, 0);
4578             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4579             total_coeff= coeff_token>>2;
4580         }else{
4581             total_coeff= pred_non_zero_count(h, n);
4582             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4583             total_coeff= coeff_token>>2;
4584             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4585         }
4586     }
4587
4588     //FIXME set last_non_zero?
4589
4590     if(total_coeff==0)
4591         return 0;
4592     if(total_coeff > (unsigned)max_coeff) {
4593         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4594         return -1;
4595     }
4596
4597     trailing_ones= coeff_token&3;
4598     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4599     assert(total_coeff<=16);
4600
4601     for(i=0; i<trailing_ones; i++){
4602         level[i]= 1 - 2*get_bits1(gb);
4603     }
4604
4605     if(i<total_coeff) {
4606         int level_code, mask;
4607         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4608         int prefix= get_level_prefix(gb);
4609
4610         //first coefficient has suffix_length equal to 0 or 1
4611         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4612             if(suffix_length)
4613                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4614             else
4615                 level_code= (prefix<<suffix_length); //part
4616         }else if(prefix==14){
4617             if(suffix_length)
4618                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4619             else
4620                 level_code= prefix + get_bits(gb, 4); //part
4621         }else if(prefix==15){
4622             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4623             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4624         }else{
4625             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4626             return -1;
4627         }
4628
4629         if(trailing_ones < 3) level_code += 2;
4630
4631         suffix_length = 1;
4632         if(level_code > 5)
4633             suffix_length++;
4634         mask= -(level_code&1);
4635         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4636         i++;
4637
4638         //remaining coefficients have suffix_length > 0
4639         for(;i<total_coeff;i++) {
4640             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4641             prefix = get_level_prefix(gb);
4642             if(prefix<15){
4643                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4644             }else if(prefix==15){
4645                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4646             }else{
4647                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4648                 return -1;
4649             }
4650             mask= -(level_code&1);
4651             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4652             if(level_code > suffix_limit[suffix_length])
4653                 suffix_length++;
4654         }
4655     }
4656
4657     if(total_coeff == max_coeff)
4658         zeros_left=0;
4659     else{
4660         if(n == CHROMA_DC_BLOCK_INDEX)
4661             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4662         else
4663             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4664     }
4665
4666     coeff_num = zeros_left + total_coeff - 1;
4667     j = scantable[coeff_num];
4668     if(n > 24){
4669         block[j] = level[0];
4670         for(i=1;i<total_coeff;i++) {
4671             if(zeros_left <= 0)
4672                 run_before = 0;
4673             else if(zeros_left < 7){
4674                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4675             }else{
4676                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4677             }
4678             zeros_left -= run_before;
4679             coeff_num -= 1 + run_before;
4680             j= scantable[ coeff_num ];
4681
4682             block[j]= level[i];
4683         }
4684     }else{
4685         block[j] = (level[0] * qmul[j] + 32)>>6;
4686         for(i=1;i<total_coeff;i++) {
4687             if(zeros_left <= 0)
4688                 run_before = 0;
4689             else if(zeros_left < 7){
4690                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4691             }else{
4692                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4693             }
4694             zeros_left -= run_before;
4695             coeff_num -= 1 + run_before;
4696             j= scantable[ coeff_num ];
4697
4698             block[j]= (level[i] * qmul[j] + 32)>>6;
4699         }
4700     }
4701
4702     if(zeros_left<0){
4703         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4704         return -1;
4705     }
4706
4707     return 0;
4708 }
4709
4710 static void predict_field_decoding_flag(H264Context *h){
4711     MpegEncContext * const s = &h->s;
4712     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4713     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4714                 ? s->current_picture.mb_type[mb_xy-1]
4715                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4716                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4717                 : 0;
4718     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4719 }
4720
4721 /**
4722  * decodes a P_SKIP or B_SKIP macroblock
4723  */
4724 static void decode_mb_skip(H264Context *h){
4725     MpegEncContext * const s = &h->s;
4726     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4727     int mb_type=0;
4728
4729     memset(h->non_zero_count[mb_xy], 0, 16);
4730     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4731
4732     if(MB_FIELD)
4733         mb_type|= MB_TYPE_INTERLACED;
4734
4735     if( h->slice_type == B_TYPE )
4736     {
4737         // just for fill_caches. pred_direct_motion will set the real mb_type
4738         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4739
4740         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4741         pred_direct_motion(h, &mb_type);
4742         mb_type|= MB_TYPE_SKIP;
4743     }
4744     else
4745     {
4746         int mx, my;
4747         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4748
4749         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4750         pred_pskip_motion(h, &mx, &my);
4751         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4752         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4753     }
4754
4755     write_back_motion(h, mb_type);
4756     s->current_picture.mb_type[mb_xy]= mb_type;
4757     s->current_picture.qscale_table[mb_xy]= s->qscale;
4758     h->slice_table[ mb_xy ]= h->slice_num;
4759     h->prev_mb_skipped= 1;
4760 }
4761
4762 /**
4763  * decodes a macroblock
4764  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4765  */
4766 static int decode_mb_cavlc(H264Context *h){
4767     MpegEncContext * const s = &h->s;
4768     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4769     int partition_count;
4770     unsigned int mb_type, cbp;
4771     int dct8x8_allowed= h->pps.transform_8x8_mode;
4772
4773     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4774
4775     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4776     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4777                 down the code */
4778     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4779         if(s->mb_skip_run==-1)
4780             s->mb_skip_run= get_ue_golomb(&s->gb);
4781
4782         if (s->mb_skip_run--) {
4783             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4784                 if(s->mb_skip_run==0)
4785                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4786                 else
4787                     predict_field_decoding_flag(h);
4788             }
4789             decode_mb_skip(h);
4790             return 0;
4791         }
4792     }
4793     if(FRAME_MBAFF){
4794         if( (s->mb_y&1) == 0 )
4795             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4796     }else
4797         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4798
4799     h->prev_mb_skipped= 0;
4800
4801     mb_type= get_ue_golomb(&s->gb);
4802     if(h->slice_type == B_TYPE){
4803         if(mb_type < 23){
4804             partition_count= b_mb_type_info[mb_type].partition_count;
4805             mb_type=         b_mb_type_info[mb_type].type;
4806         }else{
4807             mb_type -= 23;
4808             goto decode_intra_mb;
4809         }
4810     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4811         if(mb_type < 5){
4812             partition_count= p_mb_type_info[mb_type].partition_count;
4813             mb_type=         p_mb_type_info[mb_type].type;
4814         }else{
4815             mb_type -= 5;
4816             goto decode_intra_mb;
4817         }
4818     }else{
4819        assert(h->slice_type == I_TYPE);
4820 decode_intra_mb:
4821         if(mb_type > 25){
4822             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4823             return -1;
4824         }
4825         partition_count=0;
4826         cbp= i_mb_type_info[mb_type].cbp;
4827         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4828         mb_type= i_mb_type_info[mb_type].type;
4829     }
4830
4831     if(MB_FIELD)
4832         mb_type |= MB_TYPE_INTERLACED;
4833
4834     h->slice_table[ mb_xy ]= h->slice_num;
4835
4836     if(IS_INTRA_PCM(mb_type)){
4837         unsigned int x, y;
4838
4839         // We assume these blocks are very rare so we do not optimize it.
4840         align_get_bits(&s->gb);
4841
4842         // The pixels are stored in the same order as levels in h->mb array.
4843         for(y=0; y<16; y++){
4844             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4845             for(x=0; x<16; x++){
4846                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4847                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4848             }
4849         }
4850         for(y=0; y<8; y++){
4851             const int index= 256 + 4*(y&3) + 32*(y>>2);
4852             for(x=0; x<8; x++){
4853                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4854                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4855             }
4856         }
4857         for(y=0; y<8; y++){
4858             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4859             for(x=0; x<8; x++){
4860                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4861                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4862             }
4863         }
4864
4865         // In deblocking, the quantizer is 0
4866         s->current_picture.qscale_table[mb_xy]= 0;
4867         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4868         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4869         // All coeffs are present
4870         memset(h->non_zero_count[mb_xy], 16, 16);
4871
4872         s->current_picture.mb_type[mb_xy]= mb_type;
4873         return 0;
4874     }
4875
4876     if(MB_MBAFF){
4877         h->ref_count[0] <<= 1;
4878         h->ref_count[1] <<= 1;
4879     }
4880
4881     fill_caches(h, mb_type, 0);
4882
4883     //mb_pred
4884     if(IS_INTRA(mb_type)){
4885             int pred_mode;
4886 //            init_top_left_availability(h);
4887             if(IS_INTRA4x4(mb_type)){
4888                 int i;
4889                 int di = 1;
4890                 if(dct8x8_allowed && get_bits1(&s->gb)){
4891                     mb_type |= MB_TYPE_8x8DCT;
4892                     di = 4;
4893                 }
4894
4895 //                fill_intra4x4_pred_table(h);
4896                 for(i=0; i<16; i+=di){
4897                     int mode= pred_intra_mode(h, i);
4898
4899                     if(!get_bits1(&s->gb)){
4900                         const int rem_mode= get_bits(&s->gb, 3);
4901                         mode = rem_mode + (rem_mode >= mode);
4902                     }
4903
4904                     if(di==4)
4905                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4906                     else
4907                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4908                 }
4909                 write_back_intra_pred_mode(h);
4910                 if( check_intra4x4_pred_mode(h) < 0)
4911                     return -1;
4912             }else{
4913                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4914                 if(h->intra16x16_pred_mode < 0)
4915                     return -1;
4916             }
4917
4918             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4919             if(pred_mode < 0)
4920                 return -1;
4921             h->chroma_pred_mode= pred_mode;
4922     }else if(partition_count==4){
4923         int i, j, sub_partition_count[4], list, ref[2][4];
4924
4925         if(h->slice_type == B_TYPE){
4926             for(i=0; i<4; i++){
4927                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4928                 if(h->sub_mb_type[i] >=13){
4929                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4930                     return -1;
4931                 }
4932                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4933                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4934             }
4935             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4936                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4937                 pred_direct_motion(h, &mb_type);
4938                 h->ref_cache[0][scan8[4]] =
4939                 h->ref_cache[1][scan8[4]] =
4940                 h->ref_cache[0][scan8[12]] =
4941                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4942             }
4943         }else{
4944             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4945             for(i=0; i<4; i++){
4946                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4947                 if(h->sub_mb_type[i] >=4){
4948                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4949                     return -1;
4950                 }
4951                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4952                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4953             }
4954         }
4955
4956         for(list=0; list<h->list_count; list++){
4957             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4958             for(i=0; i<4; i++){
4959                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4960                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4961                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4962                     if(tmp>=ref_count){
4963                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4964                         return -1;
4965                     }
4966                     ref[list][i]= tmp;
4967                 }else{
4968                  //FIXME
4969                     ref[list][i] = -1;
4970                 }
4971             }
4972         }
4973
4974         if(dct8x8_allowed)
4975             dct8x8_allowed = get_dct8x8_allowed(h);
4976
4977         for(list=0; list<h->list_count; list++){
4978             for(i=0; i<4; i++){
4979                 if(IS_DIRECT(h->sub_mb_type[i])) {
4980                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4981                     continue;
4982                 }
4983                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4984                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4985
4986                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4987                     const int sub_mb_type= h->sub_mb_type[i];
4988                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4989                     for(j=0; j<sub_partition_count[i]; j++){
4990                         int mx, my;
4991                         const int index= 4*i + block_width*j;
4992                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4993                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4994                         mx += get_se_golomb(&s->gb);
4995                         my += get_se_golomb(&s->gb);
4996                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4997
4998                         if(IS_SUB_8X8(sub_mb_type)){
4999                             mv_cache[ 1 ][0]=
5000                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5001                             mv_cache[ 1 ][1]=
5002                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5003                         }else if(IS_SUB_8X4(sub_mb_type)){
5004                             mv_cache[ 1 ][0]= mx;
5005                             mv_cache[ 1 ][1]= my;
5006                         }else if(IS_SUB_4X8(sub_mb_type)){
5007                             mv_cache[ 8 ][0]= mx;
5008                             mv_cache[ 8 ][1]= my;
5009                         }
5010                         mv_cache[ 0 ][0]= mx;
5011                         mv_cache[ 0 ][1]= my;
5012                     }
5013                 }else{
5014                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5015                     p[0] = p[1]=
5016                     p[8] = p[9]= 0;
5017                 }
5018             }
5019         }
5020     }else if(IS_DIRECT(mb_type)){
5021         pred_direct_motion(h, &mb_type);
5022         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5023     }else{
5024         int list, mx, my, i;
5025          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5026         if(IS_16X16(mb_type)){
5027             for(list=0; list<h->list_count; list++){
5028                     unsigned int val;
5029                     if(IS_DIR(mb_type, 0, list)){
5030                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
5031                         if(val >= h->ref_count[list]){
5032                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5033                             return -1;
5034                         }
5035                     }else
5036                         val= LIST_NOT_USED&0xFF;
5037                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5038             }
5039             for(list=0; list<h->list_count; list++){
5040                 unsigned int val;
5041                 if(IS_DIR(mb_type, 0, list)){
5042                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5043                     mx += get_se_golomb(&s->gb);
5044                     my += get_se_golomb(&s->gb);
5045                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5046
5047                     val= pack16to32(mx,my);
5048                 }else
5049                     val=0;
5050                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
5051             }
5052         }
5053         else if(IS_16X8(mb_type)){
5054             for(list=0; list<h->list_count; list++){
5055                     for(i=0; i<2; i++){
5056                         unsigned int val;
5057                         if(IS_DIR(mb_type, i, list)){
5058                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5059                             if(val >= h->ref_count[list]){
5060                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5061                                 return -1;
5062                             }
5063                         }else
5064                             val= LIST_NOT_USED&0xFF;
5065                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5066                     }
5067             }
5068             for(list=0; list<h->list_count; list++){
5069                 for(i=0; i<2; i++){
5070                     unsigned int val;
5071                     if(IS_DIR(mb_type, i, list)){
5072                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5073                         mx += get_se_golomb(&s->gb);
5074                         my += get_se_golomb(&s->gb);
5075                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5076
5077                         val= pack16to32(mx,my);
5078                     }else
5079                         val=0;
5080                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
5081                 }
5082             }
5083         }else{
5084             assert(IS_8X16(mb_type));
5085             for(list=0; list<h->list_count; list++){
5086                     for(i=0; i<2; i++){
5087                         unsigned int val;
5088                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5089                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5090                             if(val >= h->ref_count[list]){
5091                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5092                                 return -1;
5093                             }
5094                         }else
5095                             val= LIST_NOT_USED&0xFF;
5096                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5097                     }
5098             }
5099             for(list=0; list<h->list_count; list++){
5100                 for(i=0; i<2; i++){
5101                     unsigned int val;
5102                     if(IS_DIR(mb_type, i, list)){
5103                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5104                         mx += get_se_golomb(&s->gb);
5105                         my += get_se_golomb(&s->gb);
5106                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5107
5108                         val= pack16to32(mx,my);
5109                     }else
5110                         val=0;
5111                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
5112                 }
5113             }
5114         }
5115     }
5116
5117     if(IS_INTER(mb_type))
5118         write_back_motion(h, mb_type);
5119
5120     if(!IS_INTRA16x16(mb_type)){
5121         cbp= get_ue_golomb(&s->gb);
5122         if(cbp > 47){
5123             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
5124             return -1;
5125         }
5126
5127         if(IS_INTRA4x4(mb_type))
5128             cbp= golomb_to_intra4x4_cbp[cbp];
5129         else
5130             cbp= golomb_to_inter_cbp[cbp];
5131     }
5132     h->cbp = cbp;
5133
5134     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5135         if(get_bits1(&s->gb))
5136             mb_type |= MB_TYPE_8x8DCT;
5137     }
5138     s->current_picture.mb_type[mb_xy]= mb_type;
5139
5140     if(cbp || IS_INTRA16x16(mb_type)){
5141         int i8x8, i4x4, chroma_idx;
5142         int dquant;
5143         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5144         const uint8_t *scan, *scan8x8, *dc_scan;
5145
5146 //        fill_non_zero_count_cache(h);
5147
5148         if(IS_INTERLACED(mb_type)){
5149             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5150             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5151             dc_scan= luma_dc_field_scan;
5152         }else{
5153             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5154             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5155             dc_scan= luma_dc_zigzag_scan;
5156         }
5157
5158         dquant= get_se_golomb(&s->gb);
5159
5160         if( dquant > 25 || dquant < -26 ){
5161             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5162             return -1;
5163         }
5164
5165         s->qscale += dquant;
5166         if(((unsigned)s->qscale) > 51){
5167             if(s->qscale<0) s->qscale+= 52;
5168             else            s->qscale-= 52;
5169         }
5170
5171         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
5172         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
5173         if(IS_INTRA16x16(mb_type)){
5174             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5175                 return -1; //FIXME continue if partitioned and other return -1 too
5176             }
5177
5178             assert((cbp&15) == 0 || (cbp&15) == 15);
5179
5180             if(cbp&15){
5181                 for(i8x8=0; i8x8<4; i8x8++){
5182                     for(i4x4=0; i4x4<4; i4x4++){
5183                         const int index= i4x4 + 4*i8x8;
5184                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5185                             return -1;
5186                         }
5187                     }
5188                 }
5189             }else{
5190                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5191             }
5192         }else{
5193             for(i8x8=0; i8x8<4; i8x8++){
5194                 if(cbp & (1<<i8x8)){
5195                     if(IS_8x8DCT(mb_type)){
5196                         DCTELEM *buf = &h->mb[64*i8x8];
5197                         uint8_t *nnz;
5198                         for(i4x4=0; i4x4<4; i4x4++){
5199                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5200                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5201                                 return -1;
5202                         }
5203                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5204                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5205                     }else{
5206                         for(i4x4=0; i4x4<4; i4x4++){
5207                             const int index= i4x4 + 4*i8x8;
5208
5209                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5210                                 return -1;
5211                             }
5212                         }
5213                     }
5214                 }else{
5215                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5216                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5217                 }
5218             }
5219         }
5220
5221         if(cbp&0x30){
5222             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5223                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5224                     return -1;
5225                 }
5226         }
5227
5228         if(cbp&0x20){
5229             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5230                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
5231                 for(i4x4=0; i4x4<4; i4x4++){
5232                     const int index= 16 + 4*chroma_idx + i4x4;
5233                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
5234                         return -1;
5235                     }
5236                 }
5237             }
5238         }else{
5239             uint8_t * const nnz= &h->non_zero_count_cache[0];
5240             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5241             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5242         }
5243     }else{
5244         uint8_t * const nnz= &h->non_zero_count_cache[0];
5245         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5246         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5247         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5248     }
5249     s->current_picture.qscale_table[mb_xy]= s->qscale;
5250     write_back_non_zero_count(h);
5251
5252     if(MB_MBAFF){
5253         h->ref_count[0] >>= 1;
5254         h->ref_count[1] >>= 1;
5255     }
5256
5257     return 0;
5258 }
5259
5260 static int decode_cabac_field_decoding_flag(H264Context *h) {
5261     MpegEncContext * const s = &h->s;
5262     const int mb_x = s->mb_x;
5263     const int mb_y = s->mb_y & ~1;
5264     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5265     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5266
5267     unsigned int ctx = 0;
5268
5269     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5270         ctx += 1;
5271     }
5272     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5273         ctx += 1;
5274     }
5275
5276     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5277 }
5278
5279 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5280     uint8_t *state= &h->cabac_state[ctx_base];
5281     int mb_type;
5282
5283     if(intra_slice){
5284         MpegEncContext * const s = &h->s;
5285         const int mba_xy = h->left_mb_xy[0];
5286         const int mbb_xy = h->top_mb_xy;
5287         int ctx=0;
5288         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5289             ctx++;
5290         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5291             ctx++;
5292         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5293             return 0;   /* I4x4 */
5294         state += 2;
5295     }else{
5296         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5297             return 0;   /* I4x4 */
5298     }
5299
5300     if( get_cabac_terminate( &h->cabac ) )
5301         return 25;  /* PCM */
5302
5303     mb_type = 1; /* I16x16 */
5304     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5305     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5306         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5307     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5308     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5309     return mb_type;
5310 }
5311
5312 static int decode_cabac_mb_type( H264Context *h ) {
5313     MpegEncContext * const s = &h->s;
5314
5315     if( h->slice_type == I_TYPE ) {
5316         return decode_cabac_intra_mb_type(h, 3, 1);
5317     } else if( h->slice_type == P_TYPE ) {
5318         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5319             /* P-type */
5320             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5321                 /* P_L0_D16x16, P_8x8 */
5322                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5323             } else {
5324                 /* P_L0_D8x16, P_L0_D16x8 */
5325                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5326             }
5327         } else {
5328             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5329         }
5330     } else if( h->slice_type == B_TYPE ) {
5331         const int mba_xy = h->left_mb_xy[0];
5332         const int mbb_xy = h->top_mb_xy;
5333         int ctx = 0;
5334         int bits;
5335
5336         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5337             ctx++;
5338         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5339             ctx++;
5340
5341         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5342             return 0; /* B_Direct_16x16 */
5343
5344         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5345             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5346         }
5347
5348         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5349         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5350         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5351         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5352         if( bits < 8 )
5353             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5354         else if( bits == 13 ) {
5355             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5356         } else if( bits == 14 )
5357             return 11; /* B_L1_L0_8x16 */
5358         else if( bits == 15 )
5359             return 22; /* B_8x8 */
5360
5361         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5362         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5363     } else {
5364         /* TODO SI/SP frames? */
5365         return -1;
5366     }
5367 }
5368
5369 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5370     MpegEncContext * const s = &h->s;
5371     int mba_xy, mbb_xy;
5372     int ctx = 0;
5373
5374     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5375         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5376         mba_xy = mb_xy - 1;
5377         if( (mb_y&1)
5378             && h->slice_table[mba_xy] == h->slice_num
5379             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5380             mba_xy += s->mb_stride;
5381         if( MB_FIELD ){
5382             mbb_xy = mb_xy - s->mb_stride;
5383             if( !(mb_y&1)
5384                 && h->slice_table[mbb_xy] == h->slice_num
5385                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5386                 mbb_xy -= s->mb_stride;
5387         }else
5388             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5389     }else{
5390         int mb_xy = mb_x + mb_y*s->mb_stride;
5391         mba_xy = mb_xy - 1;
5392         mbb_xy = mb_xy - s->mb_stride;
5393     }
5394
5395     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5396         ctx++;
5397     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5398         ctx++;
5399
5400     if( h->slice_type == B_TYPE )
5401         ctx += 13;
5402     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5403 }
5404
5405 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5406     int mode = 0;
5407
5408     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5409         return pred_mode;
5410
5411     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5412     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5413     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5414
5415     if( mode >= pred_mode )
5416         return mode + 1;
5417     else
5418         return mode;
5419 }
5420
5421 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5422     const int mba_xy = h->left_mb_xy[0];
5423     const int mbb_xy = h->top_mb_xy;
5424
5425     int ctx = 0;
5426
5427     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5428     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5429         ctx++;
5430
5431     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5432         ctx++;
5433
5434     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5435         return 0;
5436
5437     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5438         return 1;
5439     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5440         return 2;
5441     else
5442         return 3;
5443 }
5444
5445 static const uint8_t block_idx_x[16] = {
5446     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5447 };
5448 static const uint8_t block_idx_y[16] = {
5449     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5450 };
5451 static const uint8_t block_idx_xy[4][4] = {
5452     { 0, 2, 8,  10},
5453     { 1, 3, 9,  11},
5454     { 4, 6, 12, 14},
5455     { 5, 7, 13, 15}
5456 };
5457
5458 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5459     int cbp = 0;
5460     int cbp_b = -1;
5461     int i8x8;
5462
5463     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5464         cbp_b = h->top_cbp;
5465         tprintf(h->s.avctx, "cbp_b = top_cbp = %x\n", cbp_b);
5466     }
5467
5468     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5469         int cbp_a = -1;
5470         int x, y;
5471         int ctx = 0;
5472
5473         x = block_idx_x[4*i8x8];
5474         y = block_idx_y[4*i8x8];
5475
5476         if( x > 0 )
5477             cbp_a = cbp;
5478         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5479             cbp_a = h->left_cbp;
5480             tprintf(h->s.avctx, "cbp_a = left_cbp = %x\n", cbp_a);
5481         }
5482
5483         if( y > 0 )
5484             cbp_b = cbp;
5485
5486         /* No need to test for skip as we put 0 for skip block */
5487         /* No need to test for IPCM as we put 1 for IPCM block */
5488         if( cbp_a >= 0 ) {
5489             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5490             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5491                 ctx++;
5492         }
5493
5494         if( cbp_b >= 0 ) {
5495             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5496             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5497                 ctx += 2;
5498         }
5499
5500         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5501             cbp |= 1 << i8x8;
5502         }
5503     }
5504     return cbp;
5505 }
5506 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5507     int ctx;
5508     int cbp_a, cbp_b;
5509
5510     cbp_a = (h->left_cbp>>4)&0x03;
5511     cbp_b = (h-> top_cbp>>4)&0x03;
5512
5513     ctx = 0;
5514     if( cbp_a > 0 ) ctx++;
5515     if( cbp_b > 0 ) ctx += 2;
5516     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5517         return 0;
5518
5519     ctx = 4;
5520     if( cbp_a == 2 ) ctx++;
5521     if( cbp_b == 2 ) ctx += 2;
5522     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5523 }
5524 static int decode_cabac_mb_dqp( H264Context *h) {
5525     MpegEncContext * const s = &h->s;
5526     int mbn_xy;
5527     int   ctx = 0;
5528     int   val = 0;
5529
5530     if( s->mb_x > 0 )
5531         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5532     else
5533         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5534
5535     if( h->last_qscale_diff != 0 )
5536         ctx++;
5537
5538     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5539         if( ctx < 2 )
5540             ctx = 2;
5541         else
5542             ctx = 3;
5543         val++;
5544         if(val > 102) //prevent infinite loop
5545             return INT_MIN;
5546     }
5547
5548     if( val&0x01 )
5549         return (val + 1)/2;
5550     else
5551         return -(val + 1)/2;
5552 }
5553 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5554     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5555         return 0;   /* 8x8 */
5556     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5557         return 1;   /* 8x4 */
5558     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5559         return 2;   /* 4x8 */
5560     return 3;       /* 4x4 */
5561 }
5562 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5563     int type;
5564     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5565         return 0;   /* B_Direct_8x8 */
5566     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5567         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5568     type = 3;
5569     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5570         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5571             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5572         type += 4;
5573     }
5574     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5575     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5576     return type;
5577 }
5578
5579 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5580     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5581 }
5582
5583 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5584     int refa = h->ref_cache[list][scan8[n] - 1];
5585     int refb = h->ref_cache[list][scan8[n] - 8];
5586     int ref  = 0;
5587     int ctx  = 0;
5588
5589     if( h->slice_type == B_TYPE) {
5590         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5591             ctx++;
5592         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5593             ctx += 2;
5594     } else {
5595         if( refa > 0 )
5596             ctx++;
5597         if( refb > 0 )
5598             ctx += 2;
5599     }
5600
5601     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5602         ref++;
5603         if( ctx < 4 )
5604             ctx = 4;
5605         else
5606             ctx = 5;
5607         if(ref >= 32 /*h->ref_list[list]*/){
5608             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5609             return 0; //FIXME we should return -1 and check the return everywhere
5610         }
5611     }
5612     return ref;
5613 }
5614
5615 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5616     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5617                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5618     int ctxbase = (l == 0) ? 40 : 47;
5619     int ctx, mvd;
5620
5621     if( amvd < 3 )
5622         ctx = 0;
5623     else if( amvd > 32 )
5624         ctx = 2;
5625     else
5626         ctx = 1;
5627
5628     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5629         return 0;
5630
5631     mvd= 1;
5632     ctx= 3;
5633     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5634         mvd++;
5635         if( ctx < 6 )
5636             ctx++;
5637     }
5638
5639     if( mvd >= 9 ) {
5640         int k = 3;
5641         while( get_cabac_bypass( &h->cabac ) ) {
5642             mvd += 1 << k;
5643             k++;
5644             if(k>24){
5645                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5646                 return INT_MIN;
5647             }
5648         }
5649         while( k-- ) {
5650             if( get_cabac_bypass( &h->cabac ) )
5651                 mvd += 1 << k;
5652         }
5653     }
5654     return get_cabac_bypass_sign( &h->cabac, -mvd );
5655 }
5656
5657 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5658     int nza, nzb;
5659     int ctx = 0;
5660
5661     if( cat == 0 ) {
5662         nza = h->left_cbp&0x100;
5663         nzb = h-> top_cbp&0x100;
5664     } else if( cat == 1 || cat == 2 ) {
5665         nza = h->non_zero_count_cache[scan8[idx] - 1];
5666         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5667     } else if( cat == 3 ) {
5668         nza = (h->left_cbp>>(6+idx))&0x01;
5669         nzb = (h-> top_cbp>>(6+idx))&0x01;
5670     } else {
5671         assert(cat == 4);
5672         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5673         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5674     }
5675
5676     if( nza > 0 )
5677         ctx++;
5678
5679     if( nzb > 0 )
5680         ctx += 2;
5681
5682     return ctx + 4 * cat;
5683 }
5684
5685 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5686     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5687     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5688     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5689     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5690 };
5691
5692 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5693     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5694     static const int significant_coeff_flag_offset[2][6] = {
5695       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5696       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5697     };
5698     static const int last_coeff_flag_offset[2][6] = {
5699       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5700       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5701     };
5702     static const int coeff_abs_level_m1_offset[6] = {
5703         227+0, 227+10, 227+20, 227+30, 227+39, 426
5704     };
5705     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5706       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5707         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5708         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5709        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5710       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5711         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5712         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5713         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5714     };
5715
5716     int index[64];
5717
5718     int av_unused last;
5719     int coeff_count = 0;
5720
5721     int abslevel1 = 1;
5722     int abslevelgt1 = 0;
5723
5724     uint8_t *significant_coeff_ctx_base;
5725     uint8_t *last_coeff_ctx_base;
5726     uint8_t *abs_level_m1_ctx_base;
5727
5728 #ifndef ARCH_X86
5729 #define CABAC_ON_STACK
5730 #endif
5731 #ifdef CABAC_ON_STACK
5732 #define CC &cc
5733     CABACContext cc;
5734     cc.range     = h->cabac.range;
5735     cc.low       = h->cabac.low;
5736     cc.bytestream= h->cabac.bytestream;
5737 #else
5738 #define CC &h->cabac
5739 #endif
5740
5741
5742     /* cat: 0-> DC 16x16  n = 0
5743      *      1-> AC 16x16  n = luma4x4idx
5744      *      2-> Luma4x4   n = luma4x4idx
5745      *      3-> DC Chroma n = iCbCr
5746      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5747      *      5-> Luma8x8   n = 4 * luma8x8idx
5748      */
5749
5750     /* read coded block flag */
5751     if( cat != 5 ) {
5752         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5753             if( cat == 1 || cat == 2 )
5754                 h->non_zero_count_cache[scan8[n]] = 0;
5755             else if( cat == 4 )
5756                 h->non_zero_count_cache[scan8[16+n]] = 0;
5757 #ifdef CABAC_ON_STACK
5758             h->cabac.range     = cc.range     ;
5759             h->cabac.low       = cc.low       ;
5760             h->cabac.bytestream= cc.bytestream;
5761 #endif
5762             return 0;
5763         }
5764     }
5765
5766     significant_coeff_ctx_base = h->cabac_state
5767         + significant_coeff_flag_offset[MB_FIELD][cat];
5768     last_coeff_ctx_base = h->cabac_state
5769         + last_coeff_flag_offset[MB_FIELD][cat];
5770     abs_level_m1_ctx_base = h->cabac_state
5771         + coeff_abs_level_m1_offset[cat];
5772
5773     if( cat == 5 ) {
5774 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5775         for(last= 0; last < coefs; last++) { \
5776             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5777             if( get_cabac( CC, sig_ctx )) { \
5778                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5779                 index[coeff_count++] = last; \
5780                 if( get_cabac( CC, last_ctx ) ) { \
5781                     last= max_coeff; \
5782                     break; \
5783                 } \
5784             } \
5785         }\
5786         if( last == max_coeff -1 ) {\
5787             index[coeff_count++] = last;\
5788         }
5789         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5790 #if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5791         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5792     } else {
5793         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5794 #else
5795         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5796     } else {
5797         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5798 #endif
5799     }
5800     assert(coeff_count > 0);
5801
5802     if( cat == 0 )
5803         h->cbp_table[mb_xy] |= 0x100;
5804     else if( cat == 1 || cat == 2 )
5805         h->non_zero_count_cache[scan8[n]] = coeff_count;
5806     else if( cat == 3 )
5807         h->cbp_table[mb_xy] |= 0x40 << n;
5808     else if( cat == 4 )
5809         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5810     else {
5811         assert( cat == 5 );
5812         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5813     }
5814
5815     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5816         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5817         int j= scantable[index[coeff_count]];
5818
5819         if( get_cabac( CC, ctx ) == 0 ) {
5820             if( !qmul ) {
5821                 block[j] = get_cabac_bypass_sign( CC, -1);
5822             }else{
5823                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5824             }
5825
5826             abslevel1++;
5827         } else {
5828             int coeff_abs = 2;
5829             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5830             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5831                 coeff_abs++;
5832             }
5833
5834             if( coeff_abs >= 15 ) {
5835                 int j = 0;
5836                 while( get_cabac_bypass( CC ) ) {
5837                     j++;
5838                 }
5839
5840                 coeff_abs=1;
5841                 while( j-- ) {
5842                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5843                 }
5844                 coeff_abs+= 14;
5845             }
5846
5847             if( !qmul ) {
5848                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5849                 else                                block[j] =  coeff_abs;
5850             }else{
5851                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5852                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5853             }
5854
5855             abslevelgt1++;
5856         }
5857     }
5858 #ifdef CABAC_ON_STACK
5859             h->cabac.range     = cc.range     ;
5860             h->cabac.low       = cc.low       ;
5861             h->cabac.bytestream= cc.bytestream;
5862 #endif
5863     return 0;
5864 }
5865
5866 static inline void compute_mb_neighbors(H264Context *h)
5867 {
5868     MpegEncContext * const s = &h->s;
5869     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5870     h->top_mb_xy     = mb_xy - s->mb_stride;
5871     h->left_mb_xy[0] = mb_xy - 1;
5872     if(FRAME_MBAFF){
5873         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5874         const int top_pair_xy      = pair_xy     - s->mb_stride;
5875         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5876         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5877         const int curr_mb_frame_flag = !MB_FIELD;
5878         const int bottom = (s->mb_y & 1);
5879         if (bottom
5880                 ? !curr_mb_frame_flag // bottom macroblock
5881                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5882                 ) {
5883             h->top_mb_xy -= s->mb_stride;
5884         }
5885         if (left_mb_frame_flag != curr_mb_frame_flag) {
5886             h->left_mb_xy[0] = pair_xy - 1;
5887         }
5888     }
5889     return;
5890 }
5891
5892 /**
5893  * decodes a macroblock
5894  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5895  */
5896 static int decode_mb_cabac(H264Context *h) {
5897     MpegEncContext * const s = &h->s;
5898     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5899     int mb_type, partition_count, cbp = 0;
5900     int dct8x8_allowed= h->pps.transform_8x8_mode;
5901
5902     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5903
5904     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5905     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5906         int skip;
5907         /* a skipped mb needs the aff flag from the following mb */
5908         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5909             predict_field_decoding_flag(h);
5910         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5911             skip = h->next_mb_skipped;
5912         else
5913             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5914         /* read skip flags */
5915         if( skip ) {
5916             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5917                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5918                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5919                 if(h->next_mb_skipped)
5920                     predict_field_decoding_flag(h);
5921                 else
5922                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5923             }
5924
5925             decode_mb_skip(h);
5926
5927             h->cbp_table[mb_xy] = 0;
5928             h->chroma_pred_mode_table[mb_xy] = 0;
5929             h->last_qscale_diff = 0;
5930
5931             return 0;
5932
5933         }
5934     }
5935     if(FRAME_MBAFF){
5936         if( (s->mb_y&1) == 0 )
5937             h->mb_mbaff =
5938             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5939     }else
5940         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5941
5942     h->prev_mb_skipped = 0;
5943
5944     compute_mb_neighbors(h);
5945     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5946         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5947         return -1;
5948     }
5949
5950     if( h->slice_type == B_TYPE ) {
5951         if( mb_type < 23 ){
5952             partition_count= b_mb_type_info[mb_type].partition_count;
5953             mb_type=         b_mb_type_info[mb_type].type;
5954         }else{
5955             mb_type -= 23;
5956             goto decode_intra_mb;
5957         }
5958     } else if( h->slice_type == P_TYPE ) {
5959         if( mb_type < 5) {
5960             partition_count= p_mb_type_info[mb_type].partition_count;
5961             mb_type=         p_mb_type_info[mb_type].type;
5962         } else {
5963             mb_type -= 5;
5964             goto decode_intra_mb;
5965         }
5966     } else {
5967        assert(h->slice_type == I_TYPE);
5968 decode_intra_mb:
5969         partition_count = 0;
5970         cbp= i_mb_type_info[mb_type].cbp;
5971         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5972         mb_type= i_mb_type_info[mb_type].type;
5973     }
5974     if(MB_FIELD)
5975         mb_type |= MB_TYPE_INTERLACED;
5976
5977     h->slice_table[ mb_xy ]= h->slice_num;
5978
5979     if(IS_INTRA_PCM(mb_type)) {
5980         const uint8_t *ptr;
5981         unsigned int x, y;
5982
5983         // We assume these blocks are very rare so we do not optimize it.
5984         // FIXME The two following lines get the bitstream position in the cabac
5985         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5986         ptr= h->cabac.bytestream;
5987         if(h->cabac.low&0x1) ptr--;
5988         if(CABAC_BITS==16){
5989             if(h->cabac.low&0x1FF) ptr--;
5990         }
5991
5992         // The pixels are stored in the same order as levels in h->mb array.
5993         for(y=0; y<16; y++){
5994             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5995             for(x=0; x<16; x++){
5996                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5997                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5998             }
5999         }
6000         for(y=0; y<8; y++){
6001             const int index= 256 + 4*(y&3) + 32*(y>>2);
6002             for(x=0; x<8; x++){
6003                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6004                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6005             }
6006         }
6007         for(y=0; y<8; y++){
6008             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6009             for(x=0; x<8; x++){
6010                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6011                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6012             }
6013         }
6014
6015         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6016
6017         // All blocks are present
6018         h->cbp_table[mb_xy] = 0x1ef;
6019         h->chroma_pred_mode_table[mb_xy] = 0;
6020         // In deblocking, the quantizer is 0
6021         s->current_picture.qscale_table[mb_xy]= 0;
6022         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
6023         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
6024         // All coeffs are present
6025         memset(h->non_zero_count[mb_xy], 16, 16);
6026         s->current_picture.mb_type[mb_xy]= mb_type;
6027         return 0;
6028     }
6029
6030     if(MB_MBAFF){
6031         h->ref_count[0] <<= 1;
6032         h->ref_count[1] <<= 1;
6033     }
6034
6035     fill_caches(h, mb_type, 0);
6036
6037     if( IS_INTRA( mb_type ) ) {
6038         int i, pred_mode;
6039         if( IS_INTRA4x4( mb_type ) ) {
6040             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6041                 mb_type |= MB_TYPE_8x8DCT;
6042                 for( i = 0; i < 16; i+=4 ) {
6043                     int pred = pred_intra_mode( h, i );
6044                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6045                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6046                 }
6047             } else {
6048                 for( i = 0; i < 16; i++ ) {
6049                     int pred = pred_intra_mode( h, i );
6050                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6051
6052                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6053                 }
6054             }
6055             write_back_intra_pred_mode(h);
6056             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6057         } else {
6058             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6059             if( h->intra16x16_pred_mode < 0 ) return -1;
6060         }
6061         h->chroma_pred_mode_table[mb_xy] =
6062         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
6063
6064         pred_mode= check_intra_pred_mode( h, pred_mode );
6065         if( pred_mode < 0 ) return -1;
6066         h->chroma_pred_mode= pred_mode;
6067     } else if( partition_count == 4 ) {
6068         int i, j, sub_partition_count[4], list, ref[2][4];
6069
6070         if( h->slice_type == B_TYPE ) {
6071             for( i = 0; i < 4; i++ ) {
6072                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6073                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6074                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6075             }
6076             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6077                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6078                 pred_direct_motion(h, &mb_type);
6079                 h->ref_cache[0][scan8[4]] =
6080                 h->ref_cache[1][scan8[4]] =
6081                 h->ref_cache[0][scan8[12]] =
6082                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
6083                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6084                     for( i = 0; i < 4; i++ )
6085                         if( IS_DIRECT(h->sub_mb_type[i]) )
6086                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6087                 }
6088             }
6089         } else {
6090             for( i = 0; i < 4; i++ ) {
6091                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6092                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6093                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6094             }
6095         }
6096
6097         for( list = 0; list < h->list_count; list++ ) {
6098                 for( i = 0; i < 4; i++ ) {
6099                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6100                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6101                         if( h->ref_count[list] > 1 )
6102                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6103                         else
6104                             ref[list][i] = 0;
6105                     } else {
6106                         ref[list][i] = -1;
6107                     }
6108                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6109                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6110                 }
6111         }
6112
6113         if(dct8x8_allowed)
6114             dct8x8_allowed = get_dct8x8_allowed(h);
6115
6116         for(list=0; list<h->list_count; list++){
6117             for(i=0; i<4; i++){
6118                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6119                 if(IS_DIRECT(h->sub_mb_type[i])){
6120                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6121                     continue;
6122                 }
6123
6124                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6125                     const int sub_mb_type= h->sub_mb_type[i];
6126                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6127                     for(j=0; j<sub_partition_count[i]; j++){
6128                         int mpx, mpy;
6129                         int mx, my;
6130                         const int index= 4*i + block_width*j;
6131                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6132                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6133                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6134
6135                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6136                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6137                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6138
6139                         if(IS_SUB_8X8(sub_mb_type)){
6140                             mv_cache[ 1 ][0]=
6141                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6142                             mv_cache[ 1 ][1]=
6143                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6144
6145                             mvd_cache[ 1 ][0]=
6146                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6147                             mvd_cache[ 1 ][1]=
6148                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6149                         }else if(IS_SUB_8X4(sub_mb_type)){
6150                             mv_cache[ 1 ][0]= mx;
6151                             mv_cache[ 1 ][1]= my;
6152
6153                             mvd_cache[ 1 ][0]= mx - mpx;
6154                             mvd_cache[ 1 ][1]= my - mpy;
6155                         }else if(IS_SUB_4X8(sub_mb_type)){
6156                             mv_cache[ 8 ][0]= mx;
6157                             mv_cache[ 8 ][1]= my;
6158
6159                             mvd_cache[ 8 ][0]= mx - mpx;
6160                             mvd_cache[ 8 ][1]= my - mpy;
6161                         }
6162                         mv_cache[ 0 ][0]= mx;
6163                         mv_cache[ 0 ][1]= my;
6164
6165                         mvd_cache[ 0 ][0]= mx - mpx;
6166                         mvd_cache[ 0 ][1]= my - mpy;
6167                     }
6168                 }else{
6169                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6170                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6171                     p[0] = p[1] = p[8] = p[9] = 0;
6172                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6173                 }
6174             }
6175         }
6176     } else if( IS_DIRECT(mb_type) ) {
6177         pred_direct_motion(h, &mb_type);
6178         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6179         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6180         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6181     } else {
6182         int list, mx, my, i, mpx, mpy;
6183         if(IS_16X16(mb_type)){
6184             for(list=0; list<h->list_count; list++){
6185                 if(IS_DIR(mb_type, 0, list)){
6186                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6187                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6188                 }else
6189                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
6190             }
6191             for(list=0; list<h->list_count; list++){
6192                 if(IS_DIR(mb_type, 0, list)){
6193                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6194
6195                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6196                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6197                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6198
6199                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6200                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6201                 }else
6202                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6203             }
6204         }
6205         else if(IS_16X8(mb_type)){
6206             for(list=0; list<h->list_count; list++){
6207                     for(i=0; i<2; i++){
6208                         if(IS_DIR(mb_type, i, list)){
6209                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6210                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6211                         }else
6212                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6213                     }
6214             }
6215             for(list=0; list<h->list_count; list++){
6216                 for(i=0; i<2; i++){
6217                     if(IS_DIR(mb_type, i, list)){
6218                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6219                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6220                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6221                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6222
6223                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6224                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6225                     }else{
6226                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6227                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6228                     }
6229                 }
6230             }
6231         }else{
6232             assert(IS_8X16(mb_type));
6233             for(list=0; list<h->list_count; list++){
6234                     for(i=0; i<2; i++){
6235                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6236                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6237                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6238                         }else
6239                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6240                     }
6241             }
6242             for(list=0; list<h->list_count; list++){
6243                 for(i=0; i<2; i++){
6244                     if(IS_DIR(mb_type, i, list)){
6245                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6246                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6247                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6248
6249                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6250                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6251                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6252                     }else{
6253                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6254                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6255                     }
6256                 }
6257             }
6258         }
6259     }
6260
6261    if( IS_INTER( mb_type ) ) {
6262         h->chroma_pred_mode_table[mb_xy] = 0;
6263         write_back_motion( h, mb_type );
6264    }
6265
6266     if( !IS_INTRA16x16( mb_type ) ) {
6267         cbp  = decode_cabac_mb_cbp_luma( h );
6268         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6269     }
6270
6271     h->cbp_table[mb_xy] = h->cbp = cbp;
6272
6273     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6274         if( decode_cabac_mb_transform_size( h ) )
6275             mb_type |= MB_TYPE_8x8DCT;
6276     }
6277     s->current_picture.mb_type[mb_xy]= mb_type;
6278
6279     if( cbp || IS_INTRA16x16( mb_type ) ) {
6280         const uint8_t *scan, *scan8x8, *dc_scan;
6281         int dqp;
6282
6283         if(IS_INTERLACED(mb_type)){
6284             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6285             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6286             dc_scan= luma_dc_field_scan;
6287         }else{
6288             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6289             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6290             dc_scan= luma_dc_zigzag_scan;
6291         }
6292
6293         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6294         if( dqp == INT_MIN ){
6295             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6296             return -1;
6297         }
6298         s->qscale += dqp;
6299         if(((unsigned)s->qscale) > 51){
6300             if(s->qscale<0) s->qscale+= 52;
6301             else            s->qscale-= 52;
6302         }
6303         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
6304         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
6305
6306         if( IS_INTRA16x16( mb_type ) ) {
6307             int i;
6308             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6309             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6310                 return -1;
6311             if( cbp&15 ) {
6312                 for( i = 0; i < 16; i++ ) {
6313                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6314                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6315                         return -1;
6316                 }
6317             } else {
6318                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6319             }
6320         } else {
6321             int i8x8, i4x4;
6322             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6323                 if( cbp & (1<<i8x8) ) {
6324                     if( IS_8x8DCT(mb_type) ) {
6325                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6326                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6327                             return -1;
6328                     } else
6329                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6330                         const int index = 4*i8x8 + i4x4;
6331                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6332 //START_TIMER
6333                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6334                             return -1;
6335 //STOP_TIMER("decode_residual")
6336                     }
6337                 } else {
6338                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6339                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6340                 }
6341             }
6342         }
6343
6344         if( cbp&0x30 ){
6345             int c;
6346             for( c = 0; c < 2; c++ ) {
6347                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6348                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6349                     return -1;
6350             }
6351         }
6352
6353         if( cbp&0x20 ) {
6354             int c, i;
6355             for( c = 0; c < 2; c++ ) {
6356                 const uint32_t *qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6357                 for( i = 0; i < 4; i++ ) {
6358                     const int index = 16 + 4 * c + i;
6359                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6360                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15) < 0)
6361                         return -1;
6362                 }
6363             }
6364         } else {
6365             uint8_t * const nnz= &h->non_zero_count_cache[0];
6366             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6367             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6368         }
6369     } else {
6370         uint8_t * const nnz= &h->non_zero_count_cache[0];
6371         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6372         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6373         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6374         h->last_qscale_diff = 0;
6375     }
6376
6377     s->current_picture.qscale_table[mb_xy]= s->qscale;
6378     write_back_non_zero_count(h);
6379
6380     if(MB_MBAFF){
6381         h->ref_count[0] >>= 1;
6382         h->ref_count[1] >>= 1;
6383     }
6384
6385     return 0;
6386 }
6387
6388
6389 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6390     int i, d;
6391     const int index_a = qp + h->slice_alpha_c0_offset;
6392     const int alpha = (alpha_table+52)[index_a];
6393     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6394
6395     if( bS[0] < 4 ) {
6396         int8_t tc[4];
6397         for(i=0; i<4; i++)
6398             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6399         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6400     } else {
6401         /* 16px edge length, because bS=4 is triggered by being at
6402          * the edge of an intra MB, so all 4 bS are the same */
6403             for( d = 0; d < 16; d++ ) {
6404                 const int p0 = pix[-1];
6405                 const int p1 = pix[-2];
6406                 const int p2 = pix[-3];
6407
6408                 const int q0 = pix[0];
6409                 const int q1 = pix[1];
6410                 const int q2 = pix[2];
6411
6412                 if( FFABS( p0 - q0 ) < alpha &&
6413                     FFABS( p1 - p0 ) < beta &&
6414                     FFABS( q1 - q0 ) < beta ) {
6415
6416                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6417                         if( FFABS( p2 - p0 ) < beta)
6418                         {
6419                             const int p3 = pix[-4];
6420                             /* p0', p1', p2' */
6421                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6422                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6423                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6424                         } else {
6425                             /* p0' */
6426                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6427                         }
6428                         if( FFABS( q2 - q0 ) < beta)
6429                         {
6430                             const int q3 = pix[3];
6431                             /* q0', q1', q2' */
6432                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6433                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6434                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6435                         } else {
6436                             /* q0' */
6437                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6438                         }
6439                     }else{
6440                         /* p0', q0' */
6441                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6442                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6443                     }
6444                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6445                 }
6446                 pix += stride;
6447             }
6448     }
6449 }
6450 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6451     int i;
6452     const int index_a = qp + h->slice_alpha_c0_offset;
6453     const int alpha = (alpha_table+52)[index_a];
6454     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6455
6456     if( bS[0] < 4 ) {
6457         int8_t tc[4];
6458         for(i=0; i<4; i++)
6459             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6460         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6461     } else {
6462         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6463     }
6464 }
6465
6466 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6467     int i;
6468     for( i = 0; i < 16; i++, pix += stride) {
6469         int index_a;
6470         int alpha;
6471         int beta;
6472
6473         int qp_index;
6474         int bS_index = (i >> 1);
6475         if (!MB_FIELD) {
6476             bS_index &= ~1;
6477             bS_index |= (i & 1);
6478         }
6479
6480         if( bS[bS_index] == 0 ) {
6481             continue;
6482         }
6483
6484         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6485         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6486         alpha = (alpha_table+52)[index_a];
6487         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6488
6489         if( bS[bS_index] < 4 ) {
6490             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6491             const int p0 = pix[-1];
6492             const int p1 = pix[-2];
6493             const int p2 = pix[-3];
6494             const int q0 = pix[0];
6495             const int q1 = pix[1];
6496             const int q2 = pix[2];
6497
6498             if( FFABS( p0 - q0 ) < alpha &&
6499                 FFABS( p1 - p0 ) < beta &&
6500                 FFABS( q1 - q0 ) < beta ) {
6501                 int tc = tc0;
6502                 int i_delta;
6503
6504                 if( FFABS( p2 - p0 ) < beta ) {
6505                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6506                     tc++;
6507                 }
6508                 if( FFABS( q2 - q0 ) < beta ) {
6509                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6510                     tc++;
6511                 }
6512
6513                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6514                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6515                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6516                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6517             }
6518         }else{
6519             const int p0 = pix[-1];
6520             const int p1 = pix[-2];
6521             const int p2 = pix[-3];
6522
6523             const int q0 = pix[0];
6524             const int q1 = pix[1];
6525             const int q2 = pix[2];
6526
6527             if( FFABS( p0 - q0 ) < alpha &&
6528                 FFABS( p1 - p0 ) < beta &&
6529                 FFABS( q1 - q0 ) < beta ) {
6530
6531                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6532                     if( FFABS( p2 - p0 ) < beta)
6533                     {
6534                         const int p3 = pix[-4];
6535                         /* p0', p1', p2' */
6536                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6537                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6538                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6539                     } else {
6540                         /* p0' */
6541                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6542                     }
6543                     if( FFABS( q2 - q0 ) < beta)
6544                     {
6545                         const int q3 = pix[3];
6546                         /* q0', q1', q2' */
6547                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6548                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6549                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6550                     } else {
6551                         /* q0' */
6552                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6553                     }
6554                 }else{
6555                     /* p0', q0' */
6556                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6557                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6558                 }
6559                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6560             }
6561         }
6562     }
6563 }
6564 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6565     int i;
6566     for( i = 0; i < 8; i++, pix += stride) {
6567         int index_a;
6568         int alpha;
6569         int beta;
6570
6571         int qp_index;
6572         int bS_index = i;
6573
6574         if( bS[bS_index] == 0 ) {
6575             continue;
6576         }
6577
6578         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6579         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6580         alpha = (alpha_table+52)[index_a];
6581         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6582
6583         if( bS[bS_index] < 4 ) {
6584             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6585             const int p0 = pix[-1];
6586             const int p1 = pix[-2];
6587             const int q0 = pix[0];
6588             const int q1 = pix[1];
6589
6590             if( FFABS( p0 - q0 ) < alpha &&
6591                 FFABS( p1 - p0 ) < beta &&
6592                 FFABS( q1 - q0 ) < beta ) {
6593                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6594
6595                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6596                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6597                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6598             }
6599         }else{
6600             const int p0 = pix[-1];
6601             const int p1 = pix[-2];
6602             const int q0 = pix[0];
6603             const int q1 = pix[1];
6604
6605             if( FFABS( p0 - q0 ) < alpha &&
6606                 FFABS( p1 - p0 ) < beta &&
6607                 FFABS( q1 - q0 ) < beta ) {
6608
6609                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6610                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6611                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6612             }
6613         }
6614     }
6615 }
6616
6617 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6618     int i, d;
6619     const int index_a = qp + h->slice_alpha_c0_offset;
6620     const int alpha = (alpha_table+52)[index_a];
6621     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6622     const int pix_next  = stride;
6623
6624     if( bS[0] < 4 ) {
6625         int8_t tc[4];
6626         for(i=0; i<4; i++)
6627             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6628         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6629     } else {
6630         /* 16px edge length, see filter_mb_edgev */
6631             for( d = 0; d < 16; d++ ) {
6632                 const int p0 = pix[-1*pix_next];
6633                 const int p1 = pix[-2*pix_next];
6634                 const int p2 = pix[-3*pix_next];
6635                 const int q0 = pix[0];
6636                 const int q1 = pix[1*pix_next];
6637                 const int q2 = pix[2*pix_next];
6638
6639                 if( FFABS( p0 - q0 ) < alpha &&
6640                     FFABS( p1 - p0 ) < beta &&
6641                     FFABS( q1 - q0 ) < beta ) {
6642
6643                     const int p3 = pix[-4*pix_next];
6644                     const int q3 = pix[ 3*pix_next];
6645
6646                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6647                         if( FFABS( p2 - p0 ) < beta) {
6648                             /* p0', p1', p2' */
6649                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6650                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6651                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6652                         } else {
6653                             /* p0' */
6654                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6655                         }
6656                         if( FFABS( q2 - q0 ) < beta) {
6657                             /* q0', q1', q2' */
6658                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6659                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6660                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6661                         } else {
6662                             /* q0' */
6663                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6664                         }
6665                     }else{
6666                         /* p0', q0' */
6667                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6668                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6669                     }
6670                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6671                 }
6672                 pix++;
6673             }
6674     }
6675 }
6676
6677 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6678     int i;
6679     const int index_a = qp + h->slice_alpha_c0_offset;
6680     const int alpha = (alpha_table+52)[index_a];
6681     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6682
6683     if( bS[0] < 4 ) {
6684         int8_t tc[4];
6685         for(i=0; i<4; i++)
6686             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6687         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6688     } else {
6689         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6690     }
6691 }
6692
6693 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6694     MpegEncContext * const s = &h->s;
6695     int mb_xy, mb_type;
6696     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6697
6698     mb_xy = mb_x + mb_y*s->mb_stride;
6699
6700     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6701        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6702                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6703         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6704         return;
6705     }
6706     assert(!FRAME_MBAFF);
6707
6708     mb_type = s->current_picture.mb_type[mb_xy];
6709     qp = s->current_picture.qscale_table[mb_xy];
6710     qp0 = s->current_picture.qscale_table[mb_xy-1];
6711     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6712     qpc = get_chroma_qp( h, 0, qp );
6713     qpc0 = get_chroma_qp( h, 0, qp0 );
6714     qpc1 = get_chroma_qp( h, 0, qp1 );
6715     qp0 = (qp + qp0 + 1) >> 1;
6716     qp1 = (qp + qp1 + 1) >> 1;
6717     qpc0 = (qpc + qpc0 + 1) >> 1;
6718     qpc1 = (qpc + qpc1 + 1) >> 1;
6719     qp_thresh = 15 - h->slice_alpha_c0_offset;
6720     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6721        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6722         return;
6723
6724     if( IS_INTRA(mb_type) ) {
6725         int16_t bS4[4] = {4,4,4,4};
6726         int16_t bS3[4] = {3,3,3,3};
6727         if( IS_8x8DCT(mb_type) ) {
6728             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6729             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6730             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6731             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6732         } else {
6733             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6734             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6735             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6736             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6737             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6738             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6739             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6740             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6741         }
6742         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6743         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6744         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6745         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6746         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6747         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6748         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6749         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6750         return;
6751     } else {
6752         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6753         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6754         int edges;
6755         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6756             edges = 4;
6757             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6758         } else {
6759             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6760                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6761             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6762                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6763                              ? 3 : 0;
6764             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6765             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6766             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6767                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6768         }
6769         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6770             bSv[0][0] = 0x0004000400040004ULL;
6771         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6772             bSv[1][0] = 0x0004000400040004ULL;
6773
6774 #define FILTER(hv,dir,edge)\
6775         if(bSv[dir][edge]) {\
6776             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6777             if(!(edge&1)) {\
6778                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6779                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6780             }\
6781         }
6782         if( edges == 1 ) {
6783             FILTER(v,0,0);
6784             FILTER(h,1,0);
6785         } else if( IS_8x8DCT(mb_type) ) {
6786             FILTER(v,0,0);
6787             FILTER(v,0,2);
6788             FILTER(h,1,0);
6789             FILTER(h,1,2);
6790         } else {
6791             FILTER(v,0,0);
6792             FILTER(v,0,1);
6793             FILTER(v,0,2);
6794             FILTER(v,0,3);
6795             FILTER(h,1,0);
6796             FILTER(h,1,1);
6797             FILTER(h,1,2);
6798             FILTER(h,1,3);
6799         }
6800 #undef FILTER
6801     }
6802 }
6803
6804 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6805     MpegEncContext * const s = &h->s;
6806     const int mb_xy= mb_x + mb_y*s->mb_stride;
6807     const int mb_type = s->current_picture.mb_type[mb_xy];
6808     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6809     int first_vertical_edge_done = 0;
6810     int dir;
6811     /* FIXME: A given frame may occupy more than one position in
6812      * the reference list. So ref2frm should be populated with
6813      * frame numbers, not indices. */
6814     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6815                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6816
6817     //for sufficiently low qp, filtering wouldn't do anything
6818     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6819     if(!FRAME_MBAFF){
6820         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6821         int qp = s->current_picture.qscale_table[mb_xy];
6822         if(qp <= qp_thresh
6823            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6824            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6825             return;
6826         }
6827     }
6828
6829     if (FRAME_MBAFF
6830             // left mb is in picture
6831             && h->slice_table[mb_xy-1] != 255
6832             // and current and left pair do not have the same interlaced type
6833             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6834             // and left mb is in the same slice if deblocking_filter == 2
6835             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6836         /* First vertical edge is different in MBAFF frames
6837          * There are 8 different bS to compute and 2 different Qp
6838          */
6839         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6840         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6841         int16_t bS[8];
6842         int qp[2];
6843         int bqp[2];
6844         int rqp[2];
6845         int mb_qp, mbn0_qp, mbn1_qp;
6846         int i;
6847         first_vertical_edge_done = 1;
6848
6849         if( IS_INTRA(mb_type) )
6850             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6851         else {
6852             for( i = 0; i < 8; i++ ) {
6853                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6854
6855                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6856                     bS[i] = 4;
6857                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6858                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6859                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6860                     bS[i] = 2;
6861                 else
6862                     bS[i] = 1;
6863             }
6864         }
6865
6866         mb_qp = s->current_picture.qscale_table[mb_xy];
6867         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6868         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6869         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6870         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6871                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6872         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6873                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6874         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6875         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6876                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6877         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6878                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6879
6880         /* Filter edge */
6881         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6882         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6883         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6884         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6885         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6886     }
6887     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6888     for( dir = 0; dir < 2; dir++ )
6889     {
6890         int edge;
6891         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6892         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6893         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6894
6895         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6896                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6897         // how often to recheck mv-based bS when iterating between edges
6898         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6899                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6900         // how often to recheck mv-based bS when iterating along each edge
6901         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6902
6903         if (first_vertical_edge_done) {
6904             start = 1;
6905             first_vertical_edge_done = 0;
6906         }
6907
6908         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6909             start = 1;
6910
6911         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6912             && !IS_INTERLACED(mb_type)
6913             && IS_INTERLACED(mbm_type)
6914             ) {
6915             // This is a special case in the norm where the filtering must
6916             // be done twice (one each of the field) even if we are in a
6917             // frame macroblock.
6918             //
6919             static const int nnz_idx[4] = {4,5,6,3};
6920             unsigned int tmp_linesize   = 2 *   linesize;
6921             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6922             int mbn_xy = mb_xy - 2 * s->mb_stride;
6923             int qp;
6924             int i, j;
6925             int16_t bS[4];
6926
6927             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6928                 if( IS_INTRA(mb_type) ||
6929                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6930                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6931                 } else {
6932                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6933                     for( i = 0; i < 4; i++ ) {
6934                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6935                             mbn_nnz[nnz_idx[i]] != 0 )
6936                             bS[i] = 2;
6937                         else
6938                             bS[i] = 1;
6939                     }
6940                 }
6941                 // Do not use s->qscale as luma quantizer because it has not the same
6942                 // value in IPCM macroblocks.
6943                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6944                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6945                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6946                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6947                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6948                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6949                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6950                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6951             }
6952
6953             start = 1;
6954         }
6955
6956         /* Calculate bS */
6957         for( edge = start; edge < edges; edge++ ) {
6958             /* mbn_xy: neighbor macroblock */
6959             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6960             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6961             int16_t bS[4];
6962             int qp;
6963
6964             if( (edge&1) && IS_8x8DCT(mb_type) )
6965                 continue;
6966
6967             if( IS_INTRA(mb_type) ||
6968                 IS_INTRA(mbn_type) ) {
6969                 int value;
6970                 if (edge == 0) {
6971                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6972                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6973                     ) {
6974                         value = 4;
6975                     } else {
6976                         value = 3;
6977                     }
6978                 } else {
6979                     value = 3;
6980                 }
6981                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6982             } else {
6983                 int i, l;
6984                 int mv_done;
6985
6986                 if( edge & mask_edge ) {
6987                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6988                     mv_done = 1;
6989                 }
6990                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6991                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6992                     mv_done = 1;
6993                 }
6994                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6995                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6996                     int bn_idx= b_idx - (dir ? 8:1);
6997                     int v = 0;
6998                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6999                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7000                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7001                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
7002                     }
7003                     bS[0] = bS[1] = bS[2] = bS[3] = v;
7004                     mv_done = 1;
7005                 }
7006                 else
7007                     mv_done = 0;
7008
7009                 for( i = 0; i < 4; i++ ) {
7010                     int x = dir == 0 ? edge : i;
7011                     int y = dir == 0 ? i    : edge;
7012                     int b_idx= 8 + 4 + x + 8*y;
7013                     int bn_idx= b_idx - (dir ? 8:1);
7014
7015                     if( h->non_zero_count_cache[b_idx] != 0 ||
7016                         h->non_zero_count_cache[bn_idx] != 0 ) {
7017                         bS[i] = 2;
7018                     }
7019                     else if(!mv_done)
7020                     {
7021                         bS[i] = 0;
7022                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7023                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7024                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7025                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7026                                 bS[i] = 1;
7027                                 break;
7028                             }
7029                         }
7030                     }
7031                 }
7032
7033                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7034                     continue;
7035             }
7036
7037             /* Filter edge */
7038             // Do not use s->qscale as luma quantizer because it has not the same
7039             // value in IPCM macroblocks.
7040             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7041             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7042             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7043             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
7044             if( dir == 0 ) {
7045                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7046                 if( (edge&1) == 0 ) {
7047                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
7048                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
7049                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
7050                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
7051                 }
7052             } else {
7053                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7054                 if( (edge&1) == 0 ) {
7055                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
7056                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
7057                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
7058                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
7059                 }
7060             }
7061         }
7062     }
7063 }
7064
7065 static int decode_slice(H264Context *h){
7066     MpegEncContext * const s = &h->s;
7067     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7068
7069     s->mb_skip_run= -1;
7070
7071     if( h->pps.cabac ) {
7072         int i;
7073
7074         /* realign */
7075         align_get_bits( &s->gb );
7076
7077         /* init cabac */
7078         ff_init_cabac_states( &h->cabac);
7079         ff_init_cabac_decoder( &h->cabac,
7080                                s->gb.buffer + get_bits_count(&s->gb)/8,
7081                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7082         /* calculate pre-state */
7083         for( i= 0; i < 460; i++ ) {
7084             int pre;
7085             if( h->slice_type == I_TYPE )
7086                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7087             else
7088                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7089
7090             if( pre <= 63 )
7091                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7092             else
7093                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7094         }
7095
7096         for(;;){
7097 //START_TIMER
7098             int ret = decode_mb_cabac(h);
7099             int eos;
7100 //STOP_TIMER("decode_mb_cabac")
7101
7102             if(ret>=0) hl_decode_mb(h);
7103
7104             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7105                 s->mb_y++;
7106
7107                 if(ret>=0) ret = decode_mb_cabac(h);
7108
7109                 if(ret>=0) hl_decode_mb(h);
7110                 s->mb_y--;
7111             }
7112             eos = get_cabac_terminate( &h->cabac );
7113
7114             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7115                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7116                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7117                 return -1;
7118             }
7119
7120             if( ++s->mb_x >= s->mb_width ) {
7121                 s->mb_x = 0;
7122                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7123                 ++s->mb_y;
7124                 if(FRAME_MBAFF) {
7125                     ++s->mb_y;
7126                 }
7127             }
7128
7129             if( eos || s->mb_y >= s->mb_height ) {
7130                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7131                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7132                 return 0;
7133             }
7134         }
7135
7136     } else {
7137         for(;;){
7138             int ret = decode_mb_cavlc(h);
7139
7140             if(ret>=0) hl_decode_mb(h);
7141
7142             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7143                 s->mb_y++;
7144                 ret = decode_mb_cavlc(h);
7145
7146                 if(ret>=0) hl_decode_mb(h);
7147                 s->mb_y--;
7148             }
7149
7150             if(ret<0){
7151                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7152                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7153
7154                 return -1;
7155             }
7156
7157             if(++s->mb_x >= s->mb_width){
7158                 s->mb_x=0;
7159                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7160                 ++s->mb_y;
7161                 if(FRAME_MBAFF) {
7162                     ++s->mb_y;
7163                 }
7164                 if(s->mb_y >= s->mb_height){
7165                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7166
7167                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7168                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7169
7170                         return 0;
7171                     }else{
7172                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7173
7174                         return -1;
7175                     }
7176                 }
7177             }
7178
7179             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7180                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7181                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7182                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7183
7184                     return 0;
7185                 }else{
7186                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7187
7188                     return -1;
7189                 }
7190             }
7191         }
7192     }
7193
7194 #if 0
7195     for(;s->mb_y < s->mb_height; s->mb_y++){
7196         for(;s->mb_x < s->mb_width; s->mb_x++){
7197             int ret= decode_mb(h);
7198
7199             hl_decode_mb(h);
7200
7201             if(ret<0){
7202                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7203                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7204
7205                 return -1;
7206             }
7207
7208             if(++s->mb_x >= s->mb_width){
7209                 s->mb_x=0;
7210                 if(++s->mb_y >= s->mb_height){
7211                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7212                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7213
7214                         return 0;
7215                     }else{
7216                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7217
7218                         return -1;
7219                     }
7220                 }
7221             }
7222
7223             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7224                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7225                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7226
7227                     return 0;
7228                 }else{
7229                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7230
7231                     return -1;
7232                 }
7233             }
7234         }
7235         s->mb_x=0;
7236         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7237     }
7238 #endif
7239     return -1; //not reached
7240 }
7241
7242 static int decode_unregistered_user_data(H264Context *h, int size){
7243     MpegEncContext * const s = &h->s;
7244     uint8_t user_data[16+256];
7245     int e, build, i;
7246
7247     if(size<16)
7248         return -1;
7249
7250     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7251         user_data[i]= get_bits(&s->gb, 8);
7252     }
7253
7254     user_data[i]= 0;
7255     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7256     if(e==1 && build>=0)
7257         h->x264_build= build;
7258
7259     if(s->avctx->debug & FF_DEBUG_BUGS)
7260         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7261
7262     for(; i<size; i++)
7263         skip_bits(&s->gb, 8);
7264
7265     return 0;
7266 }
7267
7268 static int decode_sei(H264Context *h){
7269     MpegEncContext * const s = &h->s;
7270
7271     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7272         int size, type;
7273
7274         type=0;
7275         do{
7276             type+= show_bits(&s->gb, 8);
7277         }while(get_bits(&s->gb, 8) == 255);
7278
7279         size=0;
7280         do{
7281             size+= show_bits(&s->gb, 8);
7282         }while(get_bits(&s->gb, 8) == 255);
7283
7284         switch(type){
7285         case 5:
7286             if(decode_unregistered_user_data(h, size) < 0)
7287                 return -1;
7288             break;
7289         default:
7290             skip_bits(&s->gb, 8*size);
7291         }
7292
7293         //FIXME check bits here
7294         align_get_bits(&s->gb);
7295     }
7296
7297     return 0;
7298 }
7299
7300 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7301     MpegEncContext * const s = &h->s;
7302     int cpb_count, i;
7303     cpb_count = get_ue_golomb(&s->gb) + 1;
7304     get_bits(&s->gb, 4); /* bit_rate_scale */
7305     get_bits(&s->gb, 4); /* cpb_size_scale */
7306     for(i=0; i<cpb_count; i++){
7307         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7308         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7309         get_bits1(&s->gb);     /* cbr_flag */
7310     }
7311     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7312     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7313     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7314     get_bits(&s->gb, 5); /* time_offset_length */
7315 }
7316
7317 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7318     MpegEncContext * const s = &h->s;
7319     int aspect_ratio_info_present_flag;
7320     unsigned int aspect_ratio_idc;
7321     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7322
7323     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7324
7325     if( aspect_ratio_info_present_flag ) {
7326         aspect_ratio_idc= get_bits(&s->gb, 8);
7327         if( aspect_ratio_idc == EXTENDED_SAR ) {
7328             sps->sar.num= get_bits(&s->gb, 16);
7329             sps->sar.den= get_bits(&s->gb, 16);
7330         }else if(aspect_ratio_idc < 14){
7331             sps->sar=  pixel_aspect[aspect_ratio_idc];
7332         }else{
7333             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7334             return -1;
7335         }
7336     }else{
7337         sps->sar.num=
7338         sps->sar.den= 0;
7339     }
7340 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7341
7342     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7343         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7344     }
7345
7346     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7347         get_bits(&s->gb, 3);    /* video_format */
7348         get_bits1(&s->gb);      /* video_full_range_flag */
7349         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7350             get_bits(&s->gb, 8); /* colour_primaries */
7351             get_bits(&s->gb, 8); /* transfer_characteristics */
7352             get_bits(&s->gb, 8); /* matrix_coefficients */
7353         }
7354     }
7355
7356     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7357         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7358         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7359     }
7360
7361     sps->timing_info_present_flag = get_bits1(&s->gb);
7362     if(sps->timing_info_present_flag){
7363         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7364         sps->time_scale = get_bits_long(&s->gb, 32);
7365         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7366     }
7367
7368     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7369     if(nal_hrd_parameters_present_flag)
7370         decode_hrd_parameters(h, sps);
7371     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7372     if(vcl_hrd_parameters_present_flag)
7373         decode_hrd_parameters(h, sps);
7374     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7375         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7376     get_bits1(&s->gb);         /* pic_struct_present_flag */
7377
7378     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7379     if(sps->bitstream_restriction_flag){
7380         unsigned int num_reorder_frames;
7381         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7382         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7383         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7384         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7385         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7386         num_reorder_frames= get_ue_golomb(&s->gb);
7387         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7388
7389         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7390             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7391             return -1;
7392         }
7393
7394         sps->num_reorder_frames= num_reorder_frames;
7395     }
7396
7397     return 0;
7398 }
7399
7400 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7401                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7402     MpegEncContext * const s = &h->s;
7403     int i, last = 8, next = 8;
7404     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7405     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7406         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7407     else
7408     for(i=0;i<size;i++){
7409         if(next)
7410             next = (last + get_se_golomb(&s->gb)) & 0xff;
7411         if(!i && !next){ /* matrix not written, we use the preset one */
7412             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7413             break;
7414         }
7415         last = factors[scan[i]] = next ? next : last;
7416     }
7417 }
7418
7419 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7420                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7421     MpegEncContext * const s = &h->s;
7422     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7423     const uint8_t *fallback[4] = {
7424         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7425         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7426         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7427         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7428     };
7429     if(get_bits1(&s->gb)){
7430         sps->scaling_matrix_present |= is_sps;
7431         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7432         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7433         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7434         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7435         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7436         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7437         if(is_sps || pps->transform_8x8_mode){
7438             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7439             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7440         }
7441     } else if(fallback_sps) {
7442         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7443         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7444     }
7445 }
7446
7447 /**
7448  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7449  */
7450 static void *
7451 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7452                     const size_t size, const char *name)
7453 {
7454     if(id>=max) {
7455         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7456         return NULL;
7457     }
7458
7459     if(!vec[id]) {
7460         vec[id] = av_mallocz(size);
7461         if(vec[id] == NULL)
7462             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7463     }
7464     return vec[id];
7465 }
7466
7467 static inline int decode_seq_parameter_set(H264Context *h){
7468     MpegEncContext * const s = &h->s;
7469     int profile_idc, level_idc;
7470     unsigned int sps_id, tmp, mb_width, mb_height;
7471     int i;
7472     SPS *sps;
7473
7474     profile_idc= get_bits(&s->gb, 8);
7475     get_bits1(&s->gb);   //constraint_set0_flag
7476     get_bits1(&s->gb);   //constraint_set1_flag
7477     get_bits1(&s->gb);   //constraint_set2_flag
7478     get_bits1(&s->gb);   //constraint_set3_flag
7479     get_bits(&s->gb, 4); // reserved
7480     level_idc= get_bits(&s->gb, 8);
7481     sps_id= get_ue_golomb(&s->gb);
7482
7483     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7484     if(sps == NULL)
7485         return -1;
7486
7487     sps->profile_idc= profile_idc;
7488     sps->level_idc= level_idc;
7489
7490     if(sps->profile_idc >= 100){ //high profile
7491         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7492             get_bits1(&s->gb);  //residual_color_transform_flag
7493         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7494         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7495         sps->transform_bypass = get_bits1(&s->gb);
7496         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7497     }else
7498         sps->scaling_matrix_present = 0;
7499
7500     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7501     sps->poc_type= get_ue_golomb(&s->gb);
7502
7503     if(sps->poc_type == 0){ //FIXME #define
7504         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7505     } else if(sps->poc_type == 1){//FIXME #define
7506         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7507         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7508         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7509         tmp= get_ue_golomb(&s->gb);
7510
7511         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7512             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7513             return -1;
7514         }
7515         sps->poc_cycle_length= tmp;
7516
7517         for(i=0; i<sps->poc_cycle_length; i++)
7518             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7519     }else if(sps->poc_type != 2){
7520         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7521         return -1;
7522     }
7523
7524     tmp= get_ue_golomb(&s->gb);
7525     if(tmp > MAX_PICTURE_COUNT-2){
7526         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7527     }
7528     sps->ref_frame_count= tmp;
7529     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7530     mb_width= get_ue_golomb(&s->gb) + 1;
7531     mb_height= get_ue_golomb(&s->gb) + 1;
7532     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7533        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7534         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7535         return -1;
7536     }
7537     sps->mb_width = mb_width;
7538     sps->mb_height= mb_height;
7539
7540     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7541     if(!sps->frame_mbs_only_flag)
7542         sps->mb_aff= get_bits1(&s->gb);
7543     else
7544         sps->mb_aff= 0;
7545
7546     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7547
7548 #ifndef ALLOW_INTERLACE
7549     if(sps->mb_aff)
7550         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7551 #endif
7552     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7553         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7554
7555     sps->crop= get_bits1(&s->gb);
7556     if(sps->crop){
7557         sps->crop_left  = get_ue_golomb(&s->gb);
7558         sps->crop_right = get_ue_golomb(&s->gb);
7559         sps->crop_top   = get_ue_golomb(&s->gb);
7560         sps->crop_bottom= get_ue_golomb(&s->gb);
7561         if(sps->crop_left || sps->crop_top){
7562             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7563         }
7564     }else{
7565         sps->crop_left  =
7566         sps->crop_right =
7567         sps->crop_top   =
7568         sps->crop_bottom= 0;
7569     }
7570
7571     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7572     if( sps->vui_parameters_present_flag )
7573         decode_vui_parameters(h, sps);
7574
7575     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7576         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7577                sps_id, sps->profile_idc, sps->level_idc,
7578                sps->poc_type,
7579                sps->ref_frame_count,
7580                sps->mb_width, sps->mb_height,
7581                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7582                sps->direct_8x8_inference_flag ? "8B8" : "",
7583                sps->crop_left, sps->crop_right,
7584                sps->crop_top, sps->crop_bottom,
7585                sps->vui_parameters_present_flag ? "VUI" : ""
7586                );
7587     }
7588     return 0;
7589 }
7590
7591 static void
7592 build_qp_table(PPS *pps, int t, int index)
7593 {
7594     int i;
7595     for(i = 0; i < 255; i++)
7596         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7597 }
7598
7599 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7600     MpegEncContext * const s = &h->s;
7601     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7602     PPS *pps;
7603
7604     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7605     if(pps == NULL)
7606         return -1;
7607
7608     tmp= get_ue_golomb(&s->gb);
7609     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7610         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7611         return -1;
7612     }
7613     pps->sps_id= tmp;
7614
7615     pps->cabac= get_bits1(&s->gb);
7616     pps->pic_order_present= get_bits1(&s->gb);
7617     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7618     if(pps->slice_group_count > 1 ){
7619         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7620         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7621         switch(pps->mb_slice_group_map_type){
7622         case 0:
7623 #if 0
7624 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7625 |    run_length[ i ]                                |1  |ue(v)   |
7626 #endif
7627             break;
7628         case 2:
7629 #if 0
7630 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7631 |{                                                  |   |        |
7632 |    top_left_mb[ i ]                               |1  |ue(v)   |
7633 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7634 |   }                                               |   |        |
7635 #endif
7636             break;
7637         case 3:
7638         case 4:
7639         case 5:
7640 #if 0
7641 |   slice_group_change_direction_flag               |1  |u(1)    |
7642 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7643 #endif
7644             break;
7645         case 6:
7646 #if 0
7647 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7648 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7649 |)                                                  |   |        |
7650 |    slice_group_id[ i ]                            |1  |u(v)    |
7651 #endif
7652             break;
7653         }
7654     }
7655     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7656     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7657     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7658         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7659         pps->ref_count[0]= pps->ref_count[1]= 1;
7660         return -1;
7661     }
7662
7663     pps->weighted_pred= get_bits1(&s->gb);
7664     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7665     pps->init_qp= get_se_golomb(&s->gb) + 26;
7666     pps->init_qs= get_se_golomb(&s->gb) + 26;
7667     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7668     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7669     pps->constrained_intra_pred= get_bits1(&s->gb);
7670     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7671
7672     pps->transform_8x8_mode= 0;
7673     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7674     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7675     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7676
7677     if(get_bits_count(&s->gb) < bit_length){
7678         pps->transform_8x8_mode= get_bits1(&s->gb);
7679         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7680         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7681     } else {
7682         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7683     }
7684
7685     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7686     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7687         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7688         h->pps.chroma_qp_diff= 1;
7689     } else
7690         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7691
7692     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7693         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7694                pps_id, pps->sps_id,
7695                pps->cabac ? "CABAC" : "CAVLC",
7696                pps->slice_group_count,
7697                pps->ref_count[0], pps->ref_count[1],
7698                pps->weighted_pred ? "weighted" : "",
7699                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7700                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7701                pps->constrained_intra_pred ? "CONSTR" : "",
7702                pps->redundant_pic_cnt_present ? "REDU" : "",
7703                pps->transform_8x8_mode ? "8x8DCT" : ""
7704                );
7705     }
7706
7707     return 0;
7708 }
7709
7710 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7711     MpegEncContext * const s = &h->s;
7712     AVCodecContext * const avctx= s->avctx;
7713     int buf_index=0;
7714 #if 0
7715     int i;
7716     for(i=0; i<50; i++){
7717         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7718     }
7719 #endif
7720     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7721         h->slice_num = 0;
7722         s->current_picture_ptr= NULL;
7723     }
7724
7725     for(;;){
7726         int consumed;
7727         int dst_length;
7728         int bit_length;
7729         uint8_t *ptr;
7730         int i, nalsize = 0;
7731
7732         if(h->is_avc) {
7733             if(buf_index >= buf_size) break;
7734             nalsize = 0;
7735             for(i = 0; i < h->nal_length_size; i++)
7736                 nalsize = (nalsize << 8) | buf[buf_index++];
7737             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7738                 if(nalsize == 1){
7739                     buf_index++;
7740                     continue;
7741                 }else{
7742                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7743                     break;
7744                 }
7745             }
7746         } else {
7747             // start code prefix search
7748             for(; buf_index + 3 < buf_size; buf_index++){
7749                 // This should always succeed in the first iteration.
7750                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7751                     break;
7752             }
7753
7754             if(buf_index+3 >= buf_size) break;
7755
7756             buf_index+=3;
7757         }
7758
7759         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7760         if (ptr==NULL || dst_length < 0){
7761             return -1;
7762         }
7763         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7764             dst_length--;
7765         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7766
7767         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7768             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7769         }
7770
7771         if (h->is_avc && (nalsize != consumed))
7772             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7773
7774         buf_index += consumed;
7775
7776         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7777            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7778             continue;
7779
7780         switch(h->nal_unit_type){
7781         case NAL_IDR_SLICE:
7782             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7783         case NAL_SLICE:
7784             init_get_bits(&s->gb, ptr, bit_length);
7785             h->intra_gb_ptr=
7786             h->inter_gb_ptr= &s->gb;
7787             s->data_partitioning = 0;
7788
7789             if(decode_slice_header(h) < 0){
7790                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7791                 break;
7792             }
7793             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
7794             if(h->redundant_pic_count==0 && s->hurry_up < 5
7795                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7796                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7797                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7798                && avctx->skip_frame < AVDISCARD_ALL)
7799                 decode_slice(h);
7800             break;
7801         case NAL_DPA:
7802             init_get_bits(&s->gb, ptr, bit_length);
7803             h->intra_gb_ptr=
7804             h->inter_gb_ptr= NULL;
7805             s->data_partitioning = 1;
7806
7807             if(decode_slice_header(h) < 0){
7808                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7809             }
7810             break;
7811         case NAL_DPB:
7812             init_get_bits(&h->intra_gb, ptr, bit_length);
7813             h->intra_gb_ptr= &h->intra_gb;
7814             break;
7815         case NAL_DPC:
7816             init_get_bits(&h->inter_gb, ptr, bit_length);
7817             h->inter_gb_ptr= &h->inter_gb;
7818
7819             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
7820                && s->context_initialized
7821                && s->hurry_up < 5
7822                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7823                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7824                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7825                && avctx->skip_frame < AVDISCARD_ALL)
7826                 decode_slice(h);
7827             break;
7828         case NAL_SEI:
7829             init_get_bits(&s->gb, ptr, bit_length);
7830             decode_sei(h);
7831             break;
7832         case NAL_SPS:
7833             init_get_bits(&s->gb, ptr, bit_length);
7834             decode_seq_parameter_set(h);
7835
7836             if(s->flags& CODEC_FLAG_LOW_DELAY)
7837                 s->low_delay=1;
7838
7839             if(avctx->has_b_frames < 2)
7840                 avctx->has_b_frames= !s->low_delay;
7841             break;
7842         case NAL_PPS:
7843             init_get_bits(&s->gb, ptr, bit_length);
7844
7845             decode_picture_parameter_set(h, bit_length);
7846
7847             break;
7848         case NAL_AUD:
7849         case NAL_END_SEQUENCE:
7850         case NAL_END_STREAM:
7851         case NAL_FILLER_DATA:
7852         case NAL_SPS_EXT:
7853         case NAL_AUXILIARY_SLICE:
7854             break;
7855         default:
7856             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
7857         }
7858     }
7859
7860     return buf_index;
7861 }
7862
7863 /**
7864  * returns the number of bytes consumed for building the current frame
7865  */
7866 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7867     if(s->flags&CODEC_FLAG_TRUNCATED){
7868         pos -= s->parse_context.last_index;
7869         if(pos<0) pos=0; // FIXME remove (unneeded?)
7870
7871         return pos;
7872     }else{
7873         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7874         if(pos+10>buf_size) pos=buf_size; // oops ;)
7875
7876         return pos;
7877     }
7878 }
7879
7880 static int decode_frame(AVCodecContext *avctx,
7881                              void *data, int *data_size,
7882                              uint8_t *buf, int buf_size)
7883 {
7884     H264Context *h = avctx->priv_data;
7885     MpegEncContext *s = &h->s;
7886     AVFrame *pict = data;
7887     int buf_index;
7888
7889     s->flags= avctx->flags;
7890     s->flags2= avctx->flags2;
7891
7892    /* no supplementary picture */
7893     if (buf_size == 0) {
7894         Picture *out;
7895         int i, out_idx;
7896
7897 //FIXME factorize this with the output code below
7898         out = h->delayed_pic[0];
7899         out_idx = 0;
7900         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7901             if(h->delayed_pic[i]->poc < out->poc){
7902                 out = h->delayed_pic[i];
7903                 out_idx = i;
7904             }
7905
7906         for(i=out_idx; h->delayed_pic[i]; i++)
7907             h->delayed_pic[i] = h->delayed_pic[i+1];
7908
7909         if(out){
7910             *data_size = sizeof(AVFrame);
7911             *pict= *(AVFrame*)out;
7912         }
7913
7914         return 0;
7915     }
7916
7917     if(s->flags&CODEC_FLAG_TRUNCATED){
7918         int next= ff_h264_find_frame_end(h, buf, buf_size);
7919
7920         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7921             return buf_size;
7922 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7923     }
7924
7925     if(h->is_avc && !h->got_avcC) {
7926         int i, cnt, nalsize;
7927         unsigned char *p = avctx->extradata;
7928         if(avctx->extradata_size < 7) {
7929             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7930             return -1;
7931         }
7932         if(*p != 1) {
7933             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7934             return -1;
7935         }
7936         /* sps and pps in the avcC always have length coded with 2 bytes,
7937            so put a fake nal_length_size = 2 while parsing them */
7938         h->nal_length_size = 2;
7939         // Decode sps from avcC
7940         cnt = *(p+5) & 0x1f; // Number of sps
7941         p += 6;
7942         for (i = 0; i < cnt; i++) {
7943             nalsize = AV_RB16(p) + 2;
7944             if(decode_nal_units(h, p, nalsize) < 0) {
7945                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7946                 return -1;
7947             }
7948             p += nalsize;
7949         }
7950         // Decode pps from avcC
7951         cnt = *(p++); // Number of pps
7952         for (i = 0; i < cnt; i++) {
7953             nalsize = AV_RB16(p) + 2;
7954             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7955                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7956                 return -1;
7957             }
7958             p += nalsize;
7959         }
7960         // Now store right nal length size, that will be use to parse all other nals
7961         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7962         // Do not reparse avcC
7963         h->got_avcC = 1;
7964     }
7965
7966     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7967         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7968             return -1;
7969     }
7970
7971     buf_index=decode_nal_units(h, buf, buf_size);
7972     if(buf_index < 0)
7973         return -1;
7974
7975     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7976         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7977         return -1;
7978     }
7979
7980     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7981         Picture *out = s->current_picture_ptr;
7982         Picture *cur = s->current_picture_ptr;
7983         Picture *prev = h->delayed_output_pic;
7984         int i, pics, cross_idr, out_of_order, out_idx;
7985
7986         s->mb_y= 0;
7987
7988         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7989         s->current_picture_ptr->pict_type= s->pict_type;
7990
7991         h->prev_frame_num_offset= h->frame_num_offset;
7992         h->prev_frame_num= h->frame_num;
7993         if(s->current_picture_ptr->reference){
7994             h->prev_poc_msb= h->poc_msb;
7995             h->prev_poc_lsb= h->poc_lsb;
7996         }
7997         if(s->current_picture_ptr->reference)
7998             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7999
8000         ff_er_frame_end(s);
8001
8002         MPV_frame_end(s);
8003
8004     //FIXME do something with unavailable reference frames
8005
8006 #if 0 //decode order
8007         *data_size = sizeof(AVFrame);
8008 #else
8009         /* Sort B-frames into display order */
8010
8011         if(h->sps.bitstream_restriction_flag
8012            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
8013             s->avctx->has_b_frames = h->sps.num_reorder_frames;
8014             s->low_delay = 0;
8015         }
8016
8017         pics = 0;
8018         while(h->delayed_pic[pics]) pics++;
8019
8020         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
8021
8022         h->delayed_pic[pics++] = cur;
8023         if(cur->reference == 0)
8024             cur->reference = 1;
8025
8026         cross_idr = 0;
8027         for(i=0; h->delayed_pic[i]; i++)
8028             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8029                 cross_idr = 1;
8030
8031         out = h->delayed_pic[0];
8032         out_idx = 0;
8033         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8034             if(h->delayed_pic[i]->poc < out->poc){
8035                 out = h->delayed_pic[i];
8036                 out_idx = i;
8037             }
8038
8039         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8040         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8041             { }
8042         else if(prev && pics <= s->avctx->has_b_frames)
8043             out = prev;
8044         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8045            || (s->low_delay &&
8046             ((!cross_idr && prev && out->poc > prev->poc + 2)
8047              || cur->pict_type == B_TYPE)))
8048         {
8049             s->low_delay = 0;
8050             s->avctx->has_b_frames++;
8051             out = prev;
8052         }
8053         else if(out_of_order)
8054             out = prev;
8055
8056         if(out_of_order || pics > s->avctx->has_b_frames){
8057             for(i=out_idx; h->delayed_pic[i]; i++)
8058                 h->delayed_pic[i] = h->delayed_pic[i+1];
8059         }
8060
8061         if(prev == out)
8062             *data_size = 0;
8063         else
8064             *data_size = sizeof(AVFrame);
8065         if(prev && prev != out && prev->reference == 1)
8066             prev->reference = 0;
8067         h->delayed_output_pic = out;
8068 #endif
8069
8070         if(out)
8071             *pict= *(AVFrame*)out;
8072         else
8073             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8074     }
8075
8076     assert(pict->data[0] || !*data_size);
8077     ff_print_debug_info(s, pict);
8078 //printf("out %d\n", (int)pict->data[0]);
8079 #if 0 //?
8080
8081     /* Return the Picture timestamp as the frame number */
8082     /* we substract 1 because it is added on utils.c    */
8083     avctx->frame_number = s->picture_number - 1;
8084 #endif
8085     return get_consumed_bytes(s, buf_index, buf_size);
8086 }
8087 #if 0
8088 static inline void fill_mb_avail(H264Context *h){
8089     MpegEncContext * const s = &h->s;
8090     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8091
8092     if(s->mb_y){
8093         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8094         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8095         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8096     }else{
8097         h->mb_avail[0]=
8098         h->mb_avail[1]=
8099         h->mb_avail[2]= 0;
8100     }
8101     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8102     h->mb_avail[4]= 1; //FIXME move out
8103     h->mb_avail[5]= 0; //FIXME move out
8104 }
8105 #endif
8106
8107 #if 0 //selftest
8108 #undef random
8109 #define COUNT 8000
8110 #define SIZE (COUNT*40)
8111 int main(){
8112     int i;
8113     uint8_t temp[SIZE];
8114     PutBitContext pb;
8115     GetBitContext gb;
8116 //    int int_temp[10000];
8117     DSPContext dsp;
8118     AVCodecContext avctx;
8119
8120     dsputil_init(&dsp, &avctx);
8121
8122     init_put_bits(&pb, temp, SIZE);
8123     printf("testing unsigned exp golomb\n");
8124     for(i=0; i<COUNT; i++){
8125         START_TIMER
8126         set_ue_golomb(&pb, i);
8127         STOP_TIMER("set_ue_golomb");
8128     }
8129     flush_put_bits(&pb);
8130
8131     init_get_bits(&gb, temp, 8*SIZE);
8132     for(i=0; i<COUNT; i++){
8133         int j, s;
8134
8135         s= show_bits(&gb, 24);
8136
8137         START_TIMER
8138         j= get_ue_golomb(&gb);
8139         if(j != i){
8140             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8141 //            return -1;
8142         }
8143         STOP_TIMER("get_ue_golomb");
8144     }
8145
8146
8147     init_put_bits(&pb, temp, SIZE);
8148     printf("testing signed exp golomb\n");
8149     for(i=0; i<COUNT; i++){
8150         START_TIMER
8151         set_se_golomb(&pb, i - COUNT/2);
8152         STOP_TIMER("set_se_golomb");
8153     }
8154     flush_put_bits(&pb);
8155
8156     init_get_bits(&gb, temp, 8*SIZE);
8157     for(i=0; i<COUNT; i++){
8158         int j, s;
8159
8160         s= show_bits(&gb, 24);
8161
8162         START_TIMER
8163         j= get_se_golomb(&gb);
8164         if(j != i - COUNT/2){
8165             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8166 //            return -1;
8167         }
8168         STOP_TIMER("get_se_golomb");
8169     }
8170
8171     printf("testing 4x4 (I)DCT\n");
8172
8173     DCTELEM block[16];
8174     uint8_t src[16], ref[16];
8175     uint64_t error= 0, max_error=0;
8176
8177     for(i=0; i<COUNT; i++){
8178         int j;
8179 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8180         for(j=0; j<16; j++){
8181             ref[j]= random()%255;
8182             src[j]= random()%255;
8183         }
8184
8185         h264_diff_dct_c(block, src, ref, 4);
8186
8187         //normalize
8188         for(j=0; j<16; j++){
8189 //            printf("%d ", block[j]);
8190             block[j]= block[j]*4;
8191             if(j&1) block[j]= (block[j]*4 + 2)/5;
8192             if(j&4) block[j]= (block[j]*4 + 2)/5;
8193         }
8194 //        printf("\n");
8195
8196         s->dsp.h264_idct_add(ref, block, 4);
8197 /*        for(j=0; j<16; j++){
8198             printf("%d ", ref[j]);
8199         }
8200         printf("\n");*/
8201
8202         for(j=0; j<16; j++){
8203             int diff= FFABS(src[j] - ref[j]);
8204
8205             error+= diff*diff;
8206             max_error= FFMAX(max_error, diff);
8207         }
8208     }
8209     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8210 #if 0
8211     printf("testing quantizer\n");
8212     for(qp=0; qp<52; qp++){
8213         for(i=0; i<16; i++)
8214             src1_block[i]= src2_block[i]= random()%255;
8215
8216     }
8217 #endif
8218     printf("Testing NAL layer\n");
8219
8220     uint8_t bitstream[COUNT];
8221     uint8_t nal[COUNT*2];
8222     H264Context h;
8223     memset(&h, 0, sizeof(H264Context));
8224
8225     for(i=0; i<COUNT; i++){
8226         int zeros= i;
8227         int nal_length;
8228         int consumed;
8229         int out_length;
8230         uint8_t *out;
8231         int j;
8232
8233         for(j=0; j<COUNT; j++){
8234             bitstream[j]= (random() % 255) + 1;
8235         }
8236
8237         for(j=0; j<zeros; j++){
8238             int pos= random() % COUNT;
8239             while(bitstream[pos] == 0){
8240                 pos++;
8241                 pos %= COUNT;
8242             }
8243             bitstream[pos]=0;
8244         }
8245
8246         START_TIMER
8247
8248         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8249         if(nal_length<0){
8250             printf("encoding failed\n");
8251             return -1;
8252         }
8253
8254         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8255
8256         STOP_TIMER("NAL")
8257
8258         if(out_length != COUNT){
8259             printf("incorrect length %d %d\n", out_length, COUNT);
8260             return -1;
8261         }
8262
8263         if(consumed != nal_length){
8264             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8265             return -1;
8266         }
8267
8268         if(memcmp(bitstream, out, COUNT)){
8269             printf("mismatch\n");
8270             return -1;
8271         }
8272     }
8273
8274     printf("Testing RBSP\n");
8275
8276
8277     return 0;
8278 }
8279 #endif
8280
8281
8282 static int decode_end(AVCodecContext *avctx)
8283 {
8284     H264Context *h = avctx->priv_data;
8285     MpegEncContext *s = &h->s;
8286
8287     av_freep(&h->rbsp_buffer[0]);
8288     av_freep(&h->rbsp_buffer[1]);
8289     free_tables(h); //FIXME cleanup init stuff perhaps
8290     MPV_common_end(s);
8291
8292 //    memset(h, 0, sizeof(H264Context));
8293
8294     return 0;
8295 }
8296
8297
8298 AVCodec h264_decoder = {
8299     "h264",
8300     CODEC_TYPE_VIDEO,
8301     CODEC_ID_H264,
8302     sizeof(H264Context),
8303     decode_init,
8304     NULL,
8305     decode_end,
8306     decode_frame,
8307     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8308     .flush= flush_dpb,
8309 };
8310
8311 #include "svq3.c"