git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  *
  21  */
  22
  23 /**
  24  * @file h264.c
  25  * H.264 / AVC / MPEG4 part10 codec.
  26  * @author Michael Niedermayer <michaelni@gmx.at>
  27  */
  28
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36
  37 #include "cabac.h"
  38
  39 //#undef NDEBUG
  40 #include <assert.h>
  41
  42 static VLC coeff_token_vlc[4];
  43 static VLC chroma_dc_coeff_token_vlc;
  44
  45 static VLC total_zeros_vlc[15];
  46 static VLC chroma_dc_total_zeros_vlc[3];
  47
  48 static VLC run_vlc[6];
  49 static VLC run7_vlc;
  50
  51 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  52 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  53 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  54 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  55
  56 static av_always_inline uint32_t pack16to32(int a, int b){
  57 #ifdef WORDS_BIGENDIAN
  58    return (b&0xFFFF) + (a<<16);
  59 #else
  60    return (a&0xFFFF) + (b<<16);
  61 #endif
  62 }
  63
  64 const uint8_t ff_rem6[52]={
  65 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  66 };
  67
  68 const uint8_t ff_div6[52]={
  69 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  70 };
  71
  72
  73 /**
  74  * fill a rectangle.
  75  * @param h height of the rectangle, should be a constant
  76  * @param w width of the rectangle, should be a constant
  77  * @param size the size of val (1 or 4), should be a constant
  78  */
  79 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  80     uint8_t *p= (uint8_t*)vp;
  81     assert(size==1 || size==4);
  82     assert(w<=4);
  83
  84     w      *= size;
  85     stride *= size;
  86
  87     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  88     assert((stride&(w-1))==0);
  89     if(w==2){
  90         const uint16_t v= size==4 ? val : val*0x0101;
  91         *(uint16_t*)(p + 0*stride)= v;
  92         if(h==1) return;
  93         *(uint16_t*)(p + 1*stride)= v;
  94         if(h==2) return;
  95         *(uint16_t*)(p + 2*stride)=
  96         *(uint16_t*)(p + 3*stride)= v;
  97     }else if(w==4){
  98         const uint32_t v= size==4 ? val : val*0x01010101;
  99         *(uint32_t*)(p + 0*stride)= v;
 100         if(h==1) return;
 101         *(uint32_t*)(p + 1*stride)= v;
 102         if(h==2) return;
 103         *(uint32_t*)(p + 2*stride)=
 104         *(uint32_t*)(p + 3*stride)= v;
 105     }else if(w==8){
 106     //gcc can't optimize 64bit math on x86_32
 107 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 108         const uint64_t v= val*0x0100000001ULL;
 109         *(uint64_t*)(p + 0*stride)= v;
 110         if(h==1) return;
 111         *(uint64_t*)(p + 1*stride)= v;
 112         if(h==2) return;
 113         *(uint64_t*)(p + 2*stride)=
 114         *(uint64_t*)(p + 3*stride)= v;
 115     }else if(w==16){
 116         const uint64_t v= val*0x0100000001ULL;
 117         *(uint64_t*)(p + 0+0*stride)=
 118         *(uint64_t*)(p + 8+0*stride)=
 119         *(uint64_t*)(p + 0+1*stride)=
 120         *(uint64_t*)(p + 8+1*stride)= v;
 121         if(h==2) return;
 122         *(uint64_t*)(p + 0+2*stride)=
 123         *(uint64_t*)(p + 8+2*stride)=
 124         *(uint64_t*)(p + 0+3*stride)=
 125         *(uint64_t*)(p + 8+3*stride)= v;
 126 #else
 127         *(uint32_t*)(p + 0+0*stride)=
 128         *(uint32_t*)(p + 4+0*stride)= val;
 129         if(h==1) return;
 130         *(uint32_t*)(p + 0+1*stride)=
 131         *(uint32_t*)(p + 4+1*stride)= val;
 132         if(h==2) return;
 133         *(uint32_t*)(p + 0+2*stride)=
 134         *(uint32_t*)(p + 4+2*stride)=
 135         *(uint32_t*)(p + 0+3*stride)=
 136         *(uint32_t*)(p + 4+3*stride)= val;
 137     }else if(w==16){
 138         *(uint32_t*)(p + 0+0*stride)=
 139         *(uint32_t*)(p + 4+0*stride)=
 140         *(uint32_t*)(p + 8+0*stride)=
 141         *(uint32_t*)(p +12+0*stride)=
 142         *(uint32_t*)(p + 0+1*stride)=
 143         *(uint32_t*)(p + 4+1*stride)=
 144         *(uint32_t*)(p + 8+1*stride)=
 145         *(uint32_t*)(p +12+1*stride)= val;
 146         if(h==2) return;
 147         *(uint32_t*)(p + 0+2*stride)=
 148         *(uint32_t*)(p + 4+2*stride)=
 149         *(uint32_t*)(p + 8+2*stride)=
 150         *(uint32_t*)(p +12+2*stride)=
 151         *(uint32_t*)(p + 0+3*stride)=
 152         *(uint32_t*)(p + 4+3*stride)=
 153         *(uint32_t*)(p + 8+3*stride)=
 154         *(uint32_t*)(p +12+3*stride)= val;
 155 #endif
 156     }else
 157         assert(0);
 158     assert(h==4);
 159 }
 160
 161 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 162     MpegEncContext * const s = &h->s;
 163     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 164     int topleft_xy, top_xy, topright_xy, left_xy[2];
 165     int topleft_type, top_type, topright_type, left_type[2];
 166     int left_block[8];
 167     int i;
 168
 169     //FIXME deblocking could skip the intra and nnz parts.
 170     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
 171         return;
 172
 173     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 174
 175     top_xy     = mb_xy  - s->mb_stride;
 176     topleft_xy = top_xy - 1;
 177     topright_xy= top_xy + 1;
 178     left_xy[1] = left_xy[0] = mb_xy-1;
 179     left_block[0]= 0;
 180     left_block[1]= 1;
 181     left_block[2]= 2;
 182     left_block[3]= 3;
 183     left_block[4]= 7;
 184     left_block[5]= 10;
 185     left_block[6]= 8;
 186     left_block[7]= 11;
 187     if(FRAME_MBAFF){
 188         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 189         const int top_pair_xy      = pair_xy     - s->mb_stride;
 190         const int topleft_pair_xy  = top_pair_xy - 1;
 191         const int topright_pair_xy = top_pair_xy + 1;
 192         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 193         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 194         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 195         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 196         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 197         const int bottom = (s->mb_y & 1);
 198         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 199         if (bottom
 200                 ? !curr_mb_frame_flag // bottom macroblock
 201                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 202                 ) {
 203             top_xy -= s->mb_stride;
 204         }
 205         if (bottom
 206                 ? !curr_mb_frame_flag // bottom macroblock
 207                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 208                 ) {
 209             topleft_xy -= s->mb_stride;
 210         }
 211         if (bottom
 212                 ? !curr_mb_frame_flag // bottom macroblock
 213                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 214                 ) {
 215             topright_xy -= s->mb_stride;
 216         }
 217         if (left_mb_frame_flag != curr_mb_frame_flag) {
 218             left_xy[1] = left_xy[0] = pair_xy - 1;
 219             if (curr_mb_frame_flag) {
 220                 if (bottom) {
 221                     left_block[0]= 2;
 222                     left_block[1]= 2;
 223                     left_block[2]= 3;
 224                     left_block[3]= 3;
 225                     left_block[4]= 8;
 226                     left_block[5]= 11;
 227                     left_block[6]= 8;
 228                     left_block[7]= 11;
 229                 } else {
 230                     left_block[0]= 0;
 231                     left_block[1]= 0;
 232                     left_block[2]= 1;
 233                     left_block[3]= 1;
 234                     left_block[4]= 7;
 235                     left_block[5]= 10;
 236                     left_block[6]= 7;
 237                     left_block[7]= 10;
 238                 }
 239             } else {
 240                 left_xy[1] += s->mb_stride;
 241                 //left_block[0]= 0;
 242                 left_block[1]= 2;
 243                 left_block[2]= 0;
 244                 left_block[3]= 2;
 245                 //left_block[4]= 7;
 246                 left_block[5]= 10;
 247                 left_block[6]= 7;
 248                 left_block[7]= 10;
 249             }
 250         }
 251     }
 252
 253     h->top_mb_xy = top_xy;
 254     h->left_mb_xy[0] = left_xy[0];
 255     h->left_mb_xy[1] = left_xy[1];
 256     if(for_deblock){
 257         topleft_type = 0;
 258         topright_type = 0;
 259         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 260         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 261         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 262
 263         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 264             int list;
 265             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 266             for(i=0; i<16; i++)
 267                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 268             for(list=0; list<h->list_count; list++){
 269                 if(USES_LIST(mb_type,list)){
 270                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 271                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 272                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 273                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 274                         dst[0] = src[0];
 275                         dst[1] = src[1];
 276                         dst[2] = src[2];
 277                         dst[3] = src[3];
 278                     }
 279                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 280                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 281                     ref += h->b8_stride;
 282                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 283                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 284                 }else{
 285                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 286                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 287                 }
 288             }
 289         }
 290     }else{
 291         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 292         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 293         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 294         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 295         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 296     }
 297
 298     if(IS_INTRA(mb_type)){
 299         h->topleft_samples_available=
 300         h->top_samples_available=
 301         h->left_samples_available= 0xFFFF;
 302         h->topright_samples_available= 0xEEEA;
 303
 304         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 305             h->topleft_samples_available= 0xB3FF;
 306             h->top_samples_available= 0x33FF;
 307             h->topright_samples_available= 0x26EA;
 308         }
 309         for(i=0; i<2; i++){
 310             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 311                 h->topleft_samples_available&= 0xDF5F;
 312                 h->left_samples_available&= 0x5F5F;
 313             }
 314         }
 315
 316         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 317             h->topleft_samples_available&= 0x7FFF;
 318
 319         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 320             h->topright_samples_available&= 0xFBFF;
 321
 322         if(IS_INTRA4x4(mb_type)){
 323             if(IS_INTRA4x4(top_type)){
 324                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 325                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 326                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 327                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 328             }else{
 329                 int pred;
 330                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 331                     pred= -1;
 332                 else{
 333                     pred= 2;
 334                 }
 335                 h->intra4x4_pred_mode_cache[4+8*0]=
 336                 h->intra4x4_pred_mode_cache[5+8*0]=
 337                 h->intra4x4_pred_mode_cache[6+8*0]=
 338                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 339             }
 340             for(i=0; i<2; i++){
 341                 if(IS_INTRA4x4(left_type[i])){
 342                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 343                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 344                 }else{
 345                     int pred;
 346                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 347                         pred= -1;
 348                     else{
 349                         pred= 2;
 350                     }
 351                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 352                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 353                 }
 354             }
 355         }
 356     }
 357
 358
 359 /*
 360 0 . T T. T T T T
 361 1 L . .L . . . .
 362 2 L . .L . . . .
 363 3 . T TL . . . .
 364 4 L . .L . . . .
 365 5 L . .. . . . .
 366 */
 367 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 368     if(top_type){
 369         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 370         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 371         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 372         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 373
 374         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 375         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 376
 377         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 378         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 379
 380     }else{
 381         h->non_zero_count_cache[4+8*0]=
 382         h->non_zero_count_cache[5+8*0]=
 383         h->non_zero_count_cache[6+8*0]=
 384         h->non_zero_count_cache[7+8*0]=
 385
 386         h->non_zero_count_cache[1+8*0]=
 387         h->non_zero_count_cache[2+8*0]=
 388
 389         h->non_zero_count_cache[1+8*3]=
 390         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 391
 392     }
 393
 394     for (i=0; i<2; i++) {
 395         if(left_type[i]){
 396             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 397             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 398             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 399             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 400         }else{
 401             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 402             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 403             h->non_zero_count_cache[0+8*1 +   8*i]=
 404             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 405         }
 406     }
 407
 408     if( h->pps.cabac ) {
 409         // top_cbp
 410         if(top_type) {
 411             h->top_cbp = h->cbp_table[top_xy];
 412         } else if(IS_INTRA(mb_type)) {
 413             h->top_cbp = 0x1C0;
 414         } else {
 415             h->top_cbp = 0;
 416         }
 417         // left_cbp
 418         if (left_type[0]) {
 419             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 420         } else if(IS_INTRA(mb_type)) {
 421             h->left_cbp = 0x1C0;
 422         } else {
 423             h->left_cbp = 0;
 424         }
 425         if (left_type[0]) {
 426             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 427         }
 428         if (left_type[1]) {
 429             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 430         }
 431     }
 432
 433 #if 1
 434     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 435         int list;
 436         for(list=0; list<h->list_count; list++){
 437             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 438                 /*if(!h->mv_cache_clean[list]){
 439                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 440                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 441                     h->mv_cache_clean[list]= 1;
 442                 }*/
 443                 continue;
 444             }
 445             h->mv_cache_clean[list]= 0;
 446
 447             if(USES_LIST(top_type, list)){
 448                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 450                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 451                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 452                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 453                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 454                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 455                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 456                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 457                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 458             }else{
 459                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 460                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 461                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 462                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 463                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 464             }
 465
 466             for(i=0; i<2; i++){
 467                 int cache_idx = scan8[0] - 1 + i*2*8;
 468                 if(USES_LIST(left_type[i], list)){
 469                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 470                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 471                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 472                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 473                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 474                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 475                 }else{
 476                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 477                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 478                     h->ref_cache[list][cache_idx  ]=
 479                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 480                 }
 481             }
 482
 483             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 484                 continue;
 485
 486             if(USES_LIST(topleft_type, list)){
 487                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 488                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 489                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 490                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 491             }else{
 492                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 493                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 494             }
 495
 496             if(USES_LIST(topright_type, list)){
 497                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 498                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 499                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 500                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 501             }else{
 502                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 503                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 504             }
 505
 506             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 507                 continue;
 508
 509             h->ref_cache[list][scan8[5 ]+1] =
 510             h->ref_cache[list][scan8[7 ]+1] =
 511             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 512             h->ref_cache[list][scan8[4 ]] =
 513             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 514             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 515             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 516             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 517             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 518             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 519
 520             if( h->pps.cabac ) {
 521                 /* XXX beurk, Load mvd */
 522                 if(USES_LIST(top_type, list)){
 523                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 524                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 525                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 526                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 527                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 528                 }else{
 529                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 530                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 531                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 532                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 533                 }
 534                 if(USES_LIST(left_type[0], list)){
 535                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 536                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 537                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 538                 }else{
 539                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 540                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 541                 }
 542                 if(USES_LIST(left_type[1], list)){
 543                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 544                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 545                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 546                 }else{
 547                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 548                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 549                 }
 550                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 551                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 552                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 553                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 554                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 555
 556                 if(h->slice_type == B_TYPE){
 557                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 558
 559                     if(IS_DIRECT(top_type)){
 560                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 561                     }else if(IS_8X8(top_type)){
 562                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 563                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 564                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 565                     }else{
 566                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 567                     }
 568
 569                     if(IS_DIRECT(left_type[0]))
 570                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 571                     else if(IS_8X8(left_type[0]))
 572                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 573                     else
 574                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 575
 576                     if(IS_DIRECT(left_type[1]))
 577                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 578                     else if(IS_8X8(left_type[1]))
 579                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 580                     else
 581                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 582                 }
 583             }
 584
 585             if(FRAME_MBAFF){
 586 #define MAP_MVS\
 587                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 588                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 589                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 590                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 591                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 592                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 593                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 594                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 595                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 596                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 597                 if(MB_FIELD){
 598 #define MAP_F2F(idx, mb_type)\
 599                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 600                         h->ref_cache[list][idx] <<= 1;\
 601                         h->mv_cache[list][idx][1] /= 2;\
 602                         h->mvd_cache[list][idx][1] /= 2;\
 603                     }
 604                     MAP_MVS
 605 #undef MAP_F2F
 606                 }else{
 607 #define MAP_F2F(idx, mb_type)\
 608                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 609                         h->ref_cache[list][idx] >>= 1;\
 610                         h->mv_cache[list][idx][1] <<= 1;\
 611                         h->mvd_cache[list][idx][1] <<= 1;\
 612                     }
 613                     MAP_MVS
 614 #undef MAP_F2F
 615                 }
 616             }
 617         }
 618     }
 619 #endif
 620
 621     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 622 }
 623
 624 static inline void write_back_intra_pred_mode(H264Context *h){
 625     MpegEncContext * const s = &h->s;
 626     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 627
 628     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 629     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 630     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 631     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 632     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 633     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 634     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 635 }
 636
 637 /**
 638  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 639  */
 640 static inline int check_intra4x4_pred_mode(H264Context *h){
 641     MpegEncContext * const s = &h->s;
 642     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 643     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 644     int i;
 645
 646     if(!(h->top_samples_available&0x8000)){
 647         for(i=0; i<4; i++){
 648             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 649             if(status<0){
 650                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 651                 return -1;
 652             } else if(status){
 653                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 654             }
 655         }
 656     }
 657
 658     if(!(h->left_samples_available&0x8000)){
 659         for(i=0; i<4; i++){
 660             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 661             if(status<0){
 662                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 663                 return -1;
 664             } else if(status){
 665                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 666             }
 667         }
 668     }
 669
 670     return 0;
 671 } //FIXME cleanup like next
 672
 673 /**
 674  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 675  */
 676 static inline int check_intra_pred_mode(H264Context *h, int mode){
 677     MpegEncContext * const s = &h->s;
 678     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 679     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 680
 681     if(mode > 6U) {
 682         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 683         return -1;
 684     }
 685
 686     if(!(h->top_samples_available&0x8000)){
 687         mode= top[ mode ];
 688         if(mode<0){
 689             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 690             return -1;
 691         }
 692     }
 693
 694     if(!(h->left_samples_available&0x8000)){
 695         mode= left[ mode ];
 696         if(mode<0){
 697             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 698             return -1;
 699         }
 700     }
 701
 702     return mode;
 703 }
 704
 705 /**
 706  * gets the predicted intra4x4 prediction mode.
 707  */
 708 static inline int pred_intra_mode(H264Context *h, int n){
 709     const int index8= scan8[n];
 710     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 711     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 712     const int min= FFMIN(left, top);
 713
 714     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 715
 716     if(min<0) return DC_PRED;
 717     else      return min;
 718 }
 719
 720 static inline void write_back_non_zero_count(H264Context *h){
 721     MpegEncContext * const s = &h->s;
 722     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 723
 724     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 725     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 726     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 727     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 728     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 729     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 730     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 731
 732     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 733     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 734     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 735
 736     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 737     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 738     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 739
 740     if(FRAME_MBAFF){
 741         // store all luma nnzs, for deblocking
 742         int v = 0, i;
 743         for(i=0; i<16; i++)
 744             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 745         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 746     }
 747 }
 748
 749 /**
 750  * gets the predicted number of non zero coefficients.
 751  * @param n block index
 752  */
 753 static inline int pred_non_zero_count(H264Context *h, int n){
 754     const int index8= scan8[n];
 755     const int left= h->non_zero_count_cache[index8 - 1];
 756     const int top = h->non_zero_count_cache[index8 - 8];
 757     int i= left + top;
 758
 759     if(i<64) i= (i+1)>>1;
 760
 761     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 762
 763     return i&31;
 764 }
 765
 766 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 767     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 768     MpegEncContext *s = &h->s;
 769
 770     /* there is no consistent mapping of mvs to neighboring locations that will
 771      * make mbaff happy, so we can't move all this logic to fill_caches */
 772     if(FRAME_MBAFF){
 773         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 774         const int16_t *mv;
 775         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 776         *C = h->mv_cache[list][scan8[0]-2];
 777
 778         if(!MB_FIELD
 779            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 780             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 781             if(IS_INTERLACED(mb_types[topright_xy])){
 782 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 783                 const int x4 = X4, y4 = Y4;\
 784                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 785                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 786                     return LIST_NOT_USED;\
 787                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 788                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 789                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 790                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 791
 792                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 793             }
 794         }
 795         if(topright_ref == PART_NOT_AVAILABLE
 796            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 797            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 798             if(!MB_FIELD
 799                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 800                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 801             }
 802             if(MB_FIELD
 803                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 804                && i >= scan8[0]+8){
 805                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 806                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 807             }
 808         }
 809 #undef SET_DIAG_MV
 810     }
 811
 812     if(topright_ref != PART_NOT_AVAILABLE){
 813         *C= h->mv_cache[list][ i - 8 + part_width ];
 814         return topright_ref;
 815     }else{
 816         tprintf(s->avctx, "topright MV not available\n");
 817
 818         *C= h->mv_cache[list][ i - 8 - 1 ];
 819         return h->ref_cache[list][ i - 8 - 1 ];
 820     }
 821 }
 822
 823 /**
 824  * gets the predicted MV.
 825  * @param n the block index
 826  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 827  * @param mx the x component of the predicted motion vector
 828  * @param my the y component of the predicted motion vector
 829  */
 830 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 831     const int index8= scan8[n];
 832     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 833     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 834     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 835     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 836     const int16_t * C;
 837     int diagonal_ref, match_count;
 838
 839     assert(part_width==1 || part_width==2 || part_width==4);
 840
 841 /* mv_cache
 842   B . . A T T T T
 843   U . . L . . , .
 844   U . . L . . . .
 845   U . . L . . , .
 846   . . . L . . . .
 847 */
 848
 849     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 850     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 851     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 852     if(match_count > 1){ //most common
 853         *mx= mid_pred(A[0], B[0], C[0]);
 854         *my= mid_pred(A[1], B[1], C[1]);
 855     }else if(match_count==1){
 856         if(left_ref==ref){
 857             *mx= A[0];
 858             *my= A[1];
 859         }else if(top_ref==ref){
 860             *mx= B[0];
 861             *my= B[1];
 862         }else{
 863             *mx= C[0];
 864             *my= C[1];
 865         }
 866     }else{
 867         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 868             *mx= A[0];
 869             *my= A[1];
 870         }else{
 871             *mx= mid_pred(A[0], B[0], C[0]);
 872             *my= mid_pred(A[1], B[1], C[1]);
 873         }
 874     }
 875
 876     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 877 }
 878
 879 /**
 880  * gets the directionally predicted 16x8 MV.
 881  * @param n the block index
 882  * @param mx the x component of the predicted motion vector
 883  * @param my the y component of the predicted motion vector
 884  */
 885 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 886     if(n==0){
 887         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 888         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 889
 890         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 891
 892         if(top_ref == ref){
 893             *mx= B[0];
 894             *my= B[1];
 895             return;
 896         }
 897     }else{
 898         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 899         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 900
 901         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 902
 903         if(left_ref == ref){
 904             *mx= A[0];
 905             *my= A[1];
 906             return;
 907         }
 908     }
 909
 910     //RARE
 911     pred_motion(h, n, 4, list, ref, mx, my);
 912 }
 913
 914 /**
 915  * gets the directionally predicted 8x16 MV.
 916  * @param n the block index
 917  * @param mx the x component of the predicted motion vector
 918  * @param my the y component of the predicted motion vector
 919  */
 920 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 921     if(n==0){
 922         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 923         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 924
 925         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 926
 927         if(left_ref == ref){
 928             *mx= A[0];
 929             *my= A[1];
 930             return;
 931         }
 932     }else{
 933         const int16_t * C;
 934         int diagonal_ref;
 935
 936         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 937
 938         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 939
 940         if(diagonal_ref == ref){
 941             *mx= C[0];
 942             *my= C[1];
 943             return;
 944         }
 945     }
 946
 947     //RARE
 948     pred_motion(h, n, 2, list, ref, mx, my);
 949 }
 950
 951 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 952     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 953     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 954
 955     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 956
 957     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 958        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 959        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 960
 961         *mx = *my = 0;
 962         return;
 963     }
 964
 965     pred_motion(h, 0, 4, 0, 0, mx, my);
 966
 967     return;
 968 }
 969
 970 static inline void direct_dist_scale_factor(H264Context * const h){
 971     const int poc = h->s.current_picture_ptr->poc;
 972     const int poc1 = h->ref_list[1][0].poc;
 973     int i;
 974     for(i=0; i<h->ref_count[0]; i++){
 975         int poc0 = h->ref_list[0][i].poc;
 976         int td = av_clip(poc1 - poc0, -128, 127);
 977         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 978             h->dist_scale_factor[i] = 256;
 979         }else{
 980             int tb = av_clip(poc - poc0, -128, 127);
 981             int tx = (16384 + (FFABS(td) >> 1)) / td;
 982             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 983         }
 984     }
 985     if(FRAME_MBAFF){
 986         for(i=0; i<h->ref_count[0]; i++){
 987             h->dist_scale_factor_field[2*i] =
 988             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 989         }
 990     }
 991 }
 992 static inline void direct_ref_list_init(H264Context * const h){
 993     MpegEncContext * const s = &h->s;
 994     Picture * const ref1 = &h->ref_list[1][0];
 995     Picture * const cur = s->current_picture_ptr;
 996     int list, i, j;
 997     if(cur->pict_type == I_TYPE)
 998         cur->ref_count[0] = 0;
 999     if(cur->pict_type != B_TYPE)
1000         cur->ref_count[1] = 0;
1001     for(list=0; list<2; list++){
1002         cur->ref_count[list] = h->ref_count[list];
1003         for(j=0; j<h->ref_count[list]; j++)
1004             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1005     }
1006     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1007         return;
1008     for(list=0; list<2; list++){
1009         for(i=0; i<ref1->ref_count[list]; i++){
1010             const int poc = ref1->ref_poc[list][i];
1011             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1012             for(j=0; j<h->ref_count[list]; j++)
1013                 if(h->ref_list[list][j].poc == poc){
1014                     h->map_col_to_list0[list][i] = j;
1015                     break;
1016                 }
1017         }
1018     }
1019     if(FRAME_MBAFF){
1020         for(list=0; list<2; list++){
1021             for(i=0; i<ref1->ref_count[list]; i++){
1022                 j = h->map_col_to_list0[list][i];
1023                 h->map_col_to_list0_field[list][2*i] = 2*j;
1024                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1025             }
1026         }
1027     }
1028 }
1029
1030 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1031     MpegEncContext * const s = &h->s;
1032     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1033     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1034     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1035     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1036     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1037     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1038     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1039     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1040     const int is_b8x8 = IS_8X8(*mb_type);
1041     unsigned int sub_mb_type;
1042     int i8, i4;
1043
1044 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1045     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1046         /* FIXME save sub mb types from previous frames (or derive from MVs)
1047          * so we know exactly what block size to use */
1048         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1049         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1050     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1051         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1052         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1053     }else{
1054         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1055         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1056     }
1057     if(!is_b8x8)
1058         *mb_type |= MB_TYPE_DIRECT2;
1059     if(MB_FIELD)
1060         *mb_type |= MB_TYPE_INTERLACED;
1061
1062     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1063
1064     if(h->direct_spatial_mv_pred){
1065         int ref[2];
1066         int mv[2][2];
1067         int list;
1068
1069         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1070
1071         /* ref = min(neighbors) */
1072         for(list=0; list<2; list++){
1073             int refa = h->ref_cache[list][scan8[0] - 1];
1074             int refb = h->ref_cache[list][scan8[0] - 8];
1075             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1076             if(refc == -2)
1077                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1078             ref[list] = refa;
1079             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1080                 ref[list] = refb;
1081             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1082                 ref[list] = refc;
1083             if(ref[list] < 0)
1084                 ref[list] = -1;
1085         }
1086
1087         if(ref[0] < 0 && ref[1] < 0){
1088             ref[0] = ref[1] = 0;
1089             mv[0][0] = mv[0][1] =
1090             mv[1][0] = mv[1][1] = 0;
1091         }else{
1092             for(list=0; list<2; list++){
1093                 if(ref[list] >= 0)
1094                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1095                 else
1096                     mv[list][0] = mv[list][1] = 0;
1097             }
1098         }
1099
1100         if(ref[1] < 0){
1101             *mb_type &= ~MB_TYPE_P0L1;
1102             sub_mb_type &= ~MB_TYPE_P0L1;
1103         }else if(ref[0] < 0){
1104             *mb_type &= ~MB_TYPE_P0L0;
1105             sub_mb_type &= ~MB_TYPE_P0L0;
1106         }
1107
1108         if(IS_16X16(*mb_type)){
1109             int a=0, b=0;
1110
1111             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1112             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1113             if(!IS_INTRA(mb_type_col)
1114                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1115                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1116                        && (h->x264_build>33 || !h->x264_build)))){
1117                 if(ref[0] > 0)
1118                     a= pack16to32(mv[0][0],mv[0][1]);
1119                 if(ref[1] > 0)
1120                     b= pack16to32(mv[1][0],mv[1][1]);
1121             }else{
1122                 a= pack16to32(mv[0][0],mv[0][1]);
1123                 b= pack16to32(mv[1][0],mv[1][1]);
1124             }
1125             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1126             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1127         }else{
1128             for(i8=0; i8<4; i8++){
1129                 const int x8 = i8&1;
1130                 const int y8 = i8>>1;
1131
1132                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1133                     continue;
1134                 h->sub_mb_type[i8] = sub_mb_type;
1135
1136                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1137                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1138                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1139                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1140
1141                 /* col_zero_flag */
1142                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1143                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1144                                                   && (h->x264_build>33 || !h->x264_build)))){
1145                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1146                     if(IS_SUB_8X8(sub_mb_type)){
1147                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1148                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1149                             if(ref[0] == 0)
1150                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1151                             if(ref[1] == 0)
1152                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1153                         }
1154                     }else
1155                     for(i4=0; i4<4; i4++){
1156                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1157                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1158                             if(ref[0] == 0)
1159                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1160                             if(ref[1] == 0)
1161                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1162                         }
1163                     }
1164                 }
1165             }
1166         }
1167     }else{ /* direct temporal mv pred */
1168         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1169         const int *dist_scale_factor = h->dist_scale_factor;
1170
1171         if(FRAME_MBAFF){
1172             if(IS_INTERLACED(*mb_type)){
1173                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1174                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1175                 dist_scale_factor = h->dist_scale_factor_field;
1176             }
1177             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1178                 /* FIXME assumes direct_8x8_inference == 1 */
1179                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1180                 int mb_types_col[2];
1181                 int y_shift;
1182
1183                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1184                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1185                          | (*mb_type & MB_TYPE_INTERLACED);
1186                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1187
1188                 if(IS_INTERLACED(*mb_type)){
1189                     /* frame to field scaling */
1190                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1191                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1192                     if(s->mb_y&1){
1193                         l1ref0 -= 2*h->b8_stride;
1194                         l1ref1 -= 2*h->b8_stride;
1195                         l1mv0 -= 4*h->b_stride;
1196                         l1mv1 -= 4*h->b_stride;
1197                     }
1198                     y_shift = 0;
1199
1200                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1201                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1202                        && !is_b8x8)
1203                         *mb_type |= MB_TYPE_16x8;
1204                     else
1205                         *mb_type |= MB_TYPE_8x8;
1206                 }else{
1207                     /* field to frame scaling */
1208                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1209                      * but in MBAFF, top and bottom POC are equal */
1210                     int dy = (s->mb_y&1) ? 1 : 2;
1211                     mb_types_col[0] =
1212                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1213                     l1ref0 += dy*h->b8_stride;
1214                     l1ref1 += dy*h->b8_stride;
1215                     l1mv0 += 2*dy*h->b_stride;
1216                     l1mv1 += 2*dy*h->b_stride;
1217                     y_shift = 2;
1218
1219                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1220                        && !is_b8x8)
1221                         *mb_type |= MB_TYPE_16x16;
1222                     else
1223                         *mb_type |= MB_TYPE_8x8;
1224                 }
1225
1226                 for(i8=0; i8<4; i8++){
1227                     const int x8 = i8&1;
1228                     const int y8 = i8>>1;
1229                     int ref0, scale;
1230                     const int16_t (*l1mv)[2]= l1mv0;
1231
1232                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1233                         continue;
1234                     h->sub_mb_type[i8] = sub_mb_type;
1235
1236                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1237                     if(IS_INTRA(mb_types_col[y8])){
1238                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1239                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1240                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1241                         continue;
1242                     }
1243
1244                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1245                     if(ref0 >= 0)
1246                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1247                     else{
1248                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1249                         l1mv= l1mv1;
1250                     }
1251                     scale = dist_scale_factor[ref0];
1252                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1253
1254                     {
1255                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1256                         int my_col = (mv_col[1]<<y_shift)/2;
1257                         int mx = (scale * mv_col[0] + 128) >> 8;
1258                         int my = (scale * my_col + 128) >> 8;
1259                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1260                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1261                     }
1262                 }
1263                 return;
1264             }
1265         }
1266
1267         /* one-to-one mv scaling */
1268
1269         if(IS_16X16(*mb_type)){
1270             int ref, mv0, mv1;
1271
1272             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1273             if(IS_INTRA(mb_type_col)){
1274                 ref=mv0=mv1=0;
1275             }else{
1276                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1277                                                 : map_col_to_list0[1][l1ref1[0]];
1278                 const int scale = dist_scale_factor[ref0];
1279                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1280                 int mv_l0[2];
1281                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1282                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1283                 ref= ref0;
1284                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1285                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1286             }
1287             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1288             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1289             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1290         }else{
1291             for(i8=0; i8<4; i8++){
1292                 const int x8 = i8&1;
1293                 const int y8 = i8>>1;
1294                 int ref0, scale;
1295                 const int16_t (*l1mv)[2]= l1mv0;
1296
1297                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1298                     continue;
1299                 h->sub_mb_type[i8] = sub_mb_type;
1300                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1301                 if(IS_INTRA(mb_type_col)){
1302                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1303                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1304                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1305                     continue;
1306                 }
1307
1308                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1309                 if(ref0 >= 0)
1310                     ref0 = map_col_to_list0[0][ref0];
1311                 else{
1312                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1313                     l1mv= l1mv1;
1314                 }
1315                 scale = dist_scale_factor[ref0];
1316
1317                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1318                 if(IS_SUB_8X8(sub_mb_type)){
1319                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1320                     int mx = (scale * mv_col[0] + 128) >> 8;
1321                     int my = (scale * mv_col[1] + 128) >> 8;
1322                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1323                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1324                 }else
1325                 for(i4=0; i4<4; i4++){
1326                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1327                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1328                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1329                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1330                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1331                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1332                 }
1333             }
1334         }
1335     }
1336 }
1337
1338 static inline void write_back_motion(H264Context *h, int mb_type){
1339     MpegEncContext * const s = &h->s;
1340     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1341     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1342     int list;
1343
1344     if(!USES_LIST(mb_type, 0))
1345         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1346
1347     for(list=0; list<h->list_count; list++){
1348         int y;
1349         if(!USES_LIST(mb_type, list))
1350             continue;
1351
1352         for(y=0; y<4; y++){
1353             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1354             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1355         }
1356         if( h->pps.cabac ) {
1357             if(IS_SKIP(mb_type))
1358                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1359             else
1360             for(y=0; y<4; y++){
1361                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1362                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1363             }
1364         }
1365
1366         {
1367             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1368             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1369             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1370             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1371             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1372         }
1373     }
1374
1375     if(h->slice_type == B_TYPE && h->pps.cabac){
1376         if(IS_8X8(mb_type)){
1377             uint8_t *direct_table = &h->direct_table[b8_xy];
1378             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1379             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1380             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1381         }
1382     }
1383 }
1384
1385 /**
1386  * Decodes a network abstraction layer unit.
1387  * @param consumed is the number of bytes used as input
1388  * @param length is the length of the array
1389  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1390  * @returns decoded bytes, might be src+1 if no escapes
1391  */
1392 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1393     int i, si, di;
1394     uint8_t *dst;
1395
1396 //    src[0]&0x80;                //forbidden bit
1397     h->nal_ref_idc= src[0]>>5;
1398     h->nal_unit_type= src[0]&0x1F;
1399
1400     src++; length--;
1401 #if 0
1402     for(i=0; i<length; i++)
1403         printf("%2X ", src[i]);
1404 #endif
1405     for(i=0; i+1<length; i+=2){
1406         if(src[i]) continue;
1407         if(i>0 && src[i-1]==0) i--;
1408         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1409             if(src[i+2]!=3){
1410                 /* startcode, so we must be past the end */
1411                 length=i;
1412             }
1413             break;
1414         }
1415     }
1416
1417     if(i>=length-1){ //no escaped 0
1418         *dst_length= length;
1419         *consumed= length+1; //+1 for the header
1420         return src;
1421     }
1422
1423     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1424     dst= h->rbsp_buffer;
1425
1426     if (dst == NULL){
1427         return NULL;
1428     }
1429
1430 //printf("decoding esc\n");
1431     si=di=0;
1432     while(si<length){
1433         //remove escapes (very rare 1:2^22)
1434         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1435             if(src[si+2]==3){ //escape
1436                 dst[di++]= 0;
1437                 dst[di++]= 0;
1438                 si+=3;
1439                 continue;
1440             }else //next start code
1441                 break;
1442         }
1443
1444         dst[di++]= src[si++];
1445     }
1446
1447     *dst_length= di;
1448     *consumed= si + 1;//+1 for the header
1449 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1450     return dst;
1451 }
1452
1453 /**
1454  * identifies the exact end of the bitstream
1455  * @return the length of the trailing, or 0 if damaged
1456  */
1457 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1458     int v= *src;
1459     int r;
1460
1461     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1462
1463     for(r=1; r<9; r++){
1464         if(v&1) return r;
1465         v>>=1;
1466     }
1467     return 0;
1468 }
1469
1470 /**
1471  * idct tranforms the 16 dc values and dequantize them.
1472  * @param qp quantization parameter
1473  */
1474 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1475 #define stride 16
1476     int i;
1477     int temp[16]; //FIXME check if this is a good idea
1478     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1479     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1480
1481 //memset(block, 64, 2*256);
1482 //return;
1483     for(i=0; i<4; i++){
1484         const int offset= y_offset[i];
1485         const int z0= block[offset+stride*0] + block[offset+stride*4];
1486         const int z1= block[offset+stride*0] - block[offset+stride*4];
1487         const int z2= block[offset+stride*1] - block[offset+stride*5];
1488         const int z3= block[offset+stride*1] + block[offset+stride*5];
1489
1490         temp[4*i+0]= z0+z3;
1491         temp[4*i+1]= z1+z2;
1492         temp[4*i+2]= z1-z2;
1493         temp[4*i+3]= z0-z3;
1494     }
1495
1496     for(i=0; i<4; i++){
1497         const int offset= x_offset[i];
1498         const int z0= temp[4*0+i] + temp[4*2+i];
1499         const int z1= temp[4*0+i] - temp[4*2+i];
1500         const int z2= temp[4*1+i] - temp[4*3+i];
1501         const int z3= temp[4*1+i] + temp[4*3+i];
1502
1503         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1504         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1505         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1506         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1507     }
1508 }
1509
1510 #if 0
1511 /**
1512  * dct tranforms the 16 dc values.
1513  * @param qp quantization parameter ??? FIXME
1514  */
1515 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1516 //    const int qmul= dequant_coeff[qp][0];
1517     int i;
1518     int temp[16]; //FIXME check if this is a good idea
1519     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1520     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1521
1522     for(i=0; i<4; i++){
1523         const int offset= y_offset[i];
1524         const int z0= block[offset+stride*0] + block[offset+stride*4];
1525         const int z1= block[offset+stride*0] - block[offset+stride*4];
1526         const int z2= block[offset+stride*1] - block[offset+stride*5];
1527         const int z3= block[offset+stride*1] + block[offset+stride*5];
1528
1529         temp[4*i+0]= z0+z3;
1530         temp[4*i+1]= z1+z2;
1531         temp[4*i+2]= z1-z2;
1532         temp[4*i+3]= z0-z3;
1533     }
1534
1535     for(i=0; i<4; i++){
1536         const int offset= x_offset[i];
1537         const int z0= temp[4*0+i] + temp[4*2+i];
1538         const int z1= temp[4*0+i] - temp[4*2+i];
1539         const int z2= temp[4*1+i] - temp[4*3+i];
1540         const int z3= temp[4*1+i] + temp[4*3+i];
1541
1542         block[stride*0 +offset]= (z0 + z3)>>1;
1543         block[stride*2 +offset]= (z1 + z2)>>1;
1544         block[stride*8 +offset]= (z1 - z2)>>1;
1545         block[stride*10+offset]= (z0 - z3)>>1;
1546     }
1547 }
1548 #endif
1549
1550 #undef xStride
1551 #undef stride
1552
1553 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1554     const int stride= 16*2;
1555     const int xStride= 16;
1556     int a,b,c,d,e;
1557
1558     a= block[stride*0 + xStride*0];
1559     b= block[stride*0 + xStride*1];
1560     c= block[stride*1 + xStride*0];
1561     d= block[stride*1 + xStride*1];
1562
1563     e= a-b;
1564     a= a+b;
1565     b= c-d;
1566     c= c+d;
1567
1568     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1569     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1570     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1571     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1572 }
1573
1574 #if 0
1575 static void chroma_dc_dct_c(DCTELEM *block){
1576     const int stride= 16*2;
1577     const int xStride= 16;
1578     int a,b,c,d,e;
1579
1580     a= block[stride*0 + xStride*0];
1581     b= block[stride*0 + xStride*1];
1582     c= block[stride*1 + xStride*0];
1583     d= block[stride*1 + xStride*1];
1584
1585     e= a-b;
1586     a= a+b;
1587     b= c-d;
1588     c= c+d;
1589
1590     block[stride*0 + xStride*0]= (a+c);
1591     block[stride*0 + xStride*1]= (e+b);
1592     block[stride*1 + xStride*0]= (a-c);
1593     block[stride*1 + xStride*1]= (e-b);
1594 }
1595 #endif
1596
1597 /**
1598  * gets the chroma qp.
1599  */
1600 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1601
1602     return chroma_qp[av_clip(qscale + chroma_qp_index_offset, 0, 51)];
1603 }
1604
1605 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1606 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1607 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1608     int i;
1609     const int * const quant_table= quant_coeff[qscale];
1610     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1611     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1612     const unsigned int threshold2= (threshold1<<1);
1613     int last_non_zero;
1614
1615     if(separate_dc){
1616         if(qscale<=18){
1617             //avoid overflows
1618             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1619             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1620             const unsigned int dc_threshold2= (dc_threshold1<<1);
1621
1622             int level= block[0]*quant_coeff[qscale+18][0];
1623             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1624                 if(level>0){
1625                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1626                     block[0]= level;
1627                 }else{
1628                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1629                     block[0]= -level;
1630                 }
1631 //                last_non_zero = i;
1632             }else{
1633                 block[0]=0;
1634             }
1635         }else{
1636             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1637             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1638             const unsigned int dc_threshold2= (dc_threshold1<<1);
1639
1640             int level= block[0]*quant_table[0];
1641             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1642                 if(level>0){
1643                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1644                     block[0]= level;
1645                 }else{
1646                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1647                     block[0]= -level;
1648                 }
1649 //                last_non_zero = i;
1650             }else{
1651                 block[0]=0;
1652             }
1653         }
1654         last_non_zero= 0;
1655         i=1;
1656     }else{
1657         last_non_zero= -1;
1658         i=0;
1659     }
1660
1661     for(; i<16; i++){
1662         const int j= scantable[i];
1663         int level= block[j]*quant_table[j];
1664
1665 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1666 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1667         if(((unsigned)(level+threshold1))>threshold2){
1668             if(level>0){
1669                 level= (bias + level)>>QUANT_SHIFT;
1670                 block[j]= level;
1671             }else{
1672                 level= (bias - level)>>QUANT_SHIFT;
1673                 block[j]= -level;
1674             }
1675             last_non_zero = i;
1676         }else{
1677             block[j]=0;
1678         }
1679     }
1680
1681     return last_non_zero;
1682 }
1683
1684 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1685     const uint32_t a= ((uint32_t*)(src-stride))[0];
1686     ((uint32_t*)(src+0*stride))[0]= a;
1687     ((uint32_t*)(src+1*stride))[0]= a;
1688     ((uint32_t*)(src+2*stride))[0]= a;
1689     ((uint32_t*)(src+3*stride))[0]= a;
1690 }
1691
1692 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1693     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1694     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1695     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1696     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1697 }
1698
1699 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1700     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1701                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1702
1703     ((uint32_t*)(src+0*stride))[0]=
1704     ((uint32_t*)(src+1*stride))[0]=
1705     ((uint32_t*)(src+2*stride))[0]=
1706     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1707 }
1708
1709 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1710     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1711
1712     ((uint32_t*)(src+0*stride))[0]=
1713     ((uint32_t*)(src+1*stride))[0]=
1714     ((uint32_t*)(src+2*stride))[0]=
1715     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1716 }
1717
1718 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1719     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1720
1721     ((uint32_t*)(src+0*stride))[0]=
1722     ((uint32_t*)(src+1*stride))[0]=
1723     ((uint32_t*)(src+2*stride))[0]=
1724     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1725 }
1726
1727 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1728     ((uint32_t*)(src+0*stride))[0]=
1729     ((uint32_t*)(src+1*stride))[0]=
1730     ((uint32_t*)(src+2*stride))[0]=
1731     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1732 }
1733
1734
1735 #define LOAD_TOP_RIGHT_EDGE\
1736     const int av_unused t4= topright[0];\
1737     const int av_unused t5= topright[1];\
1738     const int av_unused t6= topright[2];\
1739     const int av_unused t7= topright[3];\
1740
1741 #define LOAD_LEFT_EDGE\
1742     const int av_unused l0= src[-1+0*stride];\
1743     const int av_unused l1= src[-1+1*stride];\
1744     const int av_unused l2= src[-1+2*stride];\
1745     const int av_unused l3= src[-1+3*stride];\
1746
1747 #define LOAD_TOP_EDGE\
1748     const int av_unused t0= src[ 0-1*stride];\
1749     const int av_unused t1= src[ 1-1*stride];\
1750     const int av_unused t2= src[ 2-1*stride];\
1751     const int av_unused t3= src[ 3-1*stride];\
1752
1753 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1754     const int lt= src[-1-1*stride];
1755     LOAD_TOP_EDGE
1756     LOAD_LEFT_EDGE
1757
1758     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1759     src[0+2*stride]=
1760     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1761     src[0+1*stride]=
1762     src[1+2*stride]=
1763     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1764     src[0+0*stride]=
1765     src[1+1*stride]=
1766     src[2+2*stride]=
1767     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1768     src[1+0*stride]=
1769     src[2+1*stride]=
1770     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1771     src[2+0*stride]=
1772     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1773     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1774 }
1775
1776 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1777     LOAD_TOP_EDGE
1778     LOAD_TOP_RIGHT_EDGE
1779 //    LOAD_LEFT_EDGE
1780
1781     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1782     src[1+0*stride]=
1783     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1784     src[2+0*stride]=
1785     src[1+1*stride]=
1786     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1787     src[3+0*stride]=
1788     src[2+1*stride]=
1789     src[1+2*stride]=
1790     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1791     src[3+1*stride]=
1792     src[2+2*stride]=
1793     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1794     src[3+2*stride]=
1795     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1796     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1797 }
1798
1799 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1800     const int lt= src[-1-1*stride];
1801     LOAD_TOP_EDGE
1802     LOAD_LEFT_EDGE
1803
1804     src[0+0*stride]=
1805     src[1+2*stride]=(lt + t0 + 1)>>1;
1806     src[1+0*stride]=
1807     src[2+2*stride]=(t0 + t1 + 1)>>1;
1808     src[2+0*stride]=
1809     src[3+2*stride]=(t1 + t2 + 1)>>1;
1810     src[3+0*stride]=(t2 + t3 + 1)>>1;
1811     src[0+1*stride]=
1812     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1813     src[1+1*stride]=
1814     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
1815     src[2+1*stride]=
1816     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1817     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1818     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1819     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1820 }
1821
1822 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
1823     LOAD_TOP_EDGE
1824     LOAD_TOP_RIGHT_EDGE
1825
1826     src[0+0*stride]=(t0 + t1 + 1)>>1;
1827     src[1+0*stride]=
1828     src[0+2*stride]=(t1 + t2 + 1)>>1;
1829     src[2+0*stride]=
1830     src[1+2*stride]=(t2 + t3 + 1)>>1;
1831     src[3+0*stride]=
1832     src[2+2*stride]=(t3 + t4+ 1)>>1;
1833     src[3+2*stride]=(t4 + t5+ 1)>>1;
1834     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1835     src[1+1*stride]=
1836     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1837     src[2+1*stride]=
1838     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
1839     src[3+1*stride]=
1840     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
1841     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
1842 }
1843
1844 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
1845     LOAD_LEFT_EDGE
1846
1847     src[0+0*stride]=(l0 + l1 + 1)>>1;
1848     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1849     src[2+0*stride]=
1850     src[0+1*stride]=(l1 + l2 + 1)>>1;
1851     src[3+0*stride]=
1852     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1853     src[2+1*stride]=
1854     src[0+2*stride]=(l2 + l3 + 1)>>1;
1855     src[3+1*stride]=
1856     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
1857     src[3+2*stride]=
1858     src[1+3*stride]=
1859     src[0+3*stride]=
1860     src[2+2*stride]=
1861     src[2+3*stride]=
1862     src[3+3*stride]=l3;
1863 }
1864
1865 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
1866     const int lt= src[-1-1*stride];
1867     LOAD_TOP_EDGE
1868     LOAD_LEFT_EDGE
1869
1870     src[0+0*stride]=
1871     src[2+1*stride]=(lt + l0 + 1)>>1;
1872     src[1+0*stride]=
1873     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
1874     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
1875     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1876     src[0+1*stride]=
1877     src[2+2*stride]=(l0 + l1 + 1)>>1;
1878     src[1+1*stride]=
1879     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1880     src[0+2*stride]=
1881     src[2+3*stride]=(l1 + l2+ 1)>>1;
1882     src[1+2*stride]=
1883     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1884     src[0+3*stride]=(l2 + l3 + 1)>>1;
1885     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1886 }
1887
1888 void ff_pred16x16_vertical_c(uint8_t *src, int stride){
1889     int i;
1890     const uint32_t a= ((uint32_t*)(src-stride))[0];
1891     const uint32_t b= ((uint32_t*)(src-stride))[1];
1892     const uint32_t c= ((uint32_t*)(src-stride))[2];
1893     const uint32_t d= ((uint32_t*)(src-stride))[3];
1894
1895     for(i=0; i<16; i++){
1896         ((uint32_t*)(src+i*stride))[0]= a;
1897         ((uint32_t*)(src+i*stride))[1]= b;
1898         ((uint32_t*)(src+i*stride))[2]= c;
1899         ((uint32_t*)(src+i*stride))[3]= d;
1900     }
1901 }
1902
1903 void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
1904     int i;
1905
1906     for(i=0; i<16; i++){
1907         ((uint32_t*)(src+i*stride))[0]=
1908         ((uint32_t*)(src+i*stride))[1]=
1909         ((uint32_t*)(src+i*stride))[2]=
1910         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
1911     }
1912 }
1913
1914 void ff_pred16x16_dc_c(uint8_t *src, int stride){
1915     int i, dc=0;
1916
1917     for(i=0;i<16; i++){
1918         dc+= src[-1+i*stride];
1919     }
1920
1921     for(i=0;i<16; i++){
1922         dc+= src[i-stride];
1923     }
1924
1925     dc= 0x01010101*((dc + 16)>>5);
1926
1927     for(i=0; i<16; i++){
1928         ((uint32_t*)(src+i*stride))[0]=
1929         ((uint32_t*)(src+i*stride))[1]=
1930         ((uint32_t*)(src+i*stride))[2]=
1931         ((uint32_t*)(src+i*stride))[3]= dc;
1932     }
1933 }
1934
1935 void ff_pred16x16_left_dc_c(uint8_t *src, int stride){
1936     int i, dc=0;
1937
1938     for(i=0;i<16; i++){
1939         dc+= src[-1+i*stride];
1940     }
1941
1942     dc= 0x01010101*((dc + 8)>>4);
1943
1944     for(i=0; i<16; i++){
1945         ((uint32_t*)(src+i*stride))[0]=
1946         ((uint32_t*)(src+i*stride))[1]=
1947         ((uint32_t*)(src+i*stride))[2]=
1948         ((uint32_t*)(src+i*stride))[3]= dc;
1949     }
1950 }
1951
1952 void ff_pred16x16_top_dc_c(uint8_t *src, int stride){
1953     int i, dc=0;
1954
1955     for(i=0;i<16; i++){
1956         dc+= src[i-stride];
1957     }
1958     dc= 0x01010101*((dc + 8)>>4);
1959
1960     for(i=0; i<16; i++){
1961         ((uint32_t*)(src+i*stride))[0]=
1962         ((uint32_t*)(src+i*stride))[1]=
1963         ((uint32_t*)(src+i*stride))[2]=
1964         ((uint32_t*)(src+i*stride))[3]= dc;
1965     }
1966 }
1967
1968 void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
1969     int i;
1970
1971     for(i=0; i<16; i++){
1972         ((uint32_t*)(src+i*stride))[0]=
1973         ((uint32_t*)(src+i*stride))[1]=
1974         ((uint32_t*)(src+i*stride))[2]=
1975         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
1976     }
1977 }
1978
1979 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
1980   int i, j, k;
1981   int a;
1982   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1983   const uint8_t * const src0 = src+7-stride;
1984   const uint8_t *src1 = src+8*stride-1;
1985   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
1986   int H = src0[1] - src0[-1];
1987   int V = src1[0] - src2[ 0];
1988   for(k=2; k<=8; ++k) {
1989     src1 += stride; src2 -= stride;
1990     H += k*(src0[k] - src0[-k]);
1991     V += k*(src1[0] - src2[ 0]);
1992   }
1993   if(svq3){
1994     H = ( 5*(H/4) ) / 16;
1995     V = ( 5*(V/4) ) / 16;
1996
1997     /* required for 100% accuracy */
1998     i = H; H = V; V = i;
1999   }else{
2000     H = ( 5*H+32 ) >> 6;
2001     V = ( 5*V+32 ) >> 6;
2002   }
2003
2004   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2005   for(j=16; j>0; --j) {
2006     int b = a;
2007     a += V;
2008     for(i=-16; i<0; i+=4) {
2009       src[16+i] = cm[ (b    ) >> 5 ];
2010       src[17+i] = cm[ (b+  H) >> 5 ];
2011       src[18+i] = cm[ (b+2*H) >> 5 ];
2012       src[19+i] = cm[ (b+3*H) >> 5 ];
2013       b += 4*H;
2014     }
2015     src += stride;
2016   }
2017 }
2018
2019 void ff_pred16x16_plane_c(uint8_t *src, int stride){
2020     pred16x16_plane_compat_c(src, stride, 0);
2021 }
2022
2023 void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2024     int i;
2025     const uint32_t a= ((uint32_t*)(src-stride))[0];
2026     const uint32_t b= ((uint32_t*)(src-stride))[1];
2027
2028     for(i=0; i<8; i++){
2029         ((uint32_t*)(src+i*stride))[0]= a;
2030         ((uint32_t*)(src+i*stride))[1]= b;
2031     }
2032 }
2033
2034 void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2035     int i;
2036
2037     for(i=0; i<8; i++){
2038         ((uint32_t*)(src+i*stride))[0]=
2039         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2040     }
2041 }
2042
2043 void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2044     int i;
2045
2046     for(i=0; i<8; i++){
2047         ((uint32_t*)(src+i*stride))[0]=
2048         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2049     }
2050 }
2051
2052 void ff_pred8x8_left_dc_c(uint8_t *src, int stride){
2053     int i;
2054     int dc0, dc2;
2055
2056     dc0=dc2=0;
2057     for(i=0;i<4; i++){
2058         dc0+= src[-1+i*stride];
2059         dc2+= src[-1+(i+4)*stride];
2060     }
2061     dc0= 0x01010101*((dc0 + 2)>>2);
2062     dc2= 0x01010101*((dc2 + 2)>>2);
2063
2064     for(i=0; i<4; i++){
2065         ((uint32_t*)(src+i*stride))[0]=
2066         ((uint32_t*)(src+i*stride))[1]= dc0;
2067     }
2068     for(i=4; i<8; i++){
2069         ((uint32_t*)(src+i*stride))[0]=
2070         ((uint32_t*)(src+i*stride))[1]= dc2;
2071     }
2072 }
2073
2074 void ff_pred8x8_top_dc_c(uint8_t *src, int stride){
2075     int i;
2076     int dc0, dc1;
2077
2078     dc0=dc1=0;
2079     for(i=0;i<4; i++){
2080         dc0+= src[i-stride];
2081         dc1+= src[4+i-stride];
2082     }
2083     dc0= 0x01010101*((dc0 + 2)>>2);
2084     dc1= 0x01010101*((dc1 + 2)>>2);
2085
2086     for(i=0; i<4; i++){
2087         ((uint32_t*)(src+i*stride))[0]= dc0;
2088         ((uint32_t*)(src+i*stride))[1]= dc1;
2089     }
2090     for(i=4; i<8; i++){
2091         ((uint32_t*)(src+i*stride))[0]= dc0;
2092         ((uint32_t*)(src+i*stride))[1]= dc1;
2093     }
2094 }
2095
2096
2097 void ff_pred8x8_dc_c(uint8_t *src, int stride){
2098     int i;
2099     int dc0, dc1, dc2, dc3;
2100
2101     dc0=dc1=dc2=0;
2102     for(i=0;i<4; i++){
2103         dc0+= src[-1+i*stride] + src[i-stride];
2104         dc1+= src[4+i-stride];
2105         dc2+= src[-1+(i+4)*stride];
2106     }
2107     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2108     dc0= 0x01010101*((dc0 + 4)>>3);
2109     dc1= 0x01010101*((dc1 + 2)>>2);
2110     dc2= 0x01010101*((dc2 + 2)>>2);
2111
2112     for(i=0; i<4; i++){
2113         ((uint32_t*)(src+i*stride))[0]= dc0;
2114         ((uint32_t*)(src+i*stride))[1]= dc1;
2115     }
2116     for(i=4; i<8; i++){
2117         ((uint32_t*)(src+i*stride))[0]= dc2;
2118         ((uint32_t*)(src+i*stride))[1]= dc3;
2119     }
2120 }
2121
2122 void ff_pred8x8_plane_c(uint8_t *src, int stride){
2123   int j, k;
2124   int a;
2125   uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2126   const uint8_t * const src0 = src+3-stride;
2127   const uint8_t *src1 = src+4*stride-1;
2128   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2129   int H = src0[1] - src0[-1];
2130   int V = src1[0] - src2[ 0];
2131   for(k=2; k<=4; ++k) {
2132     src1 += stride; src2 -= stride;
2133     H += k*(src0[k] - src0[-k]);
2134     V += k*(src1[0] - src2[ 0]);
2135   }
2136   H = ( 17*H+16 ) >> 5;
2137   V = ( 17*V+16 ) >> 5;
2138
2139   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2140   for(j=8; j>0; --j) {
2141     int b = a;
2142     a += V;
2143     src[0] = cm[ (b    ) >> 5 ];
2144     src[1] = cm[ (b+  H) >> 5 ];
2145     src[2] = cm[ (b+2*H) >> 5 ];
2146     src[3] = cm[ (b+3*H) >> 5 ];
2147     src[4] = cm[ (b+4*H) >> 5 ];
2148     src[5] = cm[ (b+5*H) >> 5 ];
2149     src[6] = cm[ (b+6*H) >> 5 ];
2150     src[7] = cm[ (b+7*H) >> 5 ];
2151     src += stride;
2152   }
2153 }
2154
2155 #define SRC(x,y) src[(x)+(y)*stride]
2156 #define PL(y) \
2157     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2158 #define PREDICT_8x8_LOAD_LEFT \
2159     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2160                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2161     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2162     const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2163
2164 #define PT(x) \
2165     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2166 #define PREDICT_8x8_LOAD_TOP \
2167     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2168                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2169     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2170     const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2171                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2172
2173 #define PTR(x) \
2174     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2175 #define PREDICT_8x8_LOAD_TOPRIGHT \
2176     int t8, t9, t10, t11, t12, t13, t14, t15; \
2177     if(has_topright) { \
2178         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2179         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2180     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2181
2182 #define PREDICT_8x8_LOAD_TOPLEFT \
2183     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2184
2185 #define PREDICT_8x8_DC(v) \
2186     int y; \
2187     for( y = 0; y < 8; y++ ) { \
2188         ((uint32_t*)src)[0] = \
2189         ((uint32_t*)src)[1] = v; \
2190         src += stride; \
2191     }
2192
2193 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2194 {
2195     PREDICT_8x8_DC(0x80808080);
2196 }
2197 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2198 {
2199     PREDICT_8x8_LOAD_LEFT;
2200     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2201     PREDICT_8x8_DC(dc);
2202 }
2203 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2204 {
2205     PREDICT_8x8_LOAD_TOP;
2206     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2207     PREDICT_8x8_DC(dc);
2208 }
2209 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2210 {
2211     PREDICT_8x8_LOAD_LEFT;
2212     PREDICT_8x8_LOAD_TOP;
2213     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2214                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2215     PREDICT_8x8_DC(dc);
2216 }
2217 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2218 {
2219     PREDICT_8x8_LOAD_LEFT;
2220 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2221                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2222     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2223 #undef ROW
2224 }
2225 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2226 {
2227     int y;
2228     PREDICT_8x8_LOAD_TOP;
2229     src[0] = t0;
2230     src[1] = t1;
2231     src[2] = t2;
2232     src[3] = t3;
2233     src[4] = t4;
2234     src[5] = t5;
2235     src[6] = t6;
2236     src[7] = t7;
2237     for( y = 1; y < 8; y++ )
2238         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2239 }
2240 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2241 {
2242     PREDICT_8x8_LOAD_TOP;
2243     PREDICT_8x8_LOAD_TOPRIGHT;
2244     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2245     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2246     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2247     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2248     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2249     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2250     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2251     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2252     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2253     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2254     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2255     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2256     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2257     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2258     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2259 }
2260 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2261 {
2262     PREDICT_8x8_LOAD_TOP;
2263     PREDICT_8x8_LOAD_LEFT;
2264     PREDICT_8x8_LOAD_TOPLEFT;
2265     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2266     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2267     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2268     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2269     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2270     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2271     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2272     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2273     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2274     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2275     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2276     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2277     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2278     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2279     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2280
2281 }
2282 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2283 {
2284     PREDICT_8x8_LOAD_TOP;
2285     PREDICT_8x8_LOAD_LEFT;
2286     PREDICT_8x8_LOAD_TOPLEFT;
2287     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2288     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2289     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2290     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2291     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2292     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2293     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2294     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2295     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2296     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2297     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2298     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2299     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2300     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2301     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2302     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2303     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2304     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2305     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2306     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2307     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2308     SRC(7,0)= (t6 + t7 + 1) >> 1;
2309 }
2310 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2311 {
2312     PREDICT_8x8_LOAD_TOP;
2313     PREDICT_8x8_LOAD_LEFT;
2314     PREDICT_8x8_LOAD_TOPLEFT;
2315     SRC(0,7)= (l6 + l7 + 1) >> 1;
2316     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2317     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2318     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2319     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2320     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2321     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2322     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2323     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2324     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2325     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2326     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2327     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2328     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2329     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2330     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2331     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2332     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2333     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2334     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2335     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2336     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2337 }
2338 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2339 {
2340     PREDICT_8x8_LOAD_TOP;
2341     PREDICT_8x8_LOAD_TOPRIGHT;
2342     SRC(0,0)= (t0 + t1 + 1) >> 1;
2343     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2344     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2345     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2346     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2347     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2348     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2349     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2350     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2351     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2352     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2353     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2354     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2355     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2356     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2357     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2358     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2359     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2360     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2361     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2362     SRC(7,6)= (t10 + t11 + 1) >> 1;
2363     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2364 }
2365 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2366 {
2367     PREDICT_8x8_LOAD_LEFT;
2368     SRC(0,0)= (l0 + l1 + 1) >> 1;
2369     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2370     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2371     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2372     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2373     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2374     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2375     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2376     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2377     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2378     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2379     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2380     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2381     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2382     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2383     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2384     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2385     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2386 }
2387 #undef PREDICT_8x8_LOAD_LEFT
2388 #undef PREDICT_8x8_LOAD_TOP
2389 #undef PREDICT_8x8_LOAD_TOPLEFT
2390 #undef PREDICT_8x8_LOAD_TOPRIGHT
2391 #undef PREDICT_8x8_DC
2392 #undef PTR
2393 #undef PT
2394 #undef PL
2395 #undef SRC
2396
2397 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2398                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2399                            int src_x_offset, int src_y_offset,
2400                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2401     MpegEncContext * const s = &h->s;
2402     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2403     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2404     const int luma_xy= (mx&3) + ((my&3)<<2);
2405     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2406     uint8_t * src_cb, * src_cr;
2407     int extra_width= h->emu_edge_width;
2408     int extra_height= h->emu_edge_height;
2409     int emu=0;
2410     const int full_mx= mx>>2;
2411     const int full_my= my>>2;
2412     const int pic_width  = 16*s->mb_width;
2413     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2414
2415     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
2416         return;
2417
2418     if(mx&7) extra_width -= 3;
2419     if(my&7) extra_height -= 3;
2420
2421     if(   full_mx < 0-extra_width
2422        || full_my < 0-extra_height
2423        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2424        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2425         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2426             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2427         emu=1;
2428     }
2429
2430     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2431     if(!square){
2432         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2433     }
2434
2435     if(s->flags&CODEC_FLAG_GRAY) return;
2436
2437     if(MB_MBAFF){
2438         // chroma offset when predicting from a field of opposite parity
2439         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2440         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2441     }
2442     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2443     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2444
2445     if(emu){
2446         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2447             src_cb= s->edge_emu_buffer;
2448     }
2449     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2450
2451     if(emu){
2452         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2453             src_cr= s->edge_emu_buffer;
2454     }
2455     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2456 }
2457
2458 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2459                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2460                            int x_offset, int y_offset,
2461                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2462                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2463                            int list0, int list1){
2464     MpegEncContext * const s = &h->s;
2465     qpel_mc_func *qpix_op=  qpix_put;
2466     h264_chroma_mc_func chroma_op= chroma_put;
2467
2468     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2469     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2470     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2471     x_offset += 8*s->mb_x;
2472     y_offset += 8*(s->mb_y >> MB_MBAFF);
2473
2474     if(list0){
2475         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2476         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2477                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2478                            qpix_op, chroma_op);
2479
2480         qpix_op=  qpix_avg;
2481         chroma_op= chroma_avg;
2482     }
2483
2484     if(list1){
2485         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2486         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2487                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2488                            qpix_op, chroma_op);
2489     }
2490 }
2491
2492 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2493                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2494                            int x_offset, int y_offset,
2495                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2496                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2497                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2498                            int list0, int list1){
2499     MpegEncContext * const s = &h->s;
2500
2501     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2502     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2503     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2504     x_offset += 8*s->mb_x;
2505     y_offset += 8*(s->mb_y >> MB_MBAFF);
2506
2507     if(list0 && list1){
2508         /* don't optimize for luma-only case, since B-frames usually
2509          * use implicit weights => chroma too. */
2510         uint8_t *tmp_cb = s->obmc_scratchpad;
2511         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2512         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2513         int refn0 = h->ref_cache[0][ scan8[n] ];
2514         int refn1 = h->ref_cache[1][ scan8[n] ];
2515
2516         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2517                     dest_y, dest_cb, dest_cr,
2518                     x_offset, y_offset, qpix_put, chroma_put);
2519         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2520                     tmp_y, tmp_cb, tmp_cr,
2521                     x_offset, y_offset, qpix_put, chroma_put);
2522
2523         if(h->use_weight == 2){
2524             int weight0 = h->implicit_weight[refn0][refn1];
2525             int weight1 = 64 - weight0;
2526             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2527             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2528             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2529         }else{
2530             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2531                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2532                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2533             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2534                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2535                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2536             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2537                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2538                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2539         }
2540     }else{
2541         int list = list1 ? 1 : 0;
2542         int refn = h->ref_cache[list][ scan8[n] ];
2543         Picture *ref= &h->ref_list[list][refn];
2544         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2545                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2546                     qpix_put, chroma_put);
2547
2548         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
2549                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2550         if(h->use_weight_chroma){
2551             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2552                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2553             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
2554                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2555         }
2556     }
2557 }
2558
2559 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2560                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2561                            int x_offset, int y_offset,
2562                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2563                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2564                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2565                            int list0, int list1){
2566     if((h->use_weight==2 && list0 && list1
2567         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2568        || h->use_weight==1)
2569         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2570                          x_offset, y_offset, qpix_put, chroma_put,
2571                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2572     else
2573         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2574                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2575 }
2576
2577 static inline void prefetch_motion(H264Context *h, int list){
2578     /* fetch pixels for estimated mv 4 macroblocks ahead
2579      * optimized for 64byte cache lines */
2580     MpegEncContext * const s = &h->s;
2581     const int refn = h->ref_cache[list][scan8[0]];
2582     if(refn >= 0){
2583         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2584         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2585         uint8_t **src= h->ref_list[list][refn].data;
2586         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
2587         s->dsp.prefetch(src[0]+off, s->linesize, 4);
2588         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2589         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2590     }
2591 }
2592
2593 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2594                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2595                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2596                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2597     MpegEncContext * const s = &h->s;
2598     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2599     const int mb_type= s->current_picture.mb_type[mb_xy];
2600
2601     assert(IS_INTER(mb_type));
2602
2603     prefetch_motion(h, 0);
2604
2605     if(IS_16X16(mb_type)){
2606         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2607                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2608                 &weight_op[0], &weight_avg[0],
2609                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2610     }else if(IS_16X8(mb_type)){
2611         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2612                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2613                 &weight_op[1], &weight_avg[1],
2614                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2615         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2616                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2617                 &weight_op[1], &weight_avg[1],
2618                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2619     }else if(IS_8X16(mb_type)){
2620         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
2621                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2622                 &weight_op[2], &weight_avg[2],
2623                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2624         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
2625                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2626                 &weight_op[2], &weight_avg[2],
2627                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2628     }else{
2629         int i;
2630
2631         assert(IS_8X8(mb_type));
2632
2633         for(i=0; i<4; i++){
2634             const int sub_mb_type= h->sub_mb_type[i];
2635             const int n= 4*i;
2636             int x_offset= (i&1)<<2;
2637             int y_offset= (i&2)<<1;
2638
2639             if(IS_SUB_8X8(sub_mb_type)){
2640                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2641                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2642                     &weight_op[3], &weight_avg[3],
2643                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2644             }else if(IS_SUB_8X4(sub_mb_type)){
2645                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2646                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2647                     &weight_op[4], &weight_avg[4],
2648                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2649                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2650                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2651                     &weight_op[4], &weight_avg[4],
2652                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2653             }else if(IS_SUB_4X8(sub_mb_type)){
2654                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2655                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2656                     &weight_op[5], &weight_avg[5],
2657                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2658                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2659                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2660                     &weight_op[5], &weight_avg[5],
2661                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2662             }else{
2663                 int j;
2664                 assert(IS_SUB_4X4(sub_mb_type));
2665                 for(j=0; j<4; j++){
2666                     int sub_x_offset= x_offset + 2*(j&1);
2667                     int sub_y_offset= y_offset +   (j&2);
2668                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2669                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2670                         &weight_op[6], &weight_avg[6],
2671                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2672                 }
2673             }
2674         }
2675     }
2676
2677     prefetch_motion(h, 1);
2678 }
2679
2680 static void decode_init_vlc(void){
2681     static int done = 0;
2682
2683     if (!done) {
2684         int i;
2685         done = 1;
2686
2687         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2688                  &chroma_dc_coeff_token_len [0], 1, 1,
2689                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2690
2691         for(i=0; i<4; i++){
2692             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2693                      &coeff_token_len [i][0], 1, 1,
2694                      &coeff_token_bits[i][0], 1, 1, 1);
2695         }
2696
2697         for(i=0; i<3; i++){
2698             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2699                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2700                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2701         }
2702         for(i=0; i<15; i++){
2703             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2704                      &total_zeros_len [i][0], 1, 1,
2705                      &total_zeros_bits[i][0], 1, 1, 1);
2706         }
2707
2708         for(i=0; i<6; i++){
2709             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2710                      &run_len [i][0], 1, 1,
2711                      &run_bits[i][0], 1, 1, 1);
2712         }
2713         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2714                  &run_len [6][0], 1, 1,
2715                  &run_bits[6][0], 1, 1, 1);
2716     }
2717 }
2718
2719 /**
2720  * Sets the intra prediction function pointers.
2721  */
2722 static void init_pred_ptrs(H264Context *h){
2723 //    MpegEncContext * const s = &h->s;
2724
2725     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2726     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2727     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2728     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2729     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2730     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2731     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2732     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2733     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2734     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2735     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2736     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2737
2738     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
2739     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
2740     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
2741     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
2742     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
2743     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
2744     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
2745     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
2746     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
2747     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
2748     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
2749     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
2750
2751     h->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
2752     h->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
2753     h->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
2754     h->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
2755     h->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c;
2756     h->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c;
2757     h->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
2758
2759     h->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
2760     h->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
2761     h->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
2762     h->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
2763     h->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c;
2764     h->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c;
2765     h->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
2766 }
2767
2768 static void free_tables(H264Context *h){
2769     int i;
2770     av_freep(&h->intra4x4_pred_mode);
2771     av_freep(&h->chroma_pred_mode_table);
2772     av_freep(&h->cbp_table);
2773     av_freep(&h->mvd_table[0]);
2774     av_freep(&h->mvd_table[1]);
2775     av_freep(&h->direct_table);
2776     av_freep(&h->non_zero_count);
2777     av_freep(&h->slice_table_base);
2778     av_freep(&h->top_borders[1]);
2779     av_freep(&h->top_borders[0]);
2780     h->slice_table= NULL;
2781
2782     av_freep(&h->mb2b_xy);
2783     av_freep(&h->mb2b8_xy);
2784
2785     av_freep(&h->s.obmc_scratchpad);
2786
2787     for(i = 0; i < MAX_SPS_COUNT; i++)
2788         av_freep(h->sps_buffers + i);
2789
2790     for(i = 0; i < MAX_PPS_COUNT; i++)
2791         av_freep(h->pps_buffers + i);
2792 }
2793
2794 static void init_dequant8_coeff_table(H264Context *h){
2795     int i,q,x;
2796     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2797     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2798     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2799
2800     for(i=0; i<2; i++ ){
2801         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2802             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2803             break;
2804         }
2805
2806         for(q=0; q<52; q++){
2807             int shift = ff_div6[q];
2808             int idx = ff_rem6[q];
2809             for(x=0; x<64; x++)
2810                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2811                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2812                     h->pps.scaling_matrix8[i][x]) << shift;
2813         }
2814     }
2815 }
2816
2817 static void init_dequant4_coeff_table(H264Context *h){
2818     int i,j,q,x;
2819     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2820     for(i=0; i<6; i++ ){
2821         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2822         for(j=0; j<i; j++){
2823             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2824                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2825                 break;
2826             }
2827         }
2828         if(j<i)
2829             continue;
2830
2831         for(q=0; q<52; q++){
2832             int shift = ff_div6[q] + 2;
2833             int idx = ff_rem6[q];
2834             for(x=0; x<16; x++)
2835                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2836                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2837                     h->pps.scaling_matrix4[i][x]) << shift;
2838         }
2839     }
2840 }
2841
2842 static void init_dequant_tables(H264Context *h){
2843     int i,x;
2844     init_dequant4_coeff_table(h);
2845     if(h->pps.transform_8x8_mode)
2846         init_dequant8_coeff_table(h);
2847     if(h->sps.transform_bypass){
2848         for(i=0; i<6; i++)
2849             for(x=0; x<16; x++)
2850                 h->dequant4_coeff[i][0][x] = 1<<6;
2851         if(h->pps.transform_8x8_mode)
2852             for(i=0; i<2; i++)
2853                 for(x=0; x<64; x++)
2854                     h->dequant8_coeff[i][0][x] = 1<<6;
2855     }
2856 }
2857
2858
2859 /**
2860  * allocates tables.
2861  * needs width/height
2862  */
2863 static int alloc_tables(H264Context *h){
2864     MpegEncContext * const s = &h->s;
2865     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2866     int x,y;
2867
2868     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2869
2870     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2871     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2872     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2873     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2874     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2875
2876     if( h->pps.cabac ) {
2877         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2878         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2879         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2880         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2881     }
2882
2883     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2884     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2885
2886     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2887     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2888     for(y=0; y<s->mb_height; y++){
2889         for(x=0; x<s->mb_width; x++){
2890             const int mb_xy= x + y*s->mb_stride;
2891             const int b_xy = 4*x + 4*y*h->b_stride;
2892             const int b8_xy= 2*x + 2*y*h->b8_stride;
2893
2894             h->mb2b_xy [mb_xy]= b_xy;
2895             h->mb2b8_xy[mb_xy]= b8_xy;
2896         }
2897     }
2898
2899     s->obmc_scratchpad = NULL;
2900
2901     if(!h->dequant4_coeff[0])
2902         init_dequant_tables(h);
2903
2904     return 0;
2905 fail:
2906     free_tables(h);
2907     return -1;
2908 }
2909
2910 static void common_init(H264Context *h){
2911     MpegEncContext * const s = &h->s;
2912
2913     s->width = s->avctx->width;
2914     s->height = s->avctx->height;
2915     s->codec_id= s->avctx->codec->id;
2916
2917     init_pred_ptrs(h);
2918
2919     h->dequant_coeff_pps= -1;
2920     s->unrestricted_mv=1;
2921     s->decode=1; //FIXME
2922
2923     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2924     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2925 }
2926
2927 static int decode_init(AVCodecContext *avctx){
2928     H264Context *h= avctx->priv_data;
2929     MpegEncContext * const s = &h->s;
2930
2931     MPV_decode_defaults(s);
2932
2933     s->avctx = avctx;
2934     common_init(h);
2935
2936     s->out_format = FMT_H264;
2937     s->workaround_bugs= avctx->workaround_bugs;
2938
2939     // set defaults
2940 //    s->decode_mb= ff_h263_decode_mb;
2941     s->low_delay= 1;
2942     avctx->pix_fmt= PIX_FMT_YUV420P;
2943
2944     decode_init_vlc();
2945
2946     if(avctx->extradata_size > 0 && avctx->extradata &&
2947        *(char *)avctx->extradata == 1){
2948         h->is_avc = 1;
2949         h->got_avcC = 0;
2950     } else {
2951         h->is_avc = 0;
2952     }
2953
2954     return 0;
2955 }
2956
2957 static int frame_start(H264Context *h){
2958     MpegEncContext * const s = &h->s;
2959     int i;
2960
2961     if(MPV_frame_start(s, s->avctx) < 0)
2962         return -1;
2963     ff_er_frame_start(s);
2964
2965     assert(s->linesize && s->uvlinesize);
2966
2967     for(i=0; i<16; i++){
2968         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2969         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2970     }
2971     for(i=0; i<4; i++){
2972         h->block_offset[16+i]=
2973         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2974         h->block_offset[24+16+i]=
2975         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2976     }
2977
2978     /* can't be in alloc_tables because linesize isn't known there.
2979      * FIXME: redo bipred weight to not require extra buffer? */
2980     if(!s->obmc_scratchpad)
2981         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2982
2983     /* some macroblocks will be accessed before they're available */
2984     if(FRAME_MBAFF)
2985         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2986
2987 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2988     return 0;
2989 }
2990
2991 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2992     MpegEncContext * const s = &h->s;
2993     int i;
2994
2995     src_y  -=   linesize;
2996     src_cb -= uvlinesize;
2997     src_cr -= uvlinesize;
2998
2999     // There are two lines saved, the line above the the top macroblock of a pair,
3000     // and the line above the bottom macroblock
3001     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3002     for(i=1; i<17; i++){
3003         h->left_border[i]= src_y[15+i*  linesize];
3004     }
3005
3006     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3007     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3008
3009     if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3010         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3011         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3012         for(i=1; i<9; i++){
3013             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3014             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3015         }
3016         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3017         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3018     }
3019 }
3020
3021 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
3022     MpegEncContext * const s = &h->s;
3023     int temp8, i;
3024     uint64_t temp64;
3025     int deblock_left = (s->mb_x > 0);
3026     int deblock_top  = (s->mb_y > 0);
3027
3028     src_y  -=   linesize + 1;
3029     src_cb -= uvlinesize + 1;
3030     src_cr -= uvlinesize + 1;
3031
3032 #define XCHG(a,b,t,xchg)\
3033 t= a;\
3034 if(xchg)\
3035     a= b;\
3036 b= t;
3037
3038     if(deblock_left){
3039         for(i = !deblock_top; i<17; i++){
3040             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3041         }
3042     }
3043
3044     if(deblock_top){
3045         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3046         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3047         if(s->mb_x+1 < s->mb_width){
3048             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3049         }
3050     }
3051
3052     if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3053         if(deblock_left){
3054             for(i = !deblock_top; i<9; i++){
3055                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3056                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3057             }
3058         }
3059         if(deblock_top){
3060             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3061             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3062         }
3063     }
3064 }
3065
3066 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3067     MpegEncContext * const s = &h->s;
3068     int i;
3069
3070     src_y  -= 2 *   linesize;
3071     src_cb -= 2 * uvlinesize;
3072     src_cr -= 2 * uvlinesize;
3073
3074     // There are two lines saved, the line above the the top macroblock of a pair,
3075     // and the line above the bottom macroblock
3076     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3077     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3078     for(i=2; i<34; i++){
3079         h->left_border[i]= src_y[15+i*  linesize];
3080     }
3081
3082     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3083     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3084     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3085     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3086
3087     if(!(s->flags&CODEC_FLAG_GRAY)){
3088         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3089         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3090         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3091         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3092         for(i=2; i<18; i++){
3093             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3094             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3095         }
3096         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3097         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3098         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3099         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3100     }
3101 }
3102
3103 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3104     MpegEncContext * const s = &h->s;
3105     int temp8, i;
3106     uint64_t temp64;
3107     int deblock_left = (s->mb_x > 0);
3108     int deblock_top  = (s->mb_y > 1);
3109
3110     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3111
3112     src_y  -= 2 *   linesize + 1;
3113     src_cb -= 2 * uvlinesize + 1;
3114     src_cr -= 2 * uvlinesize + 1;
3115
3116 #define XCHG(a,b,t,xchg)\
3117 t= a;\
3118 if(xchg)\
3119     a= b;\
3120 b= t;
3121
3122     if(deblock_left){
3123         for(i = (!deblock_top)<<1; i<34; i++){
3124             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3125         }
3126     }
3127
3128     if(deblock_top){
3129         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3130         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3131         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3132         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3133         if(s->mb_x+1 < s->mb_width){
3134             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3135             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3136         }
3137     }
3138
3139     if(!(s->flags&CODEC_FLAG_GRAY)){
3140         if(deblock_left){
3141             for(i = (!deblock_top) << 1; i<18; i++){
3142                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3143                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3144             }
3145         }
3146         if(deblock_top){
3147             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3148             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3149             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3150             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3151         }
3152     }
3153 }
3154
3155 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
3156     MpegEncContext * const s = &h->s;
3157     const int mb_x= s->mb_x;
3158     const int mb_y= s->mb_y;
3159     const int mb_xy= mb_x + mb_y*s->mb_stride;
3160     const int mb_type= s->current_picture.mb_type[mb_xy];
3161     uint8_t  *dest_y, *dest_cb, *dest_cr;
3162     int linesize, uvlinesize /*dct_offset*/;
3163     int i;
3164     int *block_offset = &h->block_offset[0];
3165     const unsigned int bottom = mb_y & 1;
3166     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
3167     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3168     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3169
3170     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3171     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3172     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3173
3174     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
3175     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
3176
3177     if (!simple && MB_FIELD) {
3178         linesize   = h->mb_linesize   = s->linesize * 2;
3179         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3180         block_offset = &h->block_offset[24];
3181         if(mb_y&1){ //FIXME move out of this func?
3182             dest_y -= s->linesize*15;
3183             dest_cb-= s->uvlinesize*7;
3184             dest_cr-= s->uvlinesize*7;
3185         }
3186         if(FRAME_MBAFF) {
3187             int list;
3188             for(list=0; list<h->list_count; list++){
3189                 if(!USES_LIST(mb_type, list))
3190                     continue;
3191                 if(IS_16X16(mb_type)){
3192                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3193                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3194                 }else{
3195                     for(i=0; i<16; i+=4){
3196                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3197                         int ref = h->ref_cache[list][scan8[i]];
3198                         if(ref >= 0)
3199                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3200                     }
3201                 }
3202             }
3203         }
3204     } else {
3205         linesize   = h->mb_linesize   = s->linesize;
3206         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3207 //        dct_offset = s->linesize * 16;
3208     }
3209
3210     if(transform_bypass){
3211         idct_dc_add =
3212         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3213     }else if(IS_8x8DCT(mb_type)){
3214         idct_dc_add = s->dsp.h264_idct8_dc_add;
3215         idct_add = s->dsp.h264_idct8_add;
3216     }else{
3217         idct_dc_add = s->dsp.h264_idct_dc_add;
3218         idct_add = s->dsp.h264_idct_add;
3219     }
3220
3221     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3222        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3223         int mbt_y = mb_y&~1;
3224         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3225         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3226         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3227         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3228     }
3229
3230     if (!simple && IS_INTRA_PCM(mb_type)) {
3231         unsigned int x, y;
3232
3233         // The pixels are stored in h->mb array in the same order as levels,
3234         // copy them in output in the correct order.
3235         for(i=0; i<16; i++) {
3236             for (y=0; y<4; y++) {
3237                 for (x=0; x<4; x++) {
3238                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3239                 }
3240             }
3241         }
3242         for(i=16; i<16+4; i++) {
3243             for (y=0; y<4; y++) {
3244                 for (x=0; x<4; x++) {
3245                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3246                 }
3247             }
3248         }
3249         for(i=20; i<20+4; i++) {
3250             for (y=0; y<4; y++) {
3251                 for (x=0; x<4; x++) {
3252                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3253                 }
3254             }
3255         }
3256     } else {
3257         if(IS_INTRA(mb_type)){
3258             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3259                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
3260
3261             if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3262                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3263                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3264             }
3265
3266             if(IS_INTRA4x4(mb_type)){
3267                 if(simple || !s->encoding){
3268                     if(IS_8x8DCT(mb_type)){
3269                         for(i=0; i<16; i+=4){
3270                             uint8_t * const ptr= dest_y + block_offset[i];
3271                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3272                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3273                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3274                                                    (h->topright_samples_available<<i)&0x4000, linesize);
3275                             if(nnz){
3276                                 if(nnz == 1 && h->mb[i*16])
3277                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3278                                 else
3279                                     idct_add(ptr, h->mb + i*16, linesize);
3280                             }
3281                         }
3282                     }else
3283                     for(i=0; i<16; i++){
3284                         uint8_t * const ptr= dest_y + block_offset[i];
3285                         uint8_t *topright;
3286                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3287                         int nnz, tr;
3288
3289                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3290                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3291                             assert(mb_y || linesize <= block_offset[i]);
3292                             if(!topright_avail){
3293                                 tr= ptr[3 - linesize]*0x01010101;
3294                                 topright= (uint8_t*) &tr;
3295                             }else
3296                                 topright= ptr + 4 - linesize;
3297                         }else
3298                             topright= NULL;
3299
3300                         h->pred4x4[ dir ](ptr, topright, linesize);
3301                         nnz = h->non_zero_count_cache[ scan8[i] ];
3302                         if(nnz){
3303                             if(is_h264){
3304                                 if(nnz == 1 && h->mb[i*16])
3305                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3306                                 else
3307                                     idct_add(ptr, h->mb + i*16, linesize);
3308                             }else
3309                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3310                         }
3311                     }
3312                 }
3313             }else{
3314                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3315                 if(is_h264){
3316                     if(!transform_bypass)
3317                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3318                 }else
3319                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3320             }
3321             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
3322                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
3323         }else if(is_h264){
3324             hl_motion(h, dest_y, dest_cb, dest_cr,
3325                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
3326                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
3327                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3328         }
3329
3330
3331         if(!IS_INTRA4x4(mb_type)){
3332             if(is_h264){
3333                 if(IS_INTRA16x16(mb_type)){
3334                     for(i=0; i<16; i++){
3335                         if(h->non_zero_count_cache[ scan8[i] ])
3336                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3337                         else if(h->mb[i*16])
3338                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3339                     }
3340                 }else{
3341                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3342                     for(i=0; i<16; i+=di){
3343                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3344                         if(nnz){
3345                             if(nnz==1 && h->mb[i*16])
3346                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3347                             else
3348                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3349                         }
3350                     }
3351                 }
3352             }else{
3353                 for(i=0; i<16; i++){
3354                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3355                         uint8_t * const ptr= dest_y + block_offset[i];
3356                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3357                     }
3358                 }
3359             }
3360         }
3361
3362         if(simple || !(s->flags&CODEC_FLAG_GRAY)){
3363             uint8_t *dest[2] = {dest_cb, dest_cr};
3364             if(transform_bypass){
3365                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3366             }else{
3367                 idct_add = s->dsp.h264_idct_add;
3368                 idct_dc_add = s->dsp.h264_idct_dc_add;
3369                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3370                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3371             }
3372             if(is_h264){
3373                 for(i=16; i<16+8; i++){
3374                     if(h->non_zero_count_cache[ scan8[i] ])
3375                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3376                     else if(h->mb[i*16])
3377                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3378                 }
3379             }else{
3380                 for(i=16; i<16+8; i++){
3381                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3382                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3383                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3384                     }
3385                 }
3386             }
3387         }
3388     }
3389     if(h->deblocking_filter) {
3390         if (!simple && FRAME_MBAFF) {
3391             //FIXME try deblocking one mb at a time?
3392             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3393             const int mb_y = s->mb_y - 1;
3394             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3395             const int mb_xy= mb_x + mb_y*s->mb_stride;
3396             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3397             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3398             if (!bottom) return;
3399             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3400             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3401             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3402
3403             if(IS_INTRA(mb_type_top | mb_type_bottom))
3404                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3405
3406             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3407             // deblock a pair
3408             // top
3409             s->mb_y--;
3410             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3411             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3412             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3413             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3414             // bottom
3415             s->mb_y++;
3416             tprintf(h->s.avctx, "call mbaff filter_mb\n");
3417             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3418             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3419             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3420         } else {
3421             tprintf(h->s.avctx, "call filter_mb\n");
3422             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
3423             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3424             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3425         }
3426     }
3427 }
3428
3429 /**
3430  * Process a macroblock; this case avoids checks for expensive uncommon cases.
3431  */
3432 static void hl_decode_mb_simple(H264Context *h){
3433     hl_decode_mb_internal(h, 1);
3434 }
3435
3436 /**
3437  * Process a macroblock; this handles edge cases, such as interlacing.
3438  */
3439 static void av_noinline hl_decode_mb_complex(H264Context *h){
3440     hl_decode_mb_internal(h, 0);
3441 }
3442
3443 static void hl_decode_mb(H264Context *h){
3444     MpegEncContext * const s = &h->s;
3445     const int mb_x= s->mb_x;
3446     const int mb_y= s->mb_y;
3447     const int mb_xy= mb_x + mb_y*s->mb_stride;
3448     const int mb_type= s->current_picture.mb_type[mb_xy];
3449     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (s->flags&CODEC_FLAG_GRAY) || s->encoding;
3450
3451     if(!s->decode)
3452         return;
3453
3454     if (is_complex)
3455         hl_decode_mb_complex(h);
3456     else hl_decode_mb_simple(h);
3457 }
3458
3459 /**
3460  * fills the default_ref_list.
3461  */
3462 static int fill_default_ref_list(H264Context *h){
3463     MpegEncContext * const s = &h->s;
3464     int i;
3465     int smallest_poc_greater_than_current = -1;
3466     Picture sorted_short_ref[32];
3467
3468     if(h->slice_type==B_TYPE){
3469         int out_i;
3470         int limit= INT_MIN;
3471
3472         /* sort frame according to poc in B slice */
3473         for(out_i=0; out_i<h->short_ref_count; out_i++){
3474             int best_i=INT_MIN;
3475             int best_poc=INT_MAX;
3476
3477             for(i=0; i<h->short_ref_count; i++){
3478                 const int poc= h->short_ref[i]->poc;
3479                 if(poc > limit && poc < best_poc){
3480                     best_poc= poc;
3481                     best_i= i;
3482                 }
3483             }
3484
3485             assert(best_i != INT_MIN);
3486
3487             limit= best_poc;
3488             sorted_short_ref[out_i]= *h->short_ref[best_i];
3489             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3490             if (-1 == smallest_poc_greater_than_current) {
3491                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3492                     smallest_poc_greater_than_current = out_i;
3493                 }
3494             }
3495         }
3496     }
3497
3498     if(s->picture_structure == PICT_FRAME){
3499         if(h->slice_type==B_TYPE){
3500             int list;
3501             tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3502
3503             // find the largest poc
3504             for(list=0; list<2; list++){
3505                 int index = 0;
3506                 int j= -99;
3507                 int step= list ? -1 : 1;
3508
3509                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3510                     while(j<0 || j>= h->short_ref_count){
3511                         if(j != -99 && step == (list ? -1 : 1))
3512                             return -1;
3513                         step = -step;
3514                         j= smallest_poc_greater_than_current + (step>>1);
3515                     }
3516                     if(sorted_short_ref[j].reference != 3) continue;
3517                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3518                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3519                 }
3520
3521                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3522                     if(h->long_ref[i] == NULL) continue;
3523                     if(h->long_ref[i]->reference != 3) continue;
3524
3525                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3526                     h->default_ref_list[ list ][index++].pic_id= i;;
3527                 }
3528
3529                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3530                     // swap the two first elements of L1 when
3531                     // L0 and L1 are identical
3532                     Picture temp= h->default_ref_list[1][0];
3533                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3534                     h->default_ref_list[1][1] = temp;
3535                 }
3536
3537                 if(index < h->ref_count[ list ])
3538                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3539             }
3540         }else{
3541             int index=0;
3542             for(i=0; i<h->short_ref_count; i++){
3543                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3544                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3545                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3546             }
3547             for(i = 0; i < 16; i++){
3548                 if(h->long_ref[i] == NULL) continue;
3549                 if(h->long_ref[i]->reference != 3) continue;
3550                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3551                 h->default_ref_list[0][index++].pic_id= i;;
3552             }
3553             if(index < h->ref_count[0])
3554                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3555         }
3556     }else{ //FIELD
3557         if(h->slice_type==B_TYPE){
3558         }else{
3559             //FIXME second field balh
3560         }
3561     }
3562 #ifdef TRACE
3563     for (i=0; i<h->ref_count[0]; i++) {
3564         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3565     }
3566     if(h->slice_type==B_TYPE){
3567         for (i=0; i<h->ref_count[1]; i++) {
3568             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3569         }
3570     }
3571 #endif
3572     return 0;
3573 }
3574
3575 static void print_short_term(H264Context *h);
3576 static void print_long_term(H264Context *h);
3577
3578 static int decode_ref_pic_list_reordering(H264Context *h){
3579     MpegEncContext * const s = &h->s;
3580     int list, index;
3581
3582     print_short_term(h);
3583     print_long_term(h);
3584     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3585
3586     for(list=0; list<h->list_count; list++){
3587         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3588
3589         if(get_bits1(&s->gb)){
3590             int pred= h->curr_pic_num;
3591
3592             for(index=0; ; index++){
3593                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3594                 unsigned int pic_id;
3595                 int i;
3596                 Picture *ref = NULL;
3597
3598                 if(reordering_of_pic_nums_idc==3)
3599                     break;
3600
3601                 if(index >= h->ref_count[list]){
3602                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3603                     return -1;
3604                 }
3605
3606                 if(reordering_of_pic_nums_idc<3){
3607                     if(reordering_of_pic_nums_idc<2){
3608                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3609
3610                         if(abs_diff_pic_num >= h->max_pic_num){
3611                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3612                             return -1;
3613                         }
3614
3615                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3616                         else                                pred+= abs_diff_pic_num;
3617                         pred &= h->max_pic_num - 1;
3618
3619                         for(i= h->short_ref_count-1; i>=0; i--){
3620                             ref = h->short_ref[i];
3621                             assert(ref->reference == 3);
3622                             assert(!ref->long_ref);
3623                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3624                                 break;
3625                         }
3626                         if(i>=0)
3627                             ref->pic_id= ref->frame_num;
3628                     }else{
3629                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3630                         if(pic_id>31){
3631                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3632                             return -1;
3633                         }
3634                         ref = h->long_ref[pic_id];
3635                         if(ref){
3636                             ref->pic_id= pic_id;
3637                             assert(ref->reference == 3);
3638                             assert(ref->long_ref);
3639                             i=0;
3640                         }else{
3641                             i=-1;
3642                         }
3643                     }
3644
3645                     if (i < 0) {
3646                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3647                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3648                     } else {
3649                         for(i=index; i+1<h->ref_count[list]; i++){
3650                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3651                                 break;
3652                         }
3653                         for(; i > index; i--){
3654                             h->ref_list[list][i]= h->ref_list[list][i-1];
3655                         }
3656                         h->ref_list[list][index]= *ref;
3657                     }
3658                 }else{
3659                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3660                     return -1;
3661                 }
3662             }
3663         }
3664     }
3665     for(list=0; list<h->list_count; list++){
3666         for(index= 0; index < h->ref_count[list]; index++){
3667             if(!h->ref_list[list][index].data[0])
3668                 h->ref_list[list][index]= s->current_picture;
3669         }
3670     }
3671
3672     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3673         direct_dist_scale_factor(h);
3674     direct_ref_list_init(h);
3675     return 0;
3676 }
3677
3678 static void fill_mbaff_ref_list(H264Context *h){
3679     int list, i, j;
3680     for(list=0; list<2; list++){ //FIXME try list_count
3681         for(i=0; i<h->ref_count[list]; i++){
3682             Picture *frame = &h->ref_list[list][i];
3683             Picture *field = &h->ref_list[list][16+2*i];
3684             field[0] = *frame;
3685             for(j=0; j<3; j++)
3686                 field[0].linesize[j] <<= 1;
3687             field[1] = field[0];
3688             for(j=0; j<3; j++)
3689                 field[1].data[j] += frame->linesize[j];
3690
3691             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3692             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3693             for(j=0; j<2; j++){
3694                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3695                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3696             }
3697         }
3698     }
3699     for(j=0; j<h->ref_count[1]; j++){
3700         for(i=0; i<h->ref_count[0]; i++)
3701             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3702         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3703         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3704     }
3705 }
3706
3707 static int pred_weight_table(H264Context *h){
3708     MpegEncContext * const s = &h->s;
3709     int list, i;
3710     int luma_def, chroma_def;
3711
3712     h->use_weight= 0;
3713     h->use_weight_chroma= 0;
3714     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3715     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3716     luma_def = 1<<h->luma_log2_weight_denom;
3717     chroma_def = 1<<h->chroma_log2_weight_denom;
3718
3719     for(list=0; list<2; list++){
3720         for(i=0; i<h->ref_count[list]; i++){
3721             int luma_weight_flag, chroma_weight_flag;
3722
3723             luma_weight_flag= get_bits1(&s->gb);
3724             if(luma_weight_flag){
3725                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3726                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3727                 if(   h->luma_weight[list][i] != luma_def
3728                    || h->luma_offset[list][i] != 0)
3729                     h->use_weight= 1;
3730             }else{
3731                 h->luma_weight[list][i]= luma_def;
3732                 h->luma_offset[list][i]= 0;
3733             }
3734
3735             chroma_weight_flag= get_bits1(&s->gb);
3736             if(chroma_weight_flag){
3737                 int j;
3738                 for(j=0; j<2; j++){
3739                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3740                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3741                     if(   h->chroma_weight[list][i][j] != chroma_def
3742                        || h->chroma_offset[list][i][j] != 0)
3743                         h->use_weight_chroma= 1;
3744                 }
3745             }else{
3746                 int j;
3747                 for(j=0; j<2; j++){
3748                     h->chroma_weight[list][i][j]= chroma_def;
3749                     h->chroma_offset[list][i][j]= 0;
3750                 }
3751             }
3752         }
3753         if(h->slice_type != B_TYPE) break;
3754     }
3755     h->use_weight= h->use_weight || h->use_weight_chroma;
3756     return 0;
3757 }
3758
3759 static void implicit_weight_table(H264Context *h){
3760     MpegEncContext * const s = &h->s;
3761     int ref0, ref1;
3762     int cur_poc = s->current_picture_ptr->poc;
3763
3764     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3765        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3766         h->use_weight= 0;
3767         h->use_weight_chroma= 0;
3768         return;
3769     }
3770
3771     h->use_weight= 2;
3772     h->use_weight_chroma= 2;
3773     h->luma_log2_weight_denom= 5;
3774     h->chroma_log2_weight_denom= 5;
3775
3776     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3777         int poc0 = h->ref_list[0][ref0].poc;
3778         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3779             int poc1 = h->ref_list[1][ref1].poc;
3780             int td = av_clip(poc1 - poc0, -128, 127);
3781             if(td){
3782                 int tb = av_clip(cur_poc - poc0, -128, 127);
3783                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3784                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3785                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3786                     h->implicit_weight[ref0][ref1] = 32;
3787                 else
3788                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3789             }else
3790                 h->implicit_weight[ref0][ref1] = 32;
3791         }
3792     }
3793 }
3794
3795 static inline void unreference_pic(H264Context *h, Picture *pic){
3796     int i;
3797     pic->reference=0;
3798     if(pic == h->delayed_output_pic)
3799         pic->reference=1;
3800     else{
3801         for(i = 0; h->delayed_pic[i]; i++)
3802             if(pic == h->delayed_pic[i]){
3803                 pic->reference=1;
3804                 break;
3805             }
3806     }
3807 }
3808
3809 /**
3810  * instantaneous decoder refresh.
3811  */
3812 static void idr(H264Context *h){
3813     int i;
3814
3815     for(i=0; i<16; i++){
3816         if (h->long_ref[i] != NULL) {
3817             unreference_pic(h, h->long_ref[i]);
3818             h->long_ref[i]= NULL;
3819         }
3820     }
3821     h->long_ref_count=0;
3822
3823     for(i=0; i<h->short_ref_count; i++){
3824         unreference_pic(h, h->short_ref[i]);
3825         h->short_ref[i]= NULL;
3826     }
3827     h->short_ref_count=0;
3828 }
3829
3830 /* forget old pics after a seek */
3831 static void flush_dpb(AVCodecContext *avctx){
3832     H264Context *h= avctx->priv_data;
3833     int i;
3834     for(i=0; i<16; i++) {
3835         if(h->delayed_pic[i])
3836             h->delayed_pic[i]->reference= 0;
3837         h->delayed_pic[i]= NULL;
3838     }
3839     if(h->delayed_output_pic)
3840         h->delayed_output_pic->reference= 0;
3841     h->delayed_output_pic= NULL;
3842     idr(h);
3843     if(h->s.current_picture_ptr)
3844         h->s.current_picture_ptr->reference= 0;
3845 }
3846
3847 /**
3848  *
3849  * @return the removed picture or NULL if an error occurs
3850  */
3851 static Picture * remove_short(H264Context *h, int frame_num){
3852     MpegEncContext * const s = &h->s;
3853     int i;
3854
3855     if(s->avctx->debug&FF_DEBUG_MMCO)
3856         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3857
3858     for(i=0; i<h->short_ref_count; i++){
3859         Picture *pic= h->short_ref[i];
3860         if(s->avctx->debug&FF_DEBUG_MMCO)
3861             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3862         if(pic->frame_num == frame_num){
3863             h->short_ref[i]= NULL;
3864             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3865             h->short_ref_count--;
3866             return pic;
3867         }
3868     }
3869     return NULL;
3870 }
3871
3872 /**
3873  *
3874  * @return the removed picture or NULL if an error occurs
3875  */
3876 static Picture * remove_long(H264Context *h, int i){
3877     Picture *pic;
3878
3879     pic= h->long_ref[i];
3880     h->long_ref[i]= NULL;
3881     if(pic) h->long_ref_count--;
3882
3883     return pic;
3884 }
3885
3886 /**
3887  * print short term list
3888  */
3889 static void print_short_term(H264Context *h) {
3890     uint32_t i;
3891     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3892         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3893         for(i=0; i<h->short_ref_count; i++){
3894             Picture *pic= h->short_ref[i];
3895             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3896         }
3897     }
3898 }
3899
3900 /**
3901  * print long term list
3902  */
3903 static void print_long_term(H264Context *h) {
3904     uint32_t i;
3905     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3906         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3907         for(i = 0; i < 16; i++){
3908             Picture *pic= h->long_ref[i];
3909             if (pic) {
3910                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3911             }
3912         }
3913     }
3914 }
3915
3916 /**
3917  * Executes the reference picture marking (memory management control operations).
3918  */
3919 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3920     MpegEncContext * const s = &h->s;
3921     int i, j;
3922     int current_is_long=0;
3923     Picture *pic;
3924
3925     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3926         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3927
3928     for(i=0; i<mmco_count; i++){
3929         if(s->avctx->debug&FF_DEBUG_MMCO)
3930             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
3931
3932         switch(mmco[i].opcode){
3933         case MMCO_SHORT2UNUSED:
3934             pic= remove_short(h, mmco[i].short_frame_num);
3935             if(pic)
3936                 unreference_pic(h, pic);
3937             else if(s->avctx->debug&FF_DEBUG_MMCO)
3938                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
3939             break;
3940         case MMCO_SHORT2LONG:
3941             pic= remove_long(h, mmco[i].long_index);
3942             if(pic) unreference_pic(h, pic);
3943
3944             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
3945             if (h->long_ref[ mmco[i].long_index ]){
3946                 h->long_ref[ mmco[i].long_index ]->long_ref=1;
3947                 h->long_ref_count++;
3948             }
3949             break;
3950         case MMCO_LONG2UNUSED:
3951             pic= remove_long(h, mmco[i].long_index);
3952             if(pic)
3953                 unreference_pic(h, pic);
3954             else if(s->avctx->debug&FF_DEBUG_MMCO)
3955                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
3956             break;
3957         case MMCO_LONG:
3958             pic= remove_long(h, mmco[i].long_index);
3959             if(pic) unreference_pic(h, pic);
3960
3961             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
3962             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3963             h->long_ref_count++;
3964
3965             current_is_long=1;
3966             break;
3967         case MMCO_SET_MAX_LONG:
3968             assert(mmco[i].long_index <= 16);
3969             // just remove the long term which index is greater than new max
3970             for(j = mmco[i].long_index; j<16; j++){
3971                 pic = remove_long(h, j);
3972                 if (pic) unreference_pic(h, pic);
3973             }
3974             break;
3975         case MMCO_RESET:
3976             while(h->short_ref_count){
3977                 pic= remove_short(h, h->short_ref[0]->frame_num);
3978                 if(pic) unreference_pic(h, pic);
3979             }
3980             for(j = 0; j < 16; j++) {
3981                 pic= remove_long(h, j);
3982                 if(pic) unreference_pic(h, pic);
3983             }
3984             break;
3985         default: assert(0);
3986         }
3987     }
3988
3989     if(!current_is_long){
3990         pic= remove_short(h, s->current_picture_ptr->frame_num);
3991         if(pic){
3992             unreference_pic(h, pic);
3993             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3994         }
3995
3996         if(h->short_ref_count)
3997             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3998
3999         h->short_ref[0]= s->current_picture_ptr;
4000         h->short_ref[0]->long_ref=0;
4001         h->short_ref_count++;
4002     }
4003
4004     print_short_term(h);
4005     print_long_term(h);
4006     return 0;
4007 }
4008
4009 static int decode_ref_pic_marking(H264Context *h){
4010     MpegEncContext * const s = &h->s;
4011     int i;
4012
4013     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4014         s->broken_link= get_bits1(&s->gb) -1;
4015         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4016         if(h->mmco[0].long_index == -1)
4017             h->mmco_index= 0;
4018         else{
4019             h->mmco[0].opcode= MMCO_LONG;
4020             h->mmco_index= 1;
4021         }
4022     }else{
4023         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4024             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4025                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4026
4027                 h->mmco[i].opcode= opcode;
4028                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4029                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4030 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4031                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4032                         return -1;
4033                     }*/
4034                 }
4035                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4036                     unsigned int long_index= get_ue_golomb(&s->gb);
4037                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ long_index >= 16){
4038                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4039                         return -1;
4040                     }
4041                     h->mmco[i].long_index= long_index;
4042                 }
4043
4044                 if(opcode > (unsigned)MMCO_LONG){
4045                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4046                     return -1;
4047                 }
4048                 if(opcode == MMCO_END)
4049                     break;
4050             }
4051             h->mmco_index= i;
4052         }else{
4053             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4054
4055             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4056                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4057                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4058                 h->mmco_index= 1;
4059             }else
4060                 h->mmco_index= 0;
4061         }
4062     }
4063
4064     return 0;
4065 }
4066
4067 static int init_poc(H264Context *h){
4068     MpegEncContext * const s = &h->s;
4069     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4070     int field_poc[2];
4071
4072     if(h->nal_unit_type == NAL_IDR_SLICE){
4073         h->frame_num_offset= 0;
4074     }else{
4075         if(h->frame_num < h->prev_frame_num)
4076             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4077         else
4078             h->frame_num_offset= h->prev_frame_num_offset;
4079     }
4080
4081     if(h->sps.poc_type==0){
4082         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4083
4084         if(h->nal_unit_type == NAL_IDR_SLICE){
4085              h->prev_poc_msb=
4086              h->prev_poc_lsb= 0;
4087         }
4088
4089         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4090             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4091         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4092             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4093         else
4094             h->poc_msb = h->prev_poc_msb;
4095 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4096         field_poc[0] =
4097         field_poc[1] = h->poc_msb + h->poc_lsb;
4098         if(s->picture_structure == PICT_FRAME)
4099             field_poc[1] += h->delta_poc_bottom;
4100     }else if(h->sps.poc_type==1){
4101         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4102         int i;
4103
4104         if(h->sps.poc_cycle_length != 0)
4105             abs_frame_num = h->frame_num_offset + h->frame_num;
4106         else
4107             abs_frame_num = 0;
4108
4109         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4110             abs_frame_num--;
4111
4112         expected_delta_per_poc_cycle = 0;
4113         for(i=0; i < h->sps.poc_cycle_length; i++)
4114             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4115
4116         if(abs_frame_num > 0){
4117             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4118             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4119
4120             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4121             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4122                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4123         } else
4124             expectedpoc = 0;
4125
4126         if(h->nal_ref_idc == 0)
4127             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4128
4129         field_poc[0] = expectedpoc + h->delta_poc[0];
4130         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4131
4132         if(s->picture_structure == PICT_FRAME)
4133             field_poc[1] += h->delta_poc[1];
4134     }else{
4135         int poc;
4136         if(h->nal_unit_type == NAL_IDR_SLICE){
4137             poc= 0;
4138         }else{
4139             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4140             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4141         }
4142         field_poc[0]= poc;
4143         field_poc[1]= poc;
4144     }
4145
4146     if(s->picture_structure != PICT_BOTTOM_FIELD)
4147         s->current_picture_ptr->field_poc[0]= field_poc[0];
4148     if(s->picture_structure != PICT_TOP_FIELD)
4149         s->current_picture_ptr->field_poc[1]= field_poc[1];
4150     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4151         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4152
4153     return 0;
4154 }
4155
4156
4157 /**
4158  * initialize scan tables
4159  */
4160 static void init_scan_tables(H264Context *h){
4161     MpegEncContext * const s = &h->s;
4162     int i;
4163     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4164         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4165         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4166     }else{
4167         for(i=0; i<16; i++){
4168 #define T(x) (x>>2) | ((x<<2) & 0xF)
4169             h->zigzag_scan[i] = T(zigzag_scan[i]);
4170             h-> field_scan[i] = T( field_scan[i]);
4171 #undef T
4172         }
4173     }
4174     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4175         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4176         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4177         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4178         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4179     }else{
4180         for(i=0; i<64; i++){
4181 #define T(x) (x>>3) | ((x&7)<<3)
4182             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4183             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4184             h->field_scan8x8[i]        = T(field_scan8x8[i]);
4185             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4186 #undef T
4187         }
4188     }
4189     if(h->sps.transform_bypass){ //FIXME same ugly
4190         h->zigzag_scan_q0          = zigzag_scan;
4191         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4192         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4193         h->field_scan_q0           = field_scan;
4194         h->field_scan8x8_q0        = field_scan8x8;
4195         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4196     }else{
4197         h->zigzag_scan_q0          = h->zigzag_scan;
4198         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4199         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4200         h->field_scan_q0           = h->field_scan;
4201         h->field_scan8x8_q0        = h->field_scan8x8;
4202         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4203     }
4204 }
4205 /**
4206  * decodes a slice header.
4207  * this will allso call MPV_common_init() and frame_start() as needed
4208  */
4209 static int decode_slice_header(H264Context *h){
4210     MpegEncContext * const s = &h->s;
4211     unsigned int first_mb_in_slice;
4212     unsigned int pps_id;
4213     int num_ref_idx_active_override_flag;
4214     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4215     unsigned int slice_type, tmp;
4216     int default_ref_list_done = 0;
4217
4218     s->current_picture.reference= h->nal_ref_idc != 0;
4219     s->dropable= h->nal_ref_idc == 0;
4220
4221     first_mb_in_slice= get_ue_golomb(&s->gb);
4222
4223     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
4224         h->slice_num = 0;
4225         s->current_picture_ptr= NULL;
4226     }
4227
4228     slice_type= get_ue_golomb(&s->gb);
4229     if(slice_type > 9){
4230         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4231         return -1;
4232     }
4233     if(slice_type > 4){
4234         slice_type -= 5;
4235         h->slice_type_fixed=1;
4236     }else
4237         h->slice_type_fixed=0;
4238
4239     slice_type= slice_type_map[ slice_type ];
4240     if (slice_type == I_TYPE
4241         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4242         default_ref_list_done = 1;
4243     }
4244     h->slice_type= slice_type;
4245
4246     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4247
4248     pps_id= get_ue_golomb(&s->gb);
4249     if(pps_id>=MAX_PPS_COUNT){
4250         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4251         return -1;
4252     }
4253     if(!h->pps_buffers[pps_id]) {
4254         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4255         return -1;
4256     }
4257     h->pps= *h->pps_buffers[pps_id];
4258
4259     if(!h->sps_buffers[h->pps.sps_id]) {
4260         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4261         return -1;
4262     }
4263     h->sps = *h->sps_buffers[h->pps.sps_id];
4264
4265     if(h->dequant_coeff_pps != pps_id){
4266         h->dequant_coeff_pps = pps_id;
4267         init_dequant_tables(h);
4268     }
4269
4270     s->mb_width= h->sps.mb_width;
4271     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4272
4273     h->b_stride=  s->mb_width*4;
4274     h->b8_stride= s->mb_width*2;
4275
4276     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4277     if(h->sps.frame_mbs_only_flag)
4278         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4279     else
4280         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4281
4282     if (s->context_initialized
4283         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4284         free_tables(h);
4285         MPV_common_end(s);
4286     }
4287     if (!s->context_initialized) {
4288         if (MPV_common_init(s) < 0)
4289             return -1;
4290
4291         init_scan_tables(h);
4292         alloc_tables(h);
4293
4294         s->avctx->width = s->width;
4295         s->avctx->height = s->height;
4296         s->avctx->sample_aspect_ratio= h->sps.sar;
4297         if(!s->avctx->sample_aspect_ratio.den)
4298             s->avctx->sample_aspect_ratio.den = 1;
4299
4300         if(h->sps.timing_info_present_flag){
4301             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4302             if(h->x264_build > 0 && h->x264_build < 44)
4303                 s->avctx->time_base.den *= 2;
4304             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4305                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4306         }
4307     }
4308
4309     if(h->slice_num == 0){
4310         if(frame_start(h) < 0)
4311             return -1;
4312     }
4313
4314     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4315     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4316
4317     h->mb_mbaff = 0;
4318     h->mb_aff_frame = 0;
4319     if(h->sps.frame_mbs_only_flag){
4320         s->picture_structure= PICT_FRAME;
4321     }else{
4322         if(get_bits1(&s->gb)) { //field_pic_flag
4323             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4324             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4325         } else {
4326             s->picture_structure= PICT_FRAME;
4327             h->mb_aff_frame = h->sps.mb_aff;
4328         }
4329     }
4330     assert(s->mb_num == s->mb_width * s->mb_height);
4331     if(first_mb_in_slice << h->mb_aff_frame >= s->mb_num ||
4332        first_mb_in_slice                    >= s->mb_num){
4333         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4334         return -1;
4335     }
4336     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4337     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4338     assert(s->mb_y < s->mb_height);
4339
4340     if(s->picture_structure==PICT_FRAME){
4341         h->curr_pic_num=   h->frame_num;
4342         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4343     }else{
4344         h->curr_pic_num= 2*h->frame_num;
4345         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4346     }
4347
4348     if(h->nal_unit_type == NAL_IDR_SLICE){
4349         get_ue_golomb(&s->gb); /* idr_pic_id */
4350     }
4351
4352     if(h->sps.poc_type==0){
4353         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4354
4355         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4356             h->delta_poc_bottom= get_se_golomb(&s->gb);
4357         }
4358     }
4359
4360     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4361         h->delta_poc[0]= get_se_golomb(&s->gb);
4362
4363         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4364             h->delta_poc[1]= get_se_golomb(&s->gb);
4365     }
4366
4367     init_poc(h);
4368
4369     if(h->pps.redundant_pic_cnt_present){
4370         h->redundant_pic_count= get_ue_golomb(&s->gb);
4371     }
4372
4373     //set defaults, might be overriden a few line later
4374     h->ref_count[0]= h->pps.ref_count[0];
4375     h->ref_count[1]= h->pps.ref_count[1];
4376
4377     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4378         if(h->slice_type == B_TYPE){
4379             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4380             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4381                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4382         }
4383         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4384
4385         if(num_ref_idx_active_override_flag){
4386             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4387             if(h->slice_type==B_TYPE)
4388                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4389
4390             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4391                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4392                 h->ref_count[0]= h->ref_count[1]= 1;
4393                 return -1;
4394             }
4395         }
4396         if(h->slice_type == B_TYPE)
4397             h->list_count= 2;
4398         else
4399             h->list_count= 1;
4400     }else
4401         h->list_count= 0;
4402
4403     if(!default_ref_list_done){
4404         fill_default_ref_list(h);
4405     }
4406
4407     if(decode_ref_pic_list_reordering(h) < 0)
4408         return -1;
4409
4410     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4411        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4412         pred_weight_table(h);
4413     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4414         implicit_weight_table(h);
4415     else
4416         h->use_weight = 0;
4417
4418     if(s->current_picture.reference)
4419         decode_ref_pic_marking(h);
4420
4421     if(FRAME_MBAFF)
4422         fill_mbaff_ref_list(h);
4423
4424     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4425         tmp = get_ue_golomb(&s->gb);
4426         if(tmp > 2){
4427             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4428             return -1;
4429         }
4430         h->cabac_init_idc= tmp;
4431     }
4432
4433     h->last_qscale_diff = 0;
4434     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4435     if(tmp>51){
4436         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4437         return -1;
4438     }
4439     s->qscale= tmp;
4440     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4441     //FIXME qscale / qp ... stuff
4442     if(h->slice_type == SP_TYPE){
4443         get_bits1(&s->gb); /* sp_for_switch_flag */
4444     }
4445     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4446         get_se_golomb(&s->gb); /* slice_qs_delta */
4447     }
4448
4449     h->deblocking_filter = 1;
4450     h->slice_alpha_c0_offset = 0;
4451     h->slice_beta_offset = 0;
4452     if( h->pps.deblocking_filter_parameters_present ) {
4453         tmp= get_ue_golomb(&s->gb);
4454         if(tmp > 2){
4455             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4456             return -1;
4457         }
4458         h->deblocking_filter= tmp;
4459         if(h->deblocking_filter < 2)
4460             h->deblocking_filter^= 1; // 1<->0
4461
4462         if( h->deblocking_filter ) {
4463             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4464             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4465         }
4466     }
4467     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4468        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4469        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4470        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4471         h->deblocking_filter= 0;
4472
4473 #if 0 //FMO
4474     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4475         slice_group_change_cycle= get_bits(&s->gb, ?);
4476 #endif
4477
4478     h->slice_num++;
4479
4480     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4481     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4482
4483     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4484         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4485                h->slice_num,
4486                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4487                first_mb_in_slice,
4488                av_get_pict_type_char(h->slice_type),
4489                pps_id, h->frame_num,
4490                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4491                h->ref_count[0], h->ref_count[1],
4492                s->qscale,
4493                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4494                h->use_weight,
4495                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4496                );
4497     }
4498
4499     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !s->current_picture.reference){
4500         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
4501         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
4502     }else{
4503         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
4504         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
4505     }
4506
4507     return 0;
4508 }
4509
4510 /**
4511  *
4512  */
4513 static inline int get_level_prefix(GetBitContext *gb){
4514     unsigned int buf;
4515     int log;
4516
4517     OPEN_READER(re, gb);
4518     UPDATE_CACHE(re, gb);
4519     buf=GET_CACHE(re, gb);
4520
4521     log= 32 - av_log2(buf);
4522 #ifdef TRACE
4523     print_bin(buf>>(32-log), log);
4524     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4525 #endif
4526
4527     LAST_SKIP_BITS(re, gb, log);
4528     CLOSE_READER(re, gb);
4529
4530     return log-1;
4531 }
4532
4533 static inline int get_dct8x8_allowed(H264Context *h){
4534     int i;
4535     for(i=0; i<4; i++){
4536         if(!IS_SUB_8X8(h->sub_mb_type[i])
4537            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4538             return 0;
4539     }
4540     return 1;
4541 }
4542
4543 /**
4544  * decodes a residual block.
4545  * @param n block index
4546  * @param scantable scantable
4547  * @param max_coeff number of coefficients in the block
4548  * @return <0 if an error occured
4549  */
4550 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4551     MpegEncContext * const s = &h->s;
4552     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4553     int level[16];
4554     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4555
4556     //FIXME put trailing_onex into the context
4557
4558     if(n == CHROMA_DC_BLOCK_INDEX){
4559         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4560         total_coeff= coeff_token>>2;
4561     }else{
4562         if(n == LUMA_DC_BLOCK_INDEX){
4563             total_coeff= pred_non_zero_count(h, 0);
4564             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4565             total_coeff= coeff_token>>2;
4566         }else{
4567             total_coeff= pred_non_zero_count(h, n);
4568             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4569             total_coeff= coeff_token>>2;
4570             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4571         }
4572     }
4573
4574     //FIXME set last_non_zero?
4575
4576     if(total_coeff==0)
4577         return 0;
4578     if(total_coeff > (unsigned)max_coeff) {
4579         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4580         return -1;
4581     }
4582
4583     trailing_ones= coeff_token&3;
4584     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4585     assert(total_coeff<=16);
4586
4587     for(i=0; i<trailing_ones; i++){
4588         level[i]= 1 - 2*get_bits1(gb);
4589     }
4590
4591     if(i<total_coeff) {
4592         int level_code, mask;
4593         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4594         int prefix= get_level_prefix(gb);
4595
4596         //first coefficient has suffix_length equal to 0 or 1
4597         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4598             if(suffix_length)
4599                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4600             else
4601                 level_code= (prefix<<suffix_length); //part
4602         }else if(prefix==14){
4603             if(suffix_length)
4604                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4605             else
4606                 level_code= prefix + get_bits(gb, 4); //part
4607         }else if(prefix==15){
4608             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4609             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4610         }else{
4611             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4612             return -1;
4613         }
4614
4615         if(trailing_ones < 3) level_code += 2;
4616
4617         suffix_length = 1;
4618         if(level_code > 5)
4619             suffix_length++;
4620         mask= -(level_code&1);
4621         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4622         i++;
4623
4624         //remaining coefficients have suffix_length > 0
4625         for(;i<total_coeff;i++) {
4626             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4627             prefix = get_level_prefix(gb);
4628             if(prefix<15){
4629                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4630             }else if(prefix==15){
4631                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4632             }else{
4633                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4634                 return -1;
4635             }
4636             mask= -(level_code&1);
4637             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4638             if(level_code > suffix_limit[suffix_length])
4639                 suffix_length++;
4640         }
4641     }
4642
4643     if(total_coeff == max_coeff)
4644         zeros_left=0;
4645     else{
4646         if(n == CHROMA_DC_BLOCK_INDEX)
4647             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4648         else
4649             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4650     }
4651
4652     coeff_num = zeros_left + total_coeff - 1;
4653     j = scantable[coeff_num];
4654     if(n > 24){
4655         block[j] = level[0];
4656         for(i=1;i<total_coeff;i++) {
4657             if(zeros_left <= 0)
4658                 run_before = 0;
4659             else if(zeros_left < 7){
4660                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4661             }else{
4662                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4663             }
4664             zeros_left -= run_before;
4665             coeff_num -= 1 + run_before;
4666             j= scantable[ coeff_num ];
4667
4668             block[j]= level[i];
4669         }
4670     }else{
4671         block[j] = (level[0] * qmul[j] + 32)>>6;
4672         for(i=1;i<total_coeff;i++) {
4673             if(zeros_left <= 0)
4674                 run_before = 0;
4675             else if(zeros_left < 7){
4676                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4677             }else{
4678                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4679             }
4680             zeros_left -= run_before;
4681             coeff_num -= 1 + run_before;
4682             j= scantable[ coeff_num ];
4683
4684             block[j]= (level[i] * qmul[j] + 32)>>6;
4685         }
4686     }
4687
4688     if(zeros_left<0){
4689         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4690         return -1;
4691     }
4692
4693     return 0;
4694 }
4695
4696 static void predict_field_decoding_flag(H264Context *h){
4697     MpegEncContext * const s = &h->s;
4698     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4699     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4700                 ? s->current_picture.mb_type[mb_xy-1]
4701                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4702                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4703                 : 0;
4704     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4705 }
4706
4707 /**
4708  * decodes a P_SKIP or B_SKIP macroblock
4709  */
4710 static void decode_mb_skip(H264Context *h){
4711     MpegEncContext * const s = &h->s;
4712     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4713     int mb_type=0;
4714
4715     memset(h->non_zero_count[mb_xy], 0, 16);
4716     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4717
4718     if(MB_FIELD)
4719         mb_type|= MB_TYPE_INTERLACED;
4720
4721     if( h->slice_type == B_TYPE )
4722     {
4723         // just for fill_caches. pred_direct_motion will set the real mb_type
4724         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4725
4726         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4727         pred_direct_motion(h, &mb_type);
4728         mb_type|= MB_TYPE_SKIP;
4729     }
4730     else
4731     {
4732         int mx, my;
4733         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4734
4735         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4736         pred_pskip_motion(h, &mx, &my);
4737         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4738         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4739     }
4740
4741     write_back_motion(h, mb_type);
4742     s->current_picture.mb_type[mb_xy]= mb_type;
4743     s->current_picture.qscale_table[mb_xy]= s->qscale;
4744     h->slice_table[ mb_xy ]= h->slice_num;
4745     h->prev_mb_skipped= 1;
4746 }
4747
4748 /**
4749  * decodes a macroblock
4750  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4751  */
4752 static int decode_mb_cavlc(H264Context *h){
4753     MpegEncContext * const s = &h->s;
4754     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4755     int partition_count;
4756     unsigned int mb_type, cbp;
4757     int dct8x8_allowed= h->pps.transform_8x8_mode;
4758
4759     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4760
4761     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4762     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4763                 down the code */
4764     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4765         if(s->mb_skip_run==-1)
4766             s->mb_skip_run= get_ue_golomb(&s->gb);
4767
4768         if (s->mb_skip_run--) {
4769             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4770                 if(s->mb_skip_run==0)
4771                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4772                 else
4773                     predict_field_decoding_flag(h);
4774             }
4775             decode_mb_skip(h);
4776             return 0;
4777         }
4778     }
4779     if(FRAME_MBAFF){
4780         if( (s->mb_y&1) == 0 )
4781             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4782     }else
4783         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4784
4785     h->prev_mb_skipped= 0;
4786
4787     mb_type= get_ue_golomb(&s->gb);
4788     if(h->slice_type == B_TYPE){
4789         if(mb_type < 23){
4790             partition_count= b_mb_type_info[mb_type].partition_count;
4791             mb_type=         b_mb_type_info[mb_type].type;
4792         }else{
4793             mb_type -= 23;
4794             goto decode_intra_mb;
4795         }
4796     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4797         if(mb_type < 5){
4798             partition_count= p_mb_type_info[mb_type].partition_count;
4799             mb_type=         p_mb_type_info[mb_type].type;
4800         }else{
4801             mb_type -= 5;
4802             goto decode_intra_mb;
4803         }
4804     }else{
4805        assert(h->slice_type == I_TYPE);
4806 decode_intra_mb:
4807         if(mb_type > 25){
4808             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4809             return -1;
4810         }
4811         partition_count=0;
4812         cbp= i_mb_type_info[mb_type].cbp;
4813         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4814         mb_type= i_mb_type_info[mb_type].type;
4815     }
4816
4817     if(MB_FIELD)
4818         mb_type |= MB_TYPE_INTERLACED;
4819
4820     h->slice_table[ mb_xy ]= h->slice_num;
4821
4822     if(IS_INTRA_PCM(mb_type)){
4823         unsigned int x, y;
4824
4825         // We assume these blocks are very rare so we do not optimize it.
4826         align_get_bits(&s->gb);
4827
4828         // The pixels are stored in the same order as levels in h->mb array.
4829         for(y=0; y<16; y++){
4830             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4831             for(x=0; x<16; x++){
4832                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4833                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4834             }
4835         }
4836         for(y=0; y<8; y++){
4837             const int index= 256 + 4*(y&3) + 32*(y>>2);
4838             for(x=0; x<8; x++){
4839                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4840                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4841             }
4842         }
4843         for(y=0; y<8; y++){
4844             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4845             for(x=0; x<8; x++){
4846                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4847                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4848             }
4849         }
4850
4851         // In deblocking, the quantizer is 0
4852         s->current_picture.qscale_table[mb_xy]= 0;
4853         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
4854         // All coeffs are present
4855         memset(h->non_zero_count[mb_xy], 16, 16);
4856
4857         s->current_picture.mb_type[mb_xy]= mb_type;
4858         return 0;
4859     }
4860
4861     if(MB_MBAFF){
4862         h->ref_count[0] <<= 1;
4863         h->ref_count[1] <<= 1;
4864     }
4865
4866     fill_caches(h, mb_type, 0);
4867
4868     //mb_pred
4869     if(IS_INTRA(mb_type)){
4870             int pred_mode;
4871 //            init_top_left_availability(h);
4872             if(IS_INTRA4x4(mb_type)){
4873                 int i;
4874                 int di = 1;
4875                 if(dct8x8_allowed && get_bits1(&s->gb)){
4876                     mb_type |= MB_TYPE_8x8DCT;
4877                     di = 4;
4878                 }
4879
4880 //                fill_intra4x4_pred_table(h);
4881                 for(i=0; i<16; i+=di){
4882                     int mode= pred_intra_mode(h, i);
4883
4884                     if(!get_bits1(&s->gb)){
4885                         const int rem_mode= get_bits(&s->gb, 3);
4886                         mode = rem_mode + (rem_mode >= mode);
4887                     }
4888
4889                     if(di==4)
4890                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4891                     else
4892                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4893                 }
4894                 write_back_intra_pred_mode(h);
4895                 if( check_intra4x4_pred_mode(h) < 0)
4896                     return -1;
4897             }else{
4898                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4899                 if(h->intra16x16_pred_mode < 0)
4900                     return -1;
4901             }
4902
4903             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4904             if(pred_mode < 0)
4905                 return -1;
4906             h->chroma_pred_mode= pred_mode;
4907     }else if(partition_count==4){
4908         int i, j, sub_partition_count[4], list, ref[2][4];
4909
4910         if(h->slice_type == B_TYPE){
4911             for(i=0; i<4; i++){
4912                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4913                 if(h->sub_mb_type[i] >=13){
4914                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4915                     return -1;
4916                 }
4917                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4918                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4919             }
4920             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4921                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4922                 pred_direct_motion(h, &mb_type);
4923                 h->ref_cache[0][scan8[4]] =
4924                 h->ref_cache[1][scan8[4]] =
4925                 h->ref_cache[0][scan8[12]] =
4926                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4927             }
4928         }else{
4929             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4930             for(i=0; i<4; i++){
4931                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4932                 if(h->sub_mb_type[i] >=4){
4933                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4934                     return -1;
4935                 }
4936                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4937                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4938             }
4939         }
4940
4941         for(list=0; list<h->list_count; list++){
4942             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4943             for(i=0; i<4; i++){
4944                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4945                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4946                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4947                     if(tmp>=ref_count){
4948                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4949                         return -1;
4950                     }
4951                     ref[list][i]= tmp;
4952                 }else{
4953                  //FIXME
4954                     ref[list][i] = -1;
4955                 }
4956             }
4957         }
4958
4959         if(dct8x8_allowed)
4960             dct8x8_allowed = get_dct8x8_allowed(h);
4961
4962         for(list=0; list<h->list_count; list++){
4963             for(i=0; i<4; i++){
4964                 if(IS_DIRECT(h->sub_mb_type[i])) {
4965                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4966                     continue;
4967                 }
4968                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4969                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4970
4971                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4972                     const int sub_mb_type= h->sub_mb_type[i];
4973                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4974                     for(j=0; j<sub_partition_count[i]; j++){
4975                         int mx, my;
4976                         const int index= 4*i + block_width*j;
4977                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4978                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4979                         mx += get_se_golomb(&s->gb);
4980                         my += get_se_golomb(&s->gb);
4981                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4982
4983                         if(IS_SUB_8X8(sub_mb_type)){
4984                             mv_cache[ 1 ][0]=
4985                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4986                             mv_cache[ 1 ][1]=
4987                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4988                         }else if(IS_SUB_8X4(sub_mb_type)){
4989                             mv_cache[ 1 ][0]= mx;
4990                             mv_cache[ 1 ][1]= my;
4991                         }else if(IS_SUB_4X8(sub_mb_type)){
4992                             mv_cache[ 8 ][0]= mx;
4993                             mv_cache[ 8 ][1]= my;
4994                         }
4995                         mv_cache[ 0 ][0]= mx;
4996                         mv_cache[ 0 ][1]= my;
4997                     }
4998                 }else{
4999                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5000                     p[0] = p[1]=
5001                     p[8] = p[9]= 0;
5002                 }
5003             }
5004         }
5005     }else if(IS_DIRECT(mb_type)){
5006         pred_direct_motion(h, &mb_type);
5007         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5008     }else{
5009         int list, mx, my, i;
5010          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5011         if(IS_16X16(mb_type)){
5012             for(list=0; list<h->list_count; list++){
5013                     unsigned int val;
5014                     if(IS_DIR(mb_type, 0, list)){
5015                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
5016                         if(val >= h->ref_count[list]){
5017                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5018                             return -1;
5019                         }
5020                     }else
5021                         val= LIST_NOT_USED&0xFF;
5022                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5023             }
5024             for(list=0; list<h->list_count; list++){
5025                 unsigned int val;
5026                 if(IS_DIR(mb_type, 0, list)){
5027                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5028                     mx += get_se_golomb(&s->gb);
5029                     my += get_se_golomb(&s->gb);
5030                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5031
5032                     val= pack16to32(mx,my);
5033                 }else
5034                     val=0;
5035                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
5036             }
5037         }
5038         else if(IS_16X8(mb_type)){
5039             for(list=0; list<h->list_count; list++){
5040                     for(i=0; i<2; i++){
5041                         unsigned int val;
5042                         if(IS_DIR(mb_type, i, list)){
5043                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5044                             if(val >= h->ref_count[list]){
5045                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5046                                 return -1;
5047                             }
5048                         }else
5049                             val= LIST_NOT_USED&0xFF;
5050                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5051                     }
5052             }
5053             for(list=0; list<h->list_count; list++){
5054                 for(i=0; i<2; i++){
5055                     unsigned int val;
5056                     if(IS_DIR(mb_type, i, list)){
5057                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5058                         mx += get_se_golomb(&s->gb);
5059                         my += get_se_golomb(&s->gb);
5060                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5061
5062                         val= pack16to32(mx,my);
5063                     }else
5064                         val=0;
5065                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
5066                 }
5067             }
5068         }else{
5069             assert(IS_8X16(mb_type));
5070             for(list=0; list<h->list_count; list++){
5071                     for(i=0; i<2; i++){
5072                         unsigned int val;
5073                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5074                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
5075                             if(val >= h->ref_count[list]){
5076                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
5077                                 return -1;
5078                             }
5079                         }else
5080                             val= LIST_NOT_USED&0xFF;
5081                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5082                     }
5083             }
5084             for(list=0; list<h->list_count; list++){
5085                 for(i=0; i<2; i++){
5086                     unsigned int val;
5087                     if(IS_DIR(mb_type, i, list)){
5088                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5089                         mx += get_se_golomb(&s->gb);
5090                         my += get_se_golomb(&s->gb);
5091                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5092
5093                         val= pack16to32(mx,my);
5094                     }else
5095                         val=0;
5096                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
5097                 }
5098             }
5099         }
5100     }
5101
5102     if(IS_INTER(mb_type))
5103         write_back_motion(h, mb_type);
5104
5105     if(!IS_INTRA16x16(mb_type)){
5106         cbp= get_ue_golomb(&s->gb);
5107         if(cbp > 47){
5108             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
5109             return -1;
5110         }
5111
5112         if(IS_INTRA4x4(mb_type))
5113             cbp= golomb_to_intra4x4_cbp[cbp];
5114         else
5115             cbp= golomb_to_inter_cbp[cbp];
5116     }
5117     h->cbp = cbp;
5118
5119     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5120         if(get_bits1(&s->gb))
5121             mb_type |= MB_TYPE_8x8DCT;
5122     }
5123     s->current_picture.mb_type[mb_xy]= mb_type;
5124
5125     if(cbp || IS_INTRA16x16(mb_type)){
5126         int i8x8, i4x4, chroma_idx;
5127         int chroma_qp, dquant;
5128         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5129         const uint8_t *scan, *scan8x8, *dc_scan;
5130
5131 //        fill_non_zero_count_cache(h);
5132
5133         if(IS_INTERLACED(mb_type)){
5134             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5135             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5136             dc_scan= luma_dc_field_scan;
5137         }else{
5138             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5139             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5140             dc_scan= luma_dc_zigzag_scan;
5141         }
5142
5143         dquant= get_se_golomb(&s->gb);
5144
5145         if( dquant > 25 || dquant < -26 ){
5146             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5147             return -1;
5148         }
5149
5150         s->qscale += dquant;
5151         if(((unsigned)s->qscale) > 51){
5152             if(s->qscale<0) s->qscale+= 52;
5153             else            s->qscale-= 52;
5154         }
5155
5156         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5157         if(IS_INTRA16x16(mb_type)){
5158             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5159                 return -1; //FIXME continue if partitioned and other return -1 too
5160             }
5161
5162             assert((cbp&15) == 0 || (cbp&15) == 15);
5163
5164             if(cbp&15){
5165                 for(i8x8=0; i8x8<4; i8x8++){
5166                     for(i4x4=0; i4x4<4; i4x4++){
5167                         const int index= i4x4 + 4*i8x8;
5168                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5169                             return -1;
5170                         }
5171                     }
5172                 }
5173             }else{
5174                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5175             }
5176         }else{
5177             for(i8x8=0; i8x8<4; i8x8++){
5178                 if(cbp & (1<<i8x8)){
5179                     if(IS_8x8DCT(mb_type)){
5180                         DCTELEM *buf = &h->mb[64*i8x8];
5181                         uint8_t *nnz;
5182                         for(i4x4=0; i4x4<4; i4x4++){
5183                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5184                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5185                                 return -1;
5186                         }
5187                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5188                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5189                     }else{
5190                         for(i4x4=0; i4x4<4; i4x4++){
5191                             const int index= i4x4 + 4*i8x8;
5192
5193                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5194                                 return -1;
5195                             }
5196                         }
5197                     }
5198                 }else{
5199                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5200                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5201                 }
5202             }
5203         }
5204
5205         if(cbp&0x30){
5206             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5207                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5208                     return -1;
5209                 }
5210         }
5211
5212         if(cbp&0x20){
5213             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5214                 for(i4x4=0; i4x4<4; i4x4++){
5215                     const int index= 16 + 4*chroma_idx + i4x4;
5216                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5217                         return -1;
5218                     }
5219                 }
5220             }
5221         }else{
5222             uint8_t * const nnz= &h->non_zero_count_cache[0];
5223             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5224             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5225         }
5226     }else{
5227         uint8_t * const nnz= &h->non_zero_count_cache[0];
5228         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5229         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5230         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5231     }
5232     s->current_picture.qscale_table[mb_xy]= s->qscale;
5233     write_back_non_zero_count(h);
5234
5235     if(MB_MBAFF){
5236         h->ref_count[0] >>= 1;
5237         h->ref_count[1] >>= 1;
5238     }
5239
5240     return 0;
5241 }
5242
5243 static int decode_cabac_field_decoding_flag(H264Context *h) {
5244     MpegEncContext * const s = &h->s;
5245     const int mb_x = s->mb_x;
5246     const int mb_y = s->mb_y & ~1;
5247     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5248     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5249
5250     unsigned int ctx = 0;
5251
5252     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5253         ctx += 1;
5254     }
5255     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5256         ctx += 1;
5257     }
5258
5259     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5260 }
5261
5262 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5263     uint8_t *state= &h->cabac_state[ctx_base];
5264     int mb_type;
5265
5266     if(intra_slice){
5267         MpegEncContext * const s = &h->s;
5268         const int mba_xy = h->left_mb_xy[0];
5269         const int mbb_xy = h->top_mb_xy;
5270         int ctx=0;
5271         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5272             ctx++;
5273         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5274             ctx++;
5275         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5276             return 0;   /* I4x4 */
5277         state += 2;
5278     }else{
5279         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5280             return 0;   /* I4x4 */
5281     }
5282
5283     if( get_cabac_terminate( &h->cabac ) )
5284         return 25;  /* PCM */
5285
5286     mb_type = 1; /* I16x16 */
5287     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5288     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5289         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5290     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5291     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5292     return mb_type;
5293 }
5294
5295 static int decode_cabac_mb_type( H264Context *h ) {
5296     MpegEncContext * const s = &h->s;
5297
5298     if( h->slice_type == I_TYPE ) {
5299         return decode_cabac_intra_mb_type(h, 3, 1);
5300     } else if( h->slice_type == P_TYPE ) {
5301         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5302             /* P-type */
5303             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5304                 /* P_L0_D16x16, P_8x8 */
5305                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5306             } else {
5307                 /* P_L0_D8x16, P_L0_D16x8 */
5308                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5309             }
5310         } else {
5311             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5312         }
5313     } else if( h->slice_type == B_TYPE ) {
5314         const int mba_xy = h->left_mb_xy[0];
5315         const int mbb_xy = h->top_mb_xy;
5316         int ctx = 0;
5317         int bits;
5318
5319         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5320             ctx++;
5321         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5322             ctx++;
5323
5324         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5325             return 0; /* B_Direct_16x16 */
5326
5327         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5328             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5329         }
5330
5331         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5332         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5333         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5334         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5335         if( bits < 8 )
5336             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5337         else if( bits == 13 ) {
5338             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5339         } else if( bits == 14 )
5340             return 11; /* B_L1_L0_8x16 */
5341         else if( bits == 15 )
5342             return 22; /* B_8x8 */
5343
5344         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5345         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5346     } else {
5347         /* TODO SI/SP frames? */
5348         return -1;
5349     }
5350 }
5351
5352 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5353     MpegEncContext * const s = &h->s;
5354     int mba_xy, mbb_xy;
5355     int ctx = 0;
5356
5357     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5358         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5359         mba_xy = mb_xy - 1;
5360         if( (mb_y&1)
5361             && h->slice_table[mba_xy] == h->slice_num
5362             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5363             mba_xy += s->mb_stride;
5364         if( MB_FIELD ){
5365             mbb_xy = mb_xy - s->mb_stride;
5366             if( !(mb_y&1)
5367                 && h->slice_table[mbb_xy] == h->slice_num
5368                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5369                 mbb_xy -= s->mb_stride;
5370         }else
5371             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5372     }else{
5373         int mb_xy = mb_x + mb_y*s->mb_stride;
5374         mba_xy = mb_xy - 1;
5375         mbb_xy = mb_xy - s->mb_stride;
5376     }
5377
5378     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5379         ctx++;
5380     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5381         ctx++;
5382
5383     if( h->slice_type == B_TYPE )
5384         ctx += 13;
5385     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5386 }
5387
5388 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5389     int mode = 0;
5390
5391     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5392         return pred_mode;
5393
5394     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5395     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5396     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5397
5398     if( mode >= pred_mode )
5399         return mode + 1;
5400     else
5401         return mode;
5402 }
5403
5404 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5405     const int mba_xy = h->left_mb_xy[0];
5406     const int mbb_xy = h->top_mb_xy;
5407
5408     int ctx = 0;
5409
5410     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5411     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5412         ctx++;
5413
5414     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5415         ctx++;
5416
5417     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5418         return 0;
5419
5420     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5421         return 1;
5422     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5423         return 2;
5424     else
5425         return 3;
5426 }
5427
5428 static const uint8_t block_idx_x[16] = {
5429     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5430 };
5431 static const uint8_t block_idx_y[16] = {
5432     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5433 };
5434 static const uint8_t block_idx_xy[4][4] = {
5435     { 0, 2, 8,  10},
5436     { 1, 3, 9,  11},
5437     { 4, 6, 12, 14},
5438     { 5, 7, 13, 15}
5439 };
5440
5441 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5442     int cbp = 0;
5443     int cbp_b = -1;
5444     int i8x8;
5445
5446     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5447         cbp_b = h->top_cbp;
5448         tprintf(h->s.avctx, "cbp_b = top_cbp = %x\n", cbp_b);
5449     }
5450
5451     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5452         int cbp_a = -1;
5453         int x, y;
5454         int ctx = 0;
5455
5456         x = block_idx_x[4*i8x8];
5457         y = block_idx_y[4*i8x8];
5458
5459         if( x > 0 )
5460             cbp_a = cbp;
5461         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5462             cbp_a = h->left_cbp;
5463             tprintf(h->s.avctx, "cbp_a = left_cbp = %x\n", cbp_a);
5464         }
5465
5466         if( y > 0 )
5467             cbp_b = cbp;
5468
5469         /* No need to test for skip as we put 0 for skip block */
5470         /* No need to test for IPCM as we put 1 for IPCM block */
5471         if( cbp_a >= 0 ) {
5472             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5473             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5474                 ctx++;
5475         }
5476
5477         if( cbp_b >= 0 ) {
5478             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5479             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5480                 ctx += 2;
5481         }
5482
5483         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5484             cbp |= 1 << i8x8;
5485         }
5486     }
5487     return cbp;
5488 }
5489 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5490     int ctx;
5491     int cbp_a, cbp_b;
5492
5493     cbp_a = (h->left_cbp>>4)&0x03;
5494     cbp_b = (h-> top_cbp>>4)&0x03;
5495
5496     ctx = 0;
5497     if( cbp_a > 0 ) ctx++;
5498     if( cbp_b > 0 ) ctx += 2;
5499     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5500         return 0;
5501
5502     ctx = 4;
5503     if( cbp_a == 2 ) ctx++;
5504     if( cbp_b == 2 ) ctx += 2;
5505     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5506 }
5507 static int decode_cabac_mb_dqp( H264Context *h) {
5508     MpegEncContext * const s = &h->s;
5509     int mbn_xy;
5510     int   ctx = 0;
5511     int   val = 0;
5512
5513     if( s->mb_x > 0 )
5514         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5515     else
5516         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5517
5518     if( h->last_qscale_diff != 0 )
5519         ctx++;
5520
5521     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5522         if( ctx < 2 )
5523             ctx = 2;
5524         else
5525             ctx = 3;
5526         val++;
5527         if(val > 102) //prevent infinite loop
5528             return INT_MIN;
5529     }
5530
5531     if( val&0x01 )
5532         return (val + 1)/2;
5533     else
5534         return -(val + 1)/2;
5535 }
5536 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5537     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5538         return 0;   /* 8x8 */
5539     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5540         return 1;   /* 8x4 */
5541     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5542         return 2;   /* 4x8 */
5543     return 3;       /* 4x4 */
5544 }
5545 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5546     int type;
5547     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5548         return 0;   /* B_Direct_8x8 */
5549     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5550         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5551     type = 3;
5552     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5553         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5554             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5555         type += 4;
5556     }
5557     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5558     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5559     return type;
5560 }
5561
5562 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5563     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5564 }
5565
5566 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5567     int refa = h->ref_cache[list][scan8[n] - 1];
5568     int refb = h->ref_cache[list][scan8[n] - 8];
5569     int ref  = 0;
5570     int ctx  = 0;
5571
5572     if( h->slice_type == B_TYPE) {
5573         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5574             ctx++;
5575         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5576             ctx += 2;
5577     } else {
5578         if( refa > 0 )
5579             ctx++;
5580         if( refb > 0 )
5581             ctx += 2;
5582     }
5583
5584     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5585         ref++;
5586         if( ctx < 4 )
5587             ctx = 4;
5588         else
5589             ctx = 5;
5590         if(ref >= 32 /*h->ref_list[list]*/){
5591             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5592             return 0; //FIXME we should return -1 and check the return everywhere
5593         }
5594     }
5595     return ref;
5596 }
5597
5598 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5599     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5600                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5601     int ctxbase = (l == 0) ? 40 : 47;
5602     int ctx, mvd;
5603
5604     if( amvd < 3 )
5605         ctx = 0;
5606     else if( amvd > 32 )
5607         ctx = 2;
5608     else
5609         ctx = 1;
5610
5611     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5612         return 0;
5613
5614     mvd= 1;
5615     ctx= 3;
5616     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5617         mvd++;
5618         if( ctx < 6 )
5619             ctx++;
5620     }
5621
5622     if( mvd >= 9 ) {
5623         int k = 3;
5624         while( get_cabac_bypass( &h->cabac ) ) {
5625             mvd += 1 << k;
5626             k++;
5627             if(k>24){
5628                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5629                 return INT_MIN;
5630             }
5631         }
5632         while( k-- ) {
5633             if( get_cabac_bypass( &h->cabac ) )
5634                 mvd += 1 << k;
5635         }
5636     }
5637     return get_cabac_bypass_sign( &h->cabac, -mvd );
5638 }
5639
5640 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5641     int nza, nzb;
5642     int ctx = 0;
5643
5644     if( cat == 0 ) {
5645         nza = h->left_cbp&0x100;
5646         nzb = h-> top_cbp&0x100;
5647     } else if( cat == 1 || cat == 2 ) {
5648         nza = h->non_zero_count_cache[scan8[idx] - 1];
5649         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5650     } else if( cat == 3 ) {
5651         nza = (h->left_cbp>>(6+idx))&0x01;
5652         nzb = (h-> top_cbp>>(6+idx))&0x01;
5653     } else {
5654         assert(cat == 4);
5655         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5656         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5657     }
5658
5659     if( nza > 0 )
5660         ctx++;
5661
5662     if( nzb > 0 )
5663         ctx += 2;
5664
5665     return ctx + 4 * cat;
5666 }
5667
5668 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5669     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5670     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5671     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5672     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5673 };
5674
5675 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5676     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5677     static const int significant_coeff_flag_offset[2][6] = {
5678       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5679       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5680     };
5681     static const int last_coeff_flag_offset[2][6] = {
5682       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5683       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5684     };
5685     static const int coeff_abs_level_m1_offset[6] = {
5686         227+0, 227+10, 227+20, 227+30, 227+39, 426
5687     };
5688     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5689       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5690         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5691         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5692        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5693       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5694         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5695         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5696         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5697     };
5698
5699     int index[64];
5700
5701     int last;
5702     int coeff_count = 0;
5703
5704     int abslevel1 = 1;
5705     int abslevelgt1 = 0;
5706
5707     uint8_t *significant_coeff_ctx_base;
5708     uint8_t *last_coeff_ctx_base;
5709     uint8_t *abs_level_m1_ctx_base;
5710
5711 #ifndef ARCH_X86
5712 #define CABAC_ON_STACK
5713 #endif
5714 #ifdef CABAC_ON_STACK
5715 #define CC &cc
5716     CABACContext cc;
5717     cc.range     = h->cabac.range;
5718     cc.low       = h->cabac.low;
5719     cc.bytestream= h->cabac.bytestream;
5720 #else
5721 #define CC &h->cabac
5722 #endif
5723
5724
5725     /* cat: 0-> DC 16x16  n = 0
5726      *      1-> AC 16x16  n = luma4x4idx
5727      *      2-> Luma4x4   n = luma4x4idx
5728      *      3-> DC Chroma n = iCbCr
5729      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5730      *      5-> Luma8x8   n = 4 * luma8x8idx
5731      */
5732
5733     /* read coded block flag */
5734     if( cat != 5 ) {
5735         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5736             if( cat == 1 || cat == 2 )
5737                 h->non_zero_count_cache[scan8[n]] = 0;
5738             else if( cat == 4 )
5739                 h->non_zero_count_cache[scan8[16+n]] = 0;
5740 #ifdef CABAC_ON_STACK
5741             h->cabac.range     = cc.range     ;
5742             h->cabac.low       = cc.low       ;
5743             h->cabac.bytestream= cc.bytestream;
5744 #endif
5745             return 0;
5746         }
5747     }
5748
5749     significant_coeff_ctx_base = h->cabac_state
5750         + significant_coeff_flag_offset[MB_FIELD][cat];
5751     last_coeff_ctx_base = h->cabac_state
5752         + last_coeff_flag_offset[MB_FIELD][cat];
5753     abs_level_m1_ctx_base = h->cabac_state
5754         + coeff_abs_level_m1_offset[cat];
5755
5756     if( cat == 5 ) {
5757 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5758         for(last= 0; last < coefs; last++) { \
5759             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5760             if( get_cabac( CC, sig_ctx )) { \
5761                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5762                 index[coeff_count++] = last; \
5763                 if( get_cabac( CC, last_ctx ) ) { \
5764                     last= max_coeff; \
5765                     break; \
5766                 } \
5767             } \
5768         }\
5769         if( last == max_coeff -1 ) {\
5770             index[coeff_count++] = last;\
5771         }
5772         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5773 #if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5774         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5775     } else {
5776         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5777 #else
5778         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5779     } else {
5780         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5781 #endif
5782     }
5783     assert(coeff_count > 0);
5784
5785     if( cat == 0 )
5786         h->cbp_table[mb_xy] |= 0x100;
5787     else if( cat == 1 || cat == 2 )
5788         h->non_zero_count_cache[scan8[n]] = coeff_count;
5789     else if( cat == 3 )
5790         h->cbp_table[mb_xy] |= 0x40 << n;
5791     else if( cat == 4 )
5792         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5793     else {
5794         assert( cat == 5 );
5795         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5796     }
5797
5798     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5799         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5800         int j= scantable[index[coeff_count]];
5801
5802         if( get_cabac( CC, ctx ) == 0 ) {
5803             if( !qmul ) {
5804                 block[j] = get_cabac_bypass_sign( CC, -1);
5805             }else{
5806                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5807             }
5808
5809             abslevel1++;
5810         } else {
5811             int coeff_abs = 2;
5812             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5813             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5814                 coeff_abs++;
5815             }
5816
5817             if( coeff_abs >= 15 ) {
5818                 int j = 0;
5819                 while( get_cabac_bypass( CC ) ) {
5820                     j++;
5821                 }
5822
5823                 coeff_abs=1;
5824                 while( j-- ) {
5825                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5826                 }
5827                 coeff_abs+= 14;
5828             }
5829
5830             if( !qmul ) {
5831                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5832                 else                                block[j] =  coeff_abs;
5833             }else{
5834                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5835                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5836             }
5837
5838             abslevelgt1++;
5839         }
5840     }
5841 #ifdef CABAC_ON_STACK
5842             h->cabac.range     = cc.range     ;
5843             h->cabac.low       = cc.low       ;
5844             h->cabac.bytestream= cc.bytestream;
5845 #endif
5846     return 0;
5847 }
5848
5849 static inline void compute_mb_neighbors(H264Context *h)
5850 {
5851     MpegEncContext * const s = &h->s;
5852     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5853     h->top_mb_xy     = mb_xy - s->mb_stride;
5854     h->left_mb_xy[0] = mb_xy - 1;
5855     if(FRAME_MBAFF){
5856         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5857         const int top_pair_xy      = pair_xy     - s->mb_stride;
5858         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5859         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5860         const int curr_mb_frame_flag = !MB_FIELD;
5861         const int bottom = (s->mb_y & 1);
5862         if (bottom
5863                 ? !curr_mb_frame_flag // bottom macroblock
5864                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5865                 ) {
5866             h->top_mb_xy -= s->mb_stride;
5867         }
5868         if (left_mb_frame_flag != curr_mb_frame_flag) {
5869             h->left_mb_xy[0] = pair_xy - 1;
5870         }
5871     }
5872     return;
5873 }
5874
5875 /**
5876  * decodes a macroblock
5877  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5878  */
5879 static int decode_mb_cabac(H264Context *h) {
5880     MpegEncContext * const s = &h->s;
5881     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5882     int mb_type, partition_count, cbp = 0;
5883     int dct8x8_allowed= h->pps.transform_8x8_mode;
5884
5885     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5886
5887     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5888     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5889         int skip;
5890         /* a skipped mb needs the aff flag from the following mb */
5891         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5892             predict_field_decoding_flag(h);
5893         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5894             skip = h->next_mb_skipped;
5895         else
5896             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5897         /* read skip flags */
5898         if( skip ) {
5899             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5900                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5901                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5902                 if(h->next_mb_skipped)
5903                     predict_field_decoding_flag(h);
5904                 else
5905                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5906             }
5907
5908             decode_mb_skip(h);
5909
5910             h->cbp_table[mb_xy] = 0;
5911             h->chroma_pred_mode_table[mb_xy] = 0;
5912             h->last_qscale_diff = 0;
5913
5914             return 0;
5915
5916         }
5917     }
5918     if(FRAME_MBAFF){
5919         if( (s->mb_y&1) == 0 )
5920             h->mb_mbaff =
5921             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5922     }else
5923         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5924
5925     h->prev_mb_skipped = 0;
5926
5927     compute_mb_neighbors(h);
5928     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5929         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5930         return -1;
5931     }
5932
5933     if( h->slice_type == B_TYPE ) {
5934         if( mb_type < 23 ){
5935             partition_count= b_mb_type_info[mb_type].partition_count;
5936             mb_type=         b_mb_type_info[mb_type].type;
5937         }else{
5938             mb_type -= 23;
5939             goto decode_intra_mb;
5940         }
5941     } else if( h->slice_type == P_TYPE ) {
5942         if( mb_type < 5) {
5943             partition_count= p_mb_type_info[mb_type].partition_count;
5944             mb_type=         p_mb_type_info[mb_type].type;
5945         } else {
5946             mb_type -= 5;
5947             goto decode_intra_mb;
5948         }
5949     } else {
5950        assert(h->slice_type == I_TYPE);
5951 decode_intra_mb:
5952         partition_count = 0;
5953         cbp= i_mb_type_info[mb_type].cbp;
5954         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5955         mb_type= i_mb_type_info[mb_type].type;
5956     }
5957     if(MB_FIELD)
5958         mb_type |= MB_TYPE_INTERLACED;
5959
5960     h->slice_table[ mb_xy ]= h->slice_num;
5961
5962     if(IS_INTRA_PCM(mb_type)) {
5963         const uint8_t *ptr;
5964         unsigned int x, y;
5965
5966         // We assume these blocks are very rare so we do not optimize it.
5967         // FIXME The two following lines get the bitstream position in the cabac
5968         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5969         ptr= h->cabac.bytestream;
5970         if(h->cabac.low&0x1) ptr--;
5971         if(CABAC_BITS==16){
5972             if(h->cabac.low&0x1FF) ptr--;
5973         }
5974
5975         // The pixels are stored in the same order as levels in h->mb array.
5976         for(y=0; y<16; y++){
5977             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5978             for(x=0; x<16; x++){
5979                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5980                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5981             }
5982         }
5983         for(y=0; y<8; y++){
5984             const int index= 256 + 4*(y&3) + 32*(y>>2);
5985             for(x=0; x<8; x++){
5986                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5987                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5988             }
5989         }
5990         for(y=0; y<8; y++){
5991             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5992             for(x=0; x<8; x++){
5993                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5994                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5995             }
5996         }
5997
5998         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5999
6000         // All blocks are present
6001         h->cbp_table[mb_xy] = 0x1ef;
6002         h->chroma_pred_mode_table[mb_xy] = 0;
6003         // In deblocking, the quantizer is 0
6004         s->current_picture.qscale_table[mb_xy]= 0;
6005         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
6006         // All coeffs are present
6007         memset(h->non_zero_count[mb_xy], 16, 16);
6008         s->current_picture.mb_type[mb_xy]= mb_type;
6009         return 0;
6010     }
6011
6012     if(MB_MBAFF){
6013         h->ref_count[0] <<= 1;
6014         h->ref_count[1] <<= 1;
6015     }
6016
6017     fill_caches(h, mb_type, 0);
6018
6019     if( IS_INTRA( mb_type ) ) {
6020         int i, pred_mode;
6021         if( IS_INTRA4x4( mb_type ) ) {
6022             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6023                 mb_type |= MB_TYPE_8x8DCT;
6024                 for( i = 0; i < 16; i+=4 ) {
6025                     int pred = pred_intra_mode( h, i );
6026                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6027                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6028                 }
6029             } else {
6030                 for( i = 0; i < 16; i++ ) {
6031                     int pred = pred_intra_mode( h, i );
6032                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6033
6034                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6035                 }
6036             }
6037             write_back_intra_pred_mode(h);
6038             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6039         } else {
6040             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6041             if( h->intra16x16_pred_mode < 0 ) return -1;
6042         }
6043         h->chroma_pred_mode_table[mb_xy] =
6044         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
6045
6046         pred_mode= check_intra_pred_mode( h, pred_mode );
6047         if( pred_mode < 0 ) return -1;
6048         h->chroma_pred_mode= pred_mode;
6049     } else if( partition_count == 4 ) {
6050         int i, j, sub_partition_count[4], list, ref[2][4];
6051
6052         if( h->slice_type == B_TYPE ) {
6053             for( i = 0; i < 4; i++ ) {
6054                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6055                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6056                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6057             }
6058             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
6059                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
6060                 pred_direct_motion(h, &mb_type);
6061                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6062                     for( i = 0; i < 4; i++ )
6063                         if( IS_DIRECT(h->sub_mb_type[i]) )
6064                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6065                 }
6066             }
6067         } else {
6068             for( i = 0; i < 4; i++ ) {
6069                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6070                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6071                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6072             }
6073         }
6074
6075         for( list = 0; list < h->list_count; list++ ) {
6076                 for( i = 0; i < 4; i++ ) {
6077                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6078                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6079                         if( h->ref_count[list] > 1 )
6080                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6081                         else
6082                             ref[list][i] = 0;
6083                     } else {
6084                         ref[list][i] = -1;
6085                     }
6086                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6087                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6088                 }
6089         }
6090
6091         if(dct8x8_allowed)
6092             dct8x8_allowed = get_dct8x8_allowed(h);
6093
6094         for(list=0; list<h->list_count; list++){
6095             for(i=0; i<4; i++){
6096                 if(IS_DIRECT(h->sub_mb_type[i])){
6097                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6098                     continue;
6099                 }
6100                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6101
6102                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6103                     const int sub_mb_type= h->sub_mb_type[i];
6104                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6105                     for(j=0; j<sub_partition_count[i]; j++){
6106                         int mpx, mpy;
6107                         int mx, my;
6108                         const int index= 4*i + block_width*j;
6109                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6110                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6111                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6112
6113                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6114                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6115                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6116
6117                         if(IS_SUB_8X8(sub_mb_type)){
6118                             mv_cache[ 1 ][0]=
6119                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6120                             mv_cache[ 1 ][1]=
6121                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6122
6123                             mvd_cache[ 1 ][0]=
6124                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6125                             mvd_cache[ 1 ][1]=
6126                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6127                         }else if(IS_SUB_8X4(sub_mb_type)){
6128                             mv_cache[ 1 ][0]= mx;
6129                             mv_cache[ 1 ][1]= my;
6130
6131                             mvd_cache[ 1 ][0]= mx - mpx;
6132                             mvd_cache[ 1 ][1]= my - mpy;
6133                         }else if(IS_SUB_4X8(sub_mb_type)){
6134                             mv_cache[ 8 ][0]= mx;
6135                             mv_cache[ 8 ][1]= my;
6136
6137                             mvd_cache[ 8 ][0]= mx - mpx;
6138                             mvd_cache[ 8 ][1]= my - mpy;
6139                         }
6140                         mv_cache[ 0 ][0]= mx;
6141                         mv_cache[ 0 ][1]= my;
6142
6143                         mvd_cache[ 0 ][0]= mx - mpx;
6144                         mvd_cache[ 0 ][1]= my - mpy;
6145                     }
6146                 }else{
6147                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6148                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6149                     p[0] = p[1] = p[8] = p[9] = 0;
6150                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6151                 }
6152             }
6153         }
6154     } else if( IS_DIRECT(mb_type) ) {
6155         pred_direct_motion(h, &mb_type);
6156         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6157         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6158         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6159     } else {
6160         int list, mx, my, i, mpx, mpy;
6161         if(IS_16X16(mb_type)){
6162             for(list=0; list<h->list_count; list++){
6163                 if(IS_DIR(mb_type, 0, list)){
6164                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6165                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6166                 }else
6167                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
6168             }
6169             for(list=0; list<h->list_count; list++){
6170                 if(IS_DIR(mb_type, 0, list)){
6171                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6172
6173                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6174                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6175                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6176
6177                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6178                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6179                 }else
6180                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6181             }
6182         }
6183         else if(IS_16X8(mb_type)){
6184             for(list=0; list<h->list_count; list++){
6185                     for(i=0; i<2; i++){
6186                         if(IS_DIR(mb_type, i, list)){
6187                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6188                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6189                         }else
6190                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6191                     }
6192             }
6193             for(list=0; list<h->list_count; list++){
6194                 for(i=0; i<2; i++){
6195                     if(IS_DIR(mb_type, i, list)){
6196                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6197                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6198                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6199                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6200
6201                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6202                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6203                     }else{
6204                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6205                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6206                     }
6207                 }
6208             }
6209         }else{
6210             assert(IS_8X16(mb_type));
6211             for(list=0; list<h->list_count; list++){
6212                     for(i=0; i<2; i++){
6213                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6214                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6215                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6216                         }else
6217                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6218                     }
6219             }
6220             for(list=0; list<h->list_count; list++){
6221                 for(i=0; i<2; i++){
6222                     if(IS_DIR(mb_type, i, list)){
6223                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6224                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6225                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6226
6227                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
6228                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6229                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6230                     }else{
6231                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6232                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6233                     }
6234                 }
6235             }
6236         }
6237     }
6238
6239    if( IS_INTER( mb_type ) ) {
6240         h->chroma_pred_mode_table[mb_xy] = 0;
6241         write_back_motion( h, mb_type );
6242    }
6243
6244     if( !IS_INTRA16x16( mb_type ) ) {
6245         cbp  = decode_cabac_mb_cbp_luma( h );
6246         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6247     }
6248
6249     h->cbp_table[mb_xy] = h->cbp = cbp;
6250
6251     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6252         if( decode_cabac_mb_transform_size( h ) )
6253             mb_type |= MB_TYPE_8x8DCT;
6254     }
6255     s->current_picture.mb_type[mb_xy]= mb_type;
6256
6257     if( cbp || IS_INTRA16x16( mb_type ) ) {
6258         const uint8_t *scan, *scan8x8, *dc_scan;
6259         int dqp;
6260
6261         if(IS_INTERLACED(mb_type)){
6262             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6263             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6264             dc_scan= luma_dc_field_scan;
6265         }else{
6266             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6267             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6268             dc_scan= luma_dc_zigzag_scan;
6269         }
6270
6271         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6272         if( dqp == INT_MIN ){
6273             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6274             return -1;
6275         }
6276         s->qscale += dqp;
6277         if(((unsigned)s->qscale) > 51){
6278             if(s->qscale<0) s->qscale+= 52;
6279             else            s->qscale-= 52;
6280         }
6281         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6282
6283         if( IS_INTRA16x16( mb_type ) ) {
6284             int i;
6285             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6286             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6287                 return -1;
6288             if( cbp&15 ) {
6289                 for( i = 0; i < 16; i++ ) {
6290                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6291                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6292                         return -1;
6293                 }
6294             } else {
6295                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6296             }
6297         } else {
6298             int i8x8, i4x4;
6299             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6300                 if( cbp & (1<<i8x8) ) {
6301                     if( IS_8x8DCT(mb_type) ) {
6302                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6303                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6304                             return -1;
6305                     } else
6306                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6307                         const int index = 4*i8x8 + i4x4;
6308                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6309 //START_TIMER
6310                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6311                             return -1;
6312 //STOP_TIMER("decode_residual")
6313                     }
6314                 } else {
6315                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6316                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6317                 }
6318             }
6319         }
6320
6321         if( cbp&0x30 ){
6322             int c;
6323             for( c = 0; c < 2; c++ ) {
6324                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6325                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6326                     return -1;
6327             }
6328         }
6329
6330         if( cbp&0x20 ) {
6331             int c, i;
6332             for( c = 0; c < 2; c++ ) {
6333                 for( i = 0; i < 4; i++ ) {
6334                     const int index = 16 + 4 * c + i;
6335                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6336                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6337                         return -1;
6338                 }
6339             }
6340         } else {
6341             uint8_t * const nnz= &h->non_zero_count_cache[0];
6342             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6343             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6344         }
6345     } else {
6346         uint8_t * const nnz= &h->non_zero_count_cache[0];
6347         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6348         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6349         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6350         h->last_qscale_diff = 0;
6351     }
6352
6353     s->current_picture.qscale_table[mb_xy]= s->qscale;
6354     write_back_non_zero_count(h);
6355
6356     if(MB_MBAFF){
6357         h->ref_count[0] >>= 1;
6358         h->ref_count[1] >>= 1;
6359     }
6360
6361     return 0;
6362 }
6363
6364
6365 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6366     int i, d;
6367     const int index_a = qp + h->slice_alpha_c0_offset;
6368     const int alpha = (alpha_table+52)[index_a];
6369     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6370
6371     if( bS[0] < 4 ) {
6372         int8_t tc[4];
6373         for(i=0; i<4; i++)
6374             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6375         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6376     } else {
6377         /* 16px edge length, because bS=4 is triggered by being at
6378          * the edge of an intra MB, so all 4 bS are the same */
6379             for( d = 0; d < 16; d++ ) {
6380                 const int p0 = pix[-1];
6381                 const int p1 = pix[-2];
6382                 const int p2 = pix[-3];
6383
6384                 const int q0 = pix[0];
6385                 const int q1 = pix[1];
6386                 const int q2 = pix[2];
6387
6388                 if( FFABS( p0 - q0 ) < alpha &&
6389                     FFABS( p1 - p0 ) < beta &&
6390                     FFABS( q1 - q0 ) < beta ) {
6391
6392                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6393                         if( FFABS( p2 - p0 ) < beta)
6394                         {
6395                             const int p3 = pix[-4];
6396                             /* p0', p1', p2' */
6397                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6398                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6399                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6400                         } else {
6401                             /* p0' */
6402                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6403                         }
6404                         if( FFABS( q2 - q0 ) < beta)
6405                         {
6406                             const int q3 = pix[3];
6407                             /* q0', q1', q2' */
6408                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6409                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6410                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6411                         } else {
6412                             /* q0' */
6413                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6414                         }
6415                     }else{
6416                         /* p0', q0' */
6417                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6418                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6419                     }
6420                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6421                 }
6422                 pix += stride;
6423             }
6424     }
6425 }
6426 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6427     int i;
6428     const int index_a = qp + h->slice_alpha_c0_offset;
6429     const int alpha = (alpha_table+52)[index_a];
6430     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6431
6432     if( bS[0] < 4 ) {
6433         int8_t tc[4];
6434         for(i=0; i<4; i++)
6435             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6436         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6437     } else {
6438         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6439     }
6440 }
6441
6442 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6443     int i;
6444     for( i = 0; i < 16; i++, pix += stride) {
6445         int index_a;
6446         int alpha;
6447         int beta;
6448
6449         int qp_index;
6450         int bS_index = (i >> 1);
6451         if (!MB_FIELD) {
6452             bS_index &= ~1;
6453             bS_index |= (i & 1);
6454         }
6455
6456         if( bS[bS_index] == 0 ) {
6457             continue;
6458         }
6459
6460         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6461         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6462         alpha = (alpha_table+52)[index_a];
6463         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6464
6465         if( bS[bS_index] < 4 ) {
6466             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6467             const int p0 = pix[-1];
6468             const int p1 = pix[-2];
6469             const int p2 = pix[-3];
6470             const int q0 = pix[0];
6471             const int q1 = pix[1];
6472             const int q2 = pix[2];
6473
6474             if( FFABS( p0 - q0 ) < alpha &&
6475                 FFABS( p1 - p0 ) < beta &&
6476                 FFABS( q1 - q0 ) < beta ) {
6477                 int tc = tc0;
6478                 int i_delta;
6479
6480                 if( FFABS( p2 - p0 ) < beta ) {
6481                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6482                     tc++;
6483                 }
6484                 if( FFABS( q2 - q0 ) < beta ) {
6485                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6486                     tc++;
6487                 }
6488
6489                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6490                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6491                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6492                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6493             }
6494         }else{
6495             const int p0 = pix[-1];
6496             const int p1 = pix[-2];
6497             const int p2 = pix[-3];
6498
6499             const int q0 = pix[0];
6500             const int q1 = pix[1];
6501             const int q2 = pix[2];
6502
6503             if( FFABS( p0 - q0 ) < alpha &&
6504                 FFABS( p1 - p0 ) < beta &&
6505                 FFABS( q1 - q0 ) < beta ) {
6506
6507                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6508                     if( FFABS( p2 - p0 ) < beta)
6509                     {
6510                         const int p3 = pix[-4];
6511                         /* p0', p1', p2' */
6512                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6513                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6514                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6515                     } else {
6516                         /* p0' */
6517                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6518                     }
6519                     if( FFABS( q2 - q0 ) < beta)
6520                     {
6521                         const int q3 = pix[3];
6522                         /* q0', q1', q2' */
6523                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6524                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6525                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6526                     } else {
6527                         /* q0' */
6528                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6529                     }
6530                 }else{
6531                     /* p0', q0' */
6532                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6533                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6534                 }
6535                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6536             }
6537         }
6538     }
6539 }
6540 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6541     int i;
6542     for( i = 0; i < 8; i++, pix += stride) {
6543         int index_a;
6544         int alpha;
6545         int beta;
6546
6547         int qp_index;
6548         int bS_index = i;
6549
6550         if( bS[bS_index] == 0 ) {
6551             continue;
6552         }
6553
6554         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6555         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6556         alpha = (alpha_table+52)[index_a];
6557         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6558
6559         if( bS[bS_index] < 4 ) {
6560             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6561             const int p0 = pix[-1];
6562             const int p1 = pix[-2];
6563             const int q0 = pix[0];
6564             const int q1 = pix[1];
6565
6566             if( FFABS( p0 - q0 ) < alpha &&
6567                 FFABS( p1 - p0 ) < beta &&
6568                 FFABS( q1 - q0 ) < beta ) {
6569                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6570
6571                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6572                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6573                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6574             }
6575         }else{
6576             const int p0 = pix[-1];
6577             const int p1 = pix[-2];
6578             const int q0 = pix[0];
6579             const int q1 = pix[1];
6580
6581             if( FFABS( p0 - q0 ) < alpha &&
6582                 FFABS( p1 - p0 ) < beta &&
6583                 FFABS( q1 - q0 ) < beta ) {
6584
6585                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6586                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6587                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6588             }
6589         }
6590     }
6591 }
6592
6593 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6594     int i, d;
6595     const int index_a = qp + h->slice_alpha_c0_offset;
6596     const int alpha = (alpha_table+52)[index_a];
6597     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6598     const int pix_next  = stride;
6599
6600     if( bS[0] < 4 ) {
6601         int8_t tc[4];
6602         for(i=0; i<4; i++)
6603             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6604         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6605     } else {
6606         /* 16px edge length, see filter_mb_edgev */
6607             for( d = 0; d < 16; d++ ) {
6608                 const int p0 = pix[-1*pix_next];
6609                 const int p1 = pix[-2*pix_next];
6610                 const int p2 = pix[-3*pix_next];
6611                 const int q0 = pix[0];
6612                 const int q1 = pix[1*pix_next];
6613                 const int q2 = pix[2*pix_next];
6614
6615                 if( FFABS( p0 - q0 ) < alpha &&
6616                     FFABS( p1 - p0 ) < beta &&
6617                     FFABS( q1 - q0 ) < beta ) {
6618
6619                     const int p3 = pix[-4*pix_next];
6620                     const int q3 = pix[ 3*pix_next];
6621
6622                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6623                         if( FFABS( p2 - p0 ) < beta) {
6624                             /* p0', p1', p2' */
6625                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6626                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6627                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6628                         } else {
6629                             /* p0' */
6630                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6631                         }
6632                         if( FFABS( q2 - q0 ) < beta) {
6633                             /* q0', q1', q2' */
6634                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6635                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6636                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6637                         } else {
6638                             /* q0' */
6639                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6640                         }
6641                     }else{
6642                         /* p0', q0' */
6643                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6644                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6645                     }
6646                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6647                 }
6648                 pix++;
6649             }
6650     }
6651 }
6652
6653 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6654     int i;
6655     const int index_a = qp + h->slice_alpha_c0_offset;
6656     const int alpha = (alpha_table+52)[index_a];
6657     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6658
6659     if( bS[0] < 4 ) {
6660         int8_t tc[4];
6661         for(i=0; i<4; i++)
6662             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6663         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6664     } else {
6665         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6666     }
6667 }
6668
6669 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6670     MpegEncContext * const s = &h->s;
6671     int mb_xy, mb_type;
6672     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6673
6674     mb_xy = mb_x + mb_y*s->mb_stride;
6675
6676     if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength ||
6677        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6678                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6679         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6680         return;
6681     }
6682     assert(!FRAME_MBAFF);
6683
6684     mb_type = s->current_picture.mb_type[mb_xy];
6685     qp = s->current_picture.qscale_table[mb_xy];
6686     qp0 = s->current_picture.qscale_table[mb_xy-1];
6687     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6688     qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
6689     qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
6690     qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
6691     qp0 = (qp + qp0 + 1) >> 1;
6692     qp1 = (qp + qp1 + 1) >> 1;
6693     qpc0 = (qpc + qpc0 + 1) >> 1;
6694     qpc1 = (qpc + qpc1 + 1) >> 1;
6695     qp_thresh = 15 - h->slice_alpha_c0_offset;
6696     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6697        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6698         return;
6699
6700     if( IS_INTRA(mb_type) ) {
6701         int16_t bS4[4] = {4,4,4,4};
6702         int16_t bS3[4] = {3,3,3,3};
6703         if( IS_8x8DCT(mb_type) ) {
6704             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6705             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6706             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6707             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6708         } else {
6709             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6710             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6711             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6712             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6713             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
6714             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6715             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6716             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6717         }
6718         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6719         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6720         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6721         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6722         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6723         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6724         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
6725         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6726         return;
6727     } else {
6728         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6729         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6730         int edges;
6731         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6732             edges = 4;
6733             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6734         } else {
6735             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6736                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6737             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6738                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6739                              ? 3 : 0;
6740             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6741             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6742             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6743                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6744         }
6745         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6746             bSv[0][0] = 0x0004000400040004ULL;
6747         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6748             bSv[1][0] = 0x0004000400040004ULL;
6749
6750 #define FILTER(hv,dir,edge)\
6751         if(bSv[dir][edge]) {\
6752             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6753             if(!(edge&1)) {\
6754                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6755                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6756             }\
6757         }
6758         if( edges == 1 ) {
6759             FILTER(v,0,0);
6760             FILTER(h,1,0);
6761         } else if( IS_8x8DCT(mb_type) ) {
6762             FILTER(v,0,0);
6763             FILTER(v,0,2);
6764             FILTER(h,1,0);
6765             FILTER(h,1,2);
6766         } else {
6767             FILTER(v,0,0);
6768             FILTER(v,0,1);
6769             FILTER(v,0,2);
6770             FILTER(v,0,3);
6771             FILTER(h,1,0);
6772             FILTER(h,1,1);
6773             FILTER(h,1,2);
6774             FILTER(h,1,3);
6775         }
6776 #undef FILTER
6777     }
6778 }
6779
6780 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6781     MpegEncContext * const s = &h->s;
6782     const int mb_xy= mb_x + mb_y*s->mb_stride;
6783     const int mb_type = s->current_picture.mb_type[mb_xy];
6784     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6785     int first_vertical_edge_done = 0;
6786     int dir;
6787     /* FIXME: A given frame may occupy more than one position in
6788      * the reference list. So ref2frm should be populated with
6789      * frame numbers, not indices. */
6790     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6791                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6792
6793     //for sufficiently low qp, filtering wouldn't do anything
6794     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6795     if(!FRAME_MBAFF){
6796         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
6797         int qp = s->current_picture.qscale_table[mb_xy];
6798         if(qp <= qp_thresh
6799            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6800            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6801             return;
6802         }
6803     }
6804
6805     if (FRAME_MBAFF
6806             // left mb is in picture
6807             && h->slice_table[mb_xy-1] != 255
6808             // and current and left pair do not have the same interlaced type
6809             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6810             // and left mb is in the same slice if deblocking_filter == 2
6811             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6812         /* First vertical edge is different in MBAFF frames
6813          * There are 8 different bS to compute and 2 different Qp
6814          */
6815         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6816         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6817         int16_t bS[8];
6818         int qp[2];
6819         int chroma_qp[2];
6820         int mb_qp, mbn0_qp, mbn1_qp;
6821         int i;
6822         first_vertical_edge_done = 1;
6823
6824         if( IS_INTRA(mb_type) )
6825             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6826         else {
6827             for( i = 0; i < 8; i++ ) {
6828                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6829
6830                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6831                     bS[i] = 4;
6832                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6833                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6834                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6835                     bS[i] = 2;
6836                 else
6837                     bS[i] = 1;
6838             }
6839         }
6840
6841         mb_qp = s->current_picture.qscale_table[mb_xy];
6842         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6843         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6844         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6845         chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
6846                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
6847         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6848         chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
6849                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
6850
6851         /* Filter edge */
6852         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
6853         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6854         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6855         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
6856         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
6857     }
6858     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6859     for( dir = 0; dir < 2; dir++ )
6860     {
6861         int edge;
6862         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6863         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6864         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6865
6866         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6867                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6868         // how often to recheck mv-based bS when iterating between edges
6869         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6870                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6871         // how often to recheck mv-based bS when iterating along each edge
6872         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6873
6874         if (first_vertical_edge_done) {
6875             start = 1;
6876             first_vertical_edge_done = 0;
6877         }
6878
6879         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6880             start = 1;
6881
6882         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6883             && !IS_INTERLACED(mb_type)
6884             && IS_INTERLACED(mbm_type)
6885             ) {
6886             // This is a special case in the norm where the filtering must
6887             // be done twice (one each of the field) even if we are in a
6888             // frame macroblock.
6889             //
6890             static const int nnz_idx[4] = {4,5,6,3};
6891             unsigned int tmp_linesize   = 2 *   linesize;
6892             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6893             int mbn_xy = mb_xy - 2 * s->mb_stride;
6894             int qp, chroma_qp;
6895             int i, j;
6896             int16_t bS[4];
6897
6898             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6899                 if( IS_INTRA(mb_type) ||
6900                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6901                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6902                 } else {
6903                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6904                     for( i = 0; i < 4; i++ ) {
6905                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6906                             mbn_nnz[nnz_idx[i]] != 0 )
6907                             bS[i] = 2;
6908                         else
6909                             bS[i] = 1;
6910                     }
6911                 }
6912                 // Do not use s->qscale as luma quantizer because it has not the same
6913                 // value in IPCM macroblocks.
6914                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6915                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6916                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6917                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6918                 chroma_qp = ( h->chroma_qp +
6919                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6920                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6921                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6922             }
6923
6924             start = 1;
6925         }
6926
6927         /* Calculate bS */
6928         for( edge = start; edge < edges; edge++ ) {
6929             /* mbn_xy: neighbor macroblock */
6930             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6931             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6932             int16_t bS[4];
6933             int qp;
6934
6935             if( (edge&1) && IS_8x8DCT(mb_type) )
6936                 continue;
6937
6938             if( IS_INTRA(mb_type) ||
6939                 IS_INTRA(mbn_type) ) {
6940                 int value;
6941                 if (edge == 0) {
6942                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6943                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6944                     ) {
6945                         value = 4;
6946                     } else {
6947                         value = 3;
6948                     }
6949                 } else {
6950                     value = 3;
6951                 }
6952                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6953             } else {
6954                 int i, l;
6955                 int mv_done;
6956
6957                 if( edge & mask_edge ) {
6958                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6959                     mv_done = 1;
6960                 }
6961                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6962                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6963                     mv_done = 1;
6964                 }
6965                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6966                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6967                     int bn_idx= b_idx - (dir ? 8:1);
6968                     int v = 0;
6969                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6970                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6971                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6972                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6973                     }
6974                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6975                     mv_done = 1;
6976                 }
6977                 else
6978                     mv_done = 0;
6979
6980                 for( i = 0; i < 4; i++ ) {
6981                     int x = dir == 0 ? edge : i;
6982                     int y = dir == 0 ? i    : edge;
6983                     int b_idx= 8 + 4 + x + 8*y;
6984                     int bn_idx= b_idx - (dir ? 8:1);
6985
6986                     if( h->non_zero_count_cache[b_idx] != 0 ||
6987                         h->non_zero_count_cache[bn_idx] != 0 ) {
6988                         bS[i] = 2;
6989                     }
6990                     else if(!mv_done)
6991                     {
6992                         bS[i] = 0;
6993                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6994                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6995                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6996                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6997                                 bS[i] = 1;
6998                                 break;
6999                             }
7000                         }
7001                     }
7002                 }
7003
7004                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7005                     continue;
7006             }
7007
7008             /* Filter edge */
7009             // Do not use s->qscale as luma quantizer because it has not the same
7010             // value in IPCM macroblocks.
7011             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7012             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7013             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7014             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
7015             if( dir == 0 ) {
7016                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7017                 if( (edge&1) == 0 ) {
7018                     int chroma_qp = ( h->chroma_qp +
7019                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7020                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7021                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7022                 }
7023             } else {
7024                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7025                 if( (edge&1) == 0 ) {
7026                     int chroma_qp = ( h->chroma_qp +
7027                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7028                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7029                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7030                 }
7031             }
7032         }
7033     }
7034 }
7035
7036 static int decode_slice(H264Context *h){
7037     MpegEncContext * const s = &h->s;
7038     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7039
7040     s->mb_skip_run= -1;
7041
7042     if( h->pps.cabac ) {
7043         int i;
7044
7045         /* realign */
7046         align_get_bits( &s->gb );
7047
7048         /* init cabac */
7049         ff_init_cabac_states( &h->cabac);
7050         ff_init_cabac_decoder( &h->cabac,
7051                                s->gb.buffer + get_bits_count(&s->gb)/8,
7052                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7053         /* calculate pre-state */
7054         for( i= 0; i < 460; i++ ) {
7055             int pre;
7056             if( h->slice_type == I_TYPE )
7057                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7058             else
7059                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7060
7061             if( pre <= 63 )
7062                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7063             else
7064                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7065         }
7066
7067         for(;;){
7068 //START_TIMER
7069             int ret = decode_mb_cabac(h);
7070             int eos;
7071 //STOP_TIMER("decode_mb_cabac")
7072
7073             if(ret>=0) hl_decode_mb(h);
7074
7075             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7076                 s->mb_y++;
7077
7078                 if(ret>=0) ret = decode_mb_cabac(h);
7079
7080                 if(ret>=0) hl_decode_mb(h);
7081                 s->mb_y--;
7082             }
7083             eos = get_cabac_terminate( &h->cabac );
7084
7085             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
7086                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7087                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7088                 return -1;
7089             }
7090
7091             if( ++s->mb_x >= s->mb_width ) {
7092                 s->mb_x = 0;
7093                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7094                 ++s->mb_y;
7095                 if(FRAME_MBAFF) {
7096                     ++s->mb_y;
7097                 }
7098             }
7099
7100             if( eos || s->mb_y >= s->mb_height ) {
7101                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7102                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7103                 return 0;
7104             }
7105         }
7106
7107     } else {
7108         for(;;){
7109             int ret = decode_mb_cavlc(h);
7110
7111             if(ret>=0) hl_decode_mb(h);
7112
7113             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7114                 s->mb_y++;
7115                 ret = decode_mb_cavlc(h);
7116
7117                 if(ret>=0) hl_decode_mb(h);
7118                 s->mb_y--;
7119             }
7120
7121             if(ret<0){
7122                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7123                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7124
7125                 return -1;
7126             }
7127
7128             if(++s->mb_x >= s->mb_width){
7129                 s->mb_x=0;
7130                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7131                 ++s->mb_y;
7132                 if(FRAME_MBAFF) {
7133                     ++s->mb_y;
7134                 }
7135                 if(s->mb_y >= s->mb_height){
7136                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7137
7138                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7139                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7140
7141                         return 0;
7142                     }else{
7143                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7144
7145                         return -1;
7146                     }
7147                 }
7148             }
7149
7150             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7151                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7152                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7153                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7154
7155                     return 0;
7156                 }else{
7157                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7158
7159                     return -1;
7160                 }
7161             }
7162         }
7163     }
7164
7165 #if 0
7166     for(;s->mb_y < s->mb_height; s->mb_y++){
7167         for(;s->mb_x < s->mb_width; s->mb_x++){
7168             int ret= decode_mb(h);
7169
7170             hl_decode_mb(h);
7171
7172             if(ret<0){
7173                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7174                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7175
7176                 return -1;
7177             }
7178
7179             if(++s->mb_x >= s->mb_width){
7180                 s->mb_x=0;
7181                 if(++s->mb_y >= s->mb_height){
7182                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7183                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7184
7185                         return 0;
7186                     }else{
7187                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7188
7189                         return -1;
7190                     }
7191                 }
7192             }
7193
7194             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7195                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7196                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7197
7198                     return 0;
7199                 }else{
7200                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7201
7202                     return -1;
7203                 }
7204             }
7205         }
7206         s->mb_x=0;
7207         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7208     }
7209 #endif
7210     return -1; //not reached
7211 }
7212
7213 static int decode_unregistered_user_data(H264Context *h, int size){
7214     MpegEncContext * const s = &h->s;
7215     uint8_t user_data[16+256];
7216     int e, build, i;
7217
7218     if(size<16)
7219         return -1;
7220
7221     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7222         user_data[i]= get_bits(&s->gb, 8);
7223     }
7224
7225     user_data[i]= 0;
7226     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7227     if(e==1 && build>=0)
7228         h->x264_build= build;
7229
7230     if(s->avctx->debug & FF_DEBUG_BUGS)
7231         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7232
7233     for(; i<size; i++)
7234         skip_bits(&s->gb, 8);
7235
7236     return 0;
7237 }
7238
7239 static int decode_sei(H264Context *h){
7240     MpegEncContext * const s = &h->s;
7241
7242     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7243         int size, type;
7244
7245         type=0;
7246         do{
7247             type+= show_bits(&s->gb, 8);
7248         }while(get_bits(&s->gb, 8) == 255);
7249
7250         size=0;
7251         do{
7252             size+= show_bits(&s->gb, 8);
7253         }while(get_bits(&s->gb, 8) == 255);
7254
7255         switch(type){
7256         case 5:
7257             if(decode_unregistered_user_data(h, size) < 0)
7258                 return -1;
7259             break;
7260         default:
7261             skip_bits(&s->gb, 8*size);
7262         }
7263
7264         //FIXME check bits here
7265         align_get_bits(&s->gb);
7266     }
7267
7268     return 0;
7269 }
7270
7271 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7272     MpegEncContext * const s = &h->s;
7273     int cpb_count, i;
7274     cpb_count = get_ue_golomb(&s->gb) + 1;
7275     get_bits(&s->gb, 4); /* bit_rate_scale */
7276     get_bits(&s->gb, 4); /* cpb_size_scale */
7277     for(i=0; i<cpb_count; i++){
7278         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7279         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7280         get_bits1(&s->gb);     /* cbr_flag */
7281     }
7282     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7283     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7284     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7285     get_bits(&s->gb, 5); /* time_offset_length */
7286 }
7287
7288 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7289     MpegEncContext * const s = &h->s;
7290     int aspect_ratio_info_present_flag;
7291     unsigned int aspect_ratio_idc;
7292     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7293
7294     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7295
7296     if( aspect_ratio_info_present_flag ) {
7297         aspect_ratio_idc= get_bits(&s->gb, 8);
7298         if( aspect_ratio_idc == EXTENDED_SAR ) {
7299             sps->sar.num= get_bits(&s->gb, 16);
7300             sps->sar.den= get_bits(&s->gb, 16);
7301         }else if(aspect_ratio_idc < 14){
7302             sps->sar=  pixel_aspect[aspect_ratio_idc];
7303         }else{
7304             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7305             return -1;
7306         }
7307     }else{
7308         sps->sar.num=
7309         sps->sar.den= 0;
7310     }
7311 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7312
7313     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7314         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7315     }
7316
7317     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7318         get_bits(&s->gb, 3);    /* video_format */
7319         get_bits1(&s->gb);      /* video_full_range_flag */
7320         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7321             get_bits(&s->gb, 8); /* colour_primaries */
7322             get_bits(&s->gb, 8); /* transfer_characteristics */
7323             get_bits(&s->gb, 8); /* matrix_coefficients */
7324         }
7325     }
7326
7327     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7328         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7329         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7330     }
7331
7332     sps->timing_info_present_flag = get_bits1(&s->gb);
7333     if(sps->timing_info_present_flag){
7334         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7335         sps->time_scale = get_bits_long(&s->gb, 32);
7336         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7337     }
7338
7339     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7340     if(nal_hrd_parameters_present_flag)
7341         decode_hrd_parameters(h, sps);
7342     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7343     if(vcl_hrd_parameters_present_flag)
7344         decode_hrd_parameters(h, sps);
7345     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7346         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7347     get_bits1(&s->gb);         /* pic_struct_present_flag */
7348
7349     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7350     if(sps->bitstream_restriction_flag){
7351         unsigned int num_reorder_frames;
7352         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7353         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7354         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7355         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7356         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7357         num_reorder_frames= get_ue_golomb(&s->gb);
7358         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7359
7360         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7361             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7362             return -1;
7363         }
7364
7365         sps->num_reorder_frames= num_reorder_frames;
7366     }
7367
7368     return 0;
7369 }
7370
7371 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7372                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7373     MpegEncContext * const s = &h->s;
7374     int i, last = 8, next = 8;
7375     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7376     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7377         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7378     else
7379     for(i=0;i<size;i++){
7380         if(next)
7381             next = (last + get_se_golomb(&s->gb)) & 0xff;
7382         if(!i && !next){ /* matrix not written, we use the preset one */
7383             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7384             break;
7385         }
7386         last = factors[scan[i]] = next ? next : last;
7387     }
7388 }
7389
7390 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7391                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7392     MpegEncContext * const s = &h->s;
7393     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7394     const uint8_t *fallback[4] = {
7395         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7396         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7397         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7398         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7399     };
7400     if(get_bits1(&s->gb)){
7401         sps->scaling_matrix_present |= is_sps;
7402         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7403         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7404         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7405         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7406         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7407         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7408         if(is_sps || pps->transform_8x8_mode){
7409             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7410             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7411         }
7412     } else if(fallback_sps) {
7413         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7414         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7415     }
7416 }
7417
7418 /**
7419  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7420  */
7421 static void *
7422 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7423                     const size_t size, const char *name)
7424 {
7425     if(id>=max) {
7426         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7427         return NULL;
7428     }
7429
7430     if(!vec[id]) {
7431         vec[id] = av_mallocz(size);
7432         if(vec[id] == NULL)
7433             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7434     }
7435     return vec[id];
7436 }
7437
7438 static inline int decode_seq_parameter_set(H264Context *h){
7439     MpegEncContext * const s = &h->s;
7440     int profile_idc, level_idc;
7441     unsigned int sps_id, tmp, mb_width, mb_height;
7442     int i;
7443     SPS *sps;
7444
7445     profile_idc= get_bits(&s->gb, 8);
7446     get_bits1(&s->gb);   //constraint_set0_flag
7447     get_bits1(&s->gb);   //constraint_set1_flag
7448     get_bits1(&s->gb);   //constraint_set2_flag
7449     get_bits1(&s->gb);   //constraint_set3_flag
7450     get_bits(&s->gb, 4); // reserved
7451     level_idc= get_bits(&s->gb, 8);
7452     sps_id= get_ue_golomb(&s->gb);
7453
7454     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7455     if(sps == NULL)
7456         return -1;
7457
7458     sps->profile_idc= profile_idc;
7459     sps->level_idc= level_idc;
7460
7461     if(sps->profile_idc >= 100){ //high profile
7462         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7463             get_bits1(&s->gb);  //residual_color_transform_flag
7464         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7465         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7466         sps->transform_bypass = get_bits1(&s->gb);
7467         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7468     }else
7469         sps->scaling_matrix_present = 0;
7470
7471     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7472     sps->poc_type= get_ue_golomb(&s->gb);
7473
7474     if(sps->poc_type == 0){ //FIXME #define
7475         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7476     } else if(sps->poc_type == 1){//FIXME #define
7477         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7478         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7479         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7480         tmp= get_ue_golomb(&s->gb);
7481
7482         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7483             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7484             return -1;
7485         }
7486         sps->poc_cycle_length= tmp;
7487
7488         for(i=0; i<sps->poc_cycle_length; i++)
7489             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7490     }else if(sps->poc_type != 2){
7491         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7492         return -1;
7493     }
7494
7495     tmp= get_ue_golomb(&s->gb);
7496     if(tmp > MAX_PICTURE_COUNT-2){
7497         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7498     }
7499     sps->ref_frame_count= tmp;
7500     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7501     mb_width= get_ue_golomb(&s->gb) + 1;
7502     mb_height= get_ue_golomb(&s->gb) + 1;
7503     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7504        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7505         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7506         return -1;
7507     }
7508     sps->mb_width = mb_width;
7509     sps->mb_height= mb_height;
7510
7511     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7512     if(!sps->frame_mbs_only_flag)
7513         sps->mb_aff= get_bits1(&s->gb);
7514     else
7515         sps->mb_aff= 0;
7516
7517     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7518
7519 #ifndef ALLOW_INTERLACE
7520     if(sps->mb_aff)
7521         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7522 #endif
7523     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7524         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7525
7526     sps->crop= get_bits1(&s->gb);
7527     if(sps->crop){
7528         sps->crop_left  = get_ue_golomb(&s->gb);
7529         sps->crop_right = get_ue_golomb(&s->gb);
7530         sps->crop_top   = get_ue_golomb(&s->gb);
7531         sps->crop_bottom= get_ue_golomb(&s->gb);
7532         if(sps->crop_left || sps->crop_top){
7533             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7534         }
7535     }else{
7536         sps->crop_left  =
7537         sps->crop_right =
7538         sps->crop_top   =
7539         sps->crop_bottom= 0;
7540     }
7541
7542     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7543     if( sps->vui_parameters_present_flag )
7544         decode_vui_parameters(h, sps);
7545
7546     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7547         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7548                sps_id, sps->profile_idc, sps->level_idc,
7549                sps->poc_type,
7550                sps->ref_frame_count,
7551                sps->mb_width, sps->mb_height,
7552                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7553                sps->direct_8x8_inference_flag ? "8B8" : "",
7554                sps->crop_left, sps->crop_right,
7555                sps->crop_top, sps->crop_bottom,
7556                sps->vui_parameters_present_flag ? "VUI" : ""
7557                );
7558     }
7559     return 0;
7560 }
7561
7562 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7563     MpegEncContext * const s = &h->s;
7564     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7565     PPS *pps;
7566
7567     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7568     if(pps == NULL)
7569         return -1;
7570
7571     tmp= get_ue_golomb(&s->gb);
7572     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7573         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7574         return -1;
7575     }
7576     pps->sps_id= tmp;
7577
7578     pps->cabac= get_bits1(&s->gb);
7579     pps->pic_order_present= get_bits1(&s->gb);
7580     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7581     if(pps->slice_group_count > 1 ){
7582         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7583         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7584         switch(pps->mb_slice_group_map_type){
7585         case 0:
7586 #if 0
7587 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7588 |    run_length[ i ]                                |1  |ue(v)   |
7589 #endif
7590             break;
7591         case 2:
7592 #if 0
7593 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7594 |{                                                  |   |        |
7595 |    top_left_mb[ i ]                               |1  |ue(v)   |
7596 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7597 |   }                                               |   |        |
7598 #endif
7599             break;
7600         case 3:
7601         case 4:
7602         case 5:
7603 #if 0
7604 |   slice_group_change_direction_flag               |1  |u(1)    |
7605 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7606 #endif
7607             break;
7608         case 6:
7609 #if 0
7610 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7611 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7612 |)                                                  |   |        |
7613 |    slice_group_id[ i ]                            |1  |u(v)    |
7614 #endif
7615             break;
7616         }
7617     }
7618     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7619     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7620     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7621         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7622         pps->ref_count[0]= pps->ref_count[1]= 1;
7623         return -1;
7624     }
7625
7626     pps->weighted_pred= get_bits1(&s->gb);
7627     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7628     pps->init_qp= get_se_golomb(&s->gb) + 26;
7629     pps->init_qs= get_se_golomb(&s->gb) + 26;
7630     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7631     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7632     pps->constrained_intra_pred= get_bits1(&s->gb);
7633     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7634
7635     pps->transform_8x8_mode= 0;
7636     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7637     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7638     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7639
7640     if(get_bits_count(&s->gb) < bit_length){
7641         pps->transform_8x8_mode= get_bits1(&s->gb);
7642         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7643         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7644     }
7645
7646     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7647         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7648                pps_id, pps->sps_id,
7649                pps->cabac ? "CABAC" : "CAVLC",
7650                pps->slice_group_count,
7651                pps->ref_count[0], pps->ref_count[1],
7652                pps->weighted_pred ? "weighted" : "",
7653                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7654                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7655                pps->constrained_intra_pred ? "CONSTR" : "",
7656                pps->redundant_pic_cnt_present ? "REDU" : "",
7657                pps->transform_8x8_mode ? "8x8DCT" : ""
7658                );
7659     }
7660
7661     return 0;
7662 }
7663
7664 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7665     MpegEncContext * const s = &h->s;
7666     AVCodecContext * const avctx= s->avctx;
7667     int buf_index=0;
7668 #if 0
7669     int i;
7670     for(i=0; i<50; i++){
7671         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7672     }
7673 #endif
7674     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7675         h->slice_num = 0;
7676         s->current_picture_ptr= NULL;
7677     }
7678
7679     for(;;){
7680         int consumed;
7681         int dst_length;
7682         int bit_length;
7683         uint8_t *ptr;
7684         int i, nalsize = 0;
7685
7686       if(h->is_avc) {
7687         if(buf_index >= buf_size) break;
7688         nalsize = 0;
7689         for(i = 0; i < h->nal_length_size; i++)
7690             nalsize = (nalsize << 8) | buf[buf_index++];
7691         if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7692             if(nalsize == 1){
7693                 buf_index++;
7694                 continue;
7695             }else{
7696                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7697                 break;
7698             }
7699         }
7700       } else {
7701         // start code prefix search
7702         for(; buf_index + 3 < buf_size; buf_index++){
7703             // This should always succeed in the first iteration.
7704             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7705                 break;
7706         }
7707
7708         if(buf_index+3 >= buf_size) break;
7709
7710         buf_index+=3;
7711       }
7712
7713         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7714         if (ptr==NULL || dst_length < 0){
7715             return -1;
7716         }
7717         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7718             dst_length--;
7719         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7720
7721         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7722             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7723         }
7724
7725         if (h->is_avc && (nalsize != consumed))
7726             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7727
7728         buf_index += consumed;
7729
7730         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7731            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7732             continue;
7733
7734         switch(h->nal_unit_type){
7735         case NAL_IDR_SLICE:
7736             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7737         case NAL_SLICE:
7738             init_get_bits(&s->gb, ptr, bit_length);
7739             h->intra_gb_ptr=
7740             h->inter_gb_ptr= &s->gb;
7741             s->data_partitioning = 0;
7742
7743             if(decode_slice_header(h) < 0){
7744                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7745                 break;
7746             }
7747             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
7748             if(h->redundant_pic_count==0 && s->hurry_up < 5
7749                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7750                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7751                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7752                && avctx->skip_frame < AVDISCARD_ALL)
7753                 decode_slice(h);
7754             break;
7755         case NAL_DPA:
7756             init_get_bits(&s->gb, ptr, bit_length);
7757             h->intra_gb_ptr=
7758             h->inter_gb_ptr= NULL;
7759             s->data_partitioning = 1;
7760
7761             if(decode_slice_header(h) < 0){
7762                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7763             }
7764             break;
7765         case NAL_DPB:
7766             init_get_bits(&h->intra_gb, ptr, bit_length);
7767             h->intra_gb_ptr= &h->intra_gb;
7768             break;
7769         case NAL_DPC:
7770             init_get_bits(&h->inter_gb, ptr, bit_length);
7771             h->inter_gb_ptr= &h->inter_gb;
7772
7773             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
7774                && s->context_initialized
7775                && s->hurry_up < 5
7776                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7777                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7778                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7779                && avctx->skip_frame < AVDISCARD_ALL)
7780                 decode_slice(h);
7781             break;
7782         case NAL_SEI:
7783             init_get_bits(&s->gb, ptr, bit_length);
7784             decode_sei(h);
7785             break;
7786         case NAL_SPS:
7787             init_get_bits(&s->gb, ptr, bit_length);
7788             decode_seq_parameter_set(h);
7789
7790             if(s->flags& CODEC_FLAG_LOW_DELAY)
7791                 s->low_delay=1;
7792
7793             if(avctx->has_b_frames < 2)
7794                 avctx->has_b_frames= !s->low_delay;
7795             break;
7796         case NAL_PPS:
7797             init_get_bits(&s->gb, ptr, bit_length);
7798
7799             decode_picture_parameter_set(h, bit_length);
7800
7801             break;
7802         case NAL_AUD:
7803         case NAL_END_SEQUENCE:
7804         case NAL_END_STREAM:
7805         case NAL_FILLER_DATA:
7806         case NAL_SPS_EXT:
7807         case NAL_AUXILIARY_SLICE:
7808             break;
7809         default:
7810             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
7811         }
7812     }
7813
7814     return buf_index;
7815 }
7816
7817 /**
7818  * returns the number of bytes consumed for building the current frame
7819  */
7820 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7821     if(s->flags&CODEC_FLAG_TRUNCATED){
7822         pos -= s->parse_context.last_index;
7823         if(pos<0) pos=0; // FIXME remove (unneeded?)
7824
7825         return pos;
7826     }else{
7827         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7828         if(pos+10>buf_size) pos=buf_size; // oops ;)
7829
7830         return pos;
7831     }
7832 }
7833
7834 static int decode_frame(AVCodecContext *avctx,
7835                              void *data, int *data_size,
7836                              uint8_t *buf, int buf_size)
7837 {
7838     H264Context *h = avctx->priv_data;
7839     MpegEncContext *s = &h->s;
7840     AVFrame *pict = data;
7841     int buf_index;
7842
7843     s->flags= avctx->flags;
7844     s->flags2= avctx->flags2;
7845
7846    /* no supplementary picture */
7847     if (buf_size == 0) {
7848         Picture *out;
7849         int i, out_idx;
7850
7851 //FIXME factorize this with the output code below
7852         out = h->delayed_pic[0];
7853         out_idx = 0;
7854         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7855             if(h->delayed_pic[i]->poc < out->poc){
7856                 out = h->delayed_pic[i];
7857                 out_idx = i;
7858             }
7859
7860         for(i=out_idx; h->delayed_pic[i]; i++)
7861             h->delayed_pic[i] = h->delayed_pic[i+1];
7862
7863         if(out){
7864             *data_size = sizeof(AVFrame);
7865             *pict= *(AVFrame*)out;
7866         }
7867
7868         return 0;
7869     }
7870
7871     if(s->flags&CODEC_FLAG_TRUNCATED){
7872         int next= ff_h264_find_frame_end(h, buf, buf_size);
7873
7874         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7875             return buf_size;
7876 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7877     }
7878
7879     if(h->is_avc && !h->got_avcC) {
7880         int i, cnt, nalsize;
7881         unsigned char *p = avctx->extradata;
7882         if(avctx->extradata_size < 7) {
7883             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7884             return -1;
7885         }
7886         if(*p != 1) {
7887             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7888             return -1;
7889         }
7890         /* sps and pps in the avcC always have length coded with 2 bytes,
7891            so put a fake nal_length_size = 2 while parsing them */
7892         h->nal_length_size = 2;
7893         // Decode sps from avcC
7894         cnt = *(p+5) & 0x1f; // Number of sps
7895         p += 6;
7896         for (i = 0; i < cnt; i++) {
7897             nalsize = AV_RB16(p) + 2;
7898             if(decode_nal_units(h, p, nalsize) < 0) {
7899                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7900                 return -1;
7901             }
7902             p += nalsize;
7903         }
7904         // Decode pps from avcC
7905         cnt = *(p++); // Number of pps
7906         for (i = 0; i < cnt; i++) {
7907             nalsize = AV_RB16(p) + 2;
7908             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7909                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7910                 return -1;
7911             }
7912             p += nalsize;
7913         }
7914         // Now store right nal length size, that will be use to parse all other nals
7915         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7916         // Do not reparse avcC
7917         h->got_avcC = 1;
7918     }
7919
7920     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7921         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7922             return -1;
7923     }
7924
7925     buf_index=decode_nal_units(h, buf, buf_size);
7926     if(buf_index < 0)
7927         return -1;
7928
7929     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7930         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7931         return -1;
7932     }
7933
7934     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7935         Picture *out = s->current_picture_ptr;
7936         Picture *cur = s->current_picture_ptr;
7937         Picture *prev = h->delayed_output_pic;
7938         int i, pics, cross_idr, out_of_order, out_idx;
7939
7940         s->mb_y= 0;
7941
7942         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7943         s->current_picture_ptr->pict_type= s->pict_type;
7944
7945         h->prev_frame_num_offset= h->frame_num_offset;
7946         h->prev_frame_num= h->frame_num;
7947         if(s->current_picture_ptr->reference){
7948             h->prev_poc_msb= h->poc_msb;
7949             h->prev_poc_lsb= h->poc_lsb;
7950         }
7951         if(s->current_picture_ptr->reference)
7952             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7953
7954         ff_er_frame_end(s);
7955
7956         MPV_frame_end(s);
7957
7958     //FIXME do something with unavailable reference frames
7959
7960 #if 0 //decode order
7961         *data_size = sizeof(AVFrame);
7962 #else
7963         /* Sort B-frames into display order */
7964
7965         if(h->sps.bitstream_restriction_flag
7966            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7967             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7968             s->low_delay = 0;
7969         }
7970
7971         pics = 0;
7972         while(h->delayed_pic[pics]) pics++;
7973
7974         assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7975
7976         h->delayed_pic[pics++] = cur;
7977         if(cur->reference == 0)
7978             cur->reference = 1;
7979
7980         cross_idr = 0;
7981         for(i=0; h->delayed_pic[i]; i++)
7982             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7983                 cross_idr = 1;
7984
7985         out = h->delayed_pic[0];
7986         out_idx = 0;
7987         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7988             if(h->delayed_pic[i]->poc < out->poc){
7989                 out = h->delayed_pic[i];
7990                 out_idx = i;
7991             }
7992
7993         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7994         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7995             { }
7996         else if(prev && pics <= s->avctx->has_b_frames)
7997             out = prev;
7998         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7999            || (s->low_delay &&
8000             ((!cross_idr && prev && out->poc > prev->poc + 2)
8001              || cur->pict_type == B_TYPE)))
8002         {
8003             s->low_delay = 0;
8004             s->avctx->has_b_frames++;
8005             out = prev;
8006         }
8007         else if(out_of_order)
8008             out = prev;
8009
8010         if(out_of_order || pics > s->avctx->has_b_frames){
8011             for(i=out_idx; h->delayed_pic[i]; i++)
8012                 h->delayed_pic[i] = h->delayed_pic[i+1];
8013         }
8014
8015         if(prev == out)
8016             *data_size = 0;
8017         else
8018             *data_size = sizeof(AVFrame);
8019         if(prev && prev != out && prev->reference == 1)
8020             prev->reference = 0;
8021         h->delayed_output_pic = out;
8022 #endif
8023
8024         if(out)
8025             *pict= *(AVFrame*)out;
8026         else
8027             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8028     }
8029
8030     assert(pict->data[0] || !*data_size);
8031     ff_print_debug_info(s, pict);
8032 //printf("out %d\n", (int)pict->data[0]);
8033 #if 0 //?
8034
8035     /* Return the Picture timestamp as the frame number */
8036     /* we substract 1 because it is added on utils.c    */
8037     avctx->frame_number = s->picture_number - 1;
8038 #endif
8039     return get_consumed_bytes(s, buf_index, buf_size);
8040 }
8041 #if 0
8042 static inline void fill_mb_avail(H264Context *h){
8043     MpegEncContext * const s = &h->s;
8044     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8045
8046     if(s->mb_y){
8047         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8048         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8049         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8050     }else{
8051         h->mb_avail[0]=
8052         h->mb_avail[1]=
8053         h->mb_avail[2]= 0;
8054     }
8055     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8056     h->mb_avail[4]= 1; //FIXME move out
8057     h->mb_avail[5]= 0; //FIXME move out
8058 }
8059 #endif
8060
8061 #if 0 //selftest
8062 #define COUNT 8000
8063 #define SIZE (COUNT*40)
8064 int main(){
8065     int i;
8066     uint8_t temp[SIZE];
8067     PutBitContext pb;
8068     GetBitContext gb;
8069 //    int int_temp[10000];
8070     DSPContext dsp;
8071     AVCodecContext avctx;
8072
8073     dsputil_init(&dsp, &avctx);
8074
8075     init_put_bits(&pb, temp, SIZE);
8076     printf("testing unsigned exp golomb\n");
8077     for(i=0; i<COUNT; i++){
8078         START_TIMER
8079         set_ue_golomb(&pb, i);
8080         STOP_TIMER("set_ue_golomb");
8081     }
8082     flush_put_bits(&pb);
8083
8084     init_get_bits(&gb, temp, 8*SIZE);
8085     for(i=0; i<COUNT; i++){
8086         int j, s;
8087
8088         s= show_bits(&gb, 24);
8089
8090         START_TIMER
8091         j= get_ue_golomb(&gb);
8092         if(j != i){
8093             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8094 //            return -1;
8095         }
8096         STOP_TIMER("get_ue_golomb");
8097     }
8098
8099
8100     init_put_bits(&pb, temp, SIZE);
8101     printf("testing signed exp golomb\n");
8102     for(i=0; i<COUNT; i++){
8103         START_TIMER
8104         set_se_golomb(&pb, i - COUNT/2);
8105         STOP_TIMER("set_se_golomb");
8106     }
8107     flush_put_bits(&pb);
8108
8109     init_get_bits(&gb, temp, 8*SIZE);
8110     for(i=0; i<COUNT; i++){
8111         int j, s;
8112
8113         s= show_bits(&gb, 24);
8114
8115         START_TIMER
8116         j= get_se_golomb(&gb);
8117         if(j != i - COUNT/2){
8118             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8119 //            return -1;
8120         }
8121         STOP_TIMER("get_se_golomb");
8122     }
8123
8124     printf("testing 4x4 (I)DCT\n");
8125
8126     DCTELEM block[16];
8127     uint8_t src[16], ref[16];
8128     uint64_t error= 0, max_error=0;
8129
8130     for(i=0; i<COUNT; i++){
8131         int j;
8132 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8133         for(j=0; j<16; j++){
8134             ref[j]= random()%255;
8135             src[j]= random()%255;
8136         }
8137
8138         h264_diff_dct_c(block, src, ref, 4);
8139
8140         //normalize
8141         for(j=0; j<16; j++){
8142 //            printf("%d ", block[j]);
8143             block[j]= block[j]*4;
8144             if(j&1) block[j]= (block[j]*4 + 2)/5;
8145             if(j&4) block[j]= (block[j]*4 + 2)/5;
8146         }
8147 //        printf("\n");
8148
8149         s->dsp.h264_idct_add(ref, block, 4);
8150 /*        for(j=0; j<16; j++){
8151             printf("%d ", ref[j]);
8152         }
8153         printf("\n");*/
8154
8155         for(j=0; j<16; j++){
8156             int diff= FFABS(src[j] - ref[j]);
8157
8158             error+= diff*diff;
8159             max_error= FFMAX(max_error, diff);
8160         }
8161     }
8162     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8163 #if 0
8164     printf("testing quantizer\n");
8165     for(qp=0; qp<52; qp++){
8166         for(i=0; i<16; i++)
8167             src1_block[i]= src2_block[i]= random()%255;
8168
8169     }
8170 #endif
8171     printf("Testing NAL layer\n");
8172
8173     uint8_t bitstream[COUNT];
8174     uint8_t nal[COUNT*2];
8175     H264Context h;
8176     memset(&h, 0, sizeof(H264Context));
8177
8178     for(i=0; i<COUNT; i++){
8179         int zeros= i;
8180         int nal_length;
8181         int consumed;
8182         int out_length;
8183         uint8_t *out;
8184         int j;
8185
8186         for(j=0; j<COUNT; j++){
8187             bitstream[j]= (random() % 255) + 1;
8188         }
8189
8190         for(j=0; j<zeros; j++){
8191             int pos= random() % COUNT;
8192             while(bitstream[pos] == 0){
8193                 pos++;
8194                 pos %= COUNT;
8195             }
8196             bitstream[pos]=0;
8197         }
8198
8199         START_TIMER
8200
8201         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8202         if(nal_length<0){
8203             printf("encoding failed\n");
8204             return -1;
8205         }
8206
8207         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8208
8209         STOP_TIMER("NAL")
8210
8211         if(out_length != COUNT){
8212             printf("incorrect length %d %d\n", out_length, COUNT);
8213             return -1;
8214         }
8215
8216         if(consumed != nal_length){
8217             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8218             return -1;
8219         }
8220
8221         if(memcmp(bitstream, out, COUNT)){
8222             printf("mismatch\n");
8223             return -1;
8224         }
8225     }
8226
8227     printf("Testing RBSP\n");
8228
8229
8230     return 0;
8231 }
8232 #endif
8233
8234
8235 static int decode_end(AVCodecContext *avctx)
8236 {
8237     H264Context *h = avctx->priv_data;
8238     MpegEncContext *s = &h->s;
8239
8240     av_freep(&h->rbsp_buffer);
8241     free_tables(h); //FIXME cleanup init stuff perhaps
8242     MPV_common_end(s);
8243
8244 //    memset(h, 0, sizeof(H264Context));
8245
8246     return 0;
8247 }
8248
8249
8250 AVCodec h264_decoder = {
8251     "h264",
8252     CODEC_TYPE_VIDEO,
8253     CODEC_ID_H264,
8254     sizeof(H264Context),
8255     decode_init,
8256     NULL,
8257     decode_end,
8258     decode_frame,
8259     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8260     .flush= flush_dpb,
8261 };
8262
8263 #include "svq3.c"