git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38
  39 //#undef NDEBUG
  40 #include <assert.h>
  41
  42 /**
  43  * Value of Picture.reference when Picture is not a reference picture, but
  44  * is held for delayed output.
  45  */
  46 #define DELAYED_PIC_REF 4
  47
  48 static VLC coeff_token_vlc[4];
  49 static VLC chroma_dc_coeff_token_vlc;
  50
  51 static VLC total_zeros_vlc[15];
  52 static VLC chroma_dc_total_zeros_vlc[3];
  53
  54 static VLC run_vlc[6];
  55 static VLC run7_vlc;
  56
  57 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  58 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  59 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  60 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  61
  62 static av_always_inline uint32_t pack16to32(int a, int b){
  63 #ifdef WORDS_BIGENDIAN
  64    return (b&0xFFFF) + (a<<16);
  65 #else
  66    return (a&0xFFFF) + (b<<16);
  67 #endif
  68 }
  69
  70 const uint8_t ff_rem6[52]={
  71 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  72 };
  73
  74 const uint8_t ff_div6[52]={
  75 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  76 };
  77
  78
  79 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  80     MpegEncContext * const s = &h->s;
  81     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
  82     int topleft_xy, top_xy, topright_xy, left_xy[2];
  83     int topleft_type, top_type, topright_type, left_type[2];
  84     int left_block[8];
  85     int i;
  86
  87     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  88
  89     //FIXME deblocking could skip the intra and nnz parts.
  90     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  91         return;
  92
  93     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
  94
  95     topleft_xy = top_xy - 1;
  96     topright_xy= top_xy + 1;
  97     left_xy[1] = left_xy[0] = mb_xy-1;
  98     left_block[0]= 0;
  99     left_block[1]= 1;
 100     left_block[2]= 2;
 101     left_block[3]= 3;
 102     left_block[4]= 7;
 103     left_block[5]= 10;
 104     left_block[6]= 8;
 105     left_block[7]= 11;
 106     if(FRAME_MBAFF){
 107         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 108         const int top_pair_xy      = pair_xy     - s->mb_stride;
 109         const int topleft_pair_xy  = top_pair_xy - 1;
 110         const int topright_pair_xy = top_pair_xy + 1;
 111         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 112         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 113         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 114         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 115         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 116         const int bottom = (s->mb_y & 1);
 117         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 118         if (bottom
 119                 ? !curr_mb_frame_flag // bottom macroblock
 120                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 121                 ) {
 122             top_xy -= s->mb_stride;
 123         }
 124         if (bottom
 125                 ? !curr_mb_frame_flag // bottom macroblock
 126                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 127                 ) {
 128             topleft_xy -= s->mb_stride;
 129         }
 130         if (bottom
 131                 ? !curr_mb_frame_flag // bottom macroblock
 132                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 133                 ) {
 134             topright_xy -= s->mb_stride;
 135         }
 136         if (left_mb_frame_flag != curr_mb_frame_flag) {
 137             left_xy[1] = left_xy[0] = pair_xy - 1;
 138             if (curr_mb_frame_flag) {
 139                 if (bottom) {
 140                     left_block[0]= 2;
 141                     left_block[1]= 2;
 142                     left_block[2]= 3;
 143                     left_block[3]= 3;
 144                     left_block[4]= 8;
 145                     left_block[5]= 11;
 146                     left_block[6]= 8;
 147                     left_block[7]= 11;
 148                 } else {
 149                     left_block[0]= 0;
 150                     left_block[1]= 0;
 151                     left_block[2]= 1;
 152                     left_block[3]= 1;
 153                     left_block[4]= 7;
 154                     left_block[5]= 10;
 155                     left_block[6]= 7;
 156                     left_block[7]= 10;
 157                 }
 158             } else {
 159                 left_xy[1] += s->mb_stride;
 160                 //left_block[0]= 0;
 161                 left_block[1]= 2;
 162                 left_block[2]= 0;
 163                 left_block[3]= 2;
 164                 //left_block[4]= 7;
 165                 left_block[5]= 10;
 166                 left_block[6]= 7;
 167                 left_block[7]= 10;
 168             }
 169         }
 170     }
 171
 172     h->top_mb_xy = top_xy;
 173     h->left_mb_xy[0] = left_xy[0];
 174     h->left_mb_xy[1] = left_xy[1];
 175     if(for_deblock){
 176         topleft_type = 0;
 177         topright_type = 0;
 178         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 179         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 180         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 181
 182         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 183             int list;
 184             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 185             for(i=0; i<16; i++)
 186                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 187             for(list=0; list<h->list_count; list++){
 188                 if(USES_LIST(mb_type,list)){
 189                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 190                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 191                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 192                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 193                         dst[0] = src[0];
 194                         dst[1] = src[1];
 195                         dst[2] = src[2];
 196                         dst[3] = src[3];
 197                     }
 198                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 199                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 200                     ref += h->b8_stride;
 201                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 202                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 203                 }else{
 204                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 205                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 206                 }
 207             }
 208         }
 209     }else{
 210         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 211         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 212         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 213         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 214         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 215     }
 216
 217     if(IS_INTRA(mb_type)){
 218         h->topleft_samples_available=
 219         h->top_samples_available=
 220         h->left_samples_available= 0xFFFF;
 221         h->topright_samples_available= 0xEEEA;
 222
 223         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 224             h->topleft_samples_available= 0xB3FF;
 225             h->top_samples_available= 0x33FF;
 226             h->topright_samples_available= 0x26EA;
 227         }
 228         for(i=0; i<2; i++){
 229             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 230                 h->topleft_samples_available&= 0xDF5F;
 231                 h->left_samples_available&= 0x5F5F;
 232             }
 233         }
 234
 235         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 236             h->topleft_samples_available&= 0x7FFF;
 237
 238         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 239             h->topright_samples_available&= 0xFBFF;
 240
 241         if(IS_INTRA4x4(mb_type)){
 242             if(IS_INTRA4x4(top_type)){
 243                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 244                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 245                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 246                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 247             }else{
 248                 int pred;
 249                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 250                     pred= -1;
 251                 else{
 252                     pred= 2;
 253                 }
 254                 h->intra4x4_pred_mode_cache[4+8*0]=
 255                 h->intra4x4_pred_mode_cache[5+8*0]=
 256                 h->intra4x4_pred_mode_cache[6+8*0]=
 257                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 258             }
 259             for(i=0; i<2; i++){
 260                 if(IS_INTRA4x4(left_type[i])){
 261                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 262                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 263                 }else{
 264                     int pred;
 265                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 266                         pred= -1;
 267                     else{
 268                         pred= 2;
 269                     }
 270                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 271                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 272                 }
 273             }
 274         }
 275     }
 276
 277
 278 /*
 279 0 . T T. T T T T
 280 1 L . .L . . . .
 281 2 L . .L . . . .
 282 3 . T TL . . . .
 283 4 L . .L . . . .
 284 5 L . .. . . . .
 285 */
 286 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 287     if(top_type){
 288         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 289         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 290         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 291         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 292
 293         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 294         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 295
 296         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 297         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 298
 299     }else{
 300         h->non_zero_count_cache[4+8*0]=
 301         h->non_zero_count_cache[5+8*0]=
 302         h->non_zero_count_cache[6+8*0]=
 303         h->non_zero_count_cache[7+8*0]=
 304
 305         h->non_zero_count_cache[1+8*0]=
 306         h->non_zero_count_cache[2+8*0]=
 307
 308         h->non_zero_count_cache[1+8*3]=
 309         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 310
 311     }
 312
 313     for (i=0; i<2; i++) {
 314         if(left_type[i]){
 315             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 316             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 317             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 318             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 319         }else{
 320             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 321             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 322             h->non_zero_count_cache[0+8*1 +   8*i]=
 323             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 324         }
 325     }
 326
 327     if( h->pps.cabac ) {
 328         // top_cbp
 329         if(top_type) {
 330             h->top_cbp = h->cbp_table[top_xy];
 331         } else if(IS_INTRA(mb_type)) {
 332             h->top_cbp = 0x1C0;
 333         } else {
 334             h->top_cbp = 0;
 335         }
 336         // left_cbp
 337         if (left_type[0]) {
 338             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 339         } else if(IS_INTRA(mb_type)) {
 340             h->left_cbp = 0x1C0;
 341         } else {
 342             h->left_cbp = 0;
 343         }
 344         if (left_type[0]) {
 345             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 346         }
 347         if (left_type[1]) {
 348             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 349         }
 350     }
 351
 352 #if 1
 353     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 354         int list;
 355         for(list=0; list<h->list_count; list++){
 356             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 357                 /*if(!h->mv_cache_clean[list]){
 358                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 359                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 360                     h->mv_cache_clean[list]= 1;
 361                 }*/
 362                 continue;
 363             }
 364             h->mv_cache_clean[list]= 0;
 365
 366             if(USES_LIST(top_type, list)){
 367                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 368                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 369                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 370                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 371                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 372                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 373                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 374                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 375                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 376                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 377             }else{
 378                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 379                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 380                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 381                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 382                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 383             }
 384
 385             for(i=0; i<2; i++){
 386                 int cache_idx = scan8[0] - 1 + i*2*8;
 387                 if(USES_LIST(left_type[i], list)){
 388                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 389                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 390                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 391                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 392                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 393                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 394                 }else{
 395                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 396                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 397                     h->ref_cache[list][cache_idx  ]=
 398                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 399                 }
 400             }
 401
 402             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 403                 continue;
 404
 405             if(USES_LIST(topleft_type, list)){
 406                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 407                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 408                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 409                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 410             }else{
 411                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 412                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 413             }
 414
 415             if(USES_LIST(topright_type, list)){
 416                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 417                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 418                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 419                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 420             }else{
 421                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 422                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 423             }
 424
 425             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 426                 continue;
 427
 428             h->ref_cache[list][scan8[5 ]+1] =
 429             h->ref_cache[list][scan8[7 ]+1] =
 430             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 431             h->ref_cache[list][scan8[4 ]] =
 432             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 433             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 434             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 435             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 436             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 437             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 438
 439             if( h->pps.cabac ) {
 440                 /* XXX beurk, Load mvd */
 441                 if(USES_LIST(top_type, list)){
 442                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 443                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 444                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 445                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 446                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 447                 }else{
 448                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 449                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 450                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 451                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 452                 }
 453                 if(USES_LIST(left_type[0], list)){
 454                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 457                 }else{
 458                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 459                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 460                 }
 461                 if(USES_LIST(left_type[1], list)){
 462                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 463                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 464                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 465                 }else{
 466                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 467                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 468                 }
 469                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 470                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 471                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 472                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 473                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 474
 475                 if(h->slice_type == B_TYPE){
 476                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 477
 478                     if(IS_DIRECT(top_type)){
 479                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 480                     }else if(IS_8X8(top_type)){
 481                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 482                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 483                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 484                     }else{
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 486                     }
 487
 488                     if(IS_DIRECT(left_type[0]))
 489                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 490                     else if(IS_8X8(left_type[0]))
 491                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 492                     else
 493                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 494
 495                     if(IS_DIRECT(left_type[1]))
 496                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 497                     else if(IS_8X8(left_type[1]))
 498                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 499                     else
 500                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 501                 }
 502             }
 503
 504             if(FRAME_MBAFF){
 505 #define MAP_MVS\
 506                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 507                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 508                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 509                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 510                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 511                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 512                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 513                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 514                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 515                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 516                 if(MB_FIELD){
 517 #define MAP_F2F(idx, mb_type)\
 518                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 519                         h->ref_cache[list][idx] <<= 1;\
 520                         h->mv_cache[list][idx][1] /= 2;\
 521                         h->mvd_cache[list][idx][1] /= 2;\
 522                     }
 523                     MAP_MVS
 524 #undef MAP_F2F
 525                 }else{
 526 #define MAP_F2F(idx, mb_type)\
 527                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 528                         h->ref_cache[list][idx] >>= 1;\
 529                         h->mv_cache[list][idx][1] <<= 1;\
 530                         h->mvd_cache[list][idx][1] <<= 1;\
 531                     }
 532                     MAP_MVS
 533 #undef MAP_F2F
 534                 }
 535             }
 536         }
 537     }
 538 #endif
 539
 540     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 541 }
 542
 543 static inline void write_back_intra_pred_mode(H264Context *h){
 544     MpegEncContext * const s = &h->s;
 545     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 546
 547     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 548     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 549     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 550     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 551     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 552     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 553     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 554 }
 555
 556 /**
 557  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 558  */
 559 static inline int check_intra4x4_pred_mode(H264Context *h){
 560     MpegEncContext * const s = &h->s;
 561     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 562     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 563     int i;
 564
 565     if(!(h->top_samples_available&0x8000)){
 566         for(i=0; i<4; i++){
 567             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 568             if(status<0){
 569                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 570                 return -1;
 571             } else if(status){
 572                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 573             }
 574         }
 575     }
 576
 577     if(!(h->left_samples_available&0x8000)){
 578         for(i=0; i<4; i++){
 579             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 580             if(status<0){
 581                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 582                 return -1;
 583             } else if(status){
 584                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 585             }
 586         }
 587     }
 588
 589     return 0;
 590 } //FIXME cleanup like next
 591
 592 /**
 593  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 594  */
 595 static inline int check_intra_pred_mode(H264Context *h, int mode){
 596     MpegEncContext * const s = &h->s;
 597     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 598     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 599
 600     if(mode > 6U) {
 601         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 602         return -1;
 603     }
 604
 605     if(!(h->top_samples_available&0x8000)){
 606         mode= top[ mode ];
 607         if(mode<0){
 608             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 609             return -1;
 610         }
 611     }
 612
 613     if(!(h->left_samples_available&0x8000)){
 614         mode= left[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     return mode;
 622 }
 623
 624 /**
 625  * gets the predicted intra4x4 prediction mode.
 626  */
 627 static inline int pred_intra_mode(H264Context *h, int n){
 628     const int index8= scan8[n];
 629     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 630     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 631     const int min= FFMIN(left, top);
 632
 633     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 634
 635     if(min<0) return DC_PRED;
 636     else      return min;
 637 }
 638
 639 static inline void write_back_non_zero_count(H264Context *h){
 640     MpegEncContext * const s = &h->s;
 641     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 642
 643     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 644     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 645     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 646     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 647     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 648     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 649     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 650
 651     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 652     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 653     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 654
 655     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 656     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 657     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 658
 659     if(FRAME_MBAFF){
 660         // store all luma nnzs, for deblocking
 661         int v = 0, i;
 662         for(i=0; i<16; i++)
 663             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 664         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 665     }
 666 }
 667
 668 /**
 669  * gets the predicted number of non zero coefficients.
 670  * @param n block index
 671  */
 672 static inline int pred_non_zero_count(H264Context *h, int n){
 673     const int index8= scan8[n];
 674     const int left= h->non_zero_count_cache[index8 - 1];
 675     const int top = h->non_zero_count_cache[index8 - 8];
 676     int i= left + top;
 677
 678     if(i<64) i= (i+1)>>1;
 679
 680     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 681
 682     return i&31;
 683 }
 684
 685 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 686     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 687     MpegEncContext *s = &h->s;
 688
 689     /* there is no consistent mapping of mvs to neighboring locations that will
 690      * make mbaff happy, so we can't move all this logic to fill_caches */
 691     if(FRAME_MBAFF){
 692         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 693         const int16_t *mv;
 694         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 695         *C = h->mv_cache[list][scan8[0]-2];
 696
 697         if(!MB_FIELD
 698            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 699             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 700             if(IS_INTERLACED(mb_types[topright_xy])){
 701 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 702                 const int x4 = X4, y4 = Y4;\
 703                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 704                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
 705                     return LIST_NOT_USED;\
 706                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 707                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 708                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 709                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 710
 711                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 712             }
 713         }
 714         if(topright_ref == PART_NOT_AVAILABLE
 715            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 716            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 717             if(!MB_FIELD
 718                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 720             }
 721             if(MB_FIELD
 722                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 723                && i >= scan8[0]+8){
 724                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 725                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 726             }
 727         }
 728 #undef SET_DIAG_MV
 729     }
 730
 731     if(topright_ref != PART_NOT_AVAILABLE){
 732         *C= h->mv_cache[list][ i - 8 + part_width ];
 733         return topright_ref;
 734     }else{
 735         tprintf(s->avctx, "topright MV not available\n");
 736
 737         *C= h->mv_cache[list][ i - 8 - 1 ];
 738         return h->ref_cache[list][ i - 8 - 1 ];
 739     }
 740 }
 741
 742 /**
 743  * gets the predicted MV.
 744  * @param n the block index
 745  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 746  * @param mx the x component of the predicted motion vector
 747  * @param my the y component of the predicted motion vector
 748  */
 749 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 750     const int index8= scan8[n];
 751     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 752     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 753     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 754     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 755     const int16_t * C;
 756     int diagonal_ref, match_count;
 757
 758     assert(part_width==1 || part_width==2 || part_width==4);
 759
 760 /* mv_cache
 761   B . . A T T T T
 762   U . . L . . , .
 763   U . . L . . . .
 764   U . . L . . , .
 765   . . . L . . . .
 766 */
 767
 768     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 769     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 770     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 771     if(match_count > 1){ //most common
 772         *mx= mid_pred(A[0], B[0], C[0]);
 773         *my= mid_pred(A[1], B[1], C[1]);
 774     }else if(match_count==1){
 775         if(left_ref==ref){
 776             *mx= A[0];
 777             *my= A[1];
 778         }else if(top_ref==ref){
 779             *mx= B[0];
 780             *my= B[1];
 781         }else{
 782             *mx= C[0];
 783             *my= C[1];
 784         }
 785     }else{
 786         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 787             *mx= A[0];
 788             *my= A[1];
 789         }else{
 790             *mx= mid_pred(A[0], B[0], C[0]);
 791             *my= mid_pred(A[1], B[1], C[1]);
 792         }
 793     }
 794
 795     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 796 }
 797
 798 /**
 799  * gets the directionally predicted 16x8 MV.
 800  * @param n the block index
 801  * @param mx the x component of the predicted motion vector
 802  * @param my the y component of the predicted motion vector
 803  */
 804 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 805     if(n==0){
 806         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 807         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 808
 809         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 810
 811         if(top_ref == ref){
 812             *mx= B[0];
 813             *my= B[1];
 814             return;
 815         }
 816     }else{
 817         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 818         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 819
 820         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 821
 822         if(left_ref == ref){
 823             *mx= A[0];
 824             *my= A[1];
 825             return;
 826         }
 827     }
 828
 829     //RARE
 830     pred_motion(h, n, 4, list, ref, mx, my);
 831 }
 832
 833 /**
 834  * gets the directionally predicted 8x16 MV.
 835  * @param n the block index
 836  * @param mx the x component of the predicted motion vector
 837  * @param my the y component of the predicted motion vector
 838  */
 839 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 840     if(n==0){
 841         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 842         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 843
 844         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 845
 846         if(left_ref == ref){
 847             *mx= A[0];
 848             *my= A[1];
 849             return;
 850         }
 851     }else{
 852         const int16_t * C;
 853         int diagonal_ref;
 854
 855         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 856
 857         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 858
 859         if(diagonal_ref == ref){
 860             *mx= C[0];
 861             *my= C[1];
 862             return;
 863         }
 864     }
 865
 866     //RARE
 867     pred_motion(h, n, 2, list, ref, mx, my);
 868 }
 869
 870 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 871     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 872     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 873
 874     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 875
 876     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 877        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 878        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 879
 880         *mx = *my = 0;
 881         return;
 882     }
 883
 884     pred_motion(h, 0, 4, 0, 0, mx, my);
 885
 886     return;
 887 }
 888
 889 static inline void direct_dist_scale_factor(H264Context * const h){
 890     const int poc = h->s.current_picture_ptr->poc;
 891     const int poc1 = h->ref_list[1][0].poc;
 892     int i;
 893     for(i=0; i<h->ref_count[0]; i++){
 894         int poc0 = h->ref_list[0][i].poc;
 895         int td = av_clip(poc1 - poc0, -128, 127);
 896         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 897             h->dist_scale_factor[i] = 256;
 898         }else{
 899             int tb = av_clip(poc - poc0, -128, 127);
 900             int tx = (16384 + (FFABS(td) >> 1)) / td;
 901             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 902         }
 903     }
 904     if(FRAME_MBAFF){
 905         for(i=0; i<h->ref_count[0]; i++){
 906             h->dist_scale_factor_field[2*i] =
 907             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 908         }
 909     }
 910 }
 911 static inline void direct_ref_list_init(H264Context * const h){
 912     MpegEncContext * const s = &h->s;
 913     Picture * const ref1 = &h->ref_list[1][0];
 914     Picture * const cur = s->current_picture_ptr;
 915     int list, i, j;
 916     if(cur->pict_type == I_TYPE)
 917         cur->ref_count[0] = 0;
 918     if(cur->pict_type != B_TYPE)
 919         cur->ref_count[1] = 0;
 920     for(list=0; list<2; list++){
 921         cur->ref_count[list] = h->ref_count[list];
 922         for(j=0; j<h->ref_count[list]; j++)
 923             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 924     }
 925     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
 926         return;
 927     for(list=0; list<2; list++){
 928         for(i=0; i<ref1->ref_count[list]; i++){
 929             const int poc = ref1->ref_poc[list][i];
 930             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 931             for(j=0; j<h->ref_count[list]; j++)
 932                 if(h->ref_list[list][j].poc == poc){
 933                     h->map_col_to_list0[list][i] = j;
 934                     break;
 935                 }
 936         }
 937     }
 938     if(FRAME_MBAFF){
 939         for(list=0; list<2; list++){
 940             for(i=0; i<ref1->ref_count[list]; i++){
 941                 j = h->map_col_to_list0[list][i];
 942                 h->map_col_to_list0_field[list][2*i] = 2*j;
 943                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 944             }
 945         }
 946     }
 947 }
 948
 949 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 950     MpegEncContext * const s = &h->s;
 951     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
 952     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 953     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 954     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 955     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 956     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 957     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 958     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 959     const int is_b8x8 = IS_8X8(*mb_type);
 960     unsigned int sub_mb_type;
 961     int i8, i4;
 962
 963 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 964     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 965         /* FIXME save sub mb types from previous frames (or derive from MVs)
 966          * so we know exactly what block size to use */
 967         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 968         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 969     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 970         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 971         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 972     }else{
 973         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 974         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 975     }
 976     if(!is_b8x8)
 977         *mb_type |= MB_TYPE_DIRECT2;
 978     if(MB_FIELD)
 979         *mb_type |= MB_TYPE_INTERLACED;
 980
 981     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 982
 983     if(h->direct_spatial_mv_pred){
 984         int ref[2];
 985         int mv[2][2];
 986         int list;
 987
 988         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 989
 990         /* ref = min(neighbors) */
 991         for(list=0; list<2; list++){
 992             int refa = h->ref_cache[list][scan8[0] - 1];
 993             int refb = h->ref_cache[list][scan8[0] - 8];
 994             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
 995             if(refc == -2)
 996                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
 997             ref[list] = refa;
 998             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
 999                 ref[list] = refb;
1000             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1001                 ref[list] = refc;
1002             if(ref[list] < 0)
1003                 ref[list] = -1;
1004         }
1005
1006         if(ref[0] < 0 && ref[1] < 0){
1007             ref[0] = ref[1] = 0;
1008             mv[0][0] = mv[0][1] =
1009             mv[1][0] = mv[1][1] = 0;
1010         }else{
1011             for(list=0; list<2; list++){
1012                 if(ref[list] >= 0)
1013                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1014                 else
1015                     mv[list][0] = mv[list][1] = 0;
1016             }
1017         }
1018
1019         if(ref[1] < 0){
1020             *mb_type &= ~MB_TYPE_P0L1;
1021             sub_mb_type &= ~MB_TYPE_P0L1;
1022         }else if(ref[0] < 0){
1023             *mb_type &= ~MB_TYPE_P0L0;
1024             sub_mb_type &= ~MB_TYPE_P0L0;
1025         }
1026
1027         if(IS_16X16(*mb_type)){
1028             int a=0, b=0;
1029
1030             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1031             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1032             if(!IS_INTRA(mb_type_col)
1033                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1034                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1035                        && (h->x264_build>33 || !h->x264_build)))){
1036                 if(ref[0] > 0)
1037                     a= pack16to32(mv[0][0],mv[0][1]);
1038                 if(ref[1] > 0)
1039                     b= pack16to32(mv[1][0],mv[1][1]);
1040             }else{
1041                 a= pack16to32(mv[0][0],mv[0][1]);
1042                 b= pack16to32(mv[1][0],mv[1][1]);
1043             }
1044             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1045             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1046         }else{
1047             for(i8=0; i8<4; i8++){
1048                 const int x8 = i8&1;
1049                 const int y8 = i8>>1;
1050
1051                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1052                     continue;
1053                 h->sub_mb_type[i8] = sub_mb_type;
1054
1055                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1056                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1057                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1058                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1059
1060                 /* col_zero_flag */
1061                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1062                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1063                                                   && (h->x264_build>33 || !h->x264_build)))){
1064                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1065                     if(IS_SUB_8X8(sub_mb_type)){
1066                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1067                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1068                             if(ref[0] == 0)
1069                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1070                             if(ref[1] == 0)
1071                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1072                         }
1073                     }else
1074                     for(i4=0; i4<4; i4++){
1075                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1076                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1077                             if(ref[0] == 0)
1078                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1079                             if(ref[1] == 0)
1080                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1081                         }
1082                     }
1083                 }
1084             }
1085         }
1086     }else{ /* direct temporal mv pred */
1087         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1088         const int *dist_scale_factor = h->dist_scale_factor;
1089
1090         if(FRAME_MBAFF){
1091             if(IS_INTERLACED(*mb_type)){
1092                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1093                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1094                 dist_scale_factor = h->dist_scale_factor_field;
1095             }
1096             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1097                 /* FIXME assumes direct_8x8_inference == 1 */
1098                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1099                 int mb_types_col[2];
1100                 int y_shift;
1101
1102                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1103                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1104                          | (*mb_type & MB_TYPE_INTERLACED);
1105                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1106
1107                 if(IS_INTERLACED(*mb_type)){
1108                     /* frame to field scaling */
1109                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1110                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1111                     if(s->mb_y&1){
1112                         l1ref0 -= 2*h->b8_stride;
1113                         l1ref1 -= 2*h->b8_stride;
1114                         l1mv0 -= 4*h->b_stride;
1115                         l1mv1 -= 4*h->b_stride;
1116                     }
1117                     y_shift = 0;
1118
1119                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1120                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1121                        && !is_b8x8)
1122                         *mb_type |= MB_TYPE_16x8;
1123                     else
1124                         *mb_type |= MB_TYPE_8x8;
1125                 }else{
1126                     /* field to frame scaling */
1127                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1128                      * but in MBAFF, top and bottom POC are equal */
1129                     int dy = (s->mb_y&1) ? 1 : 2;
1130                     mb_types_col[0] =
1131                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1132                     l1ref0 += dy*h->b8_stride;
1133                     l1ref1 += dy*h->b8_stride;
1134                     l1mv0 += 2*dy*h->b_stride;
1135                     l1mv1 += 2*dy*h->b_stride;
1136                     y_shift = 2;
1137
1138                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1139                        && !is_b8x8)
1140                         *mb_type |= MB_TYPE_16x16;
1141                     else
1142                         *mb_type |= MB_TYPE_8x8;
1143                 }
1144
1145                 for(i8=0; i8<4; i8++){
1146                     const int x8 = i8&1;
1147                     const int y8 = i8>>1;
1148                     int ref0, scale;
1149                     const int16_t (*l1mv)[2]= l1mv0;
1150
1151                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1152                         continue;
1153                     h->sub_mb_type[i8] = sub_mb_type;
1154
1155                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1156                     if(IS_INTRA(mb_types_col[y8])){
1157                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1158                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1159                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1160                         continue;
1161                     }
1162
1163                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1164                     if(ref0 >= 0)
1165                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1166                     else{
1167                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1168                         l1mv= l1mv1;
1169                     }
1170                     scale = dist_scale_factor[ref0];
1171                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1172
1173                     {
1174                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1175                         int my_col = (mv_col[1]<<y_shift)/2;
1176                         int mx = (scale * mv_col[0] + 128) >> 8;
1177                         int my = (scale * my_col + 128) >> 8;
1178                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1179                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1180                     }
1181                 }
1182                 return;
1183             }
1184         }
1185
1186         /* one-to-one mv scaling */
1187
1188         if(IS_16X16(*mb_type)){
1189             int ref, mv0, mv1;
1190
1191             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1192             if(IS_INTRA(mb_type_col)){
1193                 ref=mv0=mv1=0;
1194             }else{
1195                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1196                                                 : map_col_to_list0[1][l1ref1[0]];
1197                 const int scale = dist_scale_factor[ref0];
1198                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1199                 int mv_l0[2];
1200                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1201                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1202                 ref= ref0;
1203                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1204                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1205             }
1206             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1207             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1208             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1209         }else{
1210             for(i8=0; i8<4; i8++){
1211                 const int x8 = i8&1;
1212                 const int y8 = i8>>1;
1213                 int ref0, scale;
1214                 const int16_t (*l1mv)[2]= l1mv0;
1215
1216                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1217                     continue;
1218                 h->sub_mb_type[i8] = sub_mb_type;
1219                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                 if(IS_INTRA(mb_type_col)){
1221                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1223                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1224                     continue;
1225                 }
1226
1227                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1228                 if(ref0 >= 0)
1229                     ref0 = map_col_to_list0[0][ref0];
1230                 else{
1231                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1232                     l1mv= l1mv1;
1233                 }
1234                 scale = dist_scale_factor[ref0];
1235
1236                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1237                 if(IS_SUB_8X8(sub_mb_type)){
1238                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1239                     int mx = (scale * mv_col[0] + 128) >> 8;
1240                     int my = (scale * mv_col[1] + 128) >> 8;
1241                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1242                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1243                 }else
1244                 for(i4=0; i4<4; i4++){
1245                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1246                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1247                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1248                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1249                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1250                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1251                 }
1252             }
1253         }
1254     }
1255 }
1256
1257 static inline void write_back_motion(H264Context *h, int mb_type){
1258     MpegEncContext * const s = &h->s;
1259     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1260     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1261     int list;
1262
1263     if(!USES_LIST(mb_type, 0))
1264         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1265
1266     for(list=0; list<h->list_count; list++){
1267         int y;
1268         if(!USES_LIST(mb_type, list))
1269             continue;
1270
1271         for(y=0; y<4; y++){
1272             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1273             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1274         }
1275         if( h->pps.cabac ) {
1276             if(IS_SKIP(mb_type))
1277                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1278             else
1279             for(y=0; y<4; y++){
1280                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1281                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1282             }
1283         }
1284
1285         {
1286             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1287             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1288             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1289             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1290             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1291         }
1292     }
1293
1294     if(h->slice_type == B_TYPE && h->pps.cabac){
1295         if(IS_8X8(mb_type)){
1296             uint8_t *direct_table = &h->direct_table[b8_xy];
1297             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1298             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1299             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1300         }
1301     }
1302 }
1303
1304 /**
1305  * Decodes a network abstraction layer unit.
1306  * @param consumed is the number of bytes used as input
1307  * @param length is the length of the array
1308  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1309  * @returns decoded bytes, might be src+1 if no escapes
1310  */
1311 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1312     int i, si, di;
1313     uint8_t *dst;
1314     int bufidx;
1315
1316 //    src[0]&0x80;                //forbidden bit
1317     h->nal_ref_idc= src[0]>>5;
1318     h->nal_unit_type= src[0]&0x1F;
1319
1320     src++; length--;
1321 #if 0
1322     for(i=0; i<length; i++)
1323         printf("%2X ", src[i]);
1324 #endif
1325     for(i=0; i+1<length; i+=2){
1326         if(src[i]) continue;
1327         if(i>0 && src[i-1]==0) i--;
1328         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1329             if(src[i+2]!=3){
1330                 /* startcode, so we must be past the end */
1331                 length=i;
1332             }
1333             break;
1334         }
1335     }
1336
1337     if(i>=length-1){ //no escaped 0
1338         *dst_length= length;
1339         *consumed= length+1; //+1 for the header
1340         return src;
1341     }
1342
1343     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1344     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1345     dst= h->rbsp_buffer[bufidx];
1346
1347     if (dst == NULL){
1348         return NULL;
1349     }
1350
1351 //printf("decoding esc\n");
1352     si=di=0;
1353     while(si<length){
1354         //remove escapes (very rare 1:2^22)
1355         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1356             if(src[si+2]==3){ //escape
1357                 dst[di++]= 0;
1358                 dst[di++]= 0;
1359                 si+=3;
1360                 continue;
1361             }else //next start code
1362                 break;
1363         }
1364
1365         dst[di++]= src[si++];
1366     }
1367
1368     *dst_length= di;
1369     *consumed= si + 1;//+1 for the header
1370 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1371     return dst;
1372 }
1373
1374 /**
1375  * identifies the exact end of the bitstream
1376  * @return the length of the trailing, or 0 if damaged
1377  */
1378 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1379     int v= *src;
1380     int r;
1381
1382     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1383
1384     for(r=1; r<9; r++){
1385         if(v&1) return r;
1386         v>>=1;
1387     }
1388     return 0;
1389 }
1390
1391 /**
1392  * idct tranforms the 16 dc values and dequantize them.
1393  * @param qp quantization parameter
1394  */
1395 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1396 #define stride 16
1397     int i;
1398     int temp[16]; //FIXME check if this is a good idea
1399     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1400     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1401
1402 //memset(block, 64, 2*256);
1403 //return;
1404     for(i=0; i<4; i++){
1405         const int offset= y_offset[i];
1406         const int z0= block[offset+stride*0] + block[offset+stride*4];
1407         const int z1= block[offset+stride*0] - block[offset+stride*4];
1408         const int z2= block[offset+stride*1] - block[offset+stride*5];
1409         const int z3= block[offset+stride*1] + block[offset+stride*5];
1410
1411         temp[4*i+0]= z0+z3;
1412         temp[4*i+1]= z1+z2;
1413         temp[4*i+2]= z1-z2;
1414         temp[4*i+3]= z0-z3;
1415     }
1416
1417     for(i=0; i<4; i++){
1418         const int offset= x_offset[i];
1419         const int z0= temp[4*0+i] + temp[4*2+i];
1420         const int z1= temp[4*0+i] - temp[4*2+i];
1421         const int z2= temp[4*1+i] - temp[4*3+i];
1422         const int z3= temp[4*1+i] + temp[4*3+i];
1423
1424         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1425         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1426         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1427         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1428     }
1429 }
1430
1431 #if 0
1432 /**
1433  * dct tranforms the 16 dc values.
1434  * @param qp quantization parameter ??? FIXME
1435  */
1436 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1437 //    const int qmul= dequant_coeff[qp][0];
1438     int i;
1439     int temp[16]; //FIXME check if this is a good idea
1440     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1441     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1442
1443     for(i=0; i<4; i++){
1444         const int offset= y_offset[i];
1445         const int z0= block[offset+stride*0] + block[offset+stride*4];
1446         const int z1= block[offset+stride*0] - block[offset+stride*4];
1447         const int z2= block[offset+stride*1] - block[offset+stride*5];
1448         const int z3= block[offset+stride*1] + block[offset+stride*5];
1449
1450         temp[4*i+0]= z0+z3;
1451         temp[4*i+1]= z1+z2;
1452         temp[4*i+2]= z1-z2;
1453         temp[4*i+3]= z0-z3;
1454     }
1455
1456     for(i=0; i<4; i++){
1457         const int offset= x_offset[i];
1458         const int z0= temp[4*0+i] + temp[4*2+i];
1459         const int z1= temp[4*0+i] - temp[4*2+i];
1460         const int z2= temp[4*1+i] - temp[4*3+i];
1461         const int z3= temp[4*1+i] + temp[4*3+i];
1462
1463         block[stride*0 +offset]= (z0 + z3)>>1;
1464         block[stride*2 +offset]= (z1 + z2)>>1;
1465         block[stride*8 +offset]= (z1 - z2)>>1;
1466         block[stride*10+offset]= (z0 - z3)>>1;
1467     }
1468 }
1469 #endif
1470
1471 #undef xStride
1472 #undef stride
1473
1474 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1475     const int stride= 16*2;
1476     const int xStride= 16;
1477     int a,b,c,d,e;
1478
1479     a= block[stride*0 + xStride*0];
1480     b= block[stride*0 + xStride*1];
1481     c= block[stride*1 + xStride*0];
1482     d= block[stride*1 + xStride*1];
1483
1484     e= a-b;
1485     a= a+b;
1486     b= c-d;
1487     c= c+d;
1488
1489     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1490     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1491     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1492     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1493 }
1494
1495 #if 0
1496 static void chroma_dc_dct_c(DCTELEM *block){
1497     const int stride= 16*2;
1498     const int xStride= 16;
1499     int a,b,c,d,e;
1500
1501     a= block[stride*0 + xStride*0];
1502     b= block[stride*0 + xStride*1];
1503     c= block[stride*1 + xStride*0];
1504     d= block[stride*1 + xStride*1];
1505
1506     e= a-b;
1507     a= a+b;
1508     b= c-d;
1509     c= c+d;
1510
1511     block[stride*0 + xStride*0]= (a+c);
1512     block[stride*0 + xStride*1]= (e+b);
1513     block[stride*1 + xStride*0]= (a-c);
1514     block[stride*1 + xStride*1]= (e-b);
1515 }
1516 #endif
1517
1518 /**
1519  * gets the chroma qp.
1520  */
1521 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1522     return h->pps.chroma_qp_table[t][qscale & 0xff];
1523 }
1524
1525 //FIXME need to check that this does not overflow signed 32 bit for low qp, i am not sure, it's very close
1526 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1527 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1528     int i;
1529     const int * const quant_table= quant_coeff[qscale];
1530     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1531     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1532     const unsigned int threshold2= (threshold1<<1);
1533     int last_non_zero;
1534
1535     if(separate_dc){
1536         if(qscale<=18){
1537             //avoid overflows
1538             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1539             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1540             const unsigned int dc_threshold2= (dc_threshold1<<1);
1541
1542             int level= block[0]*quant_coeff[qscale+18][0];
1543             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1544                 if(level>0){
1545                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1546                     block[0]= level;
1547                 }else{
1548                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1549                     block[0]= -level;
1550                 }
1551 //                last_non_zero = i;
1552             }else{
1553                 block[0]=0;
1554             }
1555         }else{
1556             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1557             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1558             const unsigned int dc_threshold2= (dc_threshold1<<1);
1559
1560             int level= block[0]*quant_table[0];
1561             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1562                 if(level>0){
1563                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1564                     block[0]= level;
1565                 }else{
1566                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1567                     block[0]= -level;
1568                 }
1569 //                last_non_zero = i;
1570             }else{
1571                 block[0]=0;
1572             }
1573         }
1574         last_non_zero= 0;
1575         i=1;
1576     }else{
1577         last_non_zero= -1;
1578         i=0;
1579     }
1580
1581     for(; i<16; i++){
1582         const int j= scantable[i];
1583         int level= block[j]*quant_table[j];
1584
1585 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1586 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1587         if(((unsigned)(level+threshold1))>threshold2){
1588             if(level>0){
1589                 level= (bias + level)>>QUANT_SHIFT;
1590                 block[j]= level;
1591             }else{
1592                 level= (bias - level)>>QUANT_SHIFT;
1593                 block[j]= -level;
1594             }
1595             last_non_zero = i;
1596         }else{
1597             block[j]=0;
1598         }
1599     }
1600
1601     return last_non_zero;
1602 }
1603
1604 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1605                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1606                            int src_x_offset, int src_y_offset,
1607                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1608     MpegEncContext * const s = &h->s;
1609     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1610     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1611     const int luma_xy= (mx&3) + ((my&3)<<2);
1612     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1613     uint8_t * src_cb, * src_cr;
1614     int extra_width= h->emu_edge_width;
1615     int extra_height= h->emu_edge_height;
1616     int emu=0;
1617     const int full_mx= mx>>2;
1618     const int full_my= my>>2;
1619     const int pic_width  = 16*s->mb_width;
1620     const int pic_height = 16*s->mb_height >> MB_FIELD;
1621
1622     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1623         return;
1624
1625     if(mx&7) extra_width -= 3;
1626     if(my&7) extra_height -= 3;
1627
1628     if(   full_mx < 0-extra_width
1629        || full_my < 0-extra_height
1630        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1631        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1632         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1633             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1634         emu=1;
1635     }
1636
1637     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1638     if(!square){
1639         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1640     }
1641
1642     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1643
1644     if(MB_FIELD){
1645         // chroma offset when predicting from a field of opposite parity
1646         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1647         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1648     }
1649     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1650     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1651
1652     if(emu){
1653         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1654             src_cb= s->edge_emu_buffer;
1655     }
1656     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1657
1658     if(emu){
1659         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1660             src_cr= s->edge_emu_buffer;
1661     }
1662     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1663 }
1664
1665 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1666                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1667                            int x_offset, int y_offset,
1668                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1669                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1670                            int list0, int list1){
1671     MpegEncContext * const s = &h->s;
1672     qpel_mc_func *qpix_op=  qpix_put;
1673     h264_chroma_mc_func chroma_op= chroma_put;
1674
1675     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1676     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1677     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1678     x_offset += 8*s->mb_x;
1679     y_offset += 8*(s->mb_y >> MB_FIELD);
1680
1681     if(list0){
1682         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1683         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1684                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1685                            qpix_op, chroma_op);
1686
1687         qpix_op=  qpix_avg;
1688         chroma_op= chroma_avg;
1689     }
1690
1691     if(list1){
1692         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1693         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1694                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1695                            qpix_op, chroma_op);
1696     }
1697 }
1698
1699 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1700                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1701                            int x_offset, int y_offset,
1702                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1703                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1704                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1705                            int list0, int list1){
1706     MpegEncContext * const s = &h->s;
1707
1708     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1709     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1710     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1711     x_offset += 8*s->mb_x;
1712     y_offset += 8*(s->mb_y >> MB_FIELD);
1713
1714     if(list0 && list1){
1715         /* don't optimize for luma-only case, since B-frames usually
1716          * use implicit weights => chroma too. */
1717         uint8_t *tmp_cb = s->obmc_scratchpad;
1718         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1719         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1720         int refn0 = h->ref_cache[0][ scan8[n] ];
1721         int refn1 = h->ref_cache[1][ scan8[n] ];
1722
1723         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1724                     dest_y, dest_cb, dest_cr,
1725                     x_offset, y_offset, qpix_put, chroma_put);
1726         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1727                     tmp_y, tmp_cb, tmp_cr,
1728                     x_offset, y_offset, qpix_put, chroma_put);
1729
1730         if(h->use_weight == 2){
1731             int weight0 = h->implicit_weight[refn0][refn1];
1732             int weight1 = 64 - weight0;
1733             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1734             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1735             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1736         }else{
1737             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1738                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1739                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1740             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1741                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1742                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1743             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1745                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1746         }
1747     }else{
1748         int list = list1 ? 1 : 0;
1749         int refn = h->ref_cache[list][ scan8[n] ];
1750         Picture *ref= &h->ref_list[list][refn];
1751         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1752                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1753                     qpix_put, chroma_put);
1754
1755         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1756                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1757         if(h->use_weight_chroma){
1758             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1759                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1760             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1761                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1762         }
1763     }
1764 }
1765
1766 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1767                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1768                            int x_offset, int y_offset,
1769                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1770                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1771                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1772                            int list0, int list1){
1773     if((h->use_weight==2 && list0 && list1
1774         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1775        || h->use_weight==1)
1776         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1777                          x_offset, y_offset, qpix_put, chroma_put,
1778                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1779     else
1780         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1781                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1782 }
1783
1784 static inline void prefetch_motion(H264Context *h, int list){
1785     /* fetch pixels for estimated mv 4 macroblocks ahead
1786      * optimized for 64byte cache lines */
1787     MpegEncContext * const s = &h->s;
1788     const int refn = h->ref_cache[list][scan8[0]];
1789     if(refn >= 0){
1790         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1791         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1792         uint8_t **src= h->ref_list[list][refn].data;
1793         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1794         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1795         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1796         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1797     }
1798 }
1799
1800 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1801                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1802                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1803                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1804     MpegEncContext * const s = &h->s;
1805     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1806     const int mb_type= s->current_picture.mb_type[mb_xy];
1807
1808     assert(IS_INTER(mb_type));
1809
1810     prefetch_motion(h, 0);
1811
1812     if(IS_16X16(mb_type)){
1813         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1815                 &weight_op[0], &weight_avg[0],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817     }else if(IS_16X8(mb_type)){
1818         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1819                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1820                 &weight_op[1], &weight_avg[1],
1821                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1822         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1823                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1824                 &weight_op[1], &weight_avg[1],
1825                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1826     }else if(IS_8X16(mb_type)){
1827         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1828                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1829                 &weight_op[2], &weight_avg[2],
1830                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1831         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1832                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1833                 &weight_op[2], &weight_avg[2],
1834                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1835     }else{
1836         int i;
1837
1838         assert(IS_8X8(mb_type));
1839
1840         for(i=0; i<4; i++){
1841             const int sub_mb_type= h->sub_mb_type[i];
1842             const int n= 4*i;
1843             int x_offset= (i&1)<<2;
1844             int y_offset= (i&2)<<1;
1845
1846             if(IS_SUB_8X8(sub_mb_type)){
1847                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1849                     &weight_op[3], &weight_avg[3],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851             }else if(IS_SUB_8X4(sub_mb_type)){
1852                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1853                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1854                     &weight_op[4], &weight_avg[4],
1855                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1856                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1857                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1858                     &weight_op[4], &weight_avg[4],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860             }else if(IS_SUB_4X8(sub_mb_type)){
1861                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1862                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1863                     &weight_op[5], &weight_avg[5],
1864                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1865                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1866                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1867                     &weight_op[5], &weight_avg[5],
1868                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1869             }else{
1870                 int j;
1871                 assert(IS_SUB_4X4(sub_mb_type));
1872                 for(j=0; j<4; j++){
1873                     int sub_x_offset= x_offset + 2*(j&1);
1874                     int sub_y_offset= y_offset +   (j&2);
1875                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1876                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1877                         &weight_op[6], &weight_avg[6],
1878                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1879                 }
1880             }
1881         }
1882     }
1883
1884     prefetch_motion(h, 1);
1885 }
1886
1887 static void decode_init_vlc(void){
1888     static int done = 0;
1889
1890     if (!done) {
1891         int i;
1892         done = 1;
1893
1894         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1895                  &chroma_dc_coeff_token_len [0], 1, 1,
1896                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1897
1898         for(i=0; i<4; i++){
1899             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1900                      &coeff_token_len [i][0], 1, 1,
1901                      &coeff_token_bits[i][0], 1, 1, 1);
1902         }
1903
1904         for(i=0; i<3; i++){
1905             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1906                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1907                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1908         }
1909         for(i=0; i<15; i++){
1910             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1911                      &total_zeros_len [i][0], 1, 1,
1912                      &total_zeros_bits[i][0], 1, 1, 1);
1913         }
1914
1915         for(i=0; i<6; i++){
1916             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1917                      &run_len [i][0], 1, 1,
1918                      &run_bits[i][0], 1, 1, 1);
1919         }
1920         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1921                  &run_len [6][0], 1, 1,
1922                  &run_bits[6][0], 1, 1, 1);
1923     }
1924 }
1925
1926 static void free_tables(H264Context *h){
1927     int i;
1928     H264Context *hx;
1929     av_freep(&h->intra4x4_pred_mode);
1930     av_freep(&h->chroma_pred_mode_table);
1931     av_freep(&h->cbp_table);
1932     av_freep(&h->mvd_table[0]);
1933     av_freep(&h->mvd_table[1]);
1934     av_freep(&h->direct_table);
1935     av_freep(&h->non_zero_count);
1936     av_freep(&h->slice_table_base);
1937     h->slice_table= NULL;
1938
1939     av_freep(&h->mb2b_xy);
1940     av_freep(&h->mb2b8_xy);
1941
1942     for(i = 0; i < MAX_SPS_COUNT; i++)
1943         av_freep(h->sps_buffers + i);
1944
1945     for(i = 0; i < MAX_PPS_COUNT; i++)
1946         av_freep(h->pps_buffers + i);
1947
1948     for(i = 0; i < h->s.avctx->thread_count; i++) {
1949         hx = h->thread_context[i];
1950         if(!hx) continue;
1951         av_freep(&hx->top_borders[1]);
1952         av_freep(&hx->top_borders[0]);
1953         av_freep(&hx->s.obmc_scratchpad);
1954         av_freep(&hx->s.allocated_edge_emu_buffer);
1955     }
1956 }
1957
1958 static void init_dequant8_coeff_table(H264Context *h){
1959     int i,q,x;
1960     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1961     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1962     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1963
1964     for(i=0; i<2; i++ ){
1965         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1966             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1967             break;
1968         }
1969
1970         for(q=0; q<52; q++){
1971             int shift = ff_div6[q];
1972             int idx = ff_rem6[q];
1973             for(x=0; x<64; x++)
1974                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1975                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1976                     h->pps.scaling_matrix8[i][x]) << shift;
1977         }
1978     }
1979 }
1980
1981 static void init_dequant4_coeff_table(H264Context *h){
1982     int i,j,q,x;
1983     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1984     for(i=0; i<6; i++ ){
1985         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1986         for(j=0; j<i; j++){
1987             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1988                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1989                 break;
1990             }
1991         }
1992         if(j<i)
1993             continue;
1994
1995         for(q=0; q<52; q++){
1996             int shift = ff_div6[q] + 2;
1997             int idx = ff_rem6[q];
1998             for(x=0; x<16; x++)
1999                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2000                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2001                     h->pps.scaling_matrix4[i][x]) << shift;
2002         }
2003     }
2004 }
2005
2006 static void init_dequant_tables(H264Context *h){
2007     int i,x;
2008     init_dequant4_coeff_table(h);
2009     if(h->pps.transform_8x8_mode)
2010         init_dequant8_coeff_table(h);
2011     if(h->sps.transform_bypass){
2012         for(i=0; i<6; i++)
2013             for(x=0; x<16; x++)
2014                 h->dequant4_coeff[i][0][x] = 1<<6;
2015         if(h->pps.transform_8x8_mode)
2016             for(i=0; i<2; i++)
2017                 for(x=0; x<64; x++)
2018                     h->dequant8_coeff[i][0][x] = 1<<6;
2019     }
2020 }
2021
2022
2023 /**
2024  * allocates tables.
2025  * needs width/height
2026  */
2027 static int alloc_tables(H264Context *h){
2028     MpegEncContext * const s = &h->s;
2029     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2030     int x,y;
2031
2032     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2033
2034     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2035     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2036     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2037
2038     if( h->pps.cabac ) {
2039         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2040         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2041         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2042         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2043     }
2044
2045     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2046     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2047
2048     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2049     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2050     for(y=0; y<s->mb_height; y++){
2051         for(x=0; x<s->mb_width; x++){
2052             const int mb_xy= x + y*s->mb_stride;
2053             const int b_xy = 4*x + 4*y*h->b_stride;
2054             const int b8_xy= 2*x + 2*y*h->b8_stride;
2055
2056             h->mb2b_xy [mb_xy]= b_xy;
2057             h->mb2b8_xy[mb_xy]= b8_xy;
2058         }
2059     }
2060
2061     s->obmc_scratchpad = NULL;
2062
2063     if(!h->dequant4_coeff[0])
2064         init_dequant_tables(h);
2065
2066     return 0;
2067 fail:
2068     free_tables(h);
2069     return -1;
2070 }
2071
2072 /**
2073  * Mimic alloc_tables(), but for every context thread.
2074  */
2075 static void clone_tables(H264Context *dst, H264Context *src){
2076     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2077     dst->non_zero_count           = src->non_zero_count;
2078     dst->slice_table              = src->slice_table;
2079     dst->cbp_table                = src->cbp_table;
2080     dst->mb2b_xy                  = src->mb2b_xy;
2081     dst->mb2b8_xy                 = src->mb2b8_xy;
2082     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2083     dst->mvd_table[0]             = src->mvd_table[0];
2084     dst->mvd_table[1]             = src->mvd_table[1];
2085     dst->direct_table             = src->direct_table;
2086
2087     dst->s.obmc_scratchpad = NULL;
2088     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2089 }
2090
2091 /**
2092  * Init context
2093  * Allocate buffers which are not shared amongst multiple threads.
2094  */
2095 static int context_init(H264Context *h){
2096     MpegEncContext * const s = &h->s;
2097
2098     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2099     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2100
2101     // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264)
2102     CHECKED_ALLOCZ(s->allocated_edge_emu_buffer,
2103                    (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance
2104     s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21;
2105     return 0;
2106 fail:
2107     return -1; // free_tables will clean up for us
2108 }
2109
2110 static void common_init(H264Context *h){
2111     MpegEncContext * const s = &h->s;
2112
2113     s->width = s->avctx->width;
2114     s->height = s->avctx->height;
2115     s->codec_id= s->avctx->codec->id;
2116
2117     ff_h264_pred_init(&h->hpc, s->codec_id);
2118
2119     h->dequant_coeff_pps= -1;
2120     s->unrestricted_mv=1;
2121     s->decode=1; //FIXME
2122
2123     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2124     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2125 }
2126
2127 static int decode_init(AVCodecContext *avctx){
2128     H264Context *h= avctx->priv_data;
2129     MpegEncContext * const s = &h->s;
2130
2131     MPV_decode_defaults(s);
2132
2133     s->avctx = avctx;
2134     common_init(h);
2135
2136     s->out_format = FMT_H264;
2137     s->workaround_bugs= avctx->workaround_bugs;
2138
2139     // set defaults
2140 //    s->decode_mb= ff_h263_decode_mb;
2141     s->quarter_sample = 1;
2142     s->low_delay= 1;
2143     avctx->pix_fmt= PIX_FMT_YUV420P;
2144
2145     decode_init_vlc();
2146
2147     if(avctx->extradata_size > 0 && avctx->extradata &&
2148        *(char *)avctx->extradata == 1){
2149         h->is_avc = 1;
2150         h->got_avcC = 0;
2151     } else {
2152         h->is_avc = 0;
2153     }
2154
2155     h->thread_context[0] = h;
2156     return 0;
2157 }
2158
2159 static int frame_start(H264Context *h){
2160     MpegEncContext * const s = &h->s;
2161     int i;
2162
2163     if(MPV_frame_start(s, s->avctx) < 0)
2164         return -1;
2165     ff_er_frame_start(s);
2166     /*
2167      * MPV_frame_start uses pict_type to derive key_frame.
2168      * This is incorrect for H.264; IDR markings must be used.
2169      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2170      * See decode_nal_units().
2171      */
2172     s->current_picture_ptr->key_frame= 0;
2173
2174     assert(s->linesize && s->uvlinesize);
2175
2176     for(i=0; i<16; i++){
2177         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2178         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2179     }
2180     for(i=0; i<4; i++){
2181         h->block_offset[16+i]=
2182         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2183         h->block_offset[24+16+i]=
2184         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2185     }
2186
2187     /* can't be in alloc_tables because linesize isn't known there.
2188      * FIXME: redo bipred weight to not require extra buffer? */
2189     for(i = 0; i < s->avctx->thread_count; i++)
2190         if(!h->thread_context[i]->s.obmc_scratchpad)
2191             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2192
2193     /* some macroblocks will be accessed before they're available */
2194     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2195         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2196
2197 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2198     return 0;
2199 }
2200
2201 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2202     MpegEncContext * const s = &h->s;
2203     int i;
2204
2205     src_y  -=   linesize;
2206     src_cb -= uvlinesize;
2207     src_cr -= uvlinesize;
2208
2209     // There are two lines saved, the line above the the top macroblock of a pair,
2210     // and the line above the bottom macroblock
2211     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2212     for(i=1; i<17; i++){
2213         h->left_border[i]= src_y[15+i*  linesize];
2214     }
2215
2216     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2217     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2218
2219     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2220         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2221         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2222         for(i=1; i<9; i++){
2223             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2224             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2225         }
2226         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2227         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2228     }
2229 }
2230
2231 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2232     MpegEncContext * const s = &h->s;
2233     int temp8, i;
2234     uint64_t temp64;
2235     int deblock_left;
2236     int deblock_top;
2237     int mb_xy;
2238
2239     if(h->deblocking_filter == 2) {
2240         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2241         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2242         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2243     } else {
2244         deblock_left = (s->mb_x > 0);
2245         deblock_top =  (s->mb_y > 0);
2246     }
2247
2248     src_y  -=   linesize + 1;
2249     src_cb -= uvlinesize + 1;
2250     src_cr -= uvlinesize + 1;
2251
2252 #define XCHG(a,b,t,xchg)\
2253 t= a;\
2254 if(xchg)\
2255     a= b;\
2256 b= t;
2257
2258     if(deblock_left){
2259         for(i = !deblock_top; i<17; i++){
2260             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2261         }
2262     }
2263
2264     if(deblock_top){
2265         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2266         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2267         if(s->mb_x+1 < s->mb_width){
2268             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2269         }
2270     }
2271
2272     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2273         if(deblock_left){
2274             for(i = !deblock_top; i<9; i++){
2275                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2276                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2277             }
2278         }
2279         if(deblock_top){
2280             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2281             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2282         }
2283     }
2284 }
2285
2286 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2287     MpegEncContext * const s = &h->s;
2288     int i;
2289
2290     src_y  -= 2 *   linesize;
2291     src_cb -= 2 * uvlinesize;
2292     src_cr -= 2 * uvlinesize;
2293
2294     // There are two lines saved, the line above the the top macroblock of a pair,
2295     // and the line above the bottom macroblock
2296     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2297     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2298     for(i=2; i<34; i++){
2299         h->left_border[i]= src_y[15+i*  linesize];
2300     }
2301
2302     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2303     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2304     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2305     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2306
2307     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2308         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2309         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2310         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2311         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2312         for(i=2; i<18; i++){
2313             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2314             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2315         }
2316         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2317         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2318         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2319         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2320     }
2321 }
2322
2323 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2324     MpegEncContext * const s = &h->s;
2325     int temp8, i;
2326     uint64_t temp64;
2327     int deblock_left = (s->mb_x > 0);
2328     int deblock_top  = (s->mb_y > 1);
2329
2330     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2331
2332     src_y  -= 2 *   linesize + 1;
2333     src_cb -= 2 * uvlinesize + 1;
2334     src_cr -= 2 * uvlinesize + 1;
2335
2336 #define XCHG(a,b,t,xchg)\
2337 t= a;\
2338 if(xchg)\
2339     a= b;\
2340 b= t;
2341
2342     if(deblock_left){
2343         for(i = (!deblock_top)<<1; i<34; i++){
2344             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2345         }
2346     }
2347
2348     if(deblock_top){
2349         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2350         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2351         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2352         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2353         if(s->mb_x+1 < s->mb_width){
2354             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2355             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2356         }
2357     }
2358
2359     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2360         if(deblock_left){
2361             for(i = (!deblock_top) << 1; i<18; i++){
2362                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2363                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2364             }
2365         }
2366         if(deblock_top){
2367             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2368             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2369             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2370             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2371         }
2372     }
2373 }
2374
2375 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2376     MpegEncContext * const s = &h->s;
2377     const int mb_x= s->mb_x;
2378     const int mb_y= s->mb_y;
2379     const int mb_xy= mb_x + mb_y*s->mb_stride;
2380     const int mb_type= s->current_picture.mb_type[mb_xy];
2381     uint8_t  *dest_y, *dest_cb, *dest_cr;
2382     int linesize, uvlinesize /*dct_offset*/;
2383     int i;
2384     int *block_offset = &h->block_offset[0];
2385     const unsigned int bottom = mb_y & 1;
2386     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2387     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2388     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2389
2390     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2391     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2392     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2393
2394     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2395     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2396
2397     if (!simple && MB_FIELD) {
2398         linesize   = h->mb_linesize   = s->linesize * 2;
2399         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2400         block_offset = &h->block_offset[24];
2401         if(mb_y&1){ //FIXME move out of this func?
2402             dest_y -= s->linesize*15;
2403             dest_cb-= s->uvlinesize*7;
2404             dest_cr-= s->uvlinesize*7;
2405         }
2406         if(FRAME_MBAFF) {
2407             int list;
2408             for(list=0; list<h->list_count; list++){
2409                 if(!USES_LIST(mb_type, list))
2410                     continue;
2411                 if(IS_16X16(mb_type)){
2412                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2413                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
2414                 }else{
2415                     for(i=0; i<16; i+=4){
2416                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2417                         int ref = h->ref_cache[list][scan8[i]];
2418                         if(ref >= 0)
2419                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
2420                     }
2421                 }
2422             }
2423         }
2424     } else {
2425         linesize   = h->mb_linesize   = s->linesize;
2426         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2427 //        dct_offset = s->linesize * 16;
2428     }
2429
2430     if(transform_bypass){
2431         idct_dc_add =
2432         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2433     }else if(IS_8x8DCT(mb_type)){
2434         idct_dc_add = s->dsp.h264_idct8_dc_add;
2435         idct_add = s->dsp.h264_idct8_add;
2436     }else{
2437         idct_dc_add = s->dsp.h264_idct_dc_add;
2438         idct_add = s->dsp.h264_idct_add;
2439     }
2440
2441     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2442        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2443         int mbt_y = mb_y&~1;
2444         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2445         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2446         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2447         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2448     }
2449
2450     if (!simple && IS_INTRA_PCM(mb_type)) {
2451         unsigned int x, y;
2452
2453         // The pixels are stored in h->mb array in the same order as levels,
2454         // copy them in output in the correct order.
2455         for(i=0; i<16; i++) {
2456             for (y=0; y<4; y++) {
2457                 for (x=0; x<4; x++) {
2458                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2459                 }
2460             }
2461         }
2462         for(i=16; i<16+4; i++) {
2463             for (y=0; y<4; y++) {
2464                 for (x=0; x<4; x++) {
2465                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2466                 }
2467             }
2468         }
2469         for(i=20; i<20+4; i++) {
2470             for (y=0; y<4; y++) {
2471                 for (x=0; x<4; x++) {
2472                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2473                 }
2474             }
2475         }
2476     } else {
2477         if(IS_INTRA(mb_type)){
2478             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2479                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2480
2481             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2482                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2483                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2484             }
2485
2486             if(IS_INTRA4x4(mb_type)){
2487                 if(simple || !s->encoding){
2488                     if(IS_8x8DCT(mb_type)){
2489                         for(i=0; i<16; i+=4){
2490                             uint8_t * const ptr= dest_y + block_offset[i];
2491                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2492                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2493                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2494                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2495                             if(nnz){
2496                                 if(nnz == 1 && h->mb[i*16])
2497                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2498                                 else
2499                                     idct_add(ptr, h->mb + i*16, linesize);
2500                             }
2501                         }
2502                     }else
2503                     for(i=0; i<16; i++){
2504                         uint8_t * const ptr= dest_y + block_offset[i];
2505                         uint8_t *topright;
2506                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2507                         int nnz, tr;
2508
2509                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2510                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2511                             assert(mb_y || linesize <= block_offset[i]);
2512                             if(!topright_avail){
2513                                 tr= ptr[3 - linesize]*0x01010101;
2514                                 topright= (uint8_t*) &tr;
2515                             }else
2516                                 topright= ptr + 4 - linesize;
2517                         }else
2518                             topright= NULL;
2519
2520                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2521                         nnz = h->non_zero_count_cache[ scan8[i] ];
2522                         if(nnz){
2523                             if(is_h264){
2524                                 if(nnz == 1 && h->mb[i*16])
2525                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2526                                 else
2527                                     idct_add(ptr, h->mb + i*16, linesize);
2528                             }else
2529                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2530                         }
2531                     }
2532                 }
2533             }else{
2534                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2535                 if(is_h264){
2536                     if(!transform_bypass)
2537                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2538                 }else
2539                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2540             }
2541             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2542                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2543         }else if(is_h264){
2544             hl_motion(h, dest_y, dest_cb, dest_cr,
2545                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2546                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2547                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2548         }
2549
2550
2551         if(!IS_INTRA4x4(mb_type)){
2552             if(is_h264){
2553                 if(IS_INTRA16x16(mb_type)){
2554                     for(i=0; i<16; i++){
2555                         if(h->non_zero_count_cache[ scan8[i] ])
2556                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2557                         else if(h->mb[i*16])
2558                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2559                     }
2560                 }else{
2561                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2562                     for(i=0; i<16; i+=di){
2563                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2564                         if(nnz){
2565                             if(nnz==1 && h->mb[i*16])
2566                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2567                             else
2568                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2569                         }
2570                     }
2571                 }
2572             }else{
2573                 for(i=0; i<16; i++){
2574                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2575                         uint8_t * const ptr= dest_y + block_offset[i];
2576                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2577                     }
2578                 }
2579             }
2580         }
2581
2582         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2583             uint8_t *dest[2] = {dest_cb, dest_cr};
2584             if(transform_bypass){
2585                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2586             }else{
2587                 idct_add = s->dsp.h264_idct_add;
2588                 idct_dc_add = s->dsp.h264_idct_dc_add;
2589                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2590                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2591             }
2592             if(is_h264){
2593                 for(i=16; i<16+8; i++){
2594                     if(h->non_zero_count_cache[ scan8[i] ])
2595                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2596                     else if(h->mb[i*16])
2597                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2598                 }
2599             }else{
2600                 for(i=16; i<16+8; i++){
2601                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2602                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2603                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2604                     }
2605                 }
2606             }
2607         }
2608     }
2609     if(h->deblocking_filter) {
2610         if (!simple && FRAME_MBAFF) {
2611             //FIXME try deblocking one mb at a time?
2612             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2613             const int mb_y = s->mb_y - 1;
2614             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2615             const int mb_xy= mb_x + mb_y*s->mb_stride;
2616             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2617             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2618             if (!bottom) return;
2619             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2620             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2621             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2622
2623             if(IS_INTRA(mb_type_top | mb_type_bottom))
2624                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2625
2626             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2627             // deblock a pair
2628             // top
2629             s->mb_y--;
2630             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2631             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2632             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2633             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2634             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2635             // bottom
2636             s->mb_y++;
2637             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2638             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2639             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2640             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2641             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2642         } else {
2643             tprintf(h->s.avctx, "call filter_mb\n");
2644             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2645             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2646             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2647         }
2648     }
2649 }
2650
2651 /**
2652  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2653  */
2654 static void hl_decode_mb_simple(H264Context *h){
2655     hl_decode_mb_internal(h, 1);
2656 }
2657
2658 /**
2659  * Process a macroblock; this handles edge cases, such as interlacing.
2660  */
2661 static void av_noinline hl_decode_mb_complex(H264Context *h){
2662     hl_decode_mb_internal(h, 0);
2663 }
2664
2665 static void hl_decode_mb(H264Context *h){
2666     MpegEncContext * const s = &h->s;
2667     const int mb_x= s->mb_x;
2668     const int mb_y= s->mb_y;
2669     const int mb_xy= mb_x + mb_y*s->mb_stride;
2670     const int mb_type= s->current_picture.mb_type[mb_xy];
2671     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2672
2673     if(!s->decode)
2674         return;
2675
2676     if (is_complex)
2677         hl_decode_mb_complex(h);
2678     else hl_decode_mb_simple(h);
2679 }
2680
2681 static void pic_as_field(Picture *pic, const int parity){
2682     int i;
2683     for (i = 0; i < 4; ++i) {
2684         if (parity == PICT_BOTTOM_FIELD)
2685             pic->data[i] += pic->linesize[i];
2686         pic->reference = parity;
2687         pic->linesize[i] *= 2;
2688     }
2689 }
2690
2691 static int split_field_copy(Picture *dest, Picture *src,
2692                             int parity, int id_add){
2693     int match = !!(src->reference & parity);
2694
2695     if (match) {
2696         *dest = *src;
2697         pic_as_field(dest, parity);
2698         dest->pic_id *= 2;
2699         dest->pic_id += id_add;
2700     }
2701
2702     return match;
2703 }
2704
2705 /**
2706  * Split one reference list into field parts, interleaving by parity
2707  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2708  * set to look at the actual start of data for that field.
2709  *
2710  * @param dest output list
2711  * @param dest_len maximum number of fields to put in dest
2712  * @param src the source reference list containing fields and/or field pairs
2713  *            (aka short_ref/long_ref, or
2714  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2715  * @param src_len number of Picture's in source (pairs and unmatched fields)
2716  * @param parity the parity of the picture being decoded/needing
2717  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2718  * @return number of fields placed in dest
2719  */
2720 static int split_field_half_ref_list(Picture *dest, int dest_len,
2721                                      Picture *src,  int src_len,  int parity){
2722     int same_parity   = 1;
2723     int same_i        = 0;
2724     int opp_i         = 0;
2725     int out_i;
2726     int field_output;
2727
2728     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2729         if (same_parity && same_i < src_len) {
2730             field_output = split_field_copy(dest + out_i, src + same_i,
2731                                             parity, 1);
2732             same_parity = !field_output;
2733             same_i++;
2734
2735         } else if (opp_i < src_len) {
2736             field_output = split_field_copy(dest + out_i, src + opp_i,
2737                                             PICT_FRAME - parity, 0);
2738             same_parity = field_output;
2739             opp_i++;
2740
2741         } else {
2742             break;
2743         }
2744     }
2745
2746     return out_i;
2747 }
2748
2749 /**
2750  * Split the reference frame list into a reference field list.
2751  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2752  * The input list contains both reference field pairs and
2753  * unmatched reference fields; it is ordered as spec describes
2754  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2755  * unmatched field pairs are also present. Conceptually this is equivalent
2756  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2757  *
2758  * @param dest output reference list where ordered fields are to be placed
2759  * @param dest_len max number of fields to place at dest
2760  * @param src source reference list, as described above
2761  * @param src_len number of pictures (pairs and unmatched fields) in src
2762  * @param parity parity of field being currently decoded
2763  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2764  * @param long_i index into src array that holds first long reference picture,
2765  *        or src_len if no long refs present.
2766  */
2767 static int split_field_ref_list(Picture *dest, int dest_len,
2768                                 Picture *src,  int src_len,
2769                                 int parity,    int long_i){
2770
2771     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2772     dest += i;
2773     dest_len -= i;
2774
2775     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2776                                    src_len - long_i, parity);
2777     return i;
2778 }
2779
2780 /**
2781  * fills the default_ref_list.
2782  */
2783 static int fill_default_ref_list(H264Context *h){
2784     MpegEncContext * const s = &h->s;
2785     int i;
2786     int smallest_poc_greater_than_current = -1;
2787     int structure_sel;
2788     Picture sorted_short_ref[32];
2789     Picture field_entry_list[2][32];
2790     Picture *frame_list[2];
2791
2792     if (FIELD_PICTURE) {
2793         structure_sel = PICT_FRAME;
2794         frame_list[0] = field_entry_list[0];
2795         frame_list[1] = field_entry_list[1];
2796     } else {
2797         structure_sel = 0;
2798         frame_list[0] = h->default_ref_list[0];
2799         frame_list[1] = h->default_ref_list[1];
2800     }
2801
2802     if(h->slice_type==B_TYPE){
2803         int list;
2804         int len[2];
2805         int short_len[2];
2806         int out_i;
2807         int limit= INT_MIN;
2808
2809         /* sort frame according to poc in B slice */
2810         for(out_i=0; out_i<h->short_ref_count; out_i++){
2811             int best_i=INT_MIN;
2812             int best_poc=INT_MAX;
2813
2814             for(i=0; i<h->short_ref_count; i++){
2815                 const int poc= h->short_ref[i]->poc;
2816                 if(poc > limit && poc < best_poc){
2817                     best_poc= poc;
2818                     best_i= i;
2819                 }
2820             }
2821
2822             assert(best_i != INT_MIN);
2823
2824             limit= best_poc;
2825             sorted_short_ref[out_i]= *h->short_ref[best_i];
2826             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2827             if (-1 == smallest_poc_greater_than_current) {
2828                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2829                     smallest_poc_greater_than_current = out_i;
2830                 }
2831             }
2832         }
2833
2834         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2835
2836         // find the largest poc
2837         for(list=0; list<2; list++){
2838             int index = 0;
2839             int j= -99;
2840             int step= list ? -1 : 1;
2841
2842             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2843                 int sel;
2844                 while(j<0 || j>= h->short_ref_count){
2845                     if(j != -99 && step == (list ? -1 : 1))
2846                         return -1;
2847                     step = -step;
2848                     j= smallest_poc_greater_than_current + (step>>1);
2849                 }
2850                 sel = sorted_short_ref[j].reference | structure_sel;
2851                 if(sel != PICT_FRAME) continue;
2852                 frame_list[list][index  ]= sorted_short_ref[j];
2853                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2854             }
2855             short_len[list] = index;
2856
2857             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2858                 int sel;
2859                 if(h->long_ref[i] == NULL) continue;
2860                 sel = h->long_ref[i]->reference | structure_sel;
2861                 if(sel != PICT_FRAME) continue;
2862
2863                 frame_list[ list ][index  ]= *h->long_ref[i];
2864                 frame_list[ list ][index++].pic_id= i;;
2865             }
2866             len[list] = index;
2867
2868             if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
2869                 // swap the two first elements of L1 when
2870                 // L0 and L1 are identical
2871                 Picture temp= frame_list[1][0];
2872                 frame_list[1][0] = frame_list[1][1];
2873                 frame_list[1][1] = temp;
2874             }
2875
2876         }
2877
2878         for(list=0; list<2; list++){
2879             if (FIELD_PICTURE)
2880                 len[list] = split_field_ref_list(h->default_ref_list[list],
2881                                                  h->ref_count[list],
2882                                                  frame_list[list],
2883                                                  len[list],
2884                                                  s->picture_structure,
2885                                                  short_len[list]);
2886
2887             if(len[list] < h->ref_count[ list ])
2888                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2889         }
2890
2891
2892     }else{
2893         int index=0;
2894         int short_len;
2895         for(i=0; i<h->short_ref_count; i++){
2896             int sel;
2897             sel = h->short_ref[i]->reference | structure_sel;
2898             if(sel != PICT_FRAME) continue;
2899             frame_list[0][index  ]= *h->short_ref[i];
2900             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2901         }
2902         short_len = index;
2903         for(i = 0; i < 16; i++){
2904             int sel;
2905             if(h->long_ref[i] == NULL) continue;
2906             sel = h->long_ref[i]->reference | structure_sel;
2907             if(sel != PICT_FRAME) continue;
2908             frame_list[0][index  ]= *h->long_ref[i];
2909             frame_list[0][index++].pic_id= i;;
2910         }
2911
2912         if (FIELD_PICTURE)
2913             index = split_field_ref_list(h->default_ref_list[0],
2914                                          h->ref_count[0], frame_list[0],
2915                                          index, s->picture_structure,
2916                                          short_len);
2917
2918         if(index < h->ref_count[0])
2919             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2920     }
2921 #ifdef TRACE
2922     for (i=0; i<h->ref_count[0]; i++) {
2923         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2924     }
2925     if(h->slice_type==B_TYPE){
2926         for (i=0; i<h->ref_count[1]; i++) {
2927             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
2928         }
2929     }
2930 #endif
2931     return 0;
2932 }
2933
2934 static void print_short_term(H264Context *h);
2935 static void print_long_term(H264Context *h);
2936
2937 /**
2938  * Extract structure information about the picture described by pic_num in
2939  * the current decoding context (frame or field). Note that pic_num is
2940  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2941  * @param pic_num picture number for which to extract structure information
2942  * @param structure one of PICT_XXX describing structure of picture
2943  *                      with pic_num
2944  * @return frame number (short term) or long term index of picture
2945  *         described by pic_num
2946  */
2947 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2948     MpegEncContext * const s = &h->s;
2949
2950     *structure = s->picture_structure;
2951     if(FIELD_PICTURE){
2952         if (!(pic_num & 1))
2953             /* opposite field */
2954             *structure ^= PICT_FRAME;
2955         pic_num >>= 1;
2956     }
2957
2958     return pic_num;
2959 }
2960
2961 static int decode_ref_pic_list_reordering(H264Context *h){
2962     MpegEncContext * const s = &h->s;
2963     int list, index, pic_structure;
2964
2965     print_short_term(h);
2966     print_long_term(h);
2967     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
2968
2969     for(list=0; list<h->list_count; list++){
2970         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2971
2972         if(get_bits1(&s->gb)){
2973             int pred= h->curr_pic_num;
2974
2975             for(index=0; ; index++){
2976                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2977                 unsigned int pic_id;
2978                 int i;
2979                 Picture *ref = NULL;
2980
2981                 if(reordering_of_pic_nums_idc==3)
2982                     break;
2983
2984                 if(index >= h->ref_count[list]){
2985                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2986                     return -1;
2987                 }
2988
2989                 if(reordering_of_pic_nums_idc<3){
2990                     if(reordering_of_pic_nums_idc<2){
2991                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2992                         int frame_num;
2993
2994                         if(abs_diff_pic_num > h->max_pic_num){
2995                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2996                             return -1;
2997                         }
2998
2999                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3000                         else                                pred+= abs_diff_pic_num;
3001                         pred &= h->max_pic_num - 1;
3002
3003                         frame_num = pic_num_extract(h, pred, &pic_structure);
3004
3005                         for(i= h->short_ref_count-1; i>=0; i--){
3006                             ref = h->short_ref[i];
3007                             assert(ref->reference);
3008                             assert(!ref->long_ref);
3009                             if(ref->data[0] != NULL &&
3010                                    ref->frame_num == frame_num &&
3011                                    (ref->reference & pic_structure) &&
3012                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3013                                 break;
3014                         }
3015                         if(i>=0)
3016                             ref->pic_id= pred;
3017                     }else{
3018                         int long_idx;
3019                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3020
3021                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3022
3023                         if(long_idx>31){
3024                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3025                             return -1;
3026                         }
3027                         ref = h->long_ref[long_idx];
3028                         assert(!(ref && !ref->reference));
3029                         if(ref && (ref->reference & pic_structure)){
3030                             ref->pic_id= pic_id;
3031                             assert(ref->long_ref);
3032                             i=0;
3033                         }else{
3034                             i=-1;
3035                         }
3036                     }
3037
3038                     if (i < 0) {
3039                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3040                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3041                     } else {
3042                         for(i=index; i+1<h->ref_count[list]; i++){
3043                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3044                                 break;
3045                         }
3046                         for(; i > index; i--){
3047                             h->ref_list[list][i]= h->ref_list[list][i-1];
3048                         }
3049                         h->ref_list[list][index]= *ref;
3050                         if (FIELD_PICTURE){
3051                             pic_as_field(&h->ref_list[list][index], pic_structure);
3052                         }
3053                     }
3054                 }else{
3055                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3056                     return -1;
3057                 }
3058             }
3059         }
3060     }
3061     for(list=0; list<h->list_count; list++){
3062         for(index= 0; index < h->ref_count[list]; index++){
3063             if(!h->ref_list[list][index].data[0])
3064                 h->ref_list[list][index]= s->current_picture;
3065         }
3066     }
3067
3068     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3069         direct_dist_scale_factor(h);
3070     direct_ref_list_init(h);
3071     return 0;
3072 }
3073
3074 static void fill_mbaff_ref_list(H264Context *h){
3075     int list, i, j;
3076     for(list=0; list<2; list++){ //FIXME try list_count
3077         for(i=0; i<h->ref_count[list]; i++){
3078             Picture *frame = &h->ref_list[list][i];
3079             Picture *field = &h->ref_list[list][16+2*i];
3080             field[0] = *frame;
3081             for(j=0; j<3; j++)
3082                 field[0].linesize[j] <<= 1;
3083             field[0].reference = PICT_TOP_FIELD;
3084             field[1] = field[0];
3085             for(j=0; j<3; j++)
3086                 field[1].data[j] += frame->linesize[j];
3087             field[1].reference = PICT_BOTTOM_FIELD;
3088
3089             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3090             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3091             for(j=0; j<2; j++){
3092                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3093                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3094             }
3095         }
3096     }
3097     for(j=0; j<h->ref_count[1]; j++){
3098         for(i=0; i<h->ref_count[0]; i++)
3099             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3100         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3101         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3102     }
3103 }
3104
3105 static int pred_weight_table(H264Context *h){
3106     MpegEncContext * const s = &h->s;
3107     int list, i;
3108     int luma_def, chroma_def;
3109
3110     h->use_weight= 0;
3111     h->use_weight_chroma= 0;
3112     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3113     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3114     luma_def = 1<<h->luma_log2_weight_denom;
3115     chroma_def = 1<<h->chroma_log2_weight_denom;
3116
3117     for(list=0; list<2; list++){
3118         for(i=0; i<h->ref_count[list]; i++){
3119             int luma_weight_flag, chroma_weight_flag;
3120
3121             luma_weight_flag= get_bits1(&s->gb);
3122             if(luma_weight_flag){
3123                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3124                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3125                 if(   h->luma_weight[list][i] != luma_def
3126                    || h->luma_offset[list][i] != 0)
3127                     h->use_weight= 1;
3128             }else{
3129                 h->luma_weight[list][i]= luma_def;
3130                 h->luma_offset[list][i]= 0;
3131             }
3132
3133             chroma_weight_flag= get_bits1(&s->gb);
3134             if(chroma_weight_flag){
3135                 int j;
3136                 for(j=0; j<2; j++){
3137                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3138                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3139                     if(   h->chroma_weight[list][i][j] != chroma_def
3140                        || h->chroma_offset[list][i][j] != 0)
3141                         h->use_weight_chroma= 1;
3142                 }
3143             }else{
3144                 int j;
3145                 for(j=0; j<2; j++){
3146                     h->chroma_weight[list][i][j]= chroma_def;
3147                     h->chroma_offset[list][i][j]= 0;
3148                 }
3149             }
3150         }
3151         if(h->slice_type != B_TYPE) break;
3152     }
3153     h->use_weight= h->use_weight || h->use_weight_chroma;
3154     return 0;
3155 }
3156
3157 static void implicit_weight_table(H264Context *h){
3158     MpegEncContext * const s = &h->s;
3159     int ref0, ref1;
3160     int cur_poc = s->current_picture_ptr->poc;
3161
3162     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3163        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3164         h->use_weight= 0;
3165         h->use_weight_chroma= 0;
3166         return;
3167     }
3168
3169     h->use_weight= 2;
3170     h->use_weight_chroma= 2;
3171     h->luma_log2_weight_denom= 5;
3172     h->chroma_log2_weight_denom= 5;
3173
3174     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3175         int poc0 = h->ref_list[0][ref0].poc;
3176         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3177             int poc1 = h->ref_list[1][ref1].poc;
3178             int td = av_clip(poc1 - poc0, -128, 127);
3179             if(td){
3180                 int tb = av_clip(cur_poc - poc0, -128, 127);
3181                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3182                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3183                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3184                     h->implicit_weight[ref0][ref1] = 32;
3185                 else
3186                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3187             }else
3188                 h->implicit_weight[ref0][ref1] = 32;
3189         }
3190     }
3191 }
3192
3193 /**
3194  * Mark a picture as no longer needed for reference. The refmask
3195  * argument allows unreferencing of individual fields or the whole frame.
3196  * If the picture becomes entirely unreferenced, but is being held for
3197  * display purposes, it is marked as such.
3198  * @param refmask mask of fields to unreference; the mask is bitwise
3199  *                anded with the reference marking of pic
3200  * @return non-zero if pic becomes entirely unreferenced (except possibly
3201  *         for display purposes) zero if one of the fields remains in
3202  *         reference
3203  */
3204 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3205     int i;
3206     if (pic->reference &= refmask) {
3207         return 0;
3208     } else {
3209         if(pic == h->delayed_output_pic)
3210             pic->reference=DELAYED_PIC_REF;
3211         else{
3212             for(i = 0; h->delayed_pic[i]; i++)
3213                 if(pic == h->delayed_pic[i]){
3214                     pic->reference=DELAYED_PIC_REF;
3215                     break;
3216                 }
3217         }
3218         return 1;
3219     }
3220 }
3221
3222 /**
3223  * instantaneous decoder refresh.
3224  */
3225 static void idr(H264Context *h){
3226     int i;
3227
3228     for(i=0; i<16; i++){
3229         if (h->long_ref[i] != NULL) {
3230             unreference_pic(h, h->long_ref[i], 0);
3231             h->long_ref[i]= NULL;
3232         }
3233     }
3234     h->long_ref_count=0;
3235
3236     for(i=0; i<h->short_ref_count; i++){
3237         unreference_pic(h, h->short_ref[i], 0);
3238         h->short_ref[i]= NULL;
3239     }
3240     h->short_ref_count=0;
3241 }
3242
3243 /* forget old pics after a seek */
3244 static void flush_dpb(AVCodecContext *avctx){
3245     H264Context *h= avctx->priv_data;
3246     int i;
3247     for(i=0; i<16; i++) {
3248         if(h->delayed_pic[i])
3249             h->delayed_pic[i]->reference= 0;
3250         h->delayed_pic[i]= NULL;
3251     }
3252     if(h->delayed_output_pic)
3253         h->delayed_output_pic->reference= 0;
3254     h->delayed_output_pic= NULL;
3255     idr(h);
3256     if(h->s.current_picture_ptr)
3257         h->s.current_picture_ptr->reference= 0;
3258     h->s.first_field= 0;
3259     ff_mpeg_flush(avctx);
3260 }
3261
3262 /**
3263  * Find a Picture in the short term reference list by frame number.
3264  * @param frame_num frame number to search for
3265  * @param idx the index into h->short_ref where returned picture is found
3266  *            undefined if no picture found.
3267  * @return pointer to the found picture, or NULL if no pic with the provided
3268  *                 frame number is found
3269  */
3270 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3271     MpegEncContext * const s = &h->s;
3272     int i;
3273
3274     for(i=0; i<h->short_ref_count; i++){
3275         Picture *pic= h->short_ref[i];
3276         if(s->avctx->debug&FF_DEBUG_MMCO)
3277             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3278         if(pic->frame_num == frame_num) {
3279             *idx = i;
3280             return pic;
3281         }
3282     }
3283     return NULL;
3284 }
3285
3286 /**
3287  * Remove a picture from the short term reference list by its index in
3288  * that list.  This does no checking on the provided index; it is assumed
3289  * to be valid. Other list entries are shifted down.
3290  * @param i index into h->short_ref of picture to remove.
3291  */
3292 static void remove_short_at_index(H264Context *h, int i){
3293     assert(i > 0 && i < h->short_ref_count);
3294     h->short_ref[i]= NULL;
3295     if (--h->short_ref_count)
3296         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3297 }
3298
3299 /**
3300  *
3301  * @return the removed picture or NULL if an error occurs
3302  */
3303 static Picture * remove_short(H264Context *h, int frame_num){
3304     MpegEncContext * const s = &h->s;
3305     Picture *pic;
3306     int i;
3307
3308     if(s->avctx->debug&FF_DEBUG_MMCO)
3309         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3310
3311     pic = find_short(h, frame_num, &i);
3312     if (pic)
3313         remove_short_at_index(h, i);
3314
3315     return pic;
3316 }
3317
3318 /**
3319  * Remove a picture from the long term reference list by its index in
3320  * that list.  This does no checking on the provided index; it is assumed
3321  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3322  * @param i index into h->long_ref of picture to remove.
3323  */
3324 static void remove_long_at_index(H264Context *h, int i){
3325     h->long_ref[i]= NULL;
3326     h->long_ref_count--;
3327 }
3328
3329 /**
3330  *
3331  * @return the removed picture or NULL if an error occurs
3332  */
3333 static Picture * remove_long(H264Context *h, int i){
3334     Picture *pic;
3335
3336     pic= h->long_ref[i];
3337     if (pic)
3338         remove_long_at_index(h, i);
3339
3340     return pic;
3341 }
3342
3343 /**
3344  * print short term list
3345  */
3346 static void print_short_term(H264Context *h) {
3347     uint32_t i;
3348     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3349         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3350         for(i=0; i<h->short_ref_count; i++){
3351             Picture *pic= h->short_ref[i];
3352             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3353         }
3354     }
3355 }
3356
3357 /**
3358  * print long term list
3359  */
3360 static void print_long_term(H264Context *h) {
3361     uint32_t i;
3362     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3363         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3364         for(i = 0; i < 16; i++){
3365             Picture *pic= h->long_ref[i];
3366             if (pic) {
3367                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3368             }
3369         }
3370     }
3371 }
3372
3373 /**
3374  * Executes the reference picture marking (memory management control operations).
3375  */
3376 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3377     MpegEncContext * const s = &h->s;
3378     int i, j;
3379     int current_ref_assigned=0;
3380     Picture *pic;
3381
3382     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3383         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3384
3385     for(i=0; i<mmco_count; i++){
3386         int structure, frame_num, unref_pic;
3387         if(s->avctx->debug&FF_DEBUG_MMCO)
3388             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3389
3390         switch(mmco[i].opcode){
3391         case MMCO_SHORT2UNUSED:
3392             if(s->avctx->debug&FF_DEBUG_MMCO)
3393                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3394             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3395             pic = find_short(h, frame_num, &j);
3396             if (pic) {
3397                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3398                     remove_short_at_index(h, j);
3399             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3400                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3401             break;
3402         case MMCO_SHORT2LONG:
3403             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3404                     h->long_ref[mmco[i].long_arg]->frame_num ==
3405                                               mmco[i].short_pic_num / 2) {
3406                 /* do nothing, we've already moved this field pair. */
3407             } else {
3408                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3409
3410                 pic= remove_long(h, mmco[i].long_arg);
3411                 if(pic) unreference_pic(h, pic, 0);
3412
3413                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3414                 if (h->long_ref[ mmco[i].long_arg ]){
3415                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3416                     h->long_ref_count++;
3417                 }
3418             }
3419             break;
3420         case MMCO_LONG2UNUSED:
3421             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3422             pic = h->long_ref[j];
3423             if (pic) {
3424                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3425                     remove_long_at_index(h, j);
3426             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3427                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3428             break;
3429         case MMCO_LONG:
3430             unref_pic = 1;
3431             if (FIELD_PICTURE && !s->first_field) {
3432                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3433                     /* Just mark second field as referenced */
3434                     unref_pic = 0;
3435                 } else if (s->current_picture_ptr->reference) {
3436                     /* First field in pair is in short term list or
3437                      * at a different long term index.
3438                      * This is not allowed; see 7.4.3, notes 2 and 3.
3439                      * Report the problem and keep the pair where it is,
3440                      * and mark this field valid.
3441                      */
3442                     av_log(h->s.avctx, AV_LOG_ERROR,
3443                         "illegal long term reference assignment for second "
3444                         "field in complementary field pair (first field is "
3445                         "short term or has non-matching long index)\n");
3446                     unref_pic = 0;
3447                 }
3448             }
3449
3450             if (unref_pic) {
3451                 pic= remove_long(h, mmco[i].long_arg);
3452                 if(pic) unreference_pic(h, pic, 0);
3453
3454                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3455                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3456                 h->long_ref_count++;
3457             }
3458
3459             s->current_picture_ptr->reference |= s->picture_structure;
3460             current_ref_assigned=1;
3461             break;
3462         case MMCO_SET_MAX_LONG:
3463             assert(mmco[i].long_arg <= 16);
3464             // just remove the long term which index is greater than new max
3465             for(j = mmco[i].long_arg; j<16; j++){
3466                 pic = remove_long(h, j);
3467                 if (pic) unreference_pic(h, pic, 0);
3468             }
3469             break;
3470         case MMCO_RESET:
3471             while(h->short_ref_count){
3472                 pic= remove_short(h, h->short_ref[0]->frame_num);
3473                 if(pic) unreference_pic(h, pic, 0);
3474             }
3475             for(j = 0; j < 16; j++) {
3476                 pic= remove_long(h, j);
3477                 if(pic) unreference_pic(h, pic, 0);
3478             }
3479             break;
3480         default: assert(0);
3481         }
3482     }
3483
3484     if (!current_ref_assigned && FIELD_PICTURE &&
3485             !s->first_field && s->current_picture_ptr->reference) {
3486
3487         /* Second field of complementary field pair; the first field of
3488          * which is already referenced. If short referenced, it
3489          * should be first entry in short_ref. If not, it must exist
3490          * in long_ref; trying to put it on the short list here is an
3491          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3492          */
3493         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3494             /* Just mark the second field valid */
3495             s->current_picture_ptr->reference = PICT_FRAME;
3496         } else if (s->current_picture_ptr->long_ref) {
3497             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3498                                              "assignment for second field "
3499                                              "in complementary field pair "
3500                                              "(first field is long term)\n");
3501         } else {
3502             /*
3503              * First field in reference, but not in any sensible place on our
3504              * reference lists. This shouldn't happen unless reference
3505              * handling somewhere else is wrong.
3506              */
3507             assert(0);
3508         }
3509         current_ref_assigned = 1;
3510     }
3511
3512     if(!current_ref_assigned){
3513         pic= remove_short(h, s->current_picture_ptr->frame_num);
3514         if(pic){
3515             unreference_pic(h, pic, 0);
3516             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3517         }
3518
3519         if(h->short_ref_count)
3520             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3521
3522         h->short_ref[0]= s->current_picture_ptr;
3523         h->short_ref[0]->long_ref=0;
3524         h->short_ref_count++;
3525         s->current_picture_ptr->reference |= s->picture_structure;
3526     }
3527
3528     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3529
3530         /* We have too many reference frames, probably due to corrupted
3531          * stream. Need to discard one frame. Prevents overrun of the
3532          * short_ref and long_ref buffers.
3533          */
3534         av_log(h->s.avctx, AV_LOG_ERROR,
3535                "number of reference frames exceeds max (probably "
3536                "corrupt input), discarding one\n");
3537
3538         if (h->long_ref_count) {
3539             for (i = 0; i < 16; ++i)
3540                 if (h->long_ref[i])
3541                     break;
3542
3543             assert(i < 16);
3544             pic = h->long_ref[i];
3545             remove_long_at_index(h, i);
3546         } else {
3547             pic = h->short_ref[h->short_ref_count - 1];
3548             remove_short_at_index(h, h->short_ref_count - 1);
3549         }
3550         unreference_pic(h, pic, 0);
3551     }
3552
3553     print_short_term(h);
3554     print_long_term(h);
3555     return 0;
3556 }
3557
3558 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3559     MpegEncContext * const s = &h->s;
3560     int i;
3561
3562     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3563         s->broken_link= get_bits1(gb) -1;
3564         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3565         if(h->mmco[0].long_arg == -1)
3566             h->mmco_index= 0;
3567         else{
3568             h->mmco[0].opcode= MMCO_LONG;
3569             h->mmco_index= 1;
3570         }
3571     }else{
3572         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3573             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3574                 MMCOOpcode opcode= get_ue_golomb(gb);
3575
3576                 h->mmco[i].opcode= opcode;
3577                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3578                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3579 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3580                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3581                         return -1;
3582                     }*/
3583                 }
3584                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3585                     unsigned int long_arg= get_ue_golomb(gb);
3586                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3587                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3588                         return -1;
3589                     }
3590                     h->mmco[i].long_arg= long_arg;
3591                 }
3592
3593                 if(opcode > (unsigned)MMCO_LONG){
3594                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3595                     return -1;
3596                 }
3597                 if(opcode == MMCO_END)
3598                     break;
3599             }
3600             h->mmco_index= i;
3601         }else{
3602             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3603
3604             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3605                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3606                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3607                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3608                 h->mmco_index= 1;
3609                 if (FIELD_PICTURE) {
3610                     h->mmco[0].short_pic_num *= 2;
3611                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3612                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3613                     h->mmco_index= 2;
3614                 }
3615             }else
3616                 h->mmco_index= 0;
3617         }
3618     }
3619
3620     return 0;
3621 }
3622
3623 static int init_poc(H264Context *h){
3624     MpegEncContext * const s = &h->s;
3625     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3626     int field_poc[2];
3627
3628     if(h->nal_unit_type == NAL_IDR_SLICE){
3629         h->frame_num_offset= 0;
3630     }else{
3631         if(h->frame_num < h->prev_frame_num)
3632             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3633         else
3634             h->frame_num_offset= h->prev_frame_num_offset;
3635     }
3636
3637     if(h->sps.poc_type==0){
3638         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3639
3640         if(h->nal_unit_type == NAL_IDR_SLICE){
3641              h->prev_poc_msb=
3642              h->prev_poc_lsb= 0;
3643         }
3644
3645         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3646             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3647         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3648             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3649         else
3650             h->poc_msb = h->prev_poc_msb;
3651 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3652         field_poc[0] =
3653         field_poc[1] = h->poc_msb + h->poc_lsb;
3654         if(s->picture_structure == PICT_FRAME)
3655             field_poc[1] += h->delta_poc_bottom;
3656     }else if(h->sps.poc_type==1){
3657         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3658         int i;
3659
3660         if(h->sps.poc_cycle_length != 0)
3661             abs_frame_num = h->frame_num_offset + h->frame_num;
3662         else
3663             abs_frame_num = 0;
3664
3665         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3666             abs_frame_num--;
3667
3668         expected_delta_per_poc_cycle = 0;
3669         for(i=0; i < h->sps.poc_cycle_length; i++)
3670             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3671
3672         if(abs_frame_num > 0){
3673             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3674             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3675
3676             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3677             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3678                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3679         } else
3680             expectedpoc = 0;
3681
3682         if(h->nal_ref_idc == 0)
3683             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3684
3685         field_poc[0] = expectedpoc + h->delta_poc[0];
3686         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3687
3688         if(s->picture_structure == PICT_FRAME)
3689             field_poc[1] += h->delta_poc[1];
3690     }else{
3691         int poc;
3692         if(h->nal_unit_type == NAL_IDR_SLICE){
3693             poc= 0;
3694         }else{
3695             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3696             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3697         }
3698         field_poc[0]= poc;
3699         field_poc[1]= poc;
3700     }
3701
3702     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3703         s->current_picture_ptr->field_poc[0]= field_poc[0];
3704         s->current_picture_ptr->poc = field_poc[0];
3705     }
3706     if(s->picture_structure != PICT_TOP_FIELD) {
3707         s->current_picture_ptr->field_poc[1]= field_poc[1];
3708         s->current_picture_ptr->poc = field_poc[1];
3709     }
3710     if(!FIELD_PICTURE || !s->first_field) {
3711         Picture *cur = s->current_picture_ptr;
3712         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3713     }
3714
3715     return 0;
3716 }
3717
3718
3719 /**
3720  * initialize scan tables
3721  */
3722 static void init_scan_tables(H264Context *h){
3723     MpegEncContext * const s = &h->s;
3724     int i;
3725     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3726         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3727         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3728     }else{
3729         for(i=0; i<16; i++){
3730 #define T(x) (x>>2) | ((x<<2) & 0xF)
3731             h->zigzag_scan[i] = T(zigzag_scan[i]);
3732             h-> field_scan[i] = T( field_scan[i]);
3733 #undef T
3734         }
3735     }
3736     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3737         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3738         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3739         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3740         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3741     }else{
3742         for(i=0; i<64; i++){
3743 #define T(x) (x>>3) | ((x&7)<<3)
3744             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3745             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3746             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3747             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3748 #undef T
3749         }
3750     }
3751     if(h->sps.transform_bypass){ //FIXME same ugly
3752         h->zigzag_scan_q0          = zigzag_scan;
3753         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3754         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3755         h->field_scan_q0           = field_scan;
3756         h->field_scan8x8_q0        = field_scan8x8;
3757         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3758     }else{
3759         h->zigzag_scan_q0          = h->zigzag_scan;
3760         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3761         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3762         h->field_scan_q0           = h->field_scan;
3763         h->field_scan8x8_q0        = h->field_scan8x8;
3764         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3765     }
3766 }
3767
3768 /**
3769  * Replicates H264 "master" context to thread contexts.
3770  */
3771 static void clone_slice(H264Context *dst, H264Context *src)
3772 {
3773     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3774     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3775     dst->s.current_picture      = src->s.current_picture;
3776     dst->s.linesize             = src->s.linesize;
3777     dst->s.uvlinesize           = src->s.uvlinesize;
3778     dst->s.first_field          = src->s.first_field;
3779
3780     dst->prev_poc_msb           = src->prev_poc_msb;
3781     dst->prev_poc_lsb           = src->prev_poc_lsb;
3782     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3783     dst->prev_frame_num         = src->prev_frame_num;
3784     dst->short_ref_count        = src->short_ref_count;
3785
3786     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3787     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3788     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3789     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3790
3791     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3792     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3793 }
3794
3795 /**
3796  * decodes a slice header.
3797  * this will allso call MPV_common_init() and frame_start() as needed
3798  *
3799  * @param h h264context
3800  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3801  *
3802  * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
3803  */
3804 static int decode_slice_header(H264Context *h, H264Context *h0){
3805     MpegEncContext * const s = &h->s;
3806     MpegEncContext * const s0 = &h0->s;
3807     unsigned int first_mb_in_slice;
3808     unsigned int pps_id;
3809     int num_ref_idx_active_override_flag;
3810     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3811     unsigned int slice_type, tmp, i;
3812     int default_ref_list_done = 0;
3813     int last_pic_structure;
3814
3815     s->dropable= h->nal_ref_idc == 0;
3816
3817     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3818         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3819         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3820     }else{
3821         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3822         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3823     }
3824
3825     first_mb_in_slice= get_ue_golomb(&s->gb);
3826
3827     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3828         h0->current_slice = 0;
3829         if (!s0->first_field)
3830             s->current_picture_ptr= NULL;
3831     }
3832
3833     slice_type= get_ue_golomb(&s->gb);
3834     if(slice_type > 9){
3835         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3836         return -1;
3837     }
3838     if(slice_type > 4){
3839         slice_type -= 5;
3840         h->slice_type_fixed=1;
3841     }else
3842         h->slice_type_fixed=0;
3843
3844     slice_type= slice_type_map[ slice_type ];
3845     if (slice_type == I_TYPE
3846         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3847         default_ref_list_done = 1;
3848     }
3849     h->slice_type= slice_type;
3850
3851     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3852     if (s->pict_type == B_TYPE && s->last_picture_ptr == NULL) {
3853         av_log(h->s.avctx, AV_LOG_ERROR,
3854                "B picture before any references, skipping\n");
3855         return -1;
3856     }
3857
3858     pps_id= get_ue_golomb(&s->gb);
3859     if(pps_id>=MAX_PPS_COUNT){
3860         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3861         return -1;
3862     }
3863     if(!h0->pps_buffers[pps_id]) {
3864         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3865         return -1;
3866     }
3867     h->pps= *h0->pps_buffers[pps_id];
3868
3869     if(!h0->sps_buffers[h->pps.sps_id]) {
3870         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3871         return -1;
3872     }
3873     h->sps = *h0->sps_buffers[h->pps.sps_id];
3874
3875     if(h == h0 && h->dequant_coeff_pps != pps_id){
3876         h->dequant_coeff_pps = pps_id;
3877         init_dequant_tables(h);
3878     }
3879
3880     s->mb_width= h->sps.mb_width;
3881     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3882
3883     h->b_stride=  s->mb_width*4;
3884     h->b8_stride= s->mb_width*2;
3885
3886     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3887     if(h->sps.frame_mbs_only_flag)
3888         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3889     else
3890         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3891
3892     if (s->context_initialized
3893         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3894         if(h != h0)
3895             return -1;   // width / height changed during parallelized decoding
3896         free_tables(h);
3897         MPV_common_end(s);
3898     }
3899     if (!s->context_initialized) {
3900         if(h != h0)
3901             return -1;  // we cant (re-)initialize context during parallel decoding
3902         if (MPV_common_init(s) < 0)
3903             return -1;
3904         s->first_field = 0;
3905
3906         init_scan_tables(h);
3907         alloc_tables(h);
3908
3909         for(i = 1; i < s->avctx->thread_count; i++) {
3910             H264Context *c;
3911             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3912             memcpy(c, h, sizeof(MpegEncContext));
3913             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3914             c->sps = h->sps;
3915             c->pps = h->pps;
3916             init_scan_tables(c);
3917             clone_tables(c, h);
3918         }
3919
3920         for(i = 0; i < s->avctx->thread_count; i++)
3921             if(context_init(h->thread_context[i]) < 0)
3922                 return -1;
3923
3924         s->avctx->width = s->width;
3925         s->avctx->height = s->height;
3926         s->avctx->sample_aspect_ratio= h->sps.sar;
3927         if(!s->avctx->sample_aspect_ratio.den)
3928             s->avctx->sample_aspect_ratio.den = 1;
3929
3930         if(h->sps.timing_info_present_flag){
3931             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3932             if(h->x264_build > 0 && h->x264_build < 44)
3933                 s->avctx->time_base.den *= 2;
3934             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3935                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3936         }
3937     }
3938
3939     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3940
3941     h->mb_mbaff = 0;
3942     h->mb_aff_frame = 0;
3943     last_pic_structure = s0->picture_structure;
3944     if(h->sps.frame_mbs_only_flag){
3945         s->picture_structure= PICT_FRAME;
3946     }else{
3947         if(get_bits1(&s->gb)) { //field_pic_flag
3948             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3949         } else {
3950             s->picture_structure= PICT_FRAME;
3951             h->mb_aff_frame = h->sps.mb_aff;
3952         }
3953     }
3954
3955     if(h0->current_slice == 0){
3956         /* See if we have a decoded first field looking for a pair... */
3957         if (s0->first_field) {
3958             assert(s0->current_picture_ptr);
3959             assert(s0->current_picture_ptr->data[0]);
3960             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3961
3962             /* figure out if we have a complementary field pair */
3963             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3964                 /*
3965                  * Previous field is unmatched. Don't display it, but let it
3966                  * remain for reference if marked as such.
3967                  */
3968                 s0->current_picture_ptr = NULL;
3969                 s0->first_field = FIELD_PICTURE;
3970
3971             } else {
3972                 if (h->nal_ref_idc &&
3973                         s0->current_picture_ptr->reference &&
3974                         s0->current_picture_ptr->frame_num != h->frame_num) {
3975                     /*
3976                      * This and previous field were reference, but had
3977                      * different frame_nums. Consider this field first in
3978                      * pair. Throw away previous field except for reference
3979                      * purposes.
3980                      */
3981                     s0->first_field = 1;
3982                     s0->current_picture_ptr = NULL;
3983
3984                 } else {
3985                     /* Second field in complementary pair */
3986                     s0->first_field = 0;
3987                 }
3988             }
3989
3990         } else {
3991             /* Frame or first field in a potentially complementary pair */
3992             assert(!s0->current_picture_ptr);
3993             s0->first_field = FIELD_PICTURE;
3994         }
3995
3996         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3997             s0->first_field = 0;
3998             return -1;
3999         }
4000     }
4001     if(h != h0)
4002         clone_slice(h, h0);
4003
4004     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4005
4006     assert(s->mb_num == s->mb_width * s->mb_height);
4007     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4008        first_mb_in_slice                    >= s->mb_num){
4009         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4010         return -1;
4011     }
4012     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4013     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4014     if (s->picture_structure == PICT_BOTTOM_FIELD)
4015         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4016     assert(s->mb_y < s->mb_height);
4017
4018     if(s->picture_structure==PICT_FRAME){
4019         h->curr_pic_num=   h->frame_num;
4020         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4021     }else{
4022         h->curr_pic_num= 2*h->frame_num + 1;
4023         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4024     }
4025
4026     if(h->nal_unit_type == NAL_IDR_SLICE){
4027         get_ue_golomb(&s->gb); /* idr_pic_id */
4028     }
4029
4030     if(h->sps.poc_type==0){
4031         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4032
4033         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4034             h->delta_poc_bottom= get_se_golomb(&s->gb);
4035         }
4036     }
4037
4038     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4039         h->delta_poc[0]= get_se_golomb(&s->gb);
4040
4041         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4042             h->delta_poc[1]= get_se_golomb(&s->gb);
4043     }
4044
4045     init_poc(h);
4046
4047     if(h->pps.redundant_pic_cnt_present){
4048         h->redundant_pic_count= get_ue_golomb(&s->gb);
4049     }
4050
4051     //set defaults, might be overriden a few line later
4052     h->ref_count[0]= h->pps.ref_count[0];
4053     h->ref_count[1]= h->pps.ref_count[1];
4054
4055     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4056         if(h->slice_type == B_TYPE){
4057             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4058             if(FIELD_OR_MBAFF_PICTURE && h->direct_spatial_mv_pred)
4059                 av_log(h->s.avctx, AV_LOG_ERROR, "Interlaced pictures + spatial direct mode is not implemented\n");
4060         }
4061         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4062
4063         if(num_ref_idx_active_override_flag){
4064             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4065             if(h->slice_type==B_TYPE)
4066                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4067
4068             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4069                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4070                 h->ref_count[0]= h->ref_count[1]= 1;
4071                 return -1;
4072             }
4073         }
4074         if(h->slice_type == B_TYPE)
4075             h->list_count= 2;
4076         else
4077             h->list_count= 1;
4078     }else
4079         h->list_count= 0;
4080
4081     if(!default_ref_list_done){
4082         fill_default_ref_list(h);
4083     }
4084
4085     if(decode_ref_pic_list_reordering(h) < 0)
4086         return -1;
4087
4088     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4089        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4090         pred_weight_table(h);
4091     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4092         implicit_weight_table(h);
4093     else
4094         h->use_weight = 0;
4095
4096     if(h->nal_ref_idc)
4097         decode_ref_pic_marking(h0, &s->gb);
4098
4099     if(FRAME_MBAFF)
4100         fill_mbaff_ref_list(h);
4101
4102     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac ){
4103         tmp = get_ue_golomb(&s->gb);
4104         if(tmp > 2){
4105             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4106             return -1;
4107         }
4108         h->cabac_init_idc= tmp;
4109     }
4110
4111     h->last_qscale_diff = 0;
4112     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4113     if(tmp>51){
4114         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4115         return -1;
4116     }
4117     s->qscale= tmp;
4118     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4119     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4120     //FIXME qscale / qp ... stuff
4121     if(h->slice_type == SP_TYPE){
4122         get_bits1(&s->gb); /* sp_for_switch_flag */
4123     }
4124     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4125         get_se_golomb(&s->gb); /* slice_qs_delta */
4126     }
4127
4128     h->deblocking_filter = 1;
4129     h->slice_alpha_c0_offset = 0;
4130     h->slice_beta_offset = 0;
4131     if( h->pps.deblocking_filter_parameters_present ) {
4132         tmp= get_ue_golomb(&s->gb);
4133         if(tmp > 2){
4134             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4135             return -1;
4136         }
4137         h->deblocking_filter= tmp;
4138         if(h->deblocking_filter < 2)
4139             h->deblocking_filter^= 1; // 1<->0
4140
4141         if( h->deblocking_filter ) {
4142             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4143             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4144         }
4145     }
4146
4147     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4148        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4149        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4150        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4151         h->deblocking_filter= 0;
4152
4153     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4154         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4155             /* Cheat slightly for speed:
4156                Do not bother to deblock across slices. */
4157             h->deblocking_filter = 2;
4158         } else {
4159             h0->max_contexts = 1;
4160             if(!h0->single_decode_warning) {
4161                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4162                 h0->single_decode_warning = 1;
4163             }
4164             if(h != h0)
4165                 return 1; // deblocking switched inside frame
4166         }
4167     }
4168
4169 #if 0 //FMO
4170     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4171         slice_group_change_cycle= get_bits(&s->gb, ?);
4172 #endif
4173
4174     h0->last_slice_type = slice_type;
4175     h->slice_num = ++h0->current_slice;
4176
4177     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4178     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4179
4180     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4181         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4182                h->slice_num,
4183                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4184                first_mb_in_slice,
4185                av_get_pict_type_char(h->slice_type),
4186                pps_id, h->frame_num,
4187                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4188                h->ref_count[0], h->ref_count[1],
4189                s->qscale,
4190                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4191                h->use_weight,
4192                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4193                );
4194     }
4195
4196     return 0;
4197 }
4198
4199 /**
4200  *
4201  */
4202 static inline int get_level_prefix(GetBitContext *gb){
4203     unsigned int buf;
4204     int log;
4205
4206     OPEN_READER(re, gb);
4207     UPDATE_CACHE(re, gb);
4208     buf=GET_CACHE(re, gb);
4209
4210     log= 32 - av_log2(buf);
4211 #ifdef TRACE
4212     print_bin(buf>>(32-log), log);
4213     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4214 #endif
4215
4216     LAST_SKIP_BITS(re, gb, log);
4217     CLOSE_READER(re, gb);
4218
4219     return log-1;
4220 }
4221
4222 static inline int get_dct8x8_allowed(H264Context *h){
4223     int i;
4224     for(i=0; i<4; i++){
4225         if(!IS_SUB_8X8(h->sub_mb_type[i])
4226            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4227             return 0;
4228     }
4229     return 1;
4230 }
4231
4232 /**
4233  * decodes a residual block.
4234  * @param n block index
4235  * @param scantable scantable
4236  * @param max_coeff number of coefficients in the block
4237  * @return <0 if an error occured
4238  */
4239 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4240     MpegEncContext * const s = &h->s;
4241     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4242     int level[16];
4243     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4244
4245     //FIXME put trailing_onex into the context
4246
4247     if(n == CHROMA_DC_BLOCK_INDEX){
4248         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4249         total_coeff= coeff_token>>2;
4250     }else{
4251         if(n == LUMA_DC_BLOCK_INDEX){
4252             total_coeff= pred_non_zero_count(h, 0);
4253             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4254             total_coeff= coeff_token>>2;
4255         }else{
4256             total_coeff= pred_non_zero_count(h, n);
4257             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4258             total_coeff= coeff_token>>2;
4259             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4260         }
4261     }
4262
4263     //FIXME set last_non_zero?
4264
4265     if(total_coeff==0)
4266         return 0;
4267     if(total_coeff > (unsigned)max_coeff) {
4268         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4269         return -1;
4270     }
4271
4272     trailing_ones= coeff_token&3;
4273     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4274     assert(total_coeff<=16);
4275
4276     for(i=0; i<trailing_ones; i++){
4277         level[i]= 1 - 2*get_bits1(gb);
4278     }
4279
4280     if(i<total_coeff) {
4281         int level_code, mask;
4282         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4283         int prefix= get_level_prefix(gb);
4284
4285         //first coefficient has suffix_length equal to 0 or 1
4286         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4287             if(suffix_length)
4288                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4289             else
4290                 level_code= (prefix<<suffix_length); //part
4291         }else if(prefix==14){
4292             if(suffix_length)
4293                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4294             else
4295                 level_code= prefix + get_bits(gb, 4); //part
4296         }else if(prefix==15){
4297             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4298             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4299         }else{
4300             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4301             return -1;
4302         }
4303
4304         if(trailing_ones < 3) level_code += 2;
4305
4306         suffix_length = 1;
4307         if(level_code > 5)
4308             suffix_length++;
4309         mask= -(level_code&1);
4310         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4311         i++;
4312
4313         //remaining coefficients have suffix_length > 0
4314         for(;i<total_coeff;i++) {
4315             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4316             prefix = get_level_prefix(gb);
4317             if(prefix<15){
4318                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4319             }else if(prefix==15){
4320                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4321             }else{
4322                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4323                 return -1;
4324             }
4325             mask= -(level_code&1);
4326             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4327             if(level_code > suffix_limit[suffix_length])
4328                 suffix_length++;
4329         }
4330     }
4331
4332     if(total_coeff == max_coeff)
4333         zeros_left=0;
4334     else{
4335         if(n == CHROMA_DC_BLOCK_INDEX)
4336             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4337         else
4338             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4339     }
4340
4341     coeff_num = zeros_left + total_coeff - 1;
4342     j = scantable[coeff_num];
4343     if(n > 24){
4344         block[j] = level[0];
4345         for(i=1;i<total_coeff;i++) {
4346             if(zeros_left <= 0)
4347                 run_before = 0;
4348             else if(zeros_left < 7){
4349                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4350             }else{
4351                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4352             }
4353             zeros_left -= run_before;
4354             coeff_num -= 1 + run_before;
4355             j= scantable[ coeff_num ];
4356
4357             block[j]= level[i];
4358         }
4359     }else{
4360         block[j] = (level[0] * qmul[j] + 32)>>6;
4361         for(i=1;i<total_coeff;i++) {
4362             if(zeros_left <= 0)
4363                 run_before = 0;
4364             else if(zeros_left < 7){
4365                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4366             }else{
4367                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4368             }
4369             zeros_left -= run_before;
4370             coeff_num -= 1 + run_before;
4371             j= scantable[ coeff_num ];
4372
4373             block[j]= (level[i] * qmul[j] + 32)>>6;
4374         }
4375     }
4376
4377     if(zeros_left<0){
4378         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4379         return -1;
4380     }
4381
4382     return 0;
4383 }
4384
4385 static void predict_field_decoding_flag(H264Context *h){
4386     MpegEncContext * const s = &h->s;
4387     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4388     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4389                 ? s->current_picture.mb_type[mb_xy-1]
4390                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4391                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4392                 : 0;
4393     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4394 }
4395
4396 /**
4397  * decodes a P_SKIP or B_SKIP macroblock
4398  */
4399 static void decode_mb_skip(H264Context *h){
4400     MpegEncContext * const s = &h->s;
4401     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4402     int mb_type=0;
4403
4404     memset(h->non_zero_count[mb_xy], 0, 16);
4405     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4406
4407     if(MB_FIELD)
4408         mb_type|= MB_TYPE_INTERLACED;
4409
4410     if( h->slice_type == B_TYPE )
4411     {
4412         // just for fill_caches. pred_direct_motion will set the real mb_type
4413         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4414
4415         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4416         pred_direct_motion(h, &mb_type);
4417         mb_type|= MB_TYPE_SKIP;
4418     }
4419     else
4420     {
4421         int mx, my;
4422         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4423
4424         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4425         pred_pskip_motion(h, &mx, &my);
4426         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4427         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4428     }
4429
4430     write_back_motion(h, mb_type);
4431     s->current_picture.mb_type[mb_xy]= mb_type;
4432     s->current_picture.qscale_table[mb_xy]= s->qscale;
4433     h->slice_table[ mb_xy ]= h->slice_num;
4434     h->prev_mb_skipped= 1;
4435 }
4436
4437 /**
4438  * decodes a macroblock
4439  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4440  */
4441 static int decode_mb_cavlc(H264Context *h){
4442     MpegEncContext * const s = &h->s;
4443     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4444     int partition_count;
4445     unsigned int mb_type, cbp;
4446     int dct8x8_allowed= h->pps.transform_8x8_mode;
4447
4448     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4449
4450     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4451     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4452                 down the code */
4453     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4454         if(s->mb_skip_run==-1)
4455             s->mb_skip_run= get_ue_golomb(&s->gb);
4456
4457         if (s->mb_skip_run--) {
4458             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4459                 if(s->mb_skip_run==0)
4460                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4461                 else
4462                     predict_field_decoding_flag(h);
4463             }
4464             decode_mb_skip(h);
4465             return 0;
4466         }
4467     }
4468     if(FRAME_MBAFF){
4469         if( (s->mb_y&1) == 0 )
4470             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4471     }else
4472         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4473
4474     h->prev_mb_skipped= 0;
4475
4476     mb_type= get_ue_golomb(&s->gb);
4477     if(h->slice_type == B_TYPE){
4478         if(mb_type < 23){
4479             partition_count= b_mb_type_info[mb_type].partition_count;
4480             mb_type=         b_mb_type_info[mb_type].type;
4481         }else{
4482             mb_type -= 23;
4483             goto decode_intra_mb;
4484         }
4485     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4486         if(mb_type < 5){
4487             partition_count= p_mb_type_info[mb_type].partition_count;
4488             mb_type=         p_mb_type_info[mb_type].type;
4489         }else{
4490             mb_type -= 5;
4491             goto decode_intra_mb;
4492         }
4493     }else{
4494        assert(h->slice_type == I_TYPE);
4495 decode_intra_mb:
4496         if(mb_type > 25){
4497             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4498             return -1;
4499         }
4500         partition_count=0;
4501         cbp= i_mb_type_info[mb_type].cbp;
4502         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4503         mb_type= i_mb_type_info[mb_type].type;
4504     }
4505
4506     if(MB_FIELD)
4507         mb_type |= MB_TYPE_INTERLACED;
4508
4509     h->slice_table[ mb_xy ]= h->slice_num;
4510
4511     if(IS_INTRA_PCM(mb_type)){
4512         unsigned int x, y;
4513
4514         // We assume these blocks are very rare so we do not optimize it.
4515         align_get_bits(&s->gb);
4516
4517         // The pixels are stored in the same order as levels in h->mb array.
4518         for(y=0; y<16; y++){
4519             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4520             for(x=0; x<16; x++){
4521                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4522                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4523             }
4524         }
4525         for(y=0; y<8; y++){
4526             const int index= 256 + 4*(y&3) + 32*(y>>2);
4527             for(x=0; x<8; x++){
4528                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4529                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4530             }
4531         }
4532         for(y=0; y<8; y++){
4533             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4534             for(x=0; x<8; x++){
4535                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4536                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4537             }
4538         }
4539
4540         // In deblocking, the quantizer is 0
4541         s->current_picture.qscale_table[mb_xy]= 0;
4542         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4543         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4544         // All coeffs are present
4545         memset(h->non_zero_count[mb_xy], 16, 16);
4546
4547         s->current_picture.mb_type[mb_xy]= mb_type;
4548         return 0;
4549     }
4550
4551     if(MB_MBAFF){
4552         h->ref_count[0] <<= 1;
4553         h->ref_count[1] <<= 1;
4554     }
4555
4556     fill_caches(h, mb_type, 0);
4557
4558     //mb_pred
4559     if(IS_INTRA(mb_type)){
4560             int pred_mode;
4561 //            init_top_left_availability(h);
4562             if(IS_INTRA4x4(mb_type)){
4563                 int i;
4564                 int di = 1;
4565                 if(dct8x8_allowed && get_bits1(&s->gb)){
4566                     mb_type |= MB_TYPE_8x8DCT;
4567                     di = 4;
4568                 }
4569
4570 //                fill_intra4x4_pred_table(h);
4571                 for(i=0; i<16; i+=di){
4572                     int mode= pred_intra_mode(h, i);
4573
4574                     if(!get_bits1(&s->gb)){
4575                         const int rem_mode= get_bits(&s->gb, 3);
4576                         mode = rem_mode + (rem_mode >= mode);
4577                     }
4578
4579                     if(di==4)
4580                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4581                     else
4582                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4583                 }
4584                 write_back_intra_pred_mode(h);
4585                 if( check_intra4x4_pred_mode(h) < 0)
4586                     return -1;
4587             }else{
4588                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4589                 if(h->intra16x16_pred_mode < 0)
4590                     return -1;
4591             }
4592
4593             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4594             if(pred_mode < 0)
4595                 return -1;
4596             h->chroma_pred_mode= pred_mode;
4597     }else if(partition_count==4){
4598         int i, j, sub_partition_count[4], list, ref[2][4];
4599
4600         if(h->slice_type == B_TYPE){
4601             for(i=0; i<4; i++){
4602                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4603                 if(h->sub_mb_type[i] >=13){
4604                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4605                     return -1;
4606                 }
4607                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4608                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4609             }
4610             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4611                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4612                 pred_direct_motion(h, &mb_type);
4613                 h->ref_cache[0][scan8[4]] =
4614                 h->ref_cache[1][scan8[4]] =
4615                 h->ref_cache[0][scan8[12]] =
4616                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4617             }
4618         }else{
4619             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4620             for(i=0; i<4; i++){
4621                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4622                 if(h->sub_mb_type[i] >=4){
4623                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4624                     return -1;
4625                 }
4626                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4627                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4628             }
4629         }
4630
4631         for(list=0; list<h->list_count; list++){
4632             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4633             for(i=0; i<4; i++){
4634                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4635                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4636                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4637                     if(tmp>=ref_count){
4638                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4639                         return -1;
4640                     }
4641                     ref[list][i]= tmp;
4642                 }else{
4643                  //FIXME
4644                     ref[list][i] = -1;
4645                 }
4646             }
4647         }
4648
4649         if(dct8x8_allowed)
4650             dct8x8_allowed = get_dct8x8_allowed(h);
4651
4652         for(list=0; list<h->list_count; list++){
4653             for(i=0; i<4; i++){
4654                 if(IS_DIRECT(h->sub_mb_type[i])) {
4655                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4656                     continue;
4657                 }
4658                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4659                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4660
4661                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4662                     const int sub_mb_type= h->sub_mb_type[i];
4663                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4664                     for(j=0; j<sub_partition_count[i]; j++){
4665                         int mx, my;
4666                         const int index= 4*i + block_width*j;
4667                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4668                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4669                         mx += get_se_golomb(&s->gb);
4670                         my += get_se_golomb(&s->gb);
4671                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4672
4673                         if(IS_SUB_8X8(sub_mb_type)){
4674                             mv_cache[ 1 ][0]=
4675                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4676                             mv_cache[ 1 ][1]=
4677                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4678                         }else if(IS_SUB_8X4(sub_mb_type)){
4679                             mv_cache[ 1 ][0]= mx;
4680                             mv_cache[ 1 ][1]= my;
4681                         }else if(IS_SUB_4X8(sub_mb_type)){
4682                             mv_cache[ 8 ][0]= mx;
4683                             mv_cache[ 8 ][1]= my;
4684                         }
4685                         mv_cache[ 0 ][0]= mx;
4686                         mv_cache[ 0 ][1]= my;
4687                     }
4688                 }else{
4689                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4690                     p[0] = p[1]=
4691                     p[8] = p[9]= 0;
4692                 }
4693             }
4694         }
4695     }else if(IS_DIRECT(mb_type)){
4696         pred_direct_motion(h, &mb_type);
4697         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4698     }else{
4699         int list, mx, my, i;
4700          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4701         if(IS_16X16(mb_type)){
4702             for(list=0; list<h->list_count; list++){
4703                     unsigned int val;
4704                     if(IS_DIR(mb_type, 0, list)){
4705                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4706                         if(val >= h->ref_count[list]){
4707                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4708                             return -1;
4709                         }
4710                     }else
4711                         val= LIST_NOT_USED&0xFF;
4712                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4713             }
4714             for(list=0; list<h->list_count; list++){
4715                 unsigned int val;
4716                 if(IS_DIR(mb_type, 0, list)){
4717                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4718                     mx += get_se_golomb(&s->gb);
4719                     my += get_se_golomb(&s->gb);
4720                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4721
4722                     val= pack16to32(mx,my);
4723                 }else
4724                     val=0;
4725                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4726             }
4727         }
4728         else if(IS_16X8(mb_type)){
4729             for(list=0; list<h->list_count; list++){
4730                     for(i=0; i<2; i++){
4731                         unsigned int val;
4732                         if(IS_DIR(mb_type, i, list)){
4733                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4734                             if(val >= h->ref_count[list]){
4735                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4736                                 return -1;
4737                             }
4738                         }else
4739                             val= LIST_NOT_USED&0xFF;
4740                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4741                     }
4742             }
4743             for(list=0; list<h->list_count; list++){
4744                 for(i=0; i<2; i++){
4745                     unsigned int val;
4746                     if(IS_DIR(mb_type, i, list)){
4747                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4748                         mx += get_se_golomb(&s->gb);
4749                         my += get_se_golomb(&s->gb);
4750                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4751
4752                         val= pack16to32(mx,my);
4753                     }else
4754                         val=0;
4755                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4756                 }
4757             }
4758         }else{
4759             assert(IS_8X16(mb_type));
4760             for(list=0; list<h->list_count; list++){
4761                     for(i=0; i<2; i++){
4762                         unsigned int val;
4763                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4764                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4765                             if(val >= h->ref_count[list]){
4766                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4767                                 return -1;
4768                             }
4769                         }else
4770                             val= LIST_NOT_USED&0xFF;
4771                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4772                     }
4773             }
4774             for(list=0; list<h->list_count; list++){
4775                 for(i=0; i<2; i++){
4776                     unsigned int val;
4777                     if(IS_DIR(mb_type, i, list)){
4778                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4779                         mx += get_se_golomb(&s->gb);
4780                         my += get_se_golomb(&s->gb);
4781                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4782
4783                         val= pack16to32(mx,my);
4784                     }else
4785                         val=0;
4786                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4787                 }
4788             }
4789         }
4790     }
4791
4792     if(IS_INTER(mb_type))
4793         write_back_motion(h, mb_type);
4794
4795     if(!IS_INTRA16x16(mb_type)){
4796         cbp= get_ue_golomb(&s->gb);
4797         if(cbp > 47){
4798             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4799             return -1;
4800         }
4801
4802         if(IS_INTRA4x4(mb_type))
4803             cbp= golomb_to_intra4x4_cbp[cbp];
4804         else
4805             cbp= golomb_to_inter_cbp[cbp];
4806     }
4807     h->cbp = cbp;
4808
4809     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4810         if(get_bits1(&s->gb))
4811             mb_type |= MB_TYPE_8x8DCT;
4812     }
4813     s->current_picture.mb_type[mb_xy]= mb_type;
4814
4815     if(cbp || IS_INTRA16x16(mb_type)){
4816         int i8x8, i4x4, chroma_idx;
4817         int dquant;
4818         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4819         const uint8_t *scan, *scan8x8, *dc_scan;
4820
4821 //        fill_non_zero_count_cache(h);
4822
4823         if(IS_INTERLACED(mb_type)){
4824             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4825             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4826             dc_scan= luma_dc_field_scan;
4827         }else{
4828             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4829             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4830             dc_scan= luma_dc_zigzag_scan;
4831         }
4832
4833         dquant= get_se_golomb(&s->gb);
4834
4835         if( dquant > 25 || dquant < -26 ){
4836             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4837             return -1;
4838         }
4839
4840         s->qscale += dquant;
4841         if(((unsigned)s->qscale) > 51){
4842             if(s->qscale<0) s->qscale+= 52;
4843             else            s->qscale-= 52;
4844         }
4845
4846         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4847         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4848         if(IS_INTRA16x16(mb_type)){
4849             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4850                 return -1; //FIXME continue if partitioned and other return -1 too
4851             }
4852
4853             assert((cbp&15) == 0 || (cbp&15) == 15);
4854
4855             if(cbp&15){
4856                 for(i8x8=0; i8x8<4; i8x8++){
4857                     for(i4x4=0; i4x4<4; i4x4++){
4858                         const int index= i4x4 + 4*i8x8;
4859                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4860                             return -1;
4861                         }
4862                     }
4863                 }
4864             }else{
4865                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4866             }
4867         }else{
4868             for(i8x8=0; i8x8<4; i8x8++){
4869                 if(cbp & (1<<i8x8)){
4870                     if(IS_8x8DCT(mb_type)){
4871                         DCTELEM *buf = &h->mb[64*i8x8];
4872                         uint8_t *nnz;
4873                         for(i4x4=0; i4x4<4; i4x4++){
4874                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4875                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4876                                 return -1;
4877                         }
4878                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4879                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4880                     }else{
4881                         for(i4x4=0; i4x4<4; i4x4++){
4882                             const int index= i4x4 + 4*i8x8;
4883
4884                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4885                                 return -1;
4886                             }
4887                         }
4888                     }
4889                 }else{
4890                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4891                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4892                 }
4893             }
4894         }
4895
4896         if(cbp&0x30){
4897             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4898                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4899                     return -1;
4900                 }
4901         }
4902
4903         if(cbp&0x20){
4904             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4905                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4906                 for(i4x4=0; i4x4<4; i4x4++){
4907                     const int index= 16 + 4*chroma_idx + i4x4;
4908                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4909                         return -1;
4910                     }
4911                 }
4912             }
4913         }else{
4914             uint8_t * const nnz= &h->non_zero_count_cache[0];
4915             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4916             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4917         }
4918     }else{
4919         uint8_t * const nnz= &h->non_zero_count_cache[0];
4920         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4921         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4922         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4923     }
4924     s->current_picture.qscale_table[mb_xy]= s->qscale;
4925     write_back_non_zero_count(h);
4926
4927     if(MB_MBAFF){
4928         h->ref_count[0] >>= 1;
4929         h->ref_count[1] >>= 1;
4930     }
4931
4932     return 0;
4933 }
4934
4935 static int decode_cabac_field_decoding_flag(H264Context *h) {
4936     MpegEncContext * const s = &h->s;
4937     const int mb_x = s->mb_x;
4938     const int mb_y = s->mb_y & ~1;
4939     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4940     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4941
4942     unsigned int ctx = 0;
4943
4944     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4945         ctx += 1;
4946     }
4947     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4948         ctx += 1;
4949     }
4950
4951     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4952 }
4953
4954 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4955     uint8_t *state= &h->cabac_state[ctx_base];
4956     int mb_type;
4957
4958     if(intra_slice){
4959         MpegEncContext * const s = &h->s;
4960         const int mba_xy = h->left_mb_xy[0];
4961         const int mbb_xy = h->top_mb_xy;
4962         int ctx=0;
4963         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4964             ctx++;
4965         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4966             ctx++;
4967         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4968             return 0;   /* I4x4 */
4969         state += 2;
4970     }else{
4971         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4972             return 0;   /* I4x4 */
4973     }
4974
4975     if( get_cabac_terminate( &h->cabac ) )
4976         return 25;  /* PCM */
4977
4978     mb_type = 1; /* I16x16 */
4979     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4980     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4981         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4982     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4983     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4984     return mb_type;
4985 }
4986
4987 static int decode_cabac_mb_type( H264Context *h ) {
4988     MpegEncContext * const s = &h->s;
4989
4990     if( h->slice_type == I_TYPE ) {
4991         return decode_cabac_intra_mb_type(h, 3, 1);
4992     } else if( h->slice_type == P_TYPE ) {
4993         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4994             /* P-type */
4995             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4996                 /* P_L0_D16x16, P_8x8 */
4997                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4998             } else {
4999                 /* P_L0_D8x16, P_L0_D16x8 */
5000                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5001             }
5002         } else {
5003             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5004         }
5005     } else if( h->slice_type == B_TYPE ) {
5006         const int mba_xy = h->left_mb_xy[0];
5007         const int mbb_xy = h->top_mb_xy;
5008         int ctx = 0;
5009         int bits;
5010
5011         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5012             ctx++;
5013         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5014             ctx++;
5015
5016         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5017             return 0; /* B_Direct_16x16 */
5018
5019         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5020             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5021         }
5022
5023         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5024         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5025         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5026         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5027         if( bits < 8 )
5028             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5029         else if( bits == 13 ) {
5030             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5031         } else if( bits == 14 )
5032             return 11; /* B_L1_L0_8x16 */
5033         else if( bits == 15 )
5034             return 22; /* B_8x8 */
5035
5036         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5037         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5038     } else {
5039         /* TODO SI/SP frames? */
5040         return -1;
5041     }
5042 }
5043
5044 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5045     MpegEncContext * const s = &h->s;
5046     int mba_xy, mbb_xy;
5047     int ctx = 0;
5048
5049     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5050         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5051         mba_xy = mb_xy - 1;
5052         if( (mb_y&1)
5053             && h->slice_table[mba_xy] == h->slice_num
5054             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5055             mba_xy += s->mb_stride;
5056         if( MB_FIELD ){
5057             mbb_xy = mb_xy - s->mb_stride;
5058             if( !(mb_y&1)
5059                 && h->slice_table[mbb_xy] == h->slice_num
5060                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5061                 mbb_xy -= s->mb_stride;
5062         }else
5063             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5064     }else{
5065         int mb_xy = mb_x + mb_y*s->mb_stride;
5066         mba_xy = mb_xy - 1;
5067         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5068     }
5069
5070     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5071         ctx++;
5072     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5073         ctx++;
5074
5075     if( h->slice_type == B_TYPE )
5076         ctx += 13;
5077     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5078 }
5079
5080 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5081     int mode = 0;
5082
5083     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5084         return pred_mode;
5085
5086     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5087     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5088     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5089
5090     if( mode >= pred_mode )
5091         return mode + 1;
5092     else
5093         return mode;
5094 }
5095
5096 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5097     const int mba_xy = h->left_mb_xy[0];
5098     const int mbb_xy = h->top_mb_xy;
5099
5100     int ctx = 0;
5101
5102     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5103     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5104         ctx++;
5105
5106     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5107         ctx++;
5108
5109     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5110         return 0;
5111
5112     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5113         return 1;
5114     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5115         return 2;
5116     else
5117         return 3;
5118 }
5119
5120 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5121     int cbp_b, cbp_a, ctx, cbp = 0;
5122
5123     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5124     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5125
5126     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5127     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5128     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5129     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5130     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5131     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5132     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5133     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5134     return cbp;
5135 }
5136 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5137     int ctx;
5138     int cbp_a, cbp_b;
5139
5140     cbp_a = (h->left_cbp>>4)&0x03;
5141     cbp_b = (h-> top_cbp>>4)&0x03;
5142
5143     ctx = 0;
5144     if( cbp_a > 0 ) ctx++;
5145     if( cbp_b > 0 ) ctx += 2;
5146     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5147         return 0;
5148
5149     ctx = 4;
5150     if( cbp_a == 2 ) ctx++;
5151     if( cbp_b == 2 ) ctx += 2;
5152     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5153 }
5154 static int decode_cabac_mb_dqp( H264Context *h) {
5155     int   ctx = 0;
5156     int   val = 0;
5157
5158     if( h->last_qscale_diff != 0 )
5159         ctx++;
5160
5161     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5162         if( ctx < 2 )
5163             ctx = 2;
5164         else
5165             ctx = 3;
5166         val++;
5167         if(val > 102) //prevent infinite loop
5168             return INT_MIN;
5169     }
5170
5171     if( val&0x01 )
5172         return (val + 1)/2;
5173     else
5174         return -(val + 1)/2;
5175 }
5176 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5177     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5178         return 0;   /* 8x8 */
5179     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5180         return 1;   /* 8x4 */
5181     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5182         return 2;   /* 4x8 */
5183     return 3;       /* 4x4 */
5184 }
5185 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5186     int type;
5187     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5188         return 0;   /* B_Direct_8x8 */
5189     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5190         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5191     type = 3;
5192     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5193         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5194             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5195         type += 4;
5196     }
5197     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5198     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5199     return type;
5200 }
5201
5202 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5203     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5204 }
5205
5206 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5207     int refa = h->ref_cache[list][scan8[n] - 1];
5208     int refb = h->ref_cache[list][scan8[n] - 8];
5209     int ref  = 0;
5210     int ctx  = 0;
5211
5212     if( h->slice_type == B_TYPE) {
5213         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5214             ctx++;
5215         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5216             ctx += 2;
5217     } else {
5218         if( refa > 0 )
5219             ctx++;
5220         if( refb > 0 )
5221             ctx += 2;
5222     }
5223
5224     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5225         ref++;
5226         if( ctx < 4 )
5227             ctx = 4;
5228         else
5229             ctx = 5;
5230         if(ref >= 32 /*h->ref_list[list]*/){
5231             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5232             return 0; //FIXME we should return -1 and check the return everywhere
5233         }
5234     }
5235     return ref;
5236 }
5237
5238 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5239     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5240                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5241     int ctxbase = (l == 0) ? 40 : 47;
5242     int ctx, mvd;
5243
5244     if( amvd < 3 )
5245         ctx = 0;
5246     else if( amvd > 32 )
5247         ctx = 2;
5248     else
5249         ctx = 1;
5250
5251     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5252         return 0;
5253
5254     mvd= 1;
5255     ctx= 3;
5256     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5257         mvd++;
5258         if( ctx < 6 )
5259             ctx++;
5260     }
5261
5262     if( mvd >= 9 ) {
5263         int k = 3;
5264         while( get_cabac_bypass( &h->cabac ) ) {
5265             mvd += 1 << k;
5266             k++;
5267             if(k>24){
5268                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5269                 return INT_MIN;
5270             }
5271         }
5272         while( k-- ) {
5273             if( get_cabac_bypass( &h->cabac ) )
5274                 mvd += 1 << k;
5275         }
5276     }
5277     return get_cabac_bypass_sign( &h->cabac, -mvd );
5278 }
5279
5280 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5281     int nza, nzb;
5282     int ctx = 0;
5283
5284     if( cat == 0 ) {
5285         nza = h->left_cbp&0x100;
5286         nzb = h-> top_cbp&0x100;
5287     } else if( cat == 1 || cat == 2 ) {
5288         nza = h->non_zero_count_cache[scan8[idx] - 1];
5289         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5290     } else if( cat == 3 ) {
5291         nza = (h->left_cbp>>(6+idx))&0x01;
5292         nzb = (h-> top_cbp>>(6+idx))&0x01;
5293     } else {
5294         assert(cat == 4);
5295         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5296         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5297     }
5298
5299     if( nza > 0 )
5300         ctx++;
5301
5302     if( nzb > 0 )
5303         ctx += 2;
5304
5305     return ctx + 4 * cat;
5306 }
5307
5308 static const attribute_used uint8_t last_coeff_flag_offset_8x8[63] = {
5309     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5310     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5311     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5312     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5313 };
5314
5315 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5316     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5317     static const int significant_coeff_flag_offset[2][6] = {
5318       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5319       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5320     };
5321     static const int last_coeff_flag_offset[2][6] = {
5322       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5323       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5324     };
5325     static const int coeff_abs_level_m1_offset[6] = {
5326         227+0, 227+10, 227+20, 227+30, 227+39, 426
5327     };
5328     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5329       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5330         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5331         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5332        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5333       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5334         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5335         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5336         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5337     };
5338
5339     int index[64];
5340
5341     int av_unused last;
5342     int coeff_count = 0;
5343
5344     int abslevel1 = 1;
5345     int abslevelgt1 = 0;
5346
5347     uint8_t *significant_coeff_ctx_base;
5348     uint8_t *last_coeff_ctx_base;
5349     uint8_t *abs_level_m1_ctx_base;
5350
5351 #ifndef ARCH_X86
5352 #define CABAC_ON_STACK
5353 #endif
5354 #ifdef CABAC_ON_STACK
5355 #define CC &cc
5356     CABACContext cc;
5357     cc.range     = h->cabac.range;
5358     cc.low       = h->cabac.low;
5359     cc.bytestream= h->cabac.bytestream;
5360 #else
5361 #define CC &h->cabac
5362 #endif
5363
5364
5365     /* cat: 0-> DC 16x16  n = 0
5366      *      1-> AC 16x16  n = luma4x4idx
5367      *      2-> Luma4x4   n = luma4x4idx
5368      *      3-> DC Chroma n = iCbCr
5369      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5370      *      5-> Luma8x8   n = 4 * luma8x8idx
5371      */
5372
5373     /* read coded block flag */
5374     if( cat != 5 ) {
5375         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5376             if( cat == 1 || cat == 2 )
5377                 h->non_zero_count_cache[scan8[n]] = 0;
5378             else if( cat == 4 )
5379                 h->non_zero_count_cache[scan8[16+n]] = 0;
5380 #ifdef CABAC_ON_STACK
5381             h->cabac.range     = cc.range     ;
5382             h->cabac.low       = cc.low       ;
5383             h->cabac.bytestream= cc.bytestream;
5384 #endif
5385             return;
5386         }
5387     }
5388
5389     significant_coeff_ctx_base = h->cabac_state
5390         + significant_coeff_flag_offset[MB_FIELD][cat];
5391     last_coeff_ctx_base = h->cabac_state
5392         + last_coeff_flag_offset[MB_FIELD][cat];
5393     abs_level_m1_ctx_base = h->cabac_state
5394         + coeff_abs_level_m1_offset[cat];
5395
5396     if( cat == 5 ) {
5397 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5398         for(last= 0; last < coefs; last++) { \
5399             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5400             if( get_cabac( CC, sig_ctx )) { \
5401                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5402                 index[coeff_count++] = last; \
5403                 if( get_cabac( CC, last_ctx ) ) { \
5404                     last= max_coeff; \
5405                     break; \
5406                 } \
5407             } \
5408         }\
5409         if( last == max_coeff -1 ) {\
5410             index[coeff_count++] = last;\
5411         }
5412         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5413 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5414         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5415     } else {
5416         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5417 #else
5418         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5419     } else {
5420         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5421 #endif
5422     }
5423     assert(coeff_count > 0);
5424
5425     if( cat == 0 )
5426         h->cbp_table[mb_xy] |= 0x100;
5427     else if( cat == 1 || cat == 2 )
5428         h->non_zero_count_cache[scan8[n]] = coeff_count;
5429     else if( cat == 3 )
5430         h->cbp_table[mb_xy] |= 0x40 << n;
5431     else if( cat == 4 )
5432         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5433     else {
5434         assert( cat == 5 );
5435         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5436     }
5437
5438     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5439         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5440         int j= scantable[index[coeff_count]];
5441
5442         if( get_cabac( CC, ctx ) == 0 ) {
5443             if( !qmul ) {
5444                 block[j] = get_cabac_bypass_sign( CC, -1);
5445             }else{
5446                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;;
5447             }
5448
5449             abslevel1++;
5450         } else {
5451             int coeff_abs = 2;
5452             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5453             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5454                 coeff_abs++;
5455             }
5456
5457             if( coeff_abs >= 15 ) {
5458                 int j = 0;
5459                 while( get_cabac_bypass( CC ) ) {
5460                     j++;
5461                 }
5462
5463                 coeff_abs=1;
5464                 while( j-- ) {
5465                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5466                 }
5467                 coeff_abs+= 14;
5468             }
5469
5470             if( !qmul ) {
5471                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5472                 else                                block[j] =  coeff_abs;
5473             }else{
5474                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5475                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5476             }
5477
5478             abslevelgt1++;
5479         }
5480     }
5481 #ifdef CABAC_ON_STACK
5482             h->cabac.range     = cc.range     ;
5483             h->cabac.low       = cc.low       ;
5484             h->cabac.bytestream= cc.bytestream;
5485 #endif
5486
5487 }
5488
5489 static inline void compute_mb_neighbors(H264Context *h)
5490 {
5491     MpegEncContext * const s = &h->s;
5492     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5493     h->top_mb_xy     = mb_xy - s->mb_stride;
5494     h->left_mb_xy[0] = mb_xy - 1;
5495     if(FRAME_MBAFF){
5496         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5497         const int top_pair_xy      = pair_xy     - s->mb_stride;
5498         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5499         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5500         const int curr_mb_frame_flag = !MB_FIELD;
5501         const int bottom = (s->mb_y & 1);
5502         if (bottom
5503                 ? !curr_mb_frame_flag // bottom macroblock
5504                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5505                 ) {
5506             h->top_mb_xy -= s->mb_stride;
5507         }
5508         if (left_mb_frame_flag != curr_mb_frame_flag) {
5509             h->left_mb_xy[0] = pair_xy - 1;
5510         }
5511     } else if (FIELD_PICTURE) {
5512         h->top_mb_xy -= s->mb_stride;
5513     }
5514     return;
5515 }
5516
5517 /**
5518  * decodes a macroblock
5519  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5520  */
5521 static int decode_mb_cabac(H264Context *h) {
5522     MpegEncContext * const s = &h->s;
5523     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5524     int mb_type, partition_count, cbp = 0;
5525     int dct8x8_allowed= h->pps.transform_8x8_mode;
5526
5527     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5528
5529     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5530     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5531         int skip;
5532         /* a skipped mb needs the aff flag from the following mb */
5533         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5534             predict_field_decoding_flag(h);
5535         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5536             skip = h->next_mb_skipped;
5537         else
5538             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5539         /* read skip flags */
5540         if( skip ) {
5541             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5542                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5543                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5544                 if(h->next_mb_skipped)
5545                     predict_field_decoding_flag(h);
5546                 else
5547                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5548             }
5549
5550             decode_mb_skip(h);
5551
5552             h->cbp_table[mb_xy] = 0;
5553             h->chroma_pred_mode_table[mb_xy] = 0;
5554             h->last_qscale_diff = 0;
5555
5556             return 0;
5557
5558         }
5559     }
5560     if(FRAME_MBAFF){
5561         if( (s->mb_y&1) == 0 )
5562             h->mb_mbaff =
5563             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5564     }else
5565         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5566
5567     h->prev_mb_skipped = 0;
5568
5569     compute_mb_neighbors(h);
5570     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5571         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5572         return -1;
5573     }
5574
5575     if( h->slice_type == B_TYPE ) {
5576         if( mb_type < 23 ){
5577             partition_count= b_mb_type_info[mb_type].partition_count;
5578             mb_type=         b_mb_type_info[mb_type].type;
5579         }else{
5580             mb_type -= 23;
5581             goto decode_intra_mb;
5582         }
5583     } else if( h->slice_type == P_TYPE ) {
5584         if( mb_type < 5) {
5585             partition_count= p_mb_type_info[mb_type].partition_count;
5586             mb_type=         p_mb_type_info[mb_type].type;
5587         } else {
5588             mb_type -= 5;
5589             goto decode_intra_mb;
5590         }
5591     } else {
5592        assert(h->slice_type == I_TYPE);
5593 decode_intra_mb:
5594         partition_count = 0;
5595         cbp= i_mb_type_info[mb_type].cbp;
5596         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5597         mb_type= i_mb_type_info[mb_type].type;
5598     }
5599     if(MB_FIELD)
5600         mb_type |= MB_TYPE_INTERLACED;
5601
5602     h->slice_table[ mb_xy ]= h->slice_num;
5603
5604     if(IS_INTRA_PCM(mb_type)) {
5605         const uint8_t *ptr;
5606         unsigned int x, y;
5607
5608         // We assume these blocks are very rare so we do not optimize it.
5609         // FIXME The two following lines get the bitstream position in the cabac
5610         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5611         ptr= h->cabac.bytestream;
5612         if(h->cabac.low&0x1) ptr--;
5613         if(CABAC_BITS==16){
5614             if(h->cabac.low&0x1FF) ptr--;
5615         }
5616
5617         // The pixels are stored in the same order as levels in h->mb array.
5618         for(y=0; y<16; y++){
5619             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5620             for(x=0; x<16; x++){
5621                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5622                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5623             }
5624         }
5625         for(y=0; y<8; y++){
5626             const int index= 256 + 4*(y&3) + 32*(y>>2);
5627             for(x=0; x<8; x++){
5628                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5629                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5630             }
5631         }
5632         for(y=0; y<8; y++){
5633             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5634             for(x=0; x<8; x++){
5635                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5636                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5637             }
5638         }
5639
5640         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5641
5642         // All blocks are present
5643         h->cbp_table[mb_xy] = 0x1ef;
5644         h->chroma_pred_mode_table[mb_xy] = 0;
5645         // In deblocking, the quantizer is 0
5646         s->current_picture.qscale_table[mb_xy]= 0;
5647         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5648         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5649         // All coeffs are present
5650         memset(h->non_zero_count[mb_xy], 16, 16);
5651         s->current_picture.mb_type[mb_xy]= mb_type;
5652         return 0;
5653     }
5654
5655     if(MB_MBAFF){
5656         h->ref_count[0] <<= 1;
5657         h->ref_count[1] <<= 1;
5658     }
5659
5660     fill_caches(h, mb_type, 0);
5661
5662     if( IS_INTRA( mb_type ) ) {
5663         int i, pred_mode;
5664         if( IS_INTRA4x4( mb_type ) ) {
5665             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5666                 mb_type |= MB_TYPE_8x8DCT;
5667                 for( i = 0; i < 16; i+=4 ) {
5668                     int pred = pred_intra_mode( h, i );
5669                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5670                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5671                 }
5672             } else {
5673                 for( i = 0; i < 16; i++ ) {
5674                     int pred = pred_intra_mode( h, i );
5675                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5676
5677                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5678                 }
5679             }
5680             write_back_intra_pred_mode(h);
5681             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5682         } else {
5683             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5684             if( h->intra16x16_pred_mode < 0 ) return -1;
5685         }
5686         h->chroma_pred_mode_table[mb_xy] =
5687         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5688
5689         pred_mode= check_intra_pred_mode( h, pred_mode );
5690         if( pred_mode < 0 ) return -1;
5691         h->chroma_pred_mode= pred_mode;
5692     } else if( partition_count == 4 ) {
5693         int i, j, sub_partition_count[4], list, ref[2][4];
5694
5695         if( h->slice_type == B_TYPE ) {
5696             for( i = 0; i < 4; i++ ) {
5697                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5698                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5699                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5700             }
5701             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5702                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5703                 pred_direct_motion(h, &mb_type);
5704                 h->ref_cache[0][scan8[4]] =
5705                 h->ref_cache[1][scan8[4]] =
5706                 h->ref_cache[0][scan8[12]] =
5707                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5708                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5709                     for( i = 0; i < 4; i++ )
5710                         if( IS_DIRECT(h->sub_mb_type[i]) )
5711                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5712                 }
5713             }
5714         } else {
5715             for( i = 0; i < 4; i++ ) {
5716                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5717                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5718                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5719             }
5720         }
5721
5722         for( list = 0; list < h->list_count; list++ ) {
5723                 for( i = 0; i < 4; i++ ) {
5724                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5725                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5726                         if( h->ref_count[list] > 1 )
5727                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5728                         else
5729                             ref[list][i] = 0;
5730                     } else {
5731                         ref[list][i] = -1;
5732                     }
5733                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5734                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5735                 }
5736         }
5737
5738         if(dct8x8_allowed)
5739             dct8x8_allowed = get_dct8x8_allowed(h);
5740
5741         for(list=0; list<h->list_count; list++){
5742             for(i=0; i<4; i++){
5743                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5744                 if(IS_DIRECT(h->sub_mb_type[i])){
5745                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5746                     continue;
5747                 }
5748
5749                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5750                     const int sub_mb_type= h->sub_mb_type[i];
5751                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5752                     for(j=0; j<sub_partition_count[i]; j++){
5753                         int mpx, mpy;
5754                         int mx, my;
5755                         const int index= 4*i + block_width*j;
5756                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5757                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5758                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5759
5760                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5761                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5762                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5763
5764                         if(IS_SUB_8X8(sub_mb_type)){
5765                             mv_cache[ 1 ][0]=
5766                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5767                             mv_cache[ 1 ][1]=
5768                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5769
5770                             mvd_cache[ 1 ][0]=
5771                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5772                             mvd_cache[ 1 ][1]=
5773                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5774                         }else if(IS_SUB_8X4(sub_mb_type)){
5775                             mv_cache[ 1 ][0]= mx;
5776                             mv_cache[ 1 ][1]= my;
5777
5778                             mvd_cache[ 1 ][0]= mx - mpx;
5779                             mvd_cache[ 1 ][1]= my - mpy;
5780                         }else if(IS_SUB_4X8(sub_mb_type)){
5781                             mv_cache[ 8 ][0]= mx;
5782                             mv_cache[ 8 ][1]= my;
5783
5784                             mvd_cache[ 8 ][0]= mx - mpx;
5785                             mvd_cache[ 8 ][1]= my - mpy;
5786                         }
5787                         mv_cache[ 0 ][0]= mx;
5788                         mv_cache[ 0 ][1]= my;
5789
5790                         mvd_cache[ 0 ][0]= mx - mpx;
5791                         mvd_cache[ 0 ][1]= my - mpy;
5792                     }
5793                 }else{
5794                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5795                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5796                     p[0] = p[1] = p[8] = p[9] = 0;
5797                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5798                 }
5799             }
5800         }
5801     } else if( IS_DIRECT(mb_type) ) {
5802         pred_direct_motion(h, &mb_type);
5803         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5804         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5805         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5806     } else {
5807         int list, mx, my, i, mpx, mpy;
5808         if(IS_16X16(mb_type)){
5809             for(list=0; list<h->list_count; list++){
5810                 if(IS_DIR(mb_type, 0, list)){
5811                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5812                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5813                 }else
5814                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5815             }
5816             for(list=0; list<h->list_count; list++){
5817                 if(IS_DIR(mb_type, 0, list)){
5818                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5819
5820                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5821                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5822                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5823
5824                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5825                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5826                 }else
5827                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5828             }
5829         }
5830         else if(IS_16X8(mb_type)){
5831             for(list=0; list<h->list_count; list++){
5832                     for(i=0; i<2; i++){
5833                         if(IS_DIR(mb_type, i, list)){
5834                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5835                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5836                         }else
5837                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5838                     }
5839             }
5840             for(list=0; list<h->list_count; list++){
5841                 for(i=0; i<2; i++){
5842                     if(IS_DIR(mb_type, i, list)){
5843                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5844                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5845                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5846                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5847
5848                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5849                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5850                     }else{
5851                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5852                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5853                     }
5854                 }
5855             }
5856         }else{
5857             assert(IS_8X16(mb_type));
5858             for(list=0; list<h->list_count; list++){
5859                     for(i=0; i<2; i++){
5860                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5861                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5862                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5863                         }else
5864                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5865                     }
5866             }
5867             for(list=0; list<h->list_count; list++){
5868                 for(i=0; i<2; i++){
5869                     if(IS_DIR(mb_type, i, list)){
5870                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5871                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5872                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5873
5874                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5875                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5876                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5877                     }else{
5878                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5879                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5880                     }
5881                 }
5882             }
5883         }
5884     }
5885
5886    if( IS_INTER( mb_type ) ) {
5887         h->chroma_pred_mode_table[mb_xy] = 0;
5888         write_back_motion( h, mb_type );
5889    }
5890
5891     if( !IS_INTRA16x16( mb_type ) ) {
5892         cbp  = decode_cabac_mb_cbp_luma( h );
5893         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5894     }
5895
5896     h->cbp_table[mb_xy] = h->cbp = cbp;
5897
5898     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5899         if( decode_cabac_mb_transform_size( h ) )
5900             mb_type |= MB_TYPE_8x8DCT;
5901     }
5902     s->current_picture.mb_type[mb_xy]= mb_type;
5903
5904     if( cbp || IS_INTRA16x16( mb_type ) ) {
5905         const uint8_t *scan, *scan8x8, *dc_scan;
5906         const uint32_t *qmul;
5907         int dqp;
5908
5909         if(IS_INTERLACED(mb_type)){
5910             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5911             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5912             dc_scan= luma_dc_field_scan;
5913         }else{
5914             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5915             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5916             dc_scan= luma_dc_zigzag_scan;
5917         }
5918
5919         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5920         if( dqp == INT_MIN ){
5921             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5922             return -1;
5923         }
5924         s->qscale += dqp;
5925         if(((unsigned)s->qscale) > 51){
5926             if(s->qscale<0) s->qscale+= 52;
5927             else            s->qscale-= 52;
5928         }
5929         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5930         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5931
5932         if( IS_INTRA16x16( mb_type ) ) {
5933             int i;
5934             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5935             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5936
5937             if( cbp&15 ) {
5938                 qmul = h->dequant4_coeff[0][s->qscale];
5939                 for( i = 0; i < 16; i++ ) {
5940                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5941                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5942                 }
5943             } else {
5944                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5945             }
5946         } else {
5947             int i8x8, i4x4;
5948             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5949                 if( cbp & (1<<i8x8) ) {
5950                     if( IS_8x8DCT(mb_type) ) {
5951                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5952                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5953                     } else {
5954                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5955                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5956                             const int index = 4*i8x8 + i4x4;
5957                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5958 //START_TIMER
5959                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5960 //STOP_TIMER("decode_residual")
5961                         }
5962                     }
5963                 } else {
5964                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5965                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5966                 }
5967             }
5968         }
5969
5970         if( cbp&0x30 ){
5971             int c;
5972             for( c = 0; c < 2; c++ ) {
5973                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5974                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5975             }
5976         }
5977
5978         if( cbp&0x20 ) {
5979             int c, i;
5980             for( c = 0; c < 2; c++ ) {
5981                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5982                 for( i = 0; i < 4; i++ ) {
5983                     const int index = 16 + 4 * c + i;
5984                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5985                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5986                 }
5987             }
5988         } else {
5989             uint8_t * const nnz= &h->non_zero_count_cache[0];
5990             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5991             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5992         }
5993     } else {
5994         uint8_t * const nnz= &h->non_zero_count_cache[0];
5995         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5996         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5997         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5998         h->last_qscale_diff = 0;
5999     }
6000
6001     s->current_picture.qscale_table[mb_xy]= s->qscale;
6002     write_back_non_zero_count(h);
6003
6004     if(MB_MBAFF){
6005         h->ref_count[0] >>= 1;
6006         h->ref_count[1] >>= 1;
6007     }
6008
6009     return 0;
6010 }
6011
6012
6013 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6014     int i, d;
6015     const int index_a = qp + h->slice_alpha_c0_offset;
6016     const int alpha = (alpha_table+52)[index_a];
6017     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6018
6019     if( bS[0] < 4 ) {
6020         int8_t tc[4];
6021         for(i=0; i<4; i++)
6022             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6023         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6024     } else {
6025         /* 16px edge length, because bS=4 is triggered by being at
6026          * the edge of an intra MB, so all 4 bS are the same */
6027             for( d = 0; d < 16; d++ ) {
6028                 const int p0 = pix[-1];
6029                 const int p1 = pix[-2];
6030                 const int p2 = pix[-3];
6031
6032                 const int q0 = pix[0];
6033                 const int q1 = pix[1];
6034                 const int q2 = pix[2];
6035
6036                 if( FFABS( p0 - q0 ) < alpha &&
6037                     FFABS( p1 - p0 ) < beta &&
6038                     FFABS( q1 - q0 ) < beta ) {
6039
6040                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6041                         if( FFABS( p2 - p0 ) < beta)
6042                         {
6043                             const int p3 = pix[-4];
6044                             /* p0', p1', p2' */
6045                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6046                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6047                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6048                         } else {
6049                             /* p0' */
6050                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6051                         }
6052                         if( FFABS( q2 - q0 ) < beta)
6053                         {
6054                             const int q3 = pix[3];
6055                             /* q0', q1', q2' */
6056                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6057                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6058                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6059                         } else {
6060                             /* q0' */
6061                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6062                         }
6063                     }else{
6064                         /* p0', q0' */
6065                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6066                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6067                     }
6068                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6069                 }
6070                 pix += stride;
6071             }
6072     }
6073 }
6074 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6075     int i;
6076     const int index_a = qp + h->slice_alpha_c0_offset;
6077     const int alpha = (alpha_table+52)[index_a];
6078     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6079
6080     if( bS[0] < 4 ) {
6081         int8_t tc[4];
6082         for(i=0; i<4; i++)
6083             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6084         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6085     } else {
6086         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6087     }
6088 }
6089
6090 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6091     int i;
6092     for( i = 0; i < 16; i++, pix += stride) {
6093         int index_a;
6094         int alpha;
6095         int beta;
6096
6097         int qp_index;
6098         int bS_index = (i >> 1);
6099         if (!MB_FIELD) {
6100             bS_index &= ~1;
6101             bS_index |= (i & 1);
6102         }
6103
6104         if( bS[bS_index] == 0 ) {
6105             continue;
6106         }
6107
6108         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6109         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6110         alpha = (alpha_table+52)[index_a];
6111         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6112
6113         if( bS[bS_index] < 4 ) {
6114             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6115             const int p0 = pix[-1];
6116             const int p1 = pix[-2];
6117             const int p2 = pix[-3];
6118             const int q0 = pix[0];
6119             const int q1 = pix[1];
6120             const int q2 = pix[2];
6121
6122             if( FFABS( p0 - q0 ) < alpha &&
6123                 FFABS( p1 - p0 ) < beta &&
6124                 FFABS( q1 - q0 ) < beta ) {
6125                 int tc = tc0;
6126                 int i_delta;
6127
6128                 if( FFABS( p2 - p0 ) < beta ) {
6129                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6130                     tc++;
6131                 }
6132                 if( FFABS( q2 - q0 ) < beta ) {
6133                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6134                     tc++;
6135                 }
6136
6137                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6138                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6139                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6140                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6141             }
6142         }else{
6143             const int p0 = pix[-1];
6144             const int p1 = pix[-2];
6145             const int p2 = pix[-3];
6146
6147             const int q0 = pix[0];
6148             const int q1 = pix[1];
6149             const int q2 = pix[2];
6150
6151             if( FFABS( p0 - q0 ) < alpha &&
6152                 FFABS( p1 - p0 ) < beta &&
6153                 FFABS( q1 - q0 ) < beta ) {
6154
6155                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6156                     if( FFABS( p2 - p0 ) < beta)
6157                     {
6158                         const int p3 = pix[-4];
6159                         /* p0', p1', p2' */
6160                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6161                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6162                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6163                     } else {
6164                         /* p0' */
6165                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6166                     }
6167                     if( FFABS( q2 - q0 ) < beta)
6168                     {
6169                         const int q3 = pix[3];
6170                         /* q0', q1', q2' */
6171                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6172                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6173                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6174                     } else {
6175                         /* q0' */
6176                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6177                     }
6178                 }else{
6179                     /* p0', q0' */
6180                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6181                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6182                 }
6183                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6184             }
6185         }
6186     }
6187 }
6188 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6189     int i;
6190     for( i = 0; i < 8; i++, pix += stride) {
6191         int index_a;
6192         int alpha;
6193         int beta;
6194
6195         int qp_index;
6196         int bS_index = i;
6197
6198         if( bS[bS_index] == 0 ) {
6199             continue;
6200         }
6201
6202         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6203         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6204         alpha = (alpha_table+52)[index_a];
6205         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6206
6207         if( bS[bS_index] < 4 ) {
6208             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6209             const int p0 = pix[-1];
6210             const int p1 = pix[-2];
6211             const int q0 = pix[0];
6212             const int q1 = pix[1];
6213
6214             if( FFABS( p0 - q0 ) < alpha &&
6215                 FFABS( p1 - p0 ) < beta &&
6216                 FFABS( q1 - q0 ) < beta ) {
6217                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6218
6219                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6220                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6221                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6222             }
6223         }else{
6224             const int p0 = pix[-1];
6225             const int p1 = pix[-2];
6226             const int q0 = pix[0];
6227             const int q1 = pix[1];
6228
6229             if( FFABS( p0 - q0 ) < alpha &&
6230                 FFABS( p1 - p0 ) < beta &&
6231                 FFABS( q1 - q0 ) < beta ) {
6232
6233                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6234                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6235                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6236             }
6237         }
6238     }
6239 }
6240
6241 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6242     int i, d;
6243     const int index_a = qp + h->slice_alpha_c0_offset;
6244     const int alpha = (alpha_table+52)[index_a];
6245     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6246     const int pix_next  = stride;
6247
6248     if( bS[0] < 4 ) {
6249         int8_t tc[4];
6250         for(i=0; i<4; i++)
6251             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6252         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6253     } else {
6254         /* 16px edge length, see filter_mb_edgev */
6255             for( d = 0; d < 16; d++ ) {
6256                 const int p0 = pix[-1*pix_next];
6257                 const int p1 = pix[-2*pix_next];
6258                 const int p2 = pix[-3*pix_next];
6259                 const int q0 = pix[0];
6260                 const int q1 = pix[1*pix_next];
6261                 const int q2 = pix[2*pix_next];
6262
6263                 if( FFABS( p0 - q0 ) < alpha &&
6264                     FFABS( p1 - p0 ) < beta &&
6265                     FFABS( q1 - q0 ) < beta ) {
6266
6267                     const int p3 = pix[-4*pix_next];
6268                     const int q3 = pix[ 3*pix_next];
6269
6270                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6271                         if( FFABS( p2 - p0 ) < beta) {
6272                             /* p0', p1', p2' */
6273                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6274                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6275                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6276                         } else {
6277                             /* p0' */
6278                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6279                         }
6280                         if( FFABS( q2 - q0 ) < beta) {
6281                             /* q0', q1', q2' */
6282                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6283                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6284                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6285                         } else {
6286                             /* q0' */
6287                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6288                         }
6289                     }else{
6290                         /* p0', q0' */
6291                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6292                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6293                     }
6294                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6295                 }
6296                 pix++;
6297             }
6298     }
6299 }
6300
6301 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6302     int i;
6303     const int index_a = qp + h->slice_alpha_c0_offset;
6304     const int alpha = (alpha_table+52)[index_a];
6305     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6306
6307     if( bS[0] < 4 ) {
6308         int8_t tc[4];
6309         for(i=0; i<4; i++)
6310             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6311         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6312     } else {
6313         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6314     }
6315 }
6316
6317 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6318     MpegEncContext * const s = &h->s;
6319     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6320     int mb_xy, mb_type;
6321     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6322
6323     mb_xy = mb_x + mb_y*s->mb_stride;
6324
6325     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6326        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6327                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6328         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6329         return;
6330     }
6331     assert(!FRAME_MBAFF);
6332
6333     mb_type = s->current_picture.mb_type[mb_xy];
6334     qp = s->current_picture.qscale_table[mb_xy];
6335     qp0 = s->current_picture.qscale_table[mb_xy-1];
6336     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6337     qpc = get_chroma_qp( h, 0, qp );
6338     qpc0 = get_chroma_qp( h, 0, qp0 );
6339     qpc1 = get_chroma_qp( h, 0, qp1 );
6340     qp0 = (qp + qp0 + 1) >> 1;
6341     qp1 = (qp + qp1 + 1) >> 1;
6342     qpc0 = (qpc + qpc0 + 1) >> 1;
6343     qpc1 = (qpc + qpc1 + 1) >> 1;
6344     qp_thresh = 15 - h->slice_alpha_c0_offset;
6345     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6346        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6347         return;
6348
6349     if( IS_INTRA(mb_type) ) {
6350         int16_t bS4[4] = {4,4,4,4};
6351         int16_t bS3[4] = {3,3,3,3};
6352         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6353         if( IS_8x8DCT(mb_type) ) {
6354             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6355             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6356             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6357             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6358         } else {
6359             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6360             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6361             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6362             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6363             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6364             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6365             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6366             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6367         }
6368         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6369         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6370         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6371         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6372         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6373         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6374         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6375         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6376         return;
6377     } else {
6378         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6379         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6380         int edges;
6381         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6382             edges = 4;
6383             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6384         } else {
6385             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6386                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6387             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6388                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6389                              ? 3 : 0;
6390             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6391             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6392             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6393                                               (h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
6394         }
6395         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6396             bSv[0][0] = 0x0004000400040004ULL;
6397         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6398             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6399
6400 #define FILTER(hv,dir,edge)\
6401         if(bSv[dir][edge]) {\
6402             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6403             if(!(edge&1)) {\
6404                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6405                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6406             }\
6407         }
6408         if( edges == 1 ) {
6409             FILTER(v,0,0);
6410             FILTER(h,1,0);
6411         } else if( IS_8x8DCT(mb_type) ) {
6412             FILTER(v,0,0);
6413             FILTER(v,0,2);
6414             FILTER(h,1,0);
6415             FILTER(h,1,2);
6416         } else {
6417             FILTER(v,0,0);
6418             FILTER(v,0,1);
6419             FILTER(v,0,2);
6420             FILTER(v,0,3);
6421             FILTER(h,1,0);
6422             FILTER(h,1,1);
6423             FILTER(h,1,2);
6424             FILTER(h,1,3);
6425         }
6426 #undef FILTER
6427     }
6428 }
6429
6430 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6431     MpegEncContext * const s = &h->s;
6432     const int mb_xy= mb_x + mb_y*s->mb_stride;
6433     const int mb_type = s->current_picture.mb_type[mb_xy];
6434     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6435     int first_vertical_edge_done = 0;
6436     int dir;
6437     /* FIXME: A given frame may occupy more than one position in
6438      * the reference list. So ref2frm should be populated with
6439      * frame numbers, not indices. */
6440     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6441                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6442
6443     //for sufficiently low qp, filtering wouldn't do anything
6444     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6445     if(!FRAME_MBAFF){
6446         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6447         int qp = s->current_picture.qscale_table[mb_xy];
6448         if(qp <= qp_thresh
6449            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6450            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6451             return;
6452         }
6453     }
6454
6455     if (FRAME_MBAFF
6456             // left mb is in picture
6457             && h->slice_table[mb_xy-1] != 255
6458             // and current and left pair do not have the same interlaced type
6459             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6460             // and left mb is in the same slice if deblocking_filter == 2
6461             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6462         /* First vertical edge is different in MBAFF frames
6463          * There are 8 different bS to compute and 2 different Qp
6464          */
6465         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6466         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6467         int16_t bS[8];
6468         int qp[2];
6469         int bqp[2];
6470         int rqp[2];
6471         int mb_qp, mbn0_qp, mbn1_qp;
6472         int i;
6473         first_vertical_edge_done = 1;
6474
6475         if( IS_INTRA(mb_type) )
6476             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6477         else {
6478             for( i = 0; i < 8; i++ ) {
6479                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6480
6481                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6482                     bS[i] = 4;
6483                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6484                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6485                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6486                     bS[i] = 2;
6487                 else
6488                     bS[i] = 1;
6489             }
6490         }
6491
6492         mb_qp = s->current_picture.qscale_table[mb_xy];
6493         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6494         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6495         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6496         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6497                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6498         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6499                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6500         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6501         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6502                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6503         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6504                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6505
6506         /* Filter edge */
6507         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6508         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6509         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6510         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6511         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6512     }
6513     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6514     for( dir = 0; dir < 2; dir++ )
6515     {
6516         int edge;
6517         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6518         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6519         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6520
6521         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6522                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6523         // how often to recheck mv-based bS when iterating between edges
6524         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6525                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6526         // how often to recheck mv-based bS when iterating along each edge
6527         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6528
6529         if (first_vertical_edge_done) {
6530             start = 1;
6531             first_vertical_edge_done = 0;
6532         }
6533
6534         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6535             start = 1;
6536
6537         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6538             && !IS_INTERLACED(mb_type)
6539             && IS_INTERLACED(mbm_type)
6540             ) {
6541             // This is a special case in the norm where the filtering must
6542             // be done twice (one each of the field) even if we are in a
6543             // frame macroblock.
6544             //
6545             static const int nnz_idx[4] = {4,5,6,3};
6546             unsigned int tmp_linesize   = 2 *   linesize;
6547             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6548             int mbn_xy = mb_xy - 2 * s->mb_stride;
6549             int qp;
6550             int i, j;
6551             int16_t bS[4];
6552
6553             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6554                 if( IS_INTRA(mb_type) ||
6555                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6556                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6557                 } else {
6558                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6559                     for( i = 0; i < 4; i++ ) {
6560                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6561                             mbn_nnz[nnz_idx[i]] != 0 )
6562                             bS[i] = 2;
6563                         else
6564                             bS[i] = 1;
6565                     }
6566                 }
6567                 // Do not use s->qscale as luma quantizer because it has not the same
6568                 // value in IPCM macroblocks.
6569                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6570                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6571                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6572                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6573                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6574                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6575                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6576                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6577             }
6578
6579             start = 1;
6580         }
6581
6582         /* Calculate bS */
6583         for( edge = start; edge < edges; edge++ ) {
6584             /* mbn_xy: neighbor macroblock */
6585             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6586             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6587             int16_t bS[4];
6588             int qp;
6589
6590             if( (edge&1) && IS_8x8DCT(mb_type) )
6591                 continue;
6592
6593             if( IS_INTRA(mb_type) ||
6594                 IS_INTRA(mbn_type) ) {
6595                 int value;
6596                 if (edge == 0) {
6597                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6598                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6599                     ) {
6600                         value = 4;
6601                     } else {
6602                         value = 3;
6603                     }
6604                 } else {
6605                     value = 3;
6606                 }
6607                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6608             } else {
6609                 int i, l;
6610                 int mv_done;
6611
6612                 if( edge & mask_edge ) {
6613                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6614                     mv_done = 1;
6615                 }
6616                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6617                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6618                     mv_done = 1;
6619                 }
6620                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6621                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6622                     int bn_idx= b_idx - (dir ? 8:1);
6623                     int v = 0;
6624                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6625                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6626                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6627                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6628                     }
6629                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6630                     mv_done = 1;
6631                 }
6632                 else
6633                     mv_done = 0;
6634
6635                 for( i = 0; i < 4; i++ ) {
6636                     int x = dir == 0 ? edge : i;
6637                     int y = dir == 0 ? i    : edge;
6638                     int b_idx= 8 + 4 + x + 8*y;
6639                     int bn_idx= b_idx - (dir ? 8:1);
6640
6641                     if( h->non_zero_count_cache[b_idx] != 0 ||
6642                         h->non_zero_count_cache[bn_idx] != 0 ) {
6643                         bS[i] = 2;
6644                     }
6645                     else if(!mv_done)
6646                     {
6647                         bS[i] = 0;
6648                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6649                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6650                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6651                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6652                                 bS[i] = 1;
6653                                 break;
6654                             }
6655                         }
6656                     }
6657                 }
6658
6659                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6660                     continue;
6661             }
6662
6663             /* Filter edge */
6664             // Do not use s->qscale as luma quantizer because it has not the same
6665             // value in IPCM macroblocks.
6666             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6667             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6668             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6669             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6670             if( dir == 0 ) {
6671                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6672                 if( (edge&1) == 0 ) {
6673                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6674                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6675                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6676                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6677                 }
6678             } else {
6679                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6680                 if( (edge&1) == 0 ) {
6681                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6682                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6683                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6684                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6685                 }
6686             }
6687         }
6688     }
6689 }
6690
6691 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6692     MpegEncContext * const s = &h->s;
6693     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6694
6695     s->mb_skip_run= -1;
6696
6697     if( h->pps.cabac ) {
6698         int i;
6699
6700         /* realign */
6701         align_get_bits( &s->gb );
6702
6703         /* init cabac */
6704         ff_init_cabac_states( &h->cabac);
6705         ff_init_cabac_decoder( &h->cabac,
6706                                s->gb.buffer + get_bits_count(&s->gb)/8,
6707                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6708         /* calculate pre-state */
6709         for( i= 0; i < 460; i++ ) {
6710             int pre;
6711             if( h->slice_type == I_TYPE )
6712                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6713             else
6714                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6715
6716             if( pre <= 63 )
6717                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6718             else
6719                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6720         }
6721
6722         for(;;){
6723 //START_TIMER
6724             int ret = decode_mb_cabac(h);
6725             int eos;
6726 //STOP_TIMER("decode_mb_cabac")
6727
6728             if(ret>=0) hl_decode_mb(h);
6729
6730             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6731                 s->mb_y++;
6732
6733                 if(ret>=0) ret = decode_mb_cabac(h);
6734
6735                 if(ret>=0) hl_decode_mb(h);
6736                 s->mb_y--;
6737             }
6738             eos = get_cabac_terminate( &h->cabac );
6739
6740             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6741                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6742                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6743                 return -1;
6744             }
6745
6746             if( ++s->mb_x >= s->mb_width ) {
6747                 s->mb_x = 0;
6748                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6749                 ++s->mb_y;
6750                 if(FIELD_OR_MBAFF_PICTURE) {
6751                     ++s->mb_y;
6752                 }
6753             }
6754
6755             if( eos || s->mb_y >= s->mb_height ) {
6756                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6757                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6758                 return 0;
6759             }
6760         }
6761
6762     } else {
6763         for(;;){
6764             int ret = decode_mb_cavlc(h);
6765
6766             if(ret>=0) hl_decode_mb(h);
6767
6768             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6769                 s->mb_y++;
6770                 ret = decode_mb_cavlc(h);
6771
6772                 if(ret>=0) hl_decode_mb(h);
6773                 s->mb_y--;
6774             }
6775
6776             if(ret<0){
6777                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6778                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6779
6780                 return -1;
6781             }
6782
6783             if(++s->mb_x >= s->mb_width){
6784                 s->mb_x=0;
6785                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6786                 ++s->mb_y;
6787                 if(FIELD_OR_MBAFF_PICTURE) {
6788                     ++s->mb_y;
6789                 }
6790                 if(s->mb_y >= s->mb_height){
6791                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6792
6793                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6794                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6795
6796                         return 0;
6797                     }else{
6798                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6799
6800                         return -1;
6801                     }
6802                 }
6803             }
6804
6805             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6806                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6807                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6808                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6809
6810                     return 0;
6811                 }else{
6812                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6813
6814                     return -1;
6815                 }
6816             }
6817         }
6818     }
6819
6820 #if 0
6821     for(;s->mb_y < s->mb_height; s->mb_y++){
6822         for(;s->mb_x < s->mb_width; s->mb_x++){
6823             int ret= decode_mb(h);
6824
6825             hl_decode_mb(h);
6826
6827             if(ret<0){
6828                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6829                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6830
6831                 return -1;
6832             }
6833
6834             if(++s->mb_x >= s->mb_width){
6835                 s->mb_x=0;
6836                 if(++s->mb_y >= s->mb_height){
6837                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6838                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6839
6840                         return 0;
6841                     }else{
6842                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6843
6844                         return -1;
6845                     }
6846                 }
6847             }
6848
6849             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6850                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6851                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6852
6853                     return 0;
6854                 }else{
6855                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6856
6857                     return -1;
6858                 }
6859             }
6860         }
6861         s->mb_x=0;
6862         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6863     }
6864 #endif
6865     return -1; //not reached
6866 }
6867
6868 static int decode_unregistered_user_data(H264Context *h, int size){
6869     MpegEncContext * const s = &h->s;
6870     uint8_t user_data[16+256];
6871     int e, build, i;
6872
6873     if(size<16)
6874         return -1;
6875
6876     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6877         user_data[i]= get_bits(&s->gb, 8);
6878     }
6879
6880     user_data[i]= 0;
6881     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6882     if(e==1 && build>=0)
6883         h->x264_build= build;
6884
6885     if(s->avctx->debug & FF_DEBUG_BUGS)
6886         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6887
6888     for(; i<size; i++)
6889         skip_bits(&s->gb, 8);
6890
6891     return 0;
6892 }
6893
6894 static int decode_sei(H264Context *h){
6895     MpegEncContext * const s = &h->s;
6896
6897     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6898         int size, type;
6899
6900         type=0;
6901         do{
6902             type+= show_bits(&s->gb, 8);
6903         }while(get_bits(&s->gb, 8) == 255);
6904
6905         size=0;
6906         do{
6907             size+= show_bits(&s->gb, 8);
6908         }while(get_bits(&s->gb, 8) == 255);
6909
6910         switch(type){
6911         case 5:
6912             if(decode_unregistered_user_data(h, size) < 0)
6913                 return -1;
6914             break;
6915         default:
6916             skip_bits(&s->gb, 8*size);
6917         }
6918
6919         //FIXME check bits here
6920         align_get_bits(&s->gb);
6921     }
6922
6923     return 0;
6924 }
6925
6926 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6927     MpegEncContext * const s = &h->s;
6928     int cpb_count, i;
6929     cpb_count = get_ue_golomb(&s->gb) + 1;
6930     get_bits(&s->gb, 4); /* bit_rate_scale */
6931     get_bits(&s->gb, 4); /* cpb_size_scale */
6932     for(i=0; i<cpb_count; i++){
6933         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6934         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6935         get_bits1(&s->gb);     /* cbr_flag */
6936     }
6937     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6938     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6939     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6940     get_bits(&s->gb, 5); /* time_offset_length */
6941 }
6942
6943 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6944     MpegEncContext * const s = &h->s;
6945     int aspect_ratio_info_present_flag;
6946     unsigned int aspect_ratio_idc;
6947     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6948
6949     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6950
6951     if( aspect_ratio_info_present_flag ) {
6952         aspect_ratio_idc= get_bits(&s->gb, 8);
6953         if( aspect_ratio_idc == EXTENDED_SAR ) {
6954             sps->sar.num= get_bits(&s->gb, 16);
6955             sps->sar.den= get_bits(&s->gb, 16);
6956         }else if(aspect_ratio_idc < 14){
6957             sps->sar=  pixel_aspect[aspect_ratio_idc];
6958         }else{
6959             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6960             return -1;
6961         }
6962     }else{
6963         sps->sar.num=
6964         sps->sar.den= 0;
6965     }
6966 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6967
6968     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6969         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6970     }
6971
6972     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6973         get_bits(&s->gb, 3);    /* video_format */
6974         get_bits1(&s->gb);      /* video_full_range_flag */
6975         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6976             get_bits(&s->gb, 8); /* colour_primaries */
6977             get_bits(&s->gb, 8); /* transfer_characteristics */
6978             get_bits(&s->gb, 8); /* matrix_coefficients */
6979         }
6980     }
6981
6982     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6983         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6984         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6985     }
6986
6987     sps->timing_info_present_flag = get_bits1(&s->gb);
6988     if(sps->timing_info_present_flag){
6989         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6990         sps->time_scale = get_bits_long(&s->gb, 32);
6991         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6992     }
6993
6994     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6995     if(nal_hrd_parameters_present_flag)
6996         decode_hrd_parameters(h, sps);
6997     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6998     if(vcl_hrd_parameters_present_flag)
6999         decode_hrd_parameters(h, sps);
7000     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7001         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7002     get_bits1(&s->gb);         /* pic_struct_present_flag */
7003
7004     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7005     if(sps->bitstream_restriction_flag){
7006         unsigned int num_reorder_frames;
7007         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7008         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7009         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7010         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7011         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7012         num_reorder_frames= get_ue_golomb(&s->gb);
7013         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7014
7015         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7016             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7017             return -1;
7018         }
7019
7020         sps->num_reorder_frames= num_reorder_frames;
7021     }
7022
7023     return 0;
7024 }
7025
7026 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7027                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7028     MpegEncContext * const s = &h->s;
7029     int i, last = 8, next = 8;
7030     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7031     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7032         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7033     else
7034     for(i=0;i<size;i++){
7035         if(next)
7036             next = (last + get_se_golomb(&s->gb)) & 0xff;
7037         if(!i && !next){ /* matrix not written, we use the preset one */
7038             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7039             break;
7040         }
7041         last = factors[scan[i]] = next ? next : last;
7042     }
7043 }
7044
7045 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7046                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7047     MpegEncContext * const s = &h->s;
7048     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7049     const uint8_t *fallback[4] = {
7050         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7051         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7052         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7053         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7054     };
7055     if(get_bits1(&s->gb)){
7056         sps->scaling_matrix_present |= is_sps;
7057         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7058         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7059         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7060         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7061         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7062         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7063         if(is_sps || pps->transform_8x8_mode){
7064             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7065             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7066         }
7067     } else if(fallback_sps) {
7068         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7069         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7070     }
7071 }
7072
7073 /**
7074  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7075  */
7076 static void *
7077 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7078                     const size_t size, const char *name)
7079 {
7080     if(id>=max) {
7081         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7082         return NULL;
7083     }
7084
7085     if(!vec[id]) {
7086         vec[id] = av_mallocz(size);
7087         if(vec[id] == NULL)
7088             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7089     }
7090     return vec[id];
7091 }
7092
7093 static inline int decode_seq_parameter_set(H264Context *h){
7094     MpegEncContext * const s = &h->s;
7095     int profile_idc, level_idc;
7096     unsigned int sps_id, tmp, mb_width, mb_height;
7097     int i;
7098     SPS *sps;
7099
7100     profile_idc= get_bits(&s->gb, 8);
7101     get_bits1(&s->gb);   //constraint_set0_flag
7102     get_bits1(&s->gb);   //constraint_set1_flag
7103     get_bits1(&s->gb);   //constraint_set2_flag
7104     get_bits1(&s->gb);   //constraint_set3_flag
7105     get_bits(&s->gb, 4); // reserved
7106     level_idc= get_bits(&s->gb, 8);
7107     sps_id= get_ue_golomb(&s->gb);
7108
7109     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7110     if(sps == NULL)
7111         return -1;
7112
7113     sps->profile_idc= profile_idc;
7114     sps->level_idc= level_idc;
7115
7116     if(sps->profile_idc >= 100){ //high profile
7117         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7118             get_bits1(&s->gb);  //residual_color_transform_flag
7119         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7120         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7121         sps->transform_bypass = get_bits1(&s->gb);
7122         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7123     }else
7124         sps->scaling_matrix_present = 0;
7125
7126     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7127     sps->poc_type= get_ue_golomb(&s->gb);
7128
7129     if(sps->poc_type == 0){ //FIXME #define
7130         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7131     } else if(sps->poc_type == 1){//FIXME #define
7132         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7133         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7134         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7135         tmp= get_ue_golomb(&s->gb);
7136
7137         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7138             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7139             return -1;
7140         }
7141         sps->poc_cycle_length= tmp;
7142
7143         for(i=0; i<sps->poc_cycle_length; i++)
7144             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7145     }else if(sps->poc_type != 2){
7146         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7147         return -1;
7148     }
7149
7150     tmp= get_ue_golomb(&s->gb);
7151     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7152         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7153         return -1;
7154     }
7155     sps->ref_frame_count= tmp;
7156     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7157     mb_width= get_ue_golomb(&s->gb) + 1;
7158     mb_height= get_ue_golomb(&s->gb) + 1;
7159     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7160        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7161         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7162         return -1;
7163     }
7164     sps->mb_width = mb_width;
7165     sps->mb_height= mb_height;
7166
7167     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7168     if(!sps->frame_mbs_only_flag)
7169         sps->mb_aff= get_bits1(&s->gb);
7170     else
7171         sps->mb_aff= 0;
7172
7173     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7174
7175 #ifndef ALLOW_INTERLACE
7176     if(sps->mb_aff)
7177         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7178 #endif
7179     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7180         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7181
7182     sps->crop= get_bits1(&s->gb);
7183     if(sps->crop){
7184         sps->crop_left  = get_ue_golomb(&s->gb);
7185         sps->crop_right = get_ue_golomb(&s->gb);
7186         sps->crop_top   = get_ue_golomb(&s->gb);
7187         sps->crop_bottom= get_ue_golomb(&s->gb);
7188         if(sps->crop_left || sps->crop_top){
7189             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7190         }
7191     }else{
7192         sps->crop_left  =
7193         sps->crop_right =
7194         sps->crop_top   =
7195         sps->crop_bottom= 0;
7196     }
7197
7198     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7199     if( sps->vui_parameters_present_flag )
7200         decode_vui_parameters(h, sps);
7201
7202     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7203         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7204                sps_id, sps->profile_idc, sps->level_idc,
7205                sps->poc_type,
7206                sps->ref_frame_count,
7207                sps->mb_width, sps->mb_height,
7208                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7209                sps->direct_8x8_inference_flag ? "8B8" : "",
7210                sps->crop_left, sps->crop_right,
7211                sps->crop_top, sps->crop_bottom,
7212                sps->vui_parameters_present_flag ? "VUI" : ""
7213                );
7214     }
7215     return 0;
7216 }
7217
7218 static void
7219 build_qp_table(PPS *pps, int t, int index)
7220 {
7221     int i;
7222     for(i = 0; i < 255; i++)
7223         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7224 }
7225
7226 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7227     MpegEncContext * const s = &h->s;
7228     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7229     PPS *pps;
7230
7231     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7232     if(pps == NULL)
7233         return -1;
7234
7235     tmp= get_ue_golomb(&s->gb);
7236     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7237         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7238         return -1;
7239     }
7240     pps->sps_id= tmp;
7241
7242     pps->cabac= get_bits1(&s->gb);
7243     pps->pic_order_present= get_bits1(&s->gb);
7244     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7245     if(pps->slice_group_count > 1 ){
7246         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7247         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7248         switch(pps->mb_slice_group_map_type){
7249         case 0:
7250 #if 0
7251 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7252 |    run_length[ i ]                                |1  |ue(v)   |
7253 #endif
7254             break;
7255         case 2:
7256 #if 0
7257 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7258 |{                                                  |   |        |
7259 |    top_left_mb[ i ]                               |1  |ue(v)   |
7260 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7261 |   }                                               |   |        |
7262 #endif
7263             break;
7264         case 3:
7265         case 4:
7266         case 5:
7267 #if 0
7268 |   slice_group_change_direction_flag               |1  |u(1)    |
7269 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7270 #endif
7271             break;
7272         case 6:
7273 #if 0
7274 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7275 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7276 |)                                                  |   |        |
7277 |    slice_group_id[ i ]                            |1  |u(v)    |
7278 #endif
7279             break;
7280         }
7281     }
7282     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7283     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7284     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7285         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7286         pps->ref_count[0]= pps->ref_count[1]= 1;
7287         return -1;
7288     }
7289
7290     pps->weighted_pred= get_bits1(&s->gb);
7291     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7292     pps->init_qp= get_se_golomb(&s->gb) + 26;
7293     pps->init_qs= get_se_golomb(&s->gb) + 26;
7294     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7295     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7296     pps->constrained_intra_pred= get_bits1(&s->gb);
7297     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7298
7299     pps->transform_8x8_mode= 0;
7300     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7301     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7302     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7303
7304     if(get_bits_count(&s->gb) < bit_length){
7305         pps->transform_8x8_mode= get_bits1(&s->gb);
7306         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7307         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7308     } else {
7309         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7310     }
7311
7312     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7313     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7314         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7315         h->pps.chroma_qp_diff= 1;
7316     } else
7317         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7318
7319     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7320         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7321                pps_id, pps->sps_id,
7322                pps->cabac ? "CABAC" : "CAVLC",
7323                pps->slice_group_count,
7324                pps->ref_count[0], pps->ref_count[1],
7325                pps->weighted_pred ? "weighted" : "",
7326                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7327                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7328                pps->constrained_intra_pred ? "CONSTR" : "",
7329                pps->redundant_pic_cnt_present ? "REDU" : "",
7330                pps->transform_8x8_mode ? "8x8DCT" : ""
7331                );
7332     }
7333
7334     return 0;
7335 }
7336
7337 /**
7338  * Call decode_slice() for each context.
7339  *
7340  * @param h h264 master context
7341  * @param context_count number of contexts to execute
7342  */
7343 static void execute_decode_slices(H264Context *h, int context_count){
7344     MpegEncContext * const s = &h->s;
7345     AVCodecContext * const avctx= s->avctx;
7346     H264Context *hx;
7347     int i;
7348
7349     if(context_count == 1) {
7350         decode_slice(avctx, h);
7351     } else {
7352         for(i = 1; i < context_count; i++) {
7353             hx = h->thread_context[i];
7354             hx->s.error_resilience = avctx->error_resilience;
7355             hx->s.error_count = 0;
7356         }
7357
7358         avctx->execute(avctx, (void *)decode_slice,
7359                        (void **)h->thread_context, NULL, context_count);
7360
7361         /* pull back stuff from slices to master context */
7362         hx = h->thread_context[context_count - 1];
7363         s->mb_x = hx->s.mb_x;
7364         s->mb_y = hx->s.mb_y;
7365         s->dropable = hx->s.dropable;
7366         s->picture_structure = hx->s.picture_structure;
7367         for(i = 1; i < context_count; i++)
7368             h->s.error_count += h->thread_context[i]->s.error_count;
7369     }
7370 }
7371
7372
7373 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7374     MpegEncContext * const s = &h->s;
7375     AVCodecContext * const avctx= s->avctx;
7376     int buf_index=0;
7377     H264Context *hx; ///< thread context
7378     int context_count = 0;
7379
7380     h->max_contexts = avctx->thread_count;
7381 #if 0
7382     int i;
7383     for(i=0; i<50; i++){
7384         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7385     }
7386 #endif
7387     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7388         h->current_slice = 0;
7389         if (!s->first_field)
7390             s->current_picture_ptr= NULL;
7391     }
7392
7393     for(;;){
7394         int consumed;
7395         int dst_length;
7396         int bit_length;
7397         uint8_t *ptr;
7398         int i, nalsize = 0;
7399         int err;
7400
7401         if(h->is_avc) {
7402             if(buf_index >= buf_size) break;
7403             nalsize = 0;
7404             for(i = 0; i < h->nal_length_size; i++)
7405                 nalsize = (nalsize << 8) | buf[buf_index++];
7406             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7407                 if(nalsize == 1){
7408                     buf_index++;
7409                     continue;
7410                 }else{
7411                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7412                     break;
7413                 }
7414             }
7415         } else {
7416             // start code prefix search
7417             for(; buf_index + 3 < buf_size; buf_index++){
7418                 // This should always succeed in the first iteration.
7419                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7420                     break;
7421             }
7422
7423             if(buf_index+3 >= buf_size) break;
7424
7425             buf_index+=3;
7426         }
7427
7428         hx = h->thread_context[context_count];
7429
7430         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7431         if (ptr==NULL || dst_length < 0){
7432             return -1;
7433         }
7434         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7435             dst_length--;
7436         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7437
7438         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7439             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7440         }
7441
7442         if (h->is_avc && (nalsize != consumed))
7443             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7444
7445         buf_index += consumed;
7446
7447         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7448            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7449             continue;
7450
7451       again:
7452         err = 0;
7453         switch(hx->nal_unit_type){
7454         case NAL_IDR_SLICE:
7455             if (h->nal_unit_type != NAL_IDR_SLICE) {
7456                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7457                 return -1;
7458             }
7459             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7460         case NAL_SLICE:
7461             init_get_bits(&hx->s.gb, ptr, bit_length);
7462             hx->intra_gb_ptr=
7463             hx->inter_gb_ptr= &hx->s.gb;
7464             hx->s.data_partitioning = 0;
7465
7466             if((err = decode_slice_header(hx, h)))
7467                break;
7468
7469             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7470             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7471                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7472                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7473                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7474                && avctx->skip_frame < AVDISCARD_ALL)
7475                 context_count++;
7476             break;
7477         case NAL_DPA:
7478             init_get_bits(&hx->s.gb, ptr, bit_length);
7479             hx->intra_gb_ptr=
7480             hx->inter_gb_ptr= NULL;
7481             hx->s.data_partitioning = 1;
7482
7483             err = decode_slice_header(hx, h);
7484             break;
7485         case NAL_DPB:
7486             init_get_bits(&hx->intra_gb, ptr, bit_length);
7487             hx->intra_gb_ptr= &hx->intra_gb;
7488             break;
7489         case NAL_DPC:
7490             init_get_bits(&hx->inter_gb, ptr, bit_length);
7491             hx->inter_gb_ptr= &hx->inter_gb;
7492
7493             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7494                && s->context_initialized
7495                && s->hurry_up < 5
7496                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7497                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=B_TYPE)
7498                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==I_TYPE)
7499                && avctx->skip_frame < AVDISCARD_ALL)
7500                 context_count++;
7501             break;
7502         case NAL_SEI:
7503             init_get_bits(&s->gb, ptr, bit_length);
7504             decode_sei(h);
7505             break;
7506         case NAL_SPS:
7507             init_get_bits(&s->gb, ptr, bit_length);
7508             decode_seq_parameter_set(h);
7509
7510             if(s->flags& CODEC_FLAG_LOW_DELAY)
7511                 s->low_delay=1;
7512
7513             if(avctx->has_b_frames < 2)
7514                 avctx->has_b_frames= !s->low_delay;
7515             break;
7516         case NAL_PPS:
7517             init_get_bits(&s->gb, ptr, bit_length);
7518
7519             decode_picture_parameter_set(h, bit_length);
7520
7521             break;
7522         case NAL_AUD:
7523         case NAL_END_SEQUENCE:
7524         case NAL_END_STREAM:
7525         case NAL_FILLER_DATA:
7526         case NAL_SPS_EXT:
7527         case NAL_AUXILIARY_SLICE:
7528             break;
7529         default:
7530             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7531         }
7532
7533         if(context_count == h->max_contexts) {
7534             execute_decode_slices(h, context_count);
7535             context_count = 0;
7536         }
7537
7538         if (err < 0)
7539             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7540         else if(err == 1) {
7541             /* Slice could not be decoded in parallel mode, copy down
7542              * NAL unit stuff to context 0 and restart. Note that
7543              * rbsp_buffer is not transfered, but since we no longer
7544              * run in parallel mode this should not be an issue. */
7545             h->nal_unit_type = hx->nal_unit_type;
7546             h->nal_ref_idc   = hx->nal_ref_idc;
7547             hx = h;
7548             goto again;
7549         }
7550     }
7551     if(context_count)
7552         execute_decode_slices(h, context_count);
7553     return buf_index;
7554 }
7555
7556 /**
7557  * returns the number of bytes consumed for building the current frame
7558  */
7559 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7560     if(s->flags&CODEC_FLAG_TRUNCATED){
7561         pos -= s->parse_context.last_index;
7562         if(pos<0) pos=0; // FIXME remove (unneeded?)
7563
7564         return pos;
7565     }else{
7566         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7567         if(pos+10>buf_size) pos=buf_size; // oops ;)
7568
7569         return pos;
7570     }
7571 }
7572
7573 static int decode_frame(AVCodecContext *avctx,
7574                              void *data, int *data_size,
7575                              uint8_t *buf, int buf_size)
7576 {
7577     H264Context *h = avctx->priv_data;
7578     MpegEncContext *s = &h->s;
7579     AVFrame *pict = data;
7580     int buf_index;
7581
7582     s->flags= avctx->flags;
7583     s->flags2= avctx->flags2;
7584
7585    /* no supplementary picture */
7586     if (buf_size == 0) {
7587         Picture *out;
7588         int i, out_idx;
7589
7590 //FIXME factorize this with the output code below
7591         out = h->delayed_pic[0];
7592         out_idx = 0;
7593         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7594             if(h->delayed_pic[i]->poc < out->poc){
7595                 out = h->delayed_pic[i];
7596                 out_idx = i;
7597             }
7598
7599         for(i=out_idx; h->delayed_pic[i]; i++)
7600             h->delayed_pic[i] = h->delayed_pic[i+1];
7601
7602         if(out){
7603             *data_size = sizeof(AVFrame);
7604             *pict= *(AVFrame*)out;
7605         }
7606
7607         return 0;
7608     }
7609
7610     if(s->flags&CODEC_FLAG_TRUNCATED){
7611         int next= ff_h264_find_frame_end(h, buf, buf_size);
7612
7613         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7614             return buf_size;
7615 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7616     }
7617
7618     if(h->is_avc && !h->got_avcC) {
7619         int i, cnt, nalsize;
7620         unsigned char *p = avctx->extradata;
7621         if(avctx->extradata_size < 7) {
7622             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7623             return -1;
7624         }
7625         if(*p != 1) {
7626             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7627             return -1;
7628         }
7629         /* sps and pps in the avcC always have length coded with 2 bytes,
7630            so put a fake nal_length_size = 2 while parsing them */
7631         h->nal_length_size = 2;
7632         // Decode sps from avcC
7633         cnt = *(p+5) & 0x1f; // Number of sps
7634         p += 6;
7635         for (i = 0; i < cnt; i++) {
7636             nalsize = AV_RB16(p) + 2;
7637             if(decode_nal_units(h, p, nalsize) < 0) {
7638                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7639                 return -1;
7640             }
7641             p += nalsize;
7642         }
7643         // Decode pps from avcC
7644         cnt = *(p++); // Number of pps
7645         for (i = 0; i < cnt; i++) {
7646             nalsize = AV_RB16(p) + 2;
7647             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7648                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7649                 return -1;
7650             }
7651             p += nalsize;
7652         }
7653         // Now store right nal length size, that will be use to parse all other nals
7654         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7655         // Do not reparse avcC
7656         h->got_avcC = 1;
7657     }
7658
7659     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7660         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7661             return -1;
7662     }
7663
7664     buf_index=decode_nal_units(h, buf, buf_size);
7665     if(buf_index < 0)
7666         return -1;
7667
7668     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7669         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7670         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7671         return -1;
7672     }
7673
7674     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7675         Picture *out = s->current_picture_ptr;
7676         Picture *cur = s->current_picture_ptr;
7677         Picture *prev = h->delayed_output_pic;
7678         int i, pics, cross_idr, out_of_order, out_idx;
7679
7680         s->mb_y= 0;
7681
7682         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7683         s->current_picture_ptr->pict_type= s->pict_type;
7684
7685         h->prev_frame_num_offset= h->frame_num_offset;
7686         h->prev_frame_num= h->frame_num;
7687         if(!s->dropable) {
7688             h->prev_poc_msb= h->poc_msb;
7689             h->prev_poc_lsb= h->poc_lsb;
7690             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7691         }
7692
7693         /*
7694          * FIXME: Error handling code does not seem to support interlaced
7695          * when slices span multiple rows
7696          * The ff_er_add_slice calls don't work right for bottom
7697          * fields; they cause massive erroneous error concealing
7698          * Error marking covers both fields (top and bottom).
7699          * This causes a mismatched s->error_count
7700          * and a bad error table. Further, the error count goes to
7701          * INT_MAX when called for bottom field, because mb_y is
7702          * past end by one (callers fault) and resync_mb_y != 0
7703          * causes problems for the first MB line, too.
7704          */
7705         if (!FIELD_PICTURE)
7706             ff_er_frame_end(s);
7707
7708         MPV_frame_end(s);
7709
7710         if (s->first_field) {
7711             /* Wait for second field. */
7712             *data_size = 0;
7713
7714         } else {
7715             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7716             /* Derive top_field_first from field pocs. */
7717             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7718
7719         //FIXME do something with unavailable reference frames
7720
7721 #if 0 //decode order
7722             *data_size = sizeof(AVFrame);
7723 #else
7724             /* Sort B-frames into display order */
7725
7726             if(h->sps.bitstream_restriction_flag
7727                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7728                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7729                 s->low_delay = 0;
7730             }
7731
7732             pics = 0;
7733             while(h->delayed_pic[pics]) pics++;
7734
7735             assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7736
7737             h->delayed_pic[pics++] = cur;
7738             if(cur->reference == 0)
7739                 cur->reference = DELAYED_PIC_REF;
7740
7741             cross_idr = 0;
7742             for(i=0; h->delayed_pic[i]; i++)
7743                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7744                     cross_idr = 1;
7745
7746             out = h->delayed_pic[0];
7747             out_idx = 0;
7748             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7749                 if(h->delayed_pic[i]->poc < out->poc){
7750                     out = h->delayed_pic[i];
7751                     out_idx = i;
7752                 }
7753
7754             out_of_order = !cross_idr && prev && out->poc < prev->poc;
7755             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7756                 { }
7757             else if(prev && pics <= s->avctx->has_b_frames)
7758                 out = prev;
7759             else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7760                || (s->low_delay &&
7761                 ((!cross_idr && prev && out->poc > prev->poc + 2)
7762                  || cur->pict_type == B_TYPE)))
7763             {
7764                 s->low_delay = 0;
7765                 s->avctx->has_b_frames++;
7766                 out = prev;
7767             }
7768             else if(out_of_order)
7769                 out = prev;
7770
7771             if(out_of_order || pics > s->avctx->has_b_frames){
7772                 for(i=out_idx; h->delayed_pic[i]; i++)
7773                     h->delayed_pic[i] = h->delayed_pic[i+1];
7774             }
7775
7776             if(prev == out)
7777                 *data_size = 0;
7778             else
7779                 *data_size = sizeof(AVFrame);
7780             if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7781                 prev->reference = 0;
7782             h->delayed_output_pic = out;
7783 #endif
7784
7785             if(out)
7786                 *pict= *(AVFrame*)out;
7787             else
7788                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7789         }
7790     }
7791
7792     assert(pict->data[0] || !*data_size);
7793     ff_print_debug_info(s, pict);
7794 //printf("out %d\n", (int)pict->data[0]);
7795 #if 0 //?
7796
7797     /* Return the Picture timestamp as the frame number */
7798     /* we subtract 1 because it is added on utils.c     */
7799     avctx->frame_number = s->picture_number - 1;
7800 #endif
7801     return get_consumed_bytes(s, buf_index, buf_size);
7802 }
7803 #if 0
7804 static inline void fill_mb_avail(H264Context *h){
7805     MpegEncContext * const s = &h->s;
7806     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7807
7808     if(s->mb_y){
7809         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7810         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7811         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7812     }else{
7813         h->mb_avail[0]=
7814         h->mb_avail[1]=
7815         h->mb_avail[2]= 0;
7816     }
7817     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7818     h->mb_avail[4]= 1; //FIXME move out
7819     h->mb_avail[5]= 0; //FIXME move out
7820 }
7821 #endif
7822
7823 #if 0 //selftest
7824 #undef random
7825 #define COUNT 8000
7826 #define SIZE (COUNT*40)
7827 int main(void){
7828     int i;
7829     uint8_t temp[SIZE];
7830     PutBitContext pb;
7831     GetBitContext gb;
7832 //    int int_temp[10000];
7833     DSPContext dsp;
7834     AVCodecContext avctx;
7835
7836     dsputil_init(&dsp, &avctx);
7837
7838     init_put_bits(&pb, temp, SIZE);
7839     printf("testing unsigned exp golomb\n");
7840     for(i=0; i<COUNT; i++){
7841         START_TIMER
7842         set_ue_golomb(&pb, i);
7843         STOP_TIMER("set_ue_golomb");
7844     }
7845     flush_put_bits(&pb);
7846
7847     init_get_bits(&gb, temp, 8*SIZE);
7848     for(i=0; i<COUNT; i++){
7849         int j, s;
7850
7851         s= show_bits(&gb, 24);
7852
7853         START_TIMER
7854         j= get_ue_golomb(&gb);
7855         if(j != i){
7856             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7857 //            return -1;
7858         }
7859         STOP_TIMER("get_ue_golomb");
7860     }
7861
7862
7863     init_put_bits(&pb, temp, SIZE);
7864     printf("testing signed exp golomb\n");
7865     for(i=0; i<COUNT; i++){
7866         START_TIMER
7867         set_se_golomb(&pb, i - COUNT/2);
7868         STOP_TIMER("set_se_golomb");
7869     }
7870     flush_put_bits(&pb);
7871
7872     init_get_bits(&gb, temp, 8*SIZE);
7873     for(i=0; i<COUNT; i++){
7874         int j, s;
7875
7876         s= show_bits(&gb, 24);
7877
7878         START_TIMER
7879         j= get_se_golomb(&gb);
7880         if(j != i - COUNT/2){
7881             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7882 //            return -1;
7883         }
7884         STOP_TIMER("get_se_golomb");
7885     }
7886
7887     printf("testing 4x4 (I)DCT\n");
7888
7889     DCTELEM block[16];
7890     uint8_t src[16], ref[16];
7891     uint64_t error= 0, max_error=0;
7892
7893     for(i=0; i<COUNT; i++){
7894         int j;
7895 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7896         for(j=0; j<16; j++){
7897             ref[j]= random()%255;
7898             src[j]= random()%255;
7899         }
7900
7901         h264_diff_dct_c(block, src, ref, 4);
7902
7903         //normalize
7904         for(j=0; j<16; j++){
7905 //            printf("%d ", block[j]);
7906             block[j]= block[j]*4;
7907             if(j&1) block[j]= (block[j]*4 + 2)/5;
7908             if(j&4) block[j]= (block[j]*4 + 2)/5;
7909         }
7910 //        printf("\n");
7911
7912         s->dsp.h264_idct_add(ref, block, 4);
7913 /*        for(j=0; j<16; j++){
7914             printf("%d ", ref[j]);
7915         }
7916         printf("\n");*/
7917
7918         for(j=0; j<16; j++){
7919             int diff= FFABS(src[j] - ref[j]);
7920
7921             error+= diff*diff;
7922             max_error= FFMAX(max_error, diff);
7923         }
7924     }
7925     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7926 #if 0
7927     printf("testing quantizer\n");
7928     for(qp=0; qp<52; qp++){
7929         for(i=0; i<16; i++)
7930             src1_block[i]= src2_block[i]= random()%255;
7931
7932     }
7933 #endif
7934     printf("Testing NAL layer\n");
7935
7936     uint8_t bitstream[COUNT];
7937     uint8_t nal[COUNT*2];
7938     H264Context h;
7939     memset(&h, 0, sizeof(H264Context));
7940
7941     for(i=0; i<COUNT; i++){
7942         int zeros= i;
7943         int nal_length;
7944         int consumed;
7945         int out_length;
7946         uint8_t *out;
7947         int j;
7948
7949         for(j=0; j<COUNT; j++){
7950             bitstream[j]= (random() % 255) + 1;
7951         }
7952
7953         for(j=0; j<zeros; j++){
7954             int pos= random() % COUNT;
7955             while(bitstream[pos] == 0){
7956                 pos++;
7957                 pos %= COUNT;
7958             }
7959             bitstream[pos]=0;
7960         }
7961
7962         START_TIMER
7963
7964         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7965         if(nal_length<0){
7966             printf("encoding failed\n");
7967             return -1;
7968         }
7969
7970         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7971
7972         STOP_TIMER("NAL")
7973
7974         if(out_length != COUNT){
7975             printf("incorrect length %d %d\n", out_length, COUNT);
7976             return -1;
7977         }
7978
7979         if(consumed != nal_length){
7980             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7981             return -1;
7982         }
7983
7984         if(memcmp(bitstream, out, COUNT)){
7985             printf("mismatch\n");
7986             return -1;
7987         }
7988     }
7989
7990     printf("Testing RBSP\n");
7991
7992
7993     return 0;
7994 }
7995 #endif
7996
7997
7998 static int decode_end(AVCodecContext *avctx)
7999 {
8000     H264Context *h = avctx->priv_data;
8001     MpegEncContext *s = &h->s;
8002
8003     av_freep(&h->rbsp_buffer[0]);
8004     av_freep(&h->rbsp_buffer[1]);
8005     free_tables(h); //FIXME cleanup init stuff perhaps
8006     MPV_common_end(s);
8007
8008 //    memset(h, 0, sizeof(H264Context));
8009
8010     return 0;
8011 }
8012
8013
8014 AVCodec h264_decoder = {
8015     "h264",
8016     CODEC_TYPE_VIDEO,
8017     CODEC_ID_H264,
8018     sizeof(H264Context),
8019     decode_init,
8020     NULL,
8021     decode_end,
8022     decode_frame,
8023     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8024     .flush= flush_dpb,
8025 };
8026
8027 #include "svq3.c"