]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/mpegvideo.c
cosmetics: Move one code block to save an #ifdef in the next commit.
[ffmpeg] / libavcodec / mpegvideo.c
index 743e992dacfed88f561e1f3885366e88563c7589..f0069fa1e814dd8ee2f7584b180a5501080b77f8 100644 (file)
@@ -3,19 +3,21 @@
  * Copyright (c) 2000,2001 Fabrice Bellard.
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This library is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
- * This library is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * 4MV & hq & b-frame encoding stuff by Michael Niedermayer <michaelni@gmx.at>
  */
 #include <limits.h>
 
 #ifdef USE_FASTMEMCPY
-#include "fastmemcpy.h"
+#include "libvo/fastmemcpy.h"
 #endif
 
 //#undef NDEBUG
 //#include <assert.h>
 
 #ifdef CONFIG_ENCODERS
-static void encode_picture(MpegEncContext *s, int picture_number);
+static int encode_picture(MpegEncContext *s, int picture_number);
 #endif //CONFIG_ENCODERS
 static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale);
@@ -47,6 +49,8 @@ static void dct_unquantize_mpeg1_inter_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale);
 static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale);
+static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
+                                   DCTELEM *block, int n, int qscale);
 static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale);
 static void dct_unquantize_h263_intra_c(MpegEncContext *s,
@@ -103,7 +107,7 @@ static const uint8_t ff_default_chroma_qscale_table[32]={
 };
 
 #ifdef CONFIG_ENCODERS
-static uint8_t (*default_mv_penalty)[MAX_MV*2+1]=NULL;
+static uint8_t default_mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
 static uint8_t default_fcode_tab[MAX_MV*2+1];
 
 enum PixelFormat ff_yuv420p_list[2]= {PIX_FMT_YUV420P, -1};
@@ -128,7 +132,7 @@ static void convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[
                 /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
                 /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
 
-                qmat[qscale][i] = (int)((uint64_t_C(1) << QMAT_SHIFT) /
+                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) /
                                 (qscale * quant_matrix[j]));
             }
         } else if (dsp->fdct == fdct_ifast
@@ -143,7 +147,7 @@ static void convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[
                 /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
                 /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
 
-                qmat[qscale][i] = (int)((uint64_t_C(1) << (QMAT_SHIFT + 14)) /
+                qmat[qscale][i] = (int)((UINT64_C(1) << (QMAT_SHIFT + 14)) /
                                 (aanscales[i] * qscale * quant_matrix[j]));
             }
         } else {
@@ -154,7 +158,7 @@ static void convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[
                    so (1<<19) / 16 >= (1<<19) / (qscale * quant_matrix[i]) >= (1<<19) / 7905
                    so 32768        >= (1<<19) / (qscale * quant_matrix[i]) >= 67
                 */
-                qmat[qscale][i] = (int)((uint64_t_C(1) << QMAT_SHIFT) / (qscale * quant_matrix[j]));
+                qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) / (qscale * quant_matrix[j]));
 //                qmat  [qscale][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]);
                 qmat16[qscale][0][i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[j]);
 
@@ -178,13 +182,13 @@ static void convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[
         }
     }
     if(shift){
-        av_log(NULL, AV_LOG_INFO, "Warning, QMAT_SHIFT is larger then %d, overflows possible\n", QMAT_SHIFT - shift);
+        av_log(NULL, AV_LOG_INFO, "Warning, QMAT_SHIFT is larger than %d, overflows possible\n", QMAT_SHIFT - shift);
     }
 }
 
 static inline void update_qscale(MpegEncContext *s){
     s->qscale= (s->lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
-    s->qscale= clip(s->qscale, s->avctx->qmin, s->avctx->qmax);
+    s->qscale= av_clip(s->qscale, s->avctx->qmin, s->avctx->qmax);
 
     s->lambda2= (s->lambda*s->lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
 }
@@ -215,7 +219,7 @@ void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_s
 }
 
 #ifdef CONFIG_ENCODERS
-void ff_write_quant_matrix(PutBitContext *pb, int16_t *matrix){
+void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix){
     int i;
 
     if(matrix){
@@ -228,6 +232,36 @@ void ff_write_quant_matrix(PutBitContext *pb, int16_t *matrix){
 }
 #endif //CONFIG_ENCODERS
 
+const uint8_t *ff_find_start_code(const uint8_t * restrict p, const uint8_t *end, uint32_t * restrict state){
+    int i;
+
+    assert(p<=end);
+    if(p>=end)
+        return end;
+
+    for(i=0; i<3; i++){
+        uint32_t tmp= *state << 8;
+        *state= tmp + *(p++);
+        if(tmp == 0x100 || p==end)
+            return p;
+    }
+
+    while(p<end){
+        if     (p[-1] > 1      ) p+= 3;
+        else if(p[-2]          ) p+= 2;
+        else if(p[-3]|(p[-1]-1)) p++;
+        else{
+            p++;
+            break;
+        }
+    }
+
+    p= FFMIN(p, end)-4;
+    *state=  be2me_32(unaligned32(p));
+
+    return p+4;
+}
+
 /* init common dct for both encoder and decoder */
 int DCT_common_init(MpegEncContext *s)
 {
@@ -236,6 +270,8 @@ int DCT_common_init(MpegEncContext *s)
     s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_c;
     s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_c;
     s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_c;
+    if(s->flags & CODEC_FLAG_BITEXACT)
+        s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_bitexact;
     s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_c;
 
 #ifdef CONFIG_ENCODERS
@@ -292,6 +328,7 @@ static void copy_picture(Picture *dst, Picture *src){
     dst->type= FF_BUFFER_TYPE_COPY;
 }
 
+#ifdef CONFIG_ENCODERS
 static void copy_picture_attributes(MpegEncContext *s, AVFrame *dst, AVFrame *src){
     int i;
 
@@ -330,6 +367,7 @@ static void copy_picture_attributes(MpegEncContext *s, AVFrame *dst, AVFrame *sr
         }
     }
 }
+#endif
 
 /**
  * allocates a Picture
@@ -354,7 +392,7 @@ static int alloc_picture(MpegEncContext *s, Picture *pic, int shared){
         r= s->avctx->get_buffer(s->avctx, (AVFrame*)pic);
 
         if(r<0 || !pic->age || !pic->type || !pic->data[0]){
-           av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed (%d %d %d %p)\n", r, pic->age, pic->type, pic->data[0]);
+            av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed (%d %d %d %p)\n", r, pic->age, pic->type, pic->data[0]);
             return -1;
         }
 
@@ -453,8 +491,8 @@ static int init_duplicate_context(MpegEncContext *s, MpegEncContext *base){
     int i;
 
     // edge emu needs blocksize + filter length - 1 (=17x17 for halfpel / 21x21 for h264)
-    CHECKED_ALLOCZ(s->allocated_edge_emu_buffer, (s->width+64)*2*17*2); //(width + edge + align)*interlaced*MBsize*tolerance
-    s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*17;
+    CHECKED_ALLOCZ(s->allocated_edge_emu_buffer, (s->width+64)*2*21*2); //(width + edge + align)*interlaced*MBsize*tolerance
+    s->edge_emu_buffer= s->allocated_edge_emu_buffer + (s->width+64)*2*21;
 
      //FIXME should be linesize instead of s->width*2 but that isnt known before get_buffer()
     CHECKED_ALLOCZ(s->me.scratchpad,  (s->width+64)*4*16*2*sizeof(uint8_t))
@@ -531,6 +569,7 @@ void ff_update_duplicate_context(MpegEncContext *dst, MpegEncContext *src){
 //STOP_TIMER("update_duplicate_context") //about 10k cycles / 0.01 sec for 1000frames on 1ghz with 2 threads
 }
 
+#ifdef CONFIG_ENCODERS
 static void update_duplicate_context_after_me(MpegEncContext *dst, MpegEncContext *src){
 #define COPY(a) dst->a= src->a
     COPY(pict_type);
@@ -547,6 +586,7 @@ static void update_duplicate_context_after_me(MpegEncContext *dst, MpegEncContex
     COPY(partitioned_frame); //FIXME don't set in encode_header
 #undef COPY
 }
+#endif
 
 /**
  * sets the given MpegEncContext to common defaults (same for encoding and decoding).
@@ -585,20 +625,11 @@ void MPV_decode_defaults(MpegEncContext *s){
 
 #ifdef CONFIG_ENCODERS
 static void MPV_encode_defaults(MpegEncContext *s){
-    static int done=0;
-
+    int i;
     MPV_common_defaults(s);
 
-    if(!done){
-        int i;
-        done=1;
-
-        default_mv_penalty= av_mallocz( sizeof(uint8_t)*(MAX_FCODE+1)*(2*MAX_MV+1) );
-        memset(default_fcode_tab , 0, sizeof(uint8_t)*(2*MAX_MV+1));
-
-        for(i=-16; i<16; i++){
-            default_fcode_tab[i + MAX_MV]= 1;
-        }
+    for(i=-16; i<16; i++){
+        default_fcode_tab[i + MAX_MV]= 1;
     }
     s->me.mv_penalty= default_mv_penalty;
     s->fcode_tab= default_fcode_tab;
@@ -613,7 +644,9 @@ int MPV_common_init(MpegEncContext *s)
 {
     int y_size, c_size, yc_size, i, mb_array_size, mv_table_size, x, y;
 
-    if(s->avctx->thread_count > MAX_THREADS || (16*s->avctx->thread_count > s->height && s->height)){
+    s->mb_height = (s->height + 15) / 16;
+
+    if(s->avctx->thread_count > MAX_THREADS || (s->avctx->thread_count > s->mb_height && s->mb_height)){
         av_log(s->avctx, AV_LOG_ERROR, "too many threads\n");
         return -1;
     }
@@ -628,7 +661,6 @@ int MPV_common_init(MpegEncContext *s)
     s->flags2= s->avctx->flags2;
 
     s->mb_width  = (s->width  + 15) / 16;
-    s->mb_height = (s->height + 15) / 16;
     s->mb_stride = s->mb_width + 1;
     s->b8_stride = s->mb_width*2 + 1;
     s->b4_stride = s->mb_width*4 + 1;
@@ -657,12 +689,12 @@ int MPV_common_init(MpegEncContext *s)
     yc_size = y_size + 2 * c_size;
 
     /* convert fourcc to upper case */
-    s->avctx->codec_tag=   toupper( s->avctx->codec_tag     &0xFF)
+    s->codec_tag=          toupper( s->avctx->codec_tag     &0xFF)
                         + (toupper((s->avctx->codec_tag>>8 )&0xFF)<<8 )
                         + (toupper((s->avctx->codec_tag>>16)&0xFF)<<16)
                         + (toupper((s->avctx->codec_tag>>24)&0xFF)<<24);
 
-    s->avctx->stream_codec_tag=   toupper( s->avctx->stream_codec_tag     &0xFF)
+    s->stream_codec_tag=          toupper( s->avctx->stream_codec_tag     &0xFF)
                                + (toupper((s->avctx->stream_codec_tag>>8 )&0xFF)<<8 )
                                + (toupper((s->avctx->stream_codec_tag>>16)&0xFF)<<16)
                                + (toupper((s->avctx->stream_codec_tag>>24)&0xFF)<<24);
@@ -892,28 +924,45 @@ int MPV_encode_init(AVCodecContext *avctx)
 
     MPV_encode_defaults(s);
 
-    if(avctx->pix_fmt != PIX_FMT_YUVJ420P && avctx->pix_fmt != PIX_FMT_YUV420P){
-        av_log(avctx, AV_LOG_ERROR, "only YUV420 is supported\n");
-        return -1;
-    }
-
-    if(avctx->codec_id == CODEC_ID_MJPEG || avctx->codec_id == CODEC_ID_LJPEG){
-        if(avctx->strict_std_compliance>FF_COMPLIANCE_INOFFICIAL && avctx->pix_fmt != PIX_FMT_YUVJ420P){
+    switch (avctx->codec_id) {
+    case CODEC_ID_MPEG2VIDEO:
+        if(avctx->pix_fmt != PIX_FMT_YUV420P && avctx->pix_fmt != PIX_FMT_YUV422P){
+            av_log(avctx, AV_LOG_ERROR, "only YUV420 and YUV422 are supported\n");
+            return -1;
+        }
+        break;
+    case CODEC_ID_LJPEG:
+    case CODEC_ID_MJPEG:
+        if(avctx->pix_fmt != PIX_FMT_YUVJ420P && avctx->pix_fmt != PIX_FMT_YUVJ422P &&
+           ((avctx->pix_fmt != PIX_FMT_YUV420P && avctx->pix_fmt != PIX_FMT_YUV422P) || avctx->strict_std_compliance>FF_COMPLIANCE_INOFFICIAL)){
             av_log(avctx, AV_LOG_ERROR, "colorspace not supported in jpeg\n");
             return -1;
         }
-    }else{
-        if(avctx->strict_std_compliance>FF_COMPLIANCE_INOFFICIAL && avctx->pix_fmt != PIX_FMT_YUV420P){
-            av_log(avctx, AV_LOG_ERROR, "colorspace not supported\n");
+        break;
+    default:
+        if(avctx->pix_fmt != PIX_FMT_YUV420P){
+            av_log(avctx, AV_LOG_ERROR, "only YUV420 is supported\n");
             return -1;
         }
     }
 
+    switch (avctx->pix_fmt) {
+    case PIX_FMT_YUVJ422P:
+    case PIX_FMT_YUV422P:
+        s->chroma_format = CHROMA_422;
+        break;
+    case PIX_FMT_YUVJ420P:
+    case PIX_FMT_YUV420P:
+    default:
+        s->chroma_format = CHROMA_420;
+        break;
+    }
+
     s->bit_rate = avctx->bit_rate;
     s->width = avctx->width;
     s->height = avctx->height;
-    if(avctx->gop_size > 600){
-       av_log(avctx, AV_LOG_ERROR, "Warning keyframe interval too large! reducing it ...\n");
+    if(avctx->gop_size > 600 && avctx->strict_std_compliance>FF_COMPLIANCE_EXPERIMENTAL){
+        av_log(avctx, AV_LOG_ERROR, "Warning keyframe interval too large! reducing it ...\n");
         avctx->gop_size=600;
     }
     s->gop_size = avctx->gop_size;
@@ -956,6 +1005,8 @@ int MPV_encode_init(AVCodecContext *avctx)
     s->obmc= !!(s->flags & CODEC_FLAG_OBMC);
     s->loop_filter= !!(s->flags & CODEC_FLAG_LOOP_FILTER);
     s->alternate_scan= !!(s->flags & CODEC_FLAG_ALT_SCAN);
+    s->intra_vlc_format= !!(s->flags2 & CODEC_FLAG2_INTRA_VLC);
+    s->q_scale_type= !!(s->flags2 & CODEC_FLAG2_NON_LINEAR_QUANT);
 
     if(avctx->rc_max_rate && !avctx->rc_buffer_size){
         av_log(avctx, AV_LOG_ERROR, "a vbv buffer size is needed, for encoding with a maximum bitrate\n");
@@ -1036,10 +1087,37 @@ int MPV_encode_init(AVCodecContext *avctx)
     }
 
     if(s->avctx->scenechange_threshold < 1000000000 && (s->flags & CODEC_FLAG_CLOSED_GOP)){
-        av_log(avctx, AV_LOG_ERROR, "closed gop with scene change detection arent supported yet\n");
+        av_log(avctx, AV_LOG_ERROR, "closed gop with scene change detection arent supported yet, set threshold to 1000000000\n");
         return -1;
     }
 
+    if((s->flags2 & CODEC_FLAG2_INTRA_VLC) && s->codec_id != CODEC_ID_MPEG2VIDEO){
+        av_log(avctx, AV_LOG_ERROR, "intra vlc table not supported by codec\n");
+        return -1;
+    }
+
+    if(s->flags & CODEC_FLAG_LOW_DELAY){
+        if (s->codec_id != CODEC_ID_MPEG2VIDEO && s->codec_id != CODEC_ID_MPEG1VIDEO){
+            av_log(avctx, AV_LOG_ERROR, "low delay forcing is only available for mpeg1/2\n");
+            return -1;
+        }
+        if (s->max_b_frames != 0){
+            av_log(avctx, AV_LOG_ERROR, "b frames cannot be used with low delay\n");
+            return -1;
+        }
+    }
+
+    if(s->q_scale_type == 1){
+        if(s->codec_id != CODEC_ID_MPEG2VIDEO){
+            av_log(avctx, AV_LOG_ERROR, "non linear quant is only available for mpeg2\n");
+            return -1;
+        }
+        if(avctx->qmax > 12){
+            av_log(avctx, AV_LOG_ERROR, "non linear quant only supports qmax <= 12 currently\n");
+            return -1;
+        }
+    }
+
     if(s->avctx->thread_count > 1 && s->codec_id != CODEC_ID_MPEG4
        && s->codec_id != CODEC_ID_MPEG1VIDEO && s->codec_id != CODEC_ID_MPEG2VIDEO
        && (s->codec_id != CODEC_ID_H263P || !(s->flags & CODEC_FLAG_H263P_SLICE_STRUCT))){
@@ -1066,8 +1144,8 @@ int MPV_encode_init(AVCodecContext *avctx)
     }
 
     if(avctx->b_frame_strategy && (avctx->flags&CODEC_FLAG_PASS2)){
-        av_log(avctx, AV_LOG_ERROR, "b_frame_strategy must be 0 on the second pass");
-        return -1;
+        av_log(avctx, AV_LOG_INFO, "notice: b_frame_strategy only affects the first pass\n");
+        avctx->b_frame_strategy = 0;
     }
 
     i= ff_gcd(avctx->time_base.den, avctx->time_base.num);
@@ -1105,61 +1183,68 @@ int MPV_encode_init(AVCodecContext *avctx)
     switch(avctx->codec->id) {
     case CODEC_ID_MPEG1VIDEO:
         s->out_format = FMT_MPEG1;
-        s->low_delay= 0; //s->max_b_frames ? 0 : 1;
+        s->low_delay= !!(s->flags & CODEC_FLAG_LOW_DELAY);
         avctx->delay= s->low_delay ? 0 : (s->max_b_frames + 1);
         break;
     case CODEC_ID_MPEG2VIDEO:
         s->out_format = FMT_MPEG1;
-        s->low_delay= 0; //s->max_b_frames ? 0 : 1;
+        s->low_delay= !!(s->flags & CODEC_FLAG_LOW_DELAY);
         avctx->delay= s->low_delay ? 0 : (s->max_b_frames + 1);
         s->rtp_mode= 1;
         break;
     case CODEC_ID_LJPEG:
+    case CODEC_ID_JPEGLS:
     case CODEC_ID_MJPEG:
         s->out_format = FMT_MJPEG;
         s->intra_only = 1; /* force intra only for jpeg */
-        s->mjpeg_write_tables = 1; /* write all tables */
-       s->mjpeg_data_only_frames = 0; /* write all the needed headers */
-        s->mjpeg_vsample[0] = 1<<chroma_v_shift;
-        s->mjpeg_vsample[1] = 1;
-        s->mjpeg_vsample[2] = 1;
-        s->mjpeg_hsample[0] = 1<<chroma_h_shift;
-        s->mjpeg_hsample[1] = 1;
-        s->mjpeg_hsample[2] = 1;
+        s->mjpeg_write_tables = avctx->codec->id != CODEC_ID_JPEGLS;
+        s->mjpeg_data_only_frames = 0; /* write all the needed headers */
+        s->mjpeg_vsample[0] = 2;
+        s->mjpeg_vsample[1] = 2>>chroma_v_shift;
+        s->mjpeg_vsample[2] = 2>>chroma_v_shift;
+        s->mjpeg_hsample[0] = 2;
+        s->mjpeg_hsample[1] = 2>>chroma_h_shift;
+        s->mjpeg_hsample[2] = 2>>chroma_h_shift;
         if (mjpeg_init(s) < 0)
             return -1;
         avctx->delay=0;
         s->low_delay=1;
         break;
+#ifdef CONFIG_H261_ENCODER
     case CODEC_ID_H261:
+        if (ff_h261_get_picture_format(s->width, s->height) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "The specified picture size of %dx%d is not valid for the H.261 codec.\nValid sizes are 176x144, 352x288\n", s->width, s->height);
+            return -1;
+        }
         s->out_format = FMT_H261;
         avctx->delay=0;
         s->low_delay=1;
         break;
+#endif
     case CODEC_ID_H263:
         if (h263_get_picture_format(s->width, s->height) == 7) {
-            av_log(avctx, AV_LOG_INFO, "Input picture size isn't suitable for h263 codec! try h263+\n");
+            av_log(avctx, AV_LOG_INFO, "The specified picture size of %dx%d is not valid for the H.263 codec.\nValid sizes are 128x96, 176x144, 352x288, 704x576, and 1408x1152. Try H.263+.\n", s->width, s->height);
             return -1;
         }
         s->out_format = FMT_H263;
-       s->obmc= (avctx->flags & CODEC_FLAG_OBMC) ? 1:0;
+        s->obmc= (avctx->flags & CODEC_FLAG_OBMC) ? 1:0;
         avctx->delay=0;
         s->low_delay=1;
         break;
     case CODEC_ID_H263P:
         s->out_format = FMT_H263;
         s->h263_plus = 1;
-       /* Fx */
+        /* Fx */
         s->umvplus = (avctx->flags & CODEC_FLAG_H263P_UMV) ? 1:0;
-       s->h263_aic= (avctx->flags & CODEC_FLAG_H263P_AIC) ? 1:0;
-       s->modified_quant= s->h263_aic;
-       s->alt_inter_vlc= (avctx->flags & CODEC_FLAG_H263P_AIV) ? 1:0;
-       s->obmc= (avctx->flags & CODEC_FLAG_OBMC) ? 1:0;
-       s->loop_filter= (avctx->flags & CODEC_FLAG_LOOP_FILTER) ? 1:0;
-       s->unrestricted_mv= s->obmc || s->loop_filter || s->umvplus;
+        s->h263_aic= (avctx->flags & CODEC_FLAG_AC_PRED) ? 1:0;
+        s->modified_quant= s->h263_aic;
+        s->alt_inter_vlc= (avctx->flags & CODEC_FLAG_H263P_AIV) ? 1:0;
+        s->obmc= (avctx->flags & CODEC_FLAG_OBMC) ? 1:0;
+        s->loop_filter= (avctx->flags & CODEC_FLAG_LOOP_FILTER) ? 1:0;
+        s->unrestricted_mv= s->obmc || s->loop_filter || s->umvplus;
         s->h263_slice_structured= (s->flags & CODEC_FLAG_H263P_SLICE_STRUCT) ? 1:0;
 
-       /* /Fx */
+        /* /Fx */
         /* These are just to be sure */
         avctx->delay=0;
         s->low_delay=1;
@@ -1257,7 +1342,7 @@ int MPV_encode_init(AVCodecContext *avctx)
     if(s->modified_quant)
         s->chroma_qscale_table= ff_h263_chroma_qscale_table;
     s->progressive_frame=
-    s->progressive_sequence= !(avctx->flags & (CODEC_FLAG_INTERLACED_DCT|CODEC_FLAG_INTERLACED_ME));
+    s->progressive_sequence= !(avctx->flags & (CODEC_FLAG_INTERLACED_DCT|CODEC_FLAG_INTERLACED_ME|CODEC_FLAG_ALT_SCAN));
     s->quant_precision=5;
 
     ff_set_cmp(&s->dsp, s->dsp.ildct_cmp, s->avctx->ildct_cmp);
@@ -1313,10 +1398,6 @@ int MPV_encode_end(AVCodecContext *avctx)
 {
     MpegEncContext *s = avctx->priv_data;
 
-#ifdef STATS
-    print_stats();
-#endif
-
     ff_rate_control_uninit(s);
 
     MPV_common_end(s);
@@ -1330,14 +1411,14 @@ int MPV_encode_end(AVCodecContext *avctx)
 
 #endif //CONFIG_ENCODERS
 
-void init_rl(RLTable *rl, int use_static)
+void init_rl(RLTable *rl, uint8_t static_store[2][2*MAX_RUN + MAX_LEVEL + 3])
 {
     int8_t max_level[MAX_RUN+1], max_run[MAX_LEVEL+1];
     uint8_t index_run[MAX_RUN+1];
     int last, run, level, start, end, i;
 
     /* If table is static, we can quit if rl->max_level[0] is not NULL */
-    if(use_static && rl->max_level[0])
+    if(static_store && rl->max_level[0])
         return;
 
     /* compute max_level[], max_run[] and index_run[] */
@@ -1363,18 +1444,18 @@ void init_rl(RLTable *rl, int use_static)
             if (run > max_run[level])
                 max_run[level] = run;
         }
-        if(use_static)
-            rl->max_level[last] = av_mallocz_static(MAX_RUN + 1);
+        if(static_store)
+            rl->max_level[last] = static_store[last];
         else
             rl->max_level[last] = av_malloc(MAX_RUN + 1);
         memcpy(rl->max_level[last], max_level, MAX_RUN + 1);
-        if(use_static)
-            rl->max_run[last] = av_mallocz_static(MAX_LEVEL + 1);
+        if(static_store)
+            rl->max_run[last] = static_store[last] + MAX_RUN + 1;
         else
             rl->max_run[last] = av_malloc(MAX_LEVEL + 1);
         memcpy(rl->max_run[last], max_run, MAX_LEVEL + 1);
-        if(use_static)
-            rl->index_run[last] = av_mallocz_static(MAX_RUN + 1);
+        if(static_store)
+            rl->index_run[last] = static_store[last] + MAX_RUN + MAX_LEVEL + 2;
         else
             rl->index_run[last] = av_malloc(MAX_RUN + 1);
         memcpy(rl->index_run[last], index_run, MAX_RUN + 1);
@@ -1460,6 +1541,7 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
 
     /* mark&release old frames */
     if (s->pict_type != B_TYPE && s->last_picture_ptr && s->last_picture_ptr != s->next_picture_ptr && s->last_picture_ptr->data[0]) {
+      if(s->out_format != FMT_H264 || s->codec_id == CODEC_ID_SVQ3){
         avctx->release_buffer(avctx, (AVFrame*)s->last_picture_ptr);
 
         /* release forgotten pictures */
@@ -1472,6 +1554,7 @@ int MPV_frame_start(MpegEncContext *s, AVCodecContext *avctx)
                 }
             }
         }
+      }
     }
 alloc:
     if(!s->encoding){
@@ -1509,7 +1592,6 @@ alloc:
 
     copy_picture(&s->current_picture, s->current_picture_ptr);
 
-  if(s->out_format != FMT_H264 || s->codec_id == CODEC_ID_SVQ3){
     if (s->pict_type != B_TYPE) {
         s->last_picture_ptr= s->next_picture_ptr;
         if(!s->dropable)
@@ -1524,7 +1606,7 @@ alloc:
     if(s->last_picture_ptr) copy_picture(&s->last_picture, s->last_picture_ptr);
     if(s->next_picture_ptr) copy_picture(&s->next_picture, s->next_picture_ptr);
 
-    if(s->pict_type != I_TYPE && (s->last_picture_ptr==NULL || s->last_picture_ptr->data[0]==NULL)){
+    if(s->pict_type != I_TYPE && (s->last_picture_ptr==NULL || s->last_picture_ptr->data[0]==NULL) && !s->dropable){
         av_log(avctx, AV_LOG_ERROR, "warning: first frame is no keyframe\n");
         assert(s->pict_type != B_TYPE); //these should have been dropped if we don't have a reference
         goto alloc;
@@ -1543,7 +1625,6 @@ alloc:
             s->next_picture.linesize[i] *=2;
         }
     }
-  }
 
     s->hurry_up= s->avctx->hurry_up;
     s->error_resilience= avctx->error_resilience;
@@ -1593,6 +1674,7 @@ void MPV_frame_end(MpegEncContext *s)
     emms_c();
 
     s->last_pict_type    = s->pict_type;
+    s->last_lambda_for[s->pict_type]= s->current_picture_ptr->quality;
     if(s->pict_type!=B_TYPE){
         s->last_non_b_pict_type= s->pict_type;
     }
@@ -1632,19 +1714,19 @@ void MPV_frame_end(MpegEncContext *s)
  * @param color color of the arrow
  */
 static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h, int stride, int color){
-    int t, x, y, fr, f;
+    int x, y, fr, f;
 
-    sx= clip(sx, 0, w-1);
-    sy= clip(sy, 0, h-1);
-    ex= clip(ex, 0, w-1);
-    ey= clip(ey, 0, h-1);
+    sx= av_clip(sx, 0, w-1);
+    sy= av_clip(sy, 0, h-1);
+    ex= av_clip(ex, 0, w-1);
+    ey= av_clip(ey, 0, h-1);
 
     buf[sy*stride + sx]+= color;
 
-    if(ABS(ex - sx) > ABS(ey - sy)){
+    if(FFABS(ex - sx) > FFABS(ey - sy)){
         if(sx > ex){
-            t=sx; sx=ex; ex=t;
-            t=sy; sy=ey; ey=t;
+            FFSWAP(int, sx, ex);
+            FFSWAP(int, sy, ey);
         }
         buf+= sx + sy*stride;
         ex-= sx;
@@ -1657,8 +1739,8 @@ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h
         }
     }else{
         if(sy > ey){
-            t=sx; sx=ex; ex=t;
-            t=sy; sy=ey; ey=t;
+            FFSWAP(int, sx, ex);
+            FFSWAP(int, sy, ey);
         }
         buf+= sx + sy*stride;
         ey-= sy;
@@ -1683,10 +1765,10 @@ static void draw_line(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h
 static void draw_arrow(uint8_t *buf, int sx, int sy, int ex, int ey, int w, int h, int stride, int color){
     int dx,dy;
 
-    sx= clip(sx, -100, w+100);
-    sy= clip(sy, -100, h+100);
-    ex= clip(ex, -100, w+100);
-    ey= clip(ey, -100, h+100);
+    sx= av_clip(sx, -100, w+100);
+    sy= av_clip(sy, -100, h+100);
+    ex= av_clip(ex, -100, w+100);
+    ey= av_clip(ey, -100, h+100);
 
     dx= ex - sx;
     dy= ey - sy;
@@ -1798,7 +1880,7 @@ void ff_print_debug_info(MpegEncContext *s, AVFrame *pict){
         const int width = s->avctx->width;
         const int height= s->avctx->height;
         const int mv_sample_log2= 4 - pict->motion_subsample_log2;
-        const int mv_stride= (s->mb_width << mv_sample_log2) + 1;
+        const int mv_stride= (s->mb_width << mv_sample_log2) + (s->codec_id == CODEC_ID_H264 ? 0 : 1);
         s->low_delay=0; //needed to see the vectors without trashing the buffers
 
         avcodec_get_chroma_sub_sample(s->avctx->pix_fmt, &h_chroma_shift, &v_chroma_shift);
@@ -1975,7 +2057,7 @@ static int get_sae(uint8_t *src, int ref, int stride){
 
     for(y=0; y<16; y++){
         for(x=0; x<16; x++){
-            acc+= ABS(src[x+y*stride] - ref);
+            acc+= FFABS(src[x+y*stride] - ref);
         }
     }
 
@@ -2056,7 +2138,6 @@ static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
         }
         alloc_picture(s, (Picture*)pic, 1);
     }else{
-        int offset= 16;
         i= ff_find_unused_picture(s, 0);
 
         pic= (AVFrame*)&s->picture[i];
@@ -2064,9 +2145,9 @@ static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
 
         alloc_picture(s, (Picture*)pic, 0);
 
-        if(   pic->data[0] + offset == pic_arg->data[0]
-           && pic->data[1] + offset == pic_arg->data[1]
-           && pic->data[2] + offset == pic_arg->data[2]){
+        if(   pic->data[0] + INPLACE_OFFSET == pic_arg->data[0]
+           && pic->data[1] + INPLACE_OFFSET == pic_arg->data[1]
+           && pic->data[2] + INPLACE_OFFSET == pic_arg->data[2]){
        // empty
         }else{
             int h_chroma_shift, v_chroma_shift;
@@ -2080,7 +2161,10 @@ static int load_input_picture(MpegEncContext *s, AVFrame *pic_arg){
                 int w= s->width >>h_shift;
                 int h= s->height>>v_shift;
                 uint8_t *src= pic_arg->data[i];
-                uint8_t *dst= pic->data[i] + offset;
+                uint8_t *dst= pic->data[i];
+
+                if(!s->avctx->rc_buffer_size)
+                    dst +=INPLACE_OFFSET;
 
                 if(src_stride==dst_stride)
                     memcpy(dst, src, src_stride*h);
@@ -2122,9 +2206,9 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref){
 
                 switch(s->avctx->frame_skip_exp){
                     case 0: score= FFMAX(score, v); break;
-                    case 1: score+= ABS(v);break;
+                    case 1: score+= FFABS(v);break;
                     case 2: score+= v*v;break;
-                    case 3: score64+= ABS(v*v*(int64_t)v);break;
+                    case 3: score64+= FFABS(v*v*(int64_t)v);break;
                     case 4: score64+= v*v*(int64_t)(v*v);break;
                 }
             }
@@ -2140,6 +2224,115 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref){
     return 0;
 }
 
+static int estimate_best_b_count(MpegEncContext *s){
+    AVCodec *codec= avcodec_find_encoder(s->avctx->codec_id);
+    AVCodecContext *c= avcodec_alloc_context();
+    AVFrame input[FF_MAX_B_FRAMES+2];
+    const int scale= s->avctx->brd_scale;
+    int i, j, out_size, p_lambda, b_lambda, lambda2;
+    int outbuf_size= s->width * s->height; //FIXME
+    uint8_t *outbuf= av_malloc(outbuf_size);
+    int64_t best_rd= INT64_MAX;
+    int best_b_count= -1;
+
+    assert(scale>=0 && scale <=3);
+
+//    emms_c();
+    p_lambda= s->last_lambda_for[P_TYPE]; //s->next_picture_ptr->quality;
+    b_lambda= s->last_lambda_for[B_TYPE]; //p_lambda *FFABS(s->avctx->b_quant_factor) + s->avctx->b_quant_offset;
+    if(!b_lambda) b_lambda= p_lambda; //FIXME we should do this somewhere else
+    lambda2= (b_lambda*b_lambda + (1<<FF_LAMBDA_SHIFT)/2 ) >> FF_LAMBDA_SHIFT;
+
+    c->width = s->width >> scale;
+    c->height= s->height>> scale;
+    c->flags= CODEC_FLAG_QSCALE | CODEC_FLAG_PSNR | CODEC_FLAG_INPUT_PRESERVED /*| CODEC_FLAG_EMU_EDGE*/;
+    c->flags|= s->avctx->flags & CODEC_FLAG_QPEL;
+    c->mb_decision= s->avctx->mb_decision;
+    c->me_cmp= s->avctx->me_cmp;
+    c->mb_cmp= s->avctx->mb_cmp;
+    c->me_sub_cmp= s->avctx->me_sub_cmp;
+    c->pix_fmt = PIX_FMT_YUV420P;
+    c->time_base= s->avctx->time_base;
+    c->max_b_frames= s->max_b_frames;
+
+    if (avcodec_open(c, codec) < 0)
+        return -1;
+
+    for(i=0; i<s->max_b_frames+2; i++){
+        int ysize= c->width*c->height;
+        int csize= (c->width/2)*(c->height/2);
+        Picture pre_input, *pre_input_ptr= i ? s->input_picture[i-1] : s->next_picture_ptr;
+
+        avcodec_get_frame_defaults(&input[i]);
+        input[i].data[0]= av_malloc(ysize + 2*csize);
+        input[i].data[1]= input[i].data[0] + ysize;
+        input[i].data[2]= input[i].data[1] + csize;
+        input[i].linesize[0]= c->width;
+        input[i].linesize[1]=
+        input[i].linesize[2]= c->width/2;
+
+        if(pre_input_ptr && (!i || s->input_picture[i-1])) {
+            pre_input= *pre_input_ptr;
+
+            if(pre_input.type != FF_BUFFER_TYPE_SHARED && i) {
+                pre_input.data[0]+=INPLACE_OFFSET;
+                pre_input.data[1]+=INPLACE_OFFSET;
+                pre_input.data[2]+=INPLACE_OFFSET;
+            }
+
+            s->dsp.shrink[scale](input[i].data[0], input[i].linesize[0], pre_input.data[0], pre_input.linesize[0], c->width, c->height);
+            s->dsp.shrink[scale](input[i].data[1], input[i].linesize[1], pre_input.data[1], pre_input.linesize[1], c->width>>1, c->height>>1);
+            s->dsp.shrink[scale](input[i].data[2], input[i].linesize[2], pre_input.data[2], pre_input.linesize[2], c->width>>1, c->height>>1);
+        }
+    }
+
+    for(j=0; j<s->max_b_frames+1; j++){
+        int64_t rd=0;
+
+        if(!s->input_picture[j])
+            break;
+
+        c->error[0]= c->error[1]= c->error[2]= 0;
+
+        input[0].pict_type= I_TYPE;
+        input[0].quality= 1 * FF_QP2LAMBDA;
+        out_size = avcodec_encode_video(c, outbuf, outbuf_size, &input[0]);
+//        rd += (out_size * lambda2) >> FF_LAMBDA_SHIFT;
+
+        for(i=0; i<s->max_b_frames+1; i++){
+            int is_p= i % (j+1) == j || i==s->max_b_frames;
+
+            input[i+1].pict_type= is_p ? P_TYPE : B_TYPE;
+            input[i+1].quality= is_p ? p_lambda : b_lambda;
+            out_size = avcodec_encode_video(c, outbuf, outbuf_size, &input[i+1]);
+            rd += (out_size * lambda2) >> (FF_LAMBDA_SHIFT - 3);
+        }
+
+        /* get the delayed frames */
+        while(out_size){
+            out_size = avcodec_encode_video(c, outbuf, outbuf_size, NULL);
+            rd += (out_size * lambda2) >> (FF_LAMBDA_SHIFT - 3);
+        }
+
+        rd += c->error[0] + c->error[1] + c->error[2];
+
+        if(rd < best_rd){
+            best_rd= rd;
+            best_b_count= j;
+        }
+    }
+
+    av_freep(&outbuf);
+    avcodec_close(c);
+    av_freep(&c);
+
+    for(i=0; i<s->max_b_frames+2; i++){
+        av_freep(&input[i].data[0]);
+    }
+
+    return best_b_count;
+}
+
 static void select_input_picture(MpegEncContext *s){
     int i;
 
@@ -2159,7 +2352,7 @@ static void select_input_picture(MpegEncContext *s){
             if(s->avctx->frame_skip_threshold || s->avctx->frame_skip_factor){
                 if(s->picture_in_gop_number < s->gop_size && skip_check(s, s->input_picture[0], s->next_picture_ptr)){
                 //FIXME check that te gop check above is +-1 correct
-//av_log(NULL, AV_LOG_DEBUG, "skip %p %Ld\n", s->input_picture[0]->data[0], s->input_picture[0]->pts);
+//av_log(NULL, AV_LOG_DEBUG, "skip %p %"PRId64"\n", s->input_picture[0]->data[0], s->input_picture[0]->pts);
 
                     if(s->input_picture[0]->type == FF_BUFFER_TYPE_SHARED){
                         for(i=0; i<4; i++)
@@ -2207,7 +2400,7 @@ static void select_input_picture(MpegEncContext *s){
                     }
                 }
                 for(i=0; i<s->max_b_frames+1; i++){
-                    if(s->input_picture[i]==NULL || s->input_picture[i]->b_frame_score - 1 > s->mb_num/40) break;
+                    if(s->input_picture[i]==NULL || s->input_picture[i]->b_frame_score - 1 > s->mb_num/s->avctx->b_sensitivity) break;
                 }
 
                 b_frames= FFMAX(0, i-1);
@@ -2216,6 +2409,8 @@ static void select_input_picture(MpegEncContext *s){
                 for(i=0; i<b_frames+1; i++){
                     s->input_picture[i]->b_frame_score=0;
                 }
+            }else if(s->avctx->b_frame_strategy==2){
+                b_frames= estimate_best_b_count(s);
             }else{
                 av_log(s->avctx, AV_LOG_ERROR, "illegal b frame strategy\n");
                 b_frames=0;
@@ -2267,21 +2462,22 @@ no_output_pic:
 
         copy_picture(&s->new_picture, s->reordered_input_picture[0]);
 
-        if(s->reordered_input_picture[0]->type == FF_BUFFER_TYPE_SHARED){
+        if(s->reordered_input_picture[0]->type == FF_BUFFER_TYPE_SHARED || s->avctx->rc_buffer_size){
             // input is a shared pix, so we can't modifiy it -> alloc a new one & ensure that the shared one is reuseable
 
             int i= ff_find_unused_picture(s, 0);
             Picture *pic= &s->picture[i];
 
+            pic->reference              = s->reordered_input_picture[0]->reference;
+            alloc_picture(s, pic, 0);
+
             /* mark us unused / free shared pic */
+            if(s->reordered_input_picture[0]->type == FF_BUFFER_TYPE_INTERNAL)
+                s->avctx->release_buffer(s->avctx, (AVFrame*)s->reordered_input_picture[0]);
             for(i=0; i<4; i++)
                 s->reordered_input_picture[0]->data[i]= NULL;
             s->reordered_input_picture[0]->type= 0;
 
-            pic->reference              = s->reordered_input_picture[0]->reference;
-
-            alloc_picture(s, pic, 0);
-
             copy_picture_attributes(s, (AVFrame*)pic, (AVFrame*)s->reordered_input_picture[0]);
 
             s->current_picture_ptr= pic;
@@ -2293,7 +2489,7 @@ no_output_pic:
 
             s->current_picture_ptr= s->reordered_input_picture[0];
             for(i=0; i<4; i++){
-                s->new_picture.data[i]+=16;
+                s->new_picture.data[i]+= INPLACE_OFFSET;
             }
         }
         copy_picture(&s->current_picture, s->current_picture_ptr);
@@ -2312,11 +2508,6 @@ int MPV_encode_picture(AVCodecContext *avctx,
     AVFrame *pic_arg = data;
     int i, stuffing_count;
 
-    if(avctx->pix_fmt != PIX_FMT_YUV420P && avctx->pix_fmt != PIX_FMT_YUVJ420P){
-        av_log(avctx, AV_LOG_ERROR, "this codec supports only YUV420P\n");
-        return -1;
-    }
-
     for(i=0; i<avctx->thread_count; i++){
         int start_y= s->thread_context[i]->start_mb_y;
         int   end_y= s->thread_context[i]->  end_mb_y;
@@ -2340,8 +2531,9 @@ int MPV_encode_picture(AVCodecContext *avctx,
 //emms_c();
 //printf("qs:%f %f %d\n", s->new_picture.quality, s->current_picture.quality, s->qscale);
         MPV_frame_start(s, avctx);
-
-        encode_picture(s, s->picture_number);
+vbv_retry:
+        if (encode_picture(s, s->picture_number) < 0)
+            return -1;
 
         avctx->real_pict_num  = s->picture_number;
         avctx->header_bits = s->header_bits;
@@ -2358,6 +2550,37 @@ int MPV_encode_picture(AVCodecContext *avctx,
         if (s->out_format == FMT_MJPEG)
             mjpeg_picture_trailer(s);
 
+        if(avctx->rc_buffer_size){
+            RateControlContext *rcc= &s->rc_context;
+            int max_size= rcc->buffer_index/3;
+
+            if(put_bits_count(&s->pb) > max_size && s->lambda < s->avctx->lmax){
+                s->next_lambda= FFMAX(s->lambda+1, s->lambda*(s->qscale+1) / s->qscale);
+                if(s->adaptive_quant){
+                    int i;
+                    for(i=0; i<s->mb_height*s->mb_stride; i++)
+                        s->lambda_table[i]= FFMAX(s->lambda_table[i]+1, s->lambda_table[i]*(s->qscale+1) / s->qscale);
+                }
+                s->mb_skipped = 0;        //done in MPV_frame_start()
+                if(s->pict_type==P_TYPE){ //done in encode_picture() so we must undo it
+                    if(s->flipflop_rounding || s->codec_id == CODEC_ID_H263P || s->codec_id == CODEC_ID_MPEG4)
+                        s->no_rounding ^= 1;
+                }
+                if(s->pict_type!=B_TYPE){
+                    s->time_base= s->last_time_base;
+                    s->last_non_b_time= s->time - s->pp_time;
+                }
+//                av_log(NULL, AV_LOG_ERROR, "R:%d ", s->next_lambda);
+                for(i=0; i<avctx->thread_count; i++){
+                    PutBitContext *pb= &s->thread_context[i]->pb;
+                    init_put_bits(pb, pb->buf, pb->buf_end - pb->buf);
+                }
+                goto vbv_retry;
+            }
+
+            assert(s->avctx->rc_max_rate);
+        }
+
         if(s->flags&CODEC_FLAG_PASS1)
             ff_write_pass1_stats(s);
 
@@ -2444,10 +2667,10 @@ static inline void gmc1_motion(MpegEncContext *s,
     src_y = s->mb_y * 16 + (motion_y >> (s->sprite_warping_accuracy+1));
     motion_x<<=(3-s->sprite_warping_accuracy);
     motion_y<<=(3-s->sprite_warping_accuracy);
-    src_x = clip(src_x, -16, s->width);
+    src_x = av_clip(src_x, -16, s->width);
     if (src_x == s->width)
         motion_x =0;
-    src_y = clip(src_y, -16, s->height);
+    src_y = av_clip(src_y, -16, s->height);
     if (src_y == s->height)
         motion_y =0;
 
@@ -2472,7 +2695,7 @@ static inline void gmc1_motion(MpegEncContext *s,
 
         dxy= ((motion_x>>3)&1) | ((motion_y>>2)&2);
         if (s->no_rounding){
-           s->dsp.put_no_rnd_pixels_tab[0][dxy](dest_y, ptr, linesize, 16);
+            s->dsp.put_no_rnd_pixels_tab[0][dxy](dest_y, ptr, linesize, 16);
         }else{
             s->dsp.put_pixels_tab       [0][dxy](dest_y, ptr, linesize, 16);
         }
@@ -2486,10 +2709,10 @@ static inline void gmc1_motion(MpegEncContext *s,
     src_y = s->mb_y * 8 + (motion_y >> (s->sprite_warping_accuracy+1));
     motion_x<<=(3-s->sprite_warping_accuracy);
     motion_y<<=(3-s->sprite_warping_accuracy);
-    src_x = clip(src_x, -8, s->width>>1);
+    src_x = av_clip(src_x, -8, s->width>>1);
     if (src_x == s->width>>1)
         motion_x =0;
-    src_y = clip(src_y, -8, s->height>>1);
+    src_y = av_clip(src_y, -8, s->height>>1);
     if (src_y == s->height>>1)
         motion_y =0;
 
@@ -2659,10 +2882,10 @@ static inline int hpel_motion(MpegEncContext *s,
     src_y += motion_y >> 1;
 
     /* WARNING: do no forget half pels */
-    src_x = clip(src_x, -16, width); //FIXME unneeded for emu?
+    src_x = av_clip(src_x, -16, width); //FIXME unneeded for emu?
     if (src_x == width)
         dxy &= ~1;
-    src_y = clip(src_y, -16, height);
+    src_y = av_clip(src_y, -16, height);
     if (src_y == height)
         dxy &= ~2;
     src += src_y * stride + src_x;
@@ -2725,7 +2948,7 @@ static inline int hpel_motion_lowres(MpegEncContext *s,
 }
 
 /* apply one mpeg motion vector to the three components */
-static always_inline void mpeg_motion(MpegEncContext *s,
+static av_always_inline void mpeg_motion(MpegEncContext *s,
                                uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                                int field_based, int bottom_field, int field_select,
                                uint8_t **ref_picture, op_pixels_func (*pix_op)[4],
@@ -2842,7 +3065,7 @@ if(s->quarter_sample)
 }
 
 /* apply one mpeg motion vector to the three components */
-static always_inline void mpeg_motion_lowres(MpegEncContext *s,
+static av_always_inline void mpeg_motion_lowres(MpegEncContext *s,
                                uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                                int field_based, int bottom_field, int field_select,
                                uint8_t **ref_picture, h264_chroma_mc_func *pix_op,
@@ -3138,10 +3361,10 @@ static inline void chroma_4mv_motion(MpegEncContext *s,
 
     src_x = s->mb_x * 8 + mx;
     src_y = s->mb_y * 8 + my;
-    src_x = clip(src_x, -8, s->width/2);
+    src_x = av_clip(src_x, -8, s->width/2);
     if (src_x == s->width/2)
         dxy &= ~1;
-    src_y = clip(src_y, -8, s->height/2);
+    src_y = av_clip(src_y, -8, s->height/2);
     if (src_y == s->height/2)
         dxy &= ~2;
 
@@ -3215,6 +3438,18 @@ static inline void chroma_4mv_motion_lowres(MpegEncContext *s,
     pix_op[lowres](dest_cr, ptr, s->uvlinesize, block_s, sx, sy);
 }
 
+static inline void prefetch_motion(MpegEncContext *s, uint8_t **pix, int dir){
+    /* fetch pixels for estimated mv 4 macroblocks ahead
+     * optimized for 64byte cache lines */
+    const int shift = s->quarter_sample ? 2 : 1;
+    const int mx= (s->mv[dir][0][0]>>shift) + 16*s->mb_x + 8;
+    const int my= (s->mv[dir][0][1]>>shift) + 16*s->mb_y;
+    int off= mx + (my + (s->mb_x&3)*4)*s->linesize + 64;
+    s->dsp.prefetch(pix[0]+off, s->linesize, 4);
+    off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
+    s->dsp.prefetch(pix[1]+off, pix[2]-pix[1], 2);
+}
+
 /**
  * motion compensation of a single macroblock
  * @param s context
@@ -3239,6 +3474,8 @@ static inline void MPV_motion(MpegEncContext *s,
     mb_x = s->mb_x;
     mb_y = s->mb_y;
 
+    prefetch_motion(s, ref_picture, dir);
+
     if(s->obmc && s->pict_type != B_TYPE){
         int16_t mv_cache[4][4][2];
         const int xy= s->mb_x + s->mb_y*s->mb_stride;
@@ -3340,10 +3577,10 @@ static inline void MPV_motion(MpegEncContext *s,
                 src_y = mb_y * 16 + (motion_y >> 2) + (i >>1) * 8;
 
                 /* WARNING: do no forget half pels */
-                src_x = clip(src_x, -16, s->width);
+                src_x = av_clip(src_x, -16, s->width);
                 if (src_x == s->width)
                     dxy &= ~3;
-                src_y = clip(src_y, -16, s->height);
+                src_y = av_clip(src_y, -16, s->height);
                 if (src_y == s->height)
                     dxy &= ~12;
 
@@ -3660,7 +3897,7 @@ void ff_clean_intra_table_entries(MpegEncContext *s)
    s->mv       : motion vector
    s->interlaced_dct : true if interlaced dct used (mpeg2)
  */
-static always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM block[12][64], int lowres_flag)
+static av_always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM block[12][64], int lowres_flag)
 {
     int mb_x, mb_y;
     const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
@@ -3699,7 +3936,7 @@ static always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM bloc
     else if (s->h263_pred || s->h263_aic)
         s->mbintra_table[mb_xy]=1;
 
-    if ((s->flags&CODEC_FLAG_PSNR) || !(s->encoding && (s->intra_only || s->pict_type==B_TYPE))) { //FIXME precalc
+    if ((s->flags&CODEC_FLAG_PSNR) || !(s->encoding && (s->intra_only || s->pict_type==B_TYPE) && s->avctx->mb_decision != FF_MB_DECISION_RD)) { //FIXME precalc
         uint8_t *dest_y, *dest_cb, *dest_cr;
         int dct_linesize, dct_offset;
         op_pixels_func (*op_pix)[4];
@@ -3764,17 +4001,16 @@ static always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM bloc
                         MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix);
                     }
                 }else{
+                    op_qpix= s->me.qpel_put;
                     if ((!s->no_rounding) || s->pict_type==B_TYPE){
                         op_pix = s->dsp.put_pixels_tab;
-                        op_qpix= s->dsp.put_qpel_pixels_tab;
                     }else{
                         op_pix = s->dsp.put_no_rnd_pixels_tab;
-                        op_qpix= s->dsp.put_no_rnd_qpel_pixels_tab;
                     }
                     if (s->mv_dir & MV_DIR_FORWARD) {
                         MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.data, op_pix, op_qpix);
                         op_pix = s->dsp.avg_pixels_tab;
-                        op_qpix= s->dsp.avg_qpel_pixels_tab;
+                        op_qpix= s->me.qpel_avg;
                     }
                     if (s->mv_dir & MV_DIR_BACKWARD) {
                         MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix, op_qpix);
@@ -3800,8 +4036,17 @@ static always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM bloc
                 add_dequant_dct(s, block[3], 3, dest_y + dct_offset + block_size, dct_linesize, s->qscale);
 
                 if(!(s->flags&CODEC_FLAG_GRAY)){
-                    add_dequant_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
-                    add_dequant_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
+                    if (s->chroma_y_shift){
+                        add_dequant_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
+                        add_dequant_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
+                    }else{
+                        dct_linesize >>= 1;
+                        dct_offset >>=1;
+                        add_dequant_dct(s, block[4], 4, dest_cb,              dct_linesize, s->chroma_qscale);
+                        add_dequant_dct(s, block[5], 5, dest_cr,              dct_linesize, s->chroma_qscale);
+                        add_dequant_dct(s, block[6], 6, dest_cb + dct_offset, dct_linesize, s->chroma_qscale);
+                        add_dequant_dct(s, block[7], 7, dest_cr + dct_offset, dct_linesize, s->chroma_qscale);
+                    }
                 }
             } else if(s->codec_id != CODEC_ID_WMV2){
                 add_dct(s, block[0], 0, dest_y                          , dct_linesize);
@@ -3843,8 +4088,17 @@ static always_inline void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM bloc
                 put_dct(s, block[3], 3, dest_y + dct_offset + block_size, dct_linesize, s->qscale);
 
                 if(!(s->flags&CODEC_FLAG_GRAY)){
-                    put_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
-                    put_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
+                    if(s->chroma_y_shift){
+                        put_dct(s, block[4], 4, dest_cb, uvlinesize, s->chroma_qscale);
+                        put_dct(s, block[5], 5, dest_cr, uvlinesize, s->chroma_qscale);
+                    }else{
+                        dct_offset >>=1;
+                        dct_linesize >>=1;
+                        put_dct(s, block[4], 4, dest_cb,              dct_linesize, s->chroma_qscale);
+                        put_dct(s, block[5], 5, dest_cr,              dct_linesize, s->chroma_qscale);
+                        put_dct(s, block[6], 6, dest_cb + dct_offset, dct_linesize, s->chroma_qscale);
+                        put_dct(s, block[7], 7, dest_cr + dct_offset, dct_linesize, s->chroma_qscale);
+                    }
                 }
             }else{
                 s->dsp.idct_put(dest_y                          , dct_linesize, block[0]);
@@ -3920,7 +4174,7 @@ static inline void dct_single_coeff_elimination(MpegEncContext *s, int n, int th
 
     for(i=0; i<=last_index; i++){
         const int j = s->intra_scantable.permutated[i];
-        const int level = ABS(block[j]);
+        const int level = FFABS(block[j]);
         if(level==1){
             if(skip_dc && i==0) continue;
             score+= tab[run];
@@ -4066,19 +4320,19 @@ static void get_vissual_weight(int16_t *weight, uint8_t *ptr, int stride){
     }
 }
 
-static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
+static av_always_inline void encode_mb_internal(MpegEncContext *s, int motion_x, int motion_y, int mb_block_height, int mb_block_count)
 {
-    int16_t weight[6][64];
-    DCTELEM orig[6][64];
+    int16_t weight[8][64];
+    DCTELEM orig[8][64];
     const int mb_x= s->mb_x;
     const int mb_y= s->mb_y;
     int i;
-    int skip_dct[6];
+    int skip_dct[8];
     int dct_offset   = s->linesize*8; //default for progressive frames
     uint8_t *ptr_y, *ptr_cb, *ptr_cr;
     int wrap_y, wrap_c;
 
-    for(i=0; i<6; i++) skip_dct[i]=0;
+    for(i=0; i<mb_block_count; i++) skip_dct[i]=s->skipdct;
 
     if(s->adaptive_quant){
         const int last_qp= s->qscale;
@@ -4088,17 +4342,16 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         update_qscale(s);
 
         if(!(s->flags&CODEC_FLAG_QP_RD)){
+            s->qscale= s->current_picture_ptr->qscale_table[mb_xy];
             s->dquant= s->qscale - last_qp;
 
             if(s->out_format==FMT_H263){
-                s->dquant= clip(s->dquant, -2, 2); //FIXME RD
+                s->dquant= av_clip(s->dquant, -2, 2);
 
                 if(s->codec_id==CODEC_ID_MPEG4){
                     if(!s->mb_intra){
                         if(s->pict_type == B_TYPE){
-                            if(s->dquant&1)
-                                s->dquant= (s->dquant/2)*2;
-                            if(s->mv_dir&MV_DIRECT)
+                            if(s->dquant&1 || s->mv_dir&MV_DIRECT)
                                 s->dquant= 0;
                         }
                         if(s->mv_type==MV_TYPE_8X8)
@@ -4114,16 +4367,16 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
     wrap_y = s->linesize;
     wrap_c = s->uvlinesize;
     ptr_y = s->new_picture.data[0] + (mb_y * 16 * wrap_y) + mb_x * 16;
-    ptr_cb = s->new_picture.data[1] + (mb_y * 8 * wrap_c) + mb_x * 8;
-    ptr_cr = s->new_picture.data[2] + (mb_y * 8 * wrap_c) + mb_x * 8;
+    ptr_cb = s->new_picture.data[1] + (mb_y * mb_block_height * wrap_c) + mb_x * 8;
+    ptr_cr = s->new_picture.data[2] + (mb_y * mb_block_height * wrap_c) + mb_x * 8;
 
     if(mb_x*16+16 > s->width || mb_y*16+16 > s->height){
         uint8_t *ebuf= s->edge_emu_buffer + 32;
         ff_emulated_edge_mc(ebuf            , ptr_y , wrap_y,16,16,mb_x*16,mb_y*16, s->width   , s->height);
         ptr_y= ebuf;
-        ff_emulated_edge_mc(ebuf+18*wrap_y  , ptr_cb, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+        ff_emulated_edge_mc(ebuf+18*wrap_y  , ptr_cb, wrap_c, 8, mb_block_height, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
         ptr_cb= ebuf+18*wrap_y;
-        ff_emulated_edge_mc(ebuf+18*wrap_y+8, ptr_cr, wrap_c, 8, 8, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
+        ff_emulated_edge_mc(ebuf+18*wrap_y+8, ptr_cr, wrap_c, 8, mb_block_height, mb_x*8, mb_y*8, s->width>>1, s->height>>1);
         ptr_cr= ebuf+18*wrap_y+8;
     }
 
@@ -4143,11 +4396,13 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 
                     dct_offset= wrap_y;
                     wrap_y<<=1;
+                    if (s->chroma_format == CHROMA_422)
+                        wrap_c<<=1;
                 }
             }
         }
 
-       s->dsp.get_pixels(s->block[0], ptr_y                 , wrap_y);
+        s->dsp.get_pixels(s->block[0], ptr_y                 , wrap_y);
         s->dsp.get_pixels(s->block[1], ptr_y              + 8, wrap_y);
         s->dsp.get_pixels(s->block[2], ptr_y + dct_offset    , wrap_y);
         s->dsp.get_pixels(s->block[3], ptr_y + dct_offset + 8, wrap_y);
@@ -4156,8 +4411,12 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
             skip_dct[4]= 1;
             skip_dct[5]= 1;
         }else{
-           s->dsp.get_pixels(s->block[4], ptr_cb, wrap_c);
+            s->dsp.get_pixels(s->block[4], ptr_cb, wrap_c);
             s->dsp.get_pixels(s->block[5], ptr_cr, wrap_c);
+            if(!s->chroma_y_shift){ /* 422 */
+                s->dsp.get_pixels(s->block[6], ptr_cb + (dct_offset>>1), wrap_c);
+                s->dsp.get_pixels(s->block[7], ptr_cr + (dct_offset>>1), wrap_c);
+            }
         }
     }else{
         op_pixels_func (*op_pix)[4];
@@ -4169,7 +4428,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         dest_cr = s->dest[2];
 
         if ((!s->no_rounding) || s->pict_type==B_TYPE){
-           op_pix = s->dsp.put_pixels_tab;
+            op_pix = s->dsp.put_pixels_tab;
             op_qpix= s->dsp.put_qpel_pixels_tab;
         }else{
             op_pix = s->dsp.put_no_rnd_pixels_tab;
@@ -4203,11 +4462,13 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 
                     dct_offset= wrap_y;
                     wrap_y<<=1;
+                    if (s->chroma_format == CHROMA_422)
+                        wrap_c<<=1;
                 }
             }
         }
 
-       s->dsp.diff_pixels(s->block[0], ptr_y                 , dest_y                 , wrap_y);
+        s->dsp.diff_pixels(s->block[0], ptr_y                 , dest_y                 , wrap_y);
         s->dsp.diff_pixels(s->block[1], ptr_y              + 8, dest_y              + 8, wrap_y);
         s->dsp.diff_pixels(s->block[2], ptr_y + dct_offset    , dest_y + dct_offset    , wrap_y);
         s->dsp.diff_pixels(s->block[3], ptr_y + dct_offset + 8, dest_y + dct_offset + 8, wrap_y);
@@ -4218,16 +4479,24 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         }else{
             s->dsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
             s->dsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
+            if(!s->chroma_y_shift){ /* 422 */
+                s->dsp.diff_pixels(s->block[6], ptr_cb + (dct_offset>>1), dest_cb + (dct_offset>>1), wrap_c);
+                s->dsp.diff_pixels(s->block[7], ptr_cr + (dct_offset>>1), dest_cr + (dct_offset>>1), wrap_c);
+            }
         }
         /* pre quantization */
         if(s->current_picture.mc_mb_var[s->mb_stride*mb_y+ mb_x]<2*s->qscale*s->qscale){
             //FIXME optimize
-           if(s->dsp.sad[1](NULL, ptr_y               , dest_y               , wrap_y, 8) < 20*s->qscale) skip_dct[0]= 1;
+            if(s->dsp.sad[1](NULL, ptr_y               , dest_y               , wrap_y, 8) < 20*s->qscale) skip_dct[0]= 1;
             if(s->dsp.sad[1](NULL, ptr_y            + 8, dest_y            + 8, wrap_y, 8) < 20*s->qscale) skip_dct[1]= 1;
             if(s->dsp.sad[1](NULL, ptr_y +dct_offset   , dest_y +dct_offset   , wrap_y, 8) < 20*s->qscale) skip_dct[2]= 1;
             if(s->dsp.sad[1](NULL, ptr_y +dct_offset+ 8, dest_y +dct_offset+ 8, wrap_y, 8) < 20*s->qscale) skip_dct[3]= 1;
             if(s->dsp.sad[1](NULL, ptr_cb              , dest_cb              , wrap_c, 8) < 20*s->qscale) skip_dct[4]= 1;
             if(s->dsp.sad[1](NULL, ptr_cr              , dest_cr              , wrap_c, 8) < 20*s->qscale) skip_dct[5]= 1;
+            if(!s->chroma_y_shift){ /* 422 */
+                if(s->dsp.sad[1](NULL, ptr_cb +(dct_offset>>1), dest_cb +(dct_offset>>1), wrap_c, 8) < 20*s->qscale) skip_dct[6]= 1;
+                if(s->dsp.sad[1](NULL, ptr_cr +(dct_offset>>1), dest_cr +(dct_offset>>1), wrap_c, 8) < 20*s->qscale) skip_dct[7]= 1;
+            }
         }
     }
 
@@ -4238,13 +4507,17 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
         if(!skip_dct[3]) get_vissual_weight(weight[3], ptr_y + dct_offset + 8, wrap_y);
         if(!skip_dct[4]) get_vissual_weight(weight[4], ptr_cb                , wrap_c);
         if(!skip_dct[5]) get_vissual_weight(weight[5], ptr_cr                , wrap_c);
-        memcpy(orig[0], s->block[0], sizeof(DCTELEM)*64*6);
+        if(!s->chroma_y_shift){ /* 422 */
+            if(!skip_dct[6]) get_vissual_weight(weight[6], ptr_cb + (dct_offset>>1), wrap_c);
+            if(!skip_dct[7]) get_vissual_weight(weight[7], ptr_cr + (dct_offset>>1), wrap_c);
+        }
+        memcpy(orig[0], s->block[0], sizeof(DCTELEM)*64*mb_block_count);
     }
 
     /* DCT & quantize */
     assert(s->out_format!=FMT_MJPEG || s->qscale==8);
     {
-        for(i=0;i<6;i++) {
+        for(i=0;i<mb_block_count;i++) {
             if(!skip_dct[i]){
                 int overflow;
                 s->block_last_index[i] = s->dct_quantize(s, s->block[i], i, s->qscale, &overflow);
@@ -4256,7 +4529,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
                 s->block_last_index[i]= -1;
         }
         if(s->avctx->quantizer_noise_shaping){
-            for(i=0;i<6;i++) {
+            for(i=0;i<mb_block_count;i++) {
                 if(!skip_dct[i]){
                     s->block_last_index[i] = dct_quantize_refine(s, s->block[i], weight[i], orig[i], i, s->qscale);
                 }
@@ -4267,11 +4540,11 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
             for(i=0; i<4; i++)
                 dct_single_coeff_elimination(s, i, s->luma_elim_threshold);
         if(s->chroma_elim_threshold && !s->mb_intra)
-            for(i=4; i<6; i++)
+            for(i=4; i<mb_block_count; i++)
                 dct_single_coeff_elimination(s, i, s->chroma_elim_threshold);
 
         if(s->flags & CODEC_FLAG_CBP_RD){
-            for(i=0;i<6;i++) {
+            for(i=0;i<mb_block_count;i++) {
                 if(s->block_last_index[i] == -1)
                     s->coded_score[i]= INT_MAX/256;
             }
@@ -4287,7 +4560,7 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
 
     //non c quantize code returns incorrect block_last_index FIXME
     if(s->alternate_scan && s->dct_quantize != dct_quantize_c){
-        for(i=0; i<6; i++){
+        for(i=0; i<mb_block_count; i++){
             int j;
             if(s->block_last_index[i]>0){
                 for(j=63; j>0; j--){
@@ -4328,6 +4601,12 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
     }
 }
 
+static av_always_inline void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
+{
+    if (s->chroma_format == CHROMA_420) encode_mb_internal(s, motion_x, motion_y,  8, 6);
+    else                                encode_mb_internal(s, motion_x, motion_y, 16, 8);
+}
+
 #endif //CONFIG_ENCODERS
 
 void ff_mpeg_flush(AVCodecContext *avctx){
@@ -4353,6 +4632,7 @@ void ff_mpeg_flush(AVCodecContext *avctx){
     s->parse_context.index= 0;
     s->parse_context.last_index= 0;
     s->bitstream_buffer_size=0;
+    s->pp_time=0;
 }
 
 #ifdef CONFIG_ENCODERS
@@ -4437,7 +4717,7 @@ static inline void copy_context_after_encode(MpegEncContext *d, MpegEncContext *
         d->tex_pb= s->tex_pb;
     }
     d->block= s->block;
-    for(i=0; i<6; i++)
+    for(i=0; i<8; i++)
         d->block_last_index[i]= s->block_last_index[i];
     d->interlaced_dct= s->interlaced_dct;
     d->qscale= s->qscale;
@@ -4495,7 +4775,7 @@ static inline void encode_mb_hq(MpegEncContext *s, MpegEncContext *backup, MpegE
 }
 
 static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, int stride){
-    uint32_t *sq = squareTbl + 256;
+    uint32_t *sq = ff_squareTbl + 256;
     int acc=0;
     int x,y;
 
@@ -4560,6 +4840,8 @@ static int pre_estimate_motion_thread(AVCodecContext *c, void *arg){
 static int estimate_motion_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= arg;
 
+    ff_check_alignment();
+
     s->me.dia_size= s->avctx->dia_size;
     s->first_slice_line=1;
     for(s->mb_y= s->start_mb_y; s->mb_y < s->end_mb_y; s->mb_y++) {
@@ -4586,6 +4868,8 @@ static int mb_var_thread(AVCodecContext *c, void *arg){
     MpegEncContext *s= arg;
     int mb_x, mb_y;
 
+    ff_check_alignment();
+
     for(mb_y=s->start_mb_y; mb_y < s->end_mb_y; mb_y++) {
         for(mb_x=0; mb_x < s->mb_width; mb_x++) {
             int xx = mb_x * 16;
@@ -4633,6 +4917,8 @@ static int encode_thread(AVCodecContext *c, void *arg){
     PutBitContext pb[2], pb2[2], tex_pb[2];
 //printf("%d->%d\n", s->resync_mb_y, s->end_mb_y);
 
+    ff_check_alignment();
+
     for(i=0; i<2; i++){
         init_put_bits(&pb    [i], bit_buf    [i], MAX_MB_BYTES);
         init_put_bits(&pb2   [i], bit_buf2   [i], MAX_MB_BYTES);
@@ -4888,16 +5174,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_BIDIR, pb, pb2, tex_pb,
                                  &dmin, &next_block, 0, 0);
                 }
-                if(mb_type&CANDIDATE_MB_TYPE_DIRECT){
-                    int mx= s->b_direct_mv_table[xy][0];
-                    int my= s->b_direct_mv_table[xy][1];
-
-                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
-                    s->mb_intra= 0;
-                    ff_mpeg4_set_direct_mv(s, mx, my);
-                    encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_DIRECT, pb, pb2, tex_pb,
-                                 &dmin, &next_block, mx, my);
-                }
                 if(mb_type&CANDIDATE_MB_TYPE_FORWARD_I){
                     s->mv_dir = MV_DIR_FORWARD;
                     s->mv_type = MV_TYPE_FIELD;
@@ -4952,12 +5228,13 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     }
                 }
 
-                if(s->flags & CODEC_FLAG_QP_RD){
-                    if(best_s.mv_type==MV_TYPE_16X16 && !(best_s.mv_dir&MV_DIRECT)){
+                if((s->flags & CODEC_FLAG_QP_RD) && dmin < INT_MAX){
+                    if(best_s.mv_type==MV_TYPE_16X16){ //FIXME move 4mv after QPRD
                         const int last_qp= backup_s.qscale;
-                        int dquant, dir, qp, dc[6];
+                        int qpi, qp, dc[6];
                         DCTELEM ac[6][16];
                         const int mvdir= (best_s.mv_dir&MV_DIR_BACKWARD) ? 1 : 0;
+                        static const int dquant_tab[4]={-1,1,-2,2};
 
                         assert(backup_s.dquant == 0);
 
@@ -4970,12 +5247,12 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         s->mv[1][0][0] = best_s.mv[1][0][0];
                         s->mv[1][0][1] = best_s.mv[1][0][1];
 
-                        dir= s->pict_type == B_TYPE ? 2 : 1;
-                        if(last_qp + dir > s->avctx->qmax) dir= -dir;
-                        for(dquant= dir; dquant<=2 && dquant>=-2; dquant += dir){
+                        qpi = s->pict_type == B_TYPE ? 2 : 0;
+                        for(; qpi<4; qpi++){
+                            int dquant= dquant_tab[qpi];
                             qp= last_qp + dquant;
                             if(qp < s->avctx->qmin || qp > s->avctx->qmax)
-                                break;
+                                continue;
                             backup_s.dquant= dquant;
                             if(s->mb_intra && s->dc_val[0]){
                                 for(i=0; i<6; i++){
@@ -4993,17 +5270,63 @@ static int encode_thread(AVCodecContext *c, void *arg){
                                         memcpy(s->ac_val[0][s->block_index[i]], ac[i], sizeof(DCTELEM)*16);
                                     }
                                 }
-                                if(dir > 0 && dquant==dir){
-                                    dquant= 0;
-                                    dir= -dir;
-                                }else
-                                    break;
                             }
                         }
-                        qp= best_s.qscale;
-                        s->current_picture.qscale_table[xy]= qp;
                     }
                 }
+                if(mb_type&CANDIDATE_MB_TYPE_DIRECT){
+                    int mx= s->b_direct_mv_table[xy][0];
+                    int my= s->b_direct_mv_table[xy][1];
+
+                    backup_s.dquant = 0;
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
+                    s->mb_intra= 0;
+                    ff_mpeg4_set_direct_mv(s, mx, my);
+                    encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_DIRECT, pb, pb2, tex_pb,
+                                 &dmin, &next_block, mx, my);
+                }
+                if(mb_type&CANDIDATE_MB_TYPE_DIRECT0){
+                    backup_s.dquant = 0;
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
+                    s->mb_intra= 0;
+                    ff_mpeg4_set_direct_mv(s, 0, 0);
+                    encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_DIRECT, pb, pb2, tex_pb,
+                                 &dmin, &next_block, 0, 0);
+                }
+                if(!best_s.mb_intra && s->flags2&CODEC_FLAG2_SKIP_RD){
+                    int coded=0;
+                    for(i=0; i<6; i++)
+                        coded |= s->block_last_index[i];
+                    if(coded){
+                        int mx,my;
+                        memcpy(s->mv, best_s.mv, sizeof(s->mv));
+                        if(best_s.mv_dir & MV_DIRECT){
+                            mx=my=0; //FIXME find the one we actually used
+                            ff_mpeg4_set_direct_mv(s, mx, my);
+                        }else if(best_s.mv_dir&MV_DIR_BACKWARD){
+                            mx= s->mv[1][0][0];
+                            my= s->mv[1][0][1];
+                        }else{
+                            mx= s->mv[0][0][0];
+                            my= s->mv[0][0][1];
+                        }
+
+                        s->mv_dir= best_s.mv_dir;
+                        s->mv_type = best_s.mv_type;
+                        s->mb_intra= 0;
+/*                        s->mv[0][0][0] = best_s.mv[0][0][0];
+                        s->mv[0][0][1] = best_s.mv[0][0][1];
+                        s->mv[1][0][0] = best_s.mv[1][0][0];
+                        s->mv[1][0][1] = best_s.mv[1][0][1];*/
+                        backup_s.dquant= 0;
+                        s->skipdct=1;
+                        encode_mb_hq(s, &backup_s, &best_s, CANDIDATE_MB_TYPE_INTER /* wrong but unused */, pb, pb2, tex_pb,
+                                        &dmin, &next_block, mx, my);
+                        s->skipdct=0;
+                    }
+                }
+
+                s->current_picture.qscale_table[xy]= best_s.qscale;
 
                 copy_context_after_encode(s, &best_s, -1);
 
@@ -5037,7 +5360,7 @@ static int encode_thread(AVCodecContext *c, void *arg){
                 if(s->avctx->mb_decision == FF_MB_DECISION_BITS)
                     MPV_decode_mb(s, s->block);
             } else {
-                int motion_x, motion_y;
+                int motion_x = 0, motion_y = 0;
                 s->mv_type=MV_TYPE_16X16;
                 // only one MB-Type possible
 
@@ -5063,7 +5386,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         s->mv[0][i][0] = s->p_field_mv_table[i][j][xy][0];
                         s->mv[0][i][1] = s->p_field_mv_table[i][j][xy][1];
                     }
-                    motion_x = motion_y = 0;
                     break;
                 case CANDIDATE_MB_TYPE_INTER4V:
                     s->mv_dir = MV_DIR_FORWARD;
@@ -5073,7 +5395,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         s->mv[0][i][0] = s->current_picture.motion_val[0][s->block_index[i]][0];
                         s->mv[0][i][1] = s->current_picture.motion_val[0][s->block_index[i]][1];
                     }
-                    motion_x= motion_y= 0;
                     break;
                 case CANDIDATE_MB_TYPE_DIRECT:
                     s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
@@ -5082,11 +5403,14 @@ static int encode_thread(AVCodecContext *c, void *arg){
                     motion_y=s->b_direct_mv_table[xy][1];
                     ff_mpeg4_set_direct_mv(s, motion_x, motion_y);
                     break;
+                case CANDIDATE_MB_TYPE_DIRECT0:
+                    s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD | MV_DIRECT;
+                    s->mb_intra= 0;
+                    ff_mpeg4_set_direct_mv(s, 0, 0);
+                    break;
                 case CANDIDATE_MB_TYPE_BIDIR:
                     s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
                     s->mb_intra= 0;
-                    motion_x=0;
-                    motion_y=0;
                     s->mv[0][0][0] = s->b_bidir_forw_mv_table[xy][0];
                     s->mv[0][0][1] = s->b_bidir_forw_mv_table[xy][1];
                     s->mv[1][0][0] = s->b_bidir_back_mv_table[xy][0];
@@ -5114,7 +5438,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         s->mv[0][i][0] = s->b_field_mv_table[0][i][j][xy][0];
                         s->mv[0][i][1] = s->b_field_mv_table[0][i][j][xy][1];
                     }
-                    motion_x=motion_y=0;
                     break;
                 case CANDIDATE_MB_TYPE_BACKWARD_I:
                     s->mv_dir = MV_DIR_BACKWARD;
@@ -5125,7 +5448,6 @@ static int encode_thread(AVCodecContext *c, void *arg){
                         s->mv[1][i][0] = s->b_field_mv_table[1][i][j][xy][0];
                         s->mv[1][i][1] = s->b_field_mv_table[1][i][j][xy][1];
                     }
-                    motion_x=motion_y=0;
                     break;
                 case CANDIDATE_MB_TYPE_BIDIR_I:
                     s->mv_dir = MV_DIR_FORWARD | MV_DIR_BACKWARD;
@@ -5138,10 +5460,8 @@ static int encode_thread(AVCodecContext *c, void *arg){
                             s->mv[dir][i][1] = s->b_field_mv_table[dir][i][j][xy][1];
                         }
                     }
-                    motion_x=motion_y=0;
                     break;
                 default:
-                    motion_x=motion_y=0; //gcc warning fix
                     av_log(s->avctx, AV_LOG_ERROR, "illegal MB type\n");
                 }
 
@@ -5244,7 +5564,40 @@ static void merge_context_after_encode(MpegEncContext *dst, MpegEncContext *src)
     flush_put_bits(&dst->pb);
 }
 
-static void encode_picture(MpegEncContext *s, int picture_number)
+static int estimate_qp(MpegEncContext *s, int dry_run){
+    if (s->next_lambda){
+        s->current_picture_ptr->quality=
+        s->current_picture.quality = s->next_lambda;
+        if(!dry_run) s->next_lambda= 0;
+    } else if (!s->fixed_qscale) {
+        s->current_picture_ptr->quality=
+        s->current_picture.quality = ff_rate_estimate_qscale(s, dry_run);
+        if (s->current_picture.quality < 0)
+            return -1;
+    }
+
+    if(s->adaptive_quant){
+        switch(s->codec_id){
+        case CODEC_ID_MPEG4:
+            ff_clean_mpeg4_qscales(s);
+            break;
+        case CODEC_ID_H263:
+        case CODEC_ID_H263P:
+        case CODEC_ID_FLV1:
+            ff_clean_h263_qscales(s);
+            break;
+        }
+
+        s->lambda= s->lambda_table[0];
+        //FIXME broken
+    }else
+        s->lambda= s->current_picture.quality;
+//printf("%d %d\n", s->avctx->global_quality, s->current_picture.quality);
+    update_qscale(s);
+    return 0;
+}
+
+static int encode_picture(MpegEncContext *s, int picture_number)
 {
     int i;
     int bits;
@@ -5272,6 +5625,18 @@ static void encode_picture(MpegEncContext *s, int picture_number)
             s->no_rounding ^= 1;
     }
 
+    if(s->flags & CODEC_FLAG_PASS2){
+        if (estimate_qp(s,1) < 0)
+            return -1;
+        ff_get_2pass_fcode(s);
+    }else if(!(s->flags & CODEC_FLAG_QSCALE)){
+        if(s->pict_type==B_TYPE)
+            s->lambda= s->last_lambda_for[s->pict_type];
+        else
+            s->lambda= s->last_lambda_for[s->last_non_b_pict_type];
+        update_qscale(s);
+    }
+
     s->mb_intra=0; //for the rate distortion & bit compare functions
     for(i=1; i<s->avctx->thread_count; i++){
         ff_update_duplicate_context(s->thread_context[i], s);
@@ -5282,7 +5647,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
     /* Estimate motion for every MB */
     if(s->pict_type != I_TYPE){
         s->lambda = (s->lambda * s->avctx->me_penalty_compensation + 128)>>8;
-        s->lambda2= (s->lambda2* s->avctx->me_penalty_compensation + 128)>>8;
+        s->lambda2= (s->lambda2* (int64_t)s->avctx->me_penalty_compensation + 128)>>8;
         if(s->pict_type != B_TYPE && s->avctx->me_threshold==0){
             if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
                 s->avctx->execute(s->avctx, pre_estimate_motion_thread, (void**)&(s->thread_context[0]), NULL, s->avctx->thread_count);
@@ -5368,28 +5733,8 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         }
     }
 
-    if (!s->fixed_qscale)
-        s->current_picture_ptr->quality=
-        s->current_picture.quality = ff_rate_estimate_qscale(s);
-
-    if(s->adaptive_quant){
-        switch(s->codec_id){
-        case CODEC_ID_MPEG4:
-            ff_clean_mpeg4_qscales(s);
-            break;
-        case CODEC_ID_H263:
-        case CODEC_ID_H263P:
-        case CODEC_ID_FLV1:
-            ff_clean_h263_qscales(s);
-            break;
-        }
-
-        s->lambda= s->lambda_table[0];
-        //FIXME broken
-    }else
-        s->lambda= s->current_picture.quality;
-//printf("%d %d\n", s->avctx->global_quality, s->current_picture.quality);
-    update_qscale(s);
+    if (estimate_qp(s, 0) < 0)
+        return -1;
 
     if(s->qscale < 3 && s->max_qcoeff<=128 && s->pict_type==I_TYPE && !(s->flags & CODEC_FLAG_QSCALE))
         s->qscale= 3; //reduce clipping problems
@@ -5400,7 +5745,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         for(i=1;i<64;i++){
             int j= s->dsp.idct_permutation[i];
 
-            s->intra_matrix[j] = clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3) & 0xFF;
+            s->intra_matrix[j] = av_clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
         }
         convert_matrix(&s->dsp, s->q_intra_matrix, s->q_intra_matrix16,
                        s->intra_matrix, s->intra_quant_bias, 8, 8, 1);
@@ -5465,10 +5810,9 @@ static void encode_picture(MpegEncContext *s, int picture_number)
         merge_context_after_encode(s, s->thread_context[i]);
     }
     emms_c();
+    return 0;
 }
 
-#endif //CONFIG_ENCODERS
-
 static void  denoise_dct_c(MpegEncContext *s, DCTELEM *block){
     const int intra= s->mb_intra;
     int i;
@@ -5493,8 +5837,6 @@ static void  denoise_dct_c(MpegEncContext *s, DCTELEM *block){
     }
 }
 
-#ifdef CONFIG_ENCODERS
-
 static int dct_quantize_trellis_c(MpegEncContext *s,
                         DCTELEM *block, int n,
                         int qscale, int *overflow){
@@ -5613,13 +5955,13 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
 
     for(i=start_i; i<=last_non_zero; i++){
         int level_index, j;
-        const int dct_coeff= ABS(block[ scantable[i] ]);
+        const int dct_coeff= FFABS(block[ scantable[i] ]);
         const int zero_distoration= dct_coeff*dct_coeff;
         int best_score=256*256*256*120;
         for(level_index=0; level_index < coeff_count[i]; level_index++){
             int distoration;
             int level= coeff[level_index][i];
-            const int alevel= ABS(level);
+            const int alevel= FFABS(level);
             int unquant_coeff;
 
             assert(level);
@@ -5729,7 +6071,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
 
     s->coded_score[n] = last_score;
 
-    dc= ABS(block[0]);
+    dc= FFABS(block[0]);
     last_non_zero= last_i - 1;
     memset(block + start_i, 0, (64-start_i)*sizeof(DCTELEM));
 
@@ -5742,7 +6084,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
 
         for(i=0; i<coeff_count[0]; i++){
             int level= coeff[i][0];
-            int alevel= ABS(level);
+            int alevel= FFABS(level);
             int unquant_coeff, score, distortion;
 
             if(s->out_format == FMT_H263){
@@ -5809,7 +6151,7 @@ static int dct_quantize_refine(MpegEncContext *s, //FIXME breaks denoise?
                         DCTELEM *block, int16_t *weight, DCTELEM *orig,
                         int n, int qscale){
     int16_t rem[64];
-    DCTELEM d1[64] __align16;
+    DECLARE_ALIGNED_16(DCTELEM, d1[64]);
     const int *qmat;
     const uint8_t *scantable= s->intra_scantable.scantable;
     const uint8_t *perm_scantable= s->intra_scantable.permutated;
@@ -5822,7 +6164,7 @@ static int dct_quantize_refine(MpegEncContext *s, //FIXME breaks denoise?
     uint8_t * length;
     uint8_t * last_length;
     int lambda;
-    int rle_index, run, q, sum;
+    int rle_index, run, q = 1, sum; //q is only used when s->mb_intra is true
 #ifdef REFINE_STATS
 static int count=0;
 static int after_last=0;
@@ -5884,7 +6226,7 @@ STOP_TIMER("memset rem[]")}
         int qns=4;
         int w;
 
-        w= ABS(weight[i]) + qns*one;
+        w= FFABS(weight[i]) + qns*one;
         w= 15 + (48*qns*one + w/2)/w; // 16 .. 63
 
         weight[i] = w;
@@ -6008,7 +6350,7 @@ STOP_TIMER("dct")}
                 int score, new_coeff, unquant_change;
 
                 score=0;
-                if(s->avctx->quantizer_noise_shaping < 2 && ABS(new_level) > ABS(level))
+                if(s->avctx->quantizer_noise_shaping < 2 && FFABS(new_level) > FFABS(level))
                    continue;
 
                 if(new_level){
@@ -6028,7 +6370,7 @@ STOP_TIMER("dct")}
                                          - last_length[UNI_AC_ENC_INDEX(run, level+64)];
                         }
                     }else{
-                        assert(ABS(new_level)==1);
+                        assert(FFABS(new_level)==1);
 
                         if(analyze_gradient){
                             int g= d1[ scantable[i] ];
@@ -6061,7 +6403,7 @@ STOP_TIMER("dct")}
                     }
                 }else{
                     new_coeff=0;
-                    assert(ABS(level)==1);
+                    assert(FFABS(level)==1);
 
                     if(i < last_non_zero){
                         int next_i= i + run2 + 1;
@@ -6129,7 +6471,7 @@ after_last++;
 #ifdef REFINE_STATS
 if(block[j]){
     if(block[j] - best_change){
-        if(ABS(block[j]) > ABS(block[j] - best_change)){
+        if(FFABS(block[j]) > FFABS(block[j] - best_change)){
             raise++;
         }else{
             lower++;
@@ -6256,7 +6598,7 @@ static int dct_quantize_c(MpegEncContext *s,
 
     /* we need this permutation so that we correct the IDCT, we only permute the !=0 elements */
     if (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)
-       ff_block_permute(block, s->dsp.idct_permutation, scantable, last_non_zero);
+        ff_block_permute(block, s->dsp.idct_permutation, scantable, last_non_zero);
 
     return last_non_zero;
 }
@@ -6354,6 +6696,39 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
     }
 }
 
+static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
+                                   DCTELEM *block, int n, int qscale)
+{
+    int i, level, nCoeffs;
+    const uint16_t *quant_matrix;
+    int sum=-1;
+
+    if(s->alternate_scan) nCoeffs= 63;
+    else nCoeffs= s->block_last_index[n];
+
+    if (n < 4)
+        block[0] = block[0] * s->y_dc_scale;
+    else
+        block[0] = block[0] * s->c_dc_scale;
+    quant_matrix = s->intra_matrix;
+    for(i=1;i<=nCoeffs;i++) {
+        int j= s->intra_scantable.permutated[i];
+        level = block[j];
+        if (level) {
+            if (level < 0) {
+                level = -level;
+                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+                level = -level;
+            } else {
+                level = (int)(level * qscale * quant_matrix[j]) >> 3;
+            }
+            block[j] = level;
+            sum+=level;
+        }
+    }
+    block[63]^=sum&1;
+}
+
 static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
                                    DCTELEM *block, int n, int qscale)
 {
@@ -6568,7 +6943,7 @@ AVCodec mjpeg_encoder = {
     MPV_encode_init,
     MPV_encode_picture,
     MPV_encode_end,
-    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUVJ420P, -1},
+    .pix_fmts= (enum PixelFormat[]){PIX_FMT_YUVJ420P, PIX_FMT_YUVJ422P, -1},
 };
 
 #endif //CONFIG_ENCODERS