From: Michael Niedermayer <michaelni@gmx.at>
Date: Thu, 12 May 2011 02:51:24 +0000 (+0200)
Subject: Merge remote branch 'qatar/master'
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=612122b187d711257eecd517e4049cef3bb0b7f0;hp=4ea216e761e02d3f6973b316feaf3484be91a14f;p=ffmpeg

Merge remote branch 'qatar/master'

* qatar/master: (32 commits)
  10-bit H.264 x86 chroma v loopfilter asm
  Port SMPTE S302M audio decoder from FFmbc 0.3. [Copyright headers corrected]
  Fix crash of interlaced MPEG2 decoding
  h264pred: fix one more aliasing violation.
  doc/APIchanges: fill in missing hashes and dates.
  flacenc: use proper initializers for AVOption default values.
  lavc: deprecate named constants for deprecated antialias_algo.
  aac: workaround for compilation on cygwin
  swscale: extend YUV422p support to 10bits depth
  tiff: add support for inverted FillOrder for uncompressed data
  Remove unused softfloat implementation.
  h264pred: fix aliasing violations.
  rotozoom: Eliminate French variable name.
  rotozoom: Check return value of fread().
  rotozoom: Return an error value instead of calling exit().
  rotozoom: Make init_demo() return int and check for errors on invocation.
  rotozoom: Drop silly UINT8 typedef.
  rotozoom: Drop some unnecessary parentheses.
  rotozoom: K&R coding style cosmetics
  rtsp: Only do keepalive using GET_PARAMETER if the server supports it
  ...

Conflicts:
	Changelog
	cmdutils.c
	doc/APIchanges
	doc/general.texi
	ffmpeg.c
	ffplay.c
	libavcodec/h264pred_template.c
	libavcodec/resample.c
	libavutil/pixfmt.h
	libavutil/softfloat.c
	libavutil/softfloat.h
	tests/rotozoom.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>
---

diff --git a/Changelog b/Changelog
index 6425a494088..95756812806 100644
--- a/Changelog
+++ b/Changelog
@@ -12,6 +12,8 @@ version <next>:
 - Lots of deprecated API cruft removed
 - fft and imdct optimizations for AVX (Sandy Bridge) processors
 - showinfo filter added
+- DPX image encoder
+- SMPTE 302M AES3 audio decoder
 
 
 version 0.7_beta1:
diff --git a/doc/APIchanges b/doc/APIchanges
index aa2827af080..6635ec1f30d 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -13,7 +13,7 @@ libavutil:   2011-04-18
 
 API changes, most recent first:
 
-2011-05-10 - xxxxxxx - lavc 53.3.0 - avcodec.h
+2011-05-10 - 188dea1 - lavc 53.3.0 - avcodec.h
   Deprecate AVLPCType and the following fields in
   AVCodecContext: lpc_coeff_precision, prediction_order_method,
   min_partition_order, max_partition_order, lpc_type, lpc_passes.
@@ -43,15 +43,15 @@ API changes, most recent first:
   Add av_dynarray_add function for adding
   an element to a dynamic array.
 
-2011-04-XX - bebe72f - lavu 51.1.0 - avutil.h
+2011-04-26 - bebe72f - lavu 51.1.0 - avutil.h
   Add AVPictureType enum and av_get_picture_type_char(), deprecate
   FF_*_TYPE defines and av_get_pict_type_char() defined in
   libavcodec/avcodec.h.
 
-2011-04-xx - 10d3940 - lavfi 2.3.0 - avfilter.h
+2011-04-26 - 10d3940 - lavfi 2.3.0 - avfilter.h
   Add pict_type and key_frame fields to AVFilterBufferRefVideo.
 
-2011-04-xx - 7a11c82 - lavfi 2.2.0 - vsrc_buffer
+2011-04-26 - 7a11c82 - lavfi 2.2.0 - vsrc_buffer
   Add sample_aspect_ratio fields to vsrc_buffer arguments
 
 2011-04-21 - 94f7451 - lavc 53.1.0 - avcodec.h
diff --git a/doc/avutil.txt b/doc/avutil.txt
index 210bd072641..0847683d1d2 100644
--- a/doc/avutil.txt
+++ b/doc/avutil.txt
@@ -19,7 +19,6 @@ integer.c               128bit integer math
 lls.c
 mathematics.c           greatest common divisor, integer sqrt, integer log2, ...
 mem.c                   memory allocation routines with guaranteed alignment
-softfloat.c
 
 Headers:
 bswap.h                 big/little/native-endian conversion code
diff --git a/doc/general.texi b/doc/general.texi
index 676064ce559..a18dc6606e9 100644
--- a/doc/general.texi
+++ b/doc/general.texi
@@ -677,6 +677,7 @@ following image formats are supported:
 @item Sierra VMD audio       @tab     @tab  X
     @tab Used in Sierra VMD files.
 @item Smacker audio          @tab     @tab  X
+@item SMPTE 302M AES3 audio  @tab     @tab  X
 @item Sonic                  @tab  X  @tab  X
     @tab experimental codec
 @item Sonic lossless         @tab  X  @tab  X
diff --git a/ffmpeg.c b/ffmpeg.c
index 2875d8aec82..23413ce5dcc 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -663,11 +663,11 @@ static void choose_pixel_fmt(AVStream *st, AVCodec *codec)
         }
         if (*p == -1) {
             if(st->codec->pix_fmt != PIX_FMT_NONE)
-            av_log(NULL, AV_LOG_WARNING,
-                   "Incompatible pixel format '%s' for codec '%s', auto-selecting format '%s'\n",
-                   av_pix_fmt_descriptors[st->codec->pix_fmt].name,
-                   codec->name,
-                   av_pix_fmt_descriptors[codec->pix_fmts[0]].name);
+                av_log(NULL, AV_LOG_WARNING,
+                        "Incompatible pixel format '%s' for codec '%s', auto-selecting format '%s'\n",
+                        av_pix_fmt_descriptors[st->codec->pix_fmt].name,
+                        codec->name,
+                        av_pix_fmt_descriptors[codec->pix_fmts[0]].name);
             st->codec->pix_fmt = codec->pix_fmts[0];
         }
     }
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index a4c4d198245..e293438e452 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -329,6 +329,7 @@ OBJS-$(CONFIG_RV30_DECODER)            += rv30.o rv34.o rv30dsp.o        \
                                           mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_RV40_DECODER)            += rv40.o rv34.o rv40dsp.o        \
                                           mpegvideo.o error_resilience.o
+OBJS-$(CONFIG_S302M_DECODER)           += s302m.o
 OBJS-$(CONFIG_SGI_DECODER)             += sgidec.o
 OBJS-$(CONFIG_SGI_ENCODER)             += sgienc.o rle.o
 OBJS-$(CONFIG_SHORTEN_DECODER)         += shorten.o
diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index 6d55acbc459..187b4ad9724 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -30,6 +30,8 @@
  * add sane pulse detection
  ***********************************/
 
+#include "libavutil/libm.h" // brought forward to work around cygwin header breakage
+
 #include <float.h>
 #include <math.h>
 #include "avcodec.h"
@@ -37,7 +39,6 @@
 #include "aac.h"
 #include "aacenc.h"
 #include "aactab.h"
-#include "libavutil/libm.h"
 
 /** bits needed to code codebook run value for long windows */
 static const uint8_t run_value_bits_long[64] = {
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index c321596d4ae..fc74eeaf8c5 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -184,6 +184,7 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC  (RV20, rv20);
     REGISTER_DECODER (RV30, rv30);
     REGISTER_DECODER (RV40, rv40);
+    REGISTER_DECODER (S302M, s302m);
     REGISTER_ENCDEC  (SGI, sgi);
     REGISTER_DECODER (SMACKER, smacker);
     REGISTER_DECODER (SMC, smc);
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 00e9dd5e2f4..2fbf9cfc2a9 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -232,6 +232,7 @@ enum CodecID {
     CODEC_ID_PCM_F64LE,
     CODEC_ID_PCM_BLURAY,
     CODEC_ID_PCM_LXF,
+    CODEC_ID_S302M,
 
     /* various ADPCM codecs */
     CODEC_ID_ADPCM_IMA_QT= 0x11000,
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 811a75a06f5..c8d3af55465 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -1352,22 +1352,22 @@ static av_cold int flac_encode_close(AVCodecContext *avctx)
 
 #define FLAGS AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM
 static const AVOption options[] = {
-{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, 15, 0, MAX_LPC_PRECISION, FLAGS },
-{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" },
-{ "none",     NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_NONE,     INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "fixed",    NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_FIXED,    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_LEVINSON, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, FF_LPC_TYPE_CHOLESKY, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
-{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  FF_OPT_TYPE_INT, -1, INT_MIN, INT_MAX, FLAGS },
-{ "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  FF_OPT_TYPE_INT, -1,      -1, MAX_PARTITION_ORDER, FLAGS },
-{ "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  FF_OPT_TYPE_INT, -1,      -1, MAX_PARTITION_ORDER, FLAGS },
-{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, -1, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
-{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_EST,    INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "2level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_2LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "4level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_4LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "8level",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_8LEVEL, INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "search",     NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_SEARCH, INT_MIN, INT_MAX, FLAGS, "predm" },
-{ "log",        NULL, 0, FF_OPT_TYPE_CONST, ORDER_METHOD_LOG,    INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "lpc_coeff_precision", "LPC coefficient precision", offsetof(FlacEncodeContext, options.lpc_coeff_precision), FF_OPT_TYPE_INT, {.dbl = 15 }, 0, MAX_LPC_PRECISION, FLAGS },
+{ "lpc_type", "LPC algorithm", offsetof(FlacEncodeContext, options.lpc_type), FF_OPT_TYPE_INT, {.dbl = FF_LPC_TYPE_DEFAULT }, FF_LPC_TYPE_DEFAULT, FF_LPC_TYPE_NB-1, FLAGS, "lpc_type" },
+{ "none",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_NONE },     INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "fixed",    NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_FIXED },    INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "levinson", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_LEVINSON }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "cholesky", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_LPC_TYPE_CHOLESKY }, INT_MIN, INT_MAX, FLAGS, "lpc_type" },
+{ "lpc_passes", "Number of passes to use for Cholesky factorization during LPC analysis", offsetof(FlacEncodeContext, options.lpc_passes),  FF_OPT_TYPE_INT, {.dbl = -1 }, INT_MIN, INT_MAX, FLAGS },
+{ "min_partition_order",  NULL, offsetof(FlacEncodeContext, options.min_partition_order),  FF_OPT_TYPE_INT, {.dbl = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
+{ "max_partition_order",  NULL, offsetof(FlacEncodeContext, options.max_partition_order),  FF_OPT_TYPE_INT, {.dbl = -1 },      -1, MAX_PARTITION_ORDER, FLAGS },
+{ "prediction_order_method", "Search method for selecting prediction order", offsetof(FlacEncodeContext, options.prediction_order_method), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, ORDER_METHOD_LOG, FLAGS, "predm" },
+{ "estimation", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_EST },    INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "2level",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_2LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "4level",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_4LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "8level",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_8LEVEL }, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "search",     NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_SEARCH }, INT_MIN, INT_MAX, FLAGS, "predm" },
+{ "log",        NULL, 0, FF_OPT_TYPE_CONST, {.dbl = ORDER_METHOD_LOG },    INT_MIN, INT_MAX, FLAGS, "predm" },
 { NULL },
 };
 
diff --git a/libavcodec/options.c b/libavcodec/options.c
index de256351619..a2dbb0ba73d 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -305,11 +305,11 @@ static const AVOption options[]={
 {"error", NULL, OFFSET(error_rate), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #if FF_API_ANTIALIAS_ALGO
 {"antialias", "MP3 antialias algorithm", OFFSET(antialias_algo), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|D, "aa"},
-#endif
 {"auto", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_AUTO }, INT_MIN, INT_MAX, V|D, "aa"},
 {"fastint", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_FASTINT }, INT_MIN, INT_MAX, V|D, "aa"},
 {"int", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_INT }, INT_MIN, INT_MAX, V|D, "aa"},
 {"float", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_AA_FLOAT }, INT_MIN, INT_MAX, V|D, "aa"},
+#endif
 {"qns", "quantizer noise shaping", OFFSET(quantizer_noise_shaping), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"threads", NULL, OFFSET(thread_count), FF_OPT_TYPE_INT, {.dbl = 1 }, INT_MIN, INT_MAX, V|E|D},
 {"me_threshold", "motion estimaton threshold", OFFSET(me_threshold), FF_OPT_TYPE_INT, {.dbl = DEFAULT }, INT_MIN, INT_MAX},
diff --git a/libavcodec/resample.c b/libavcodec/resample.c
index d3c12f6354a..9e6defefdf6 100644
--- a/libavcodec/resample.c
+++ b/libavcodec/resample.c
@@ -29,6 +29,8 @@
 #include "libavutil/opt.h"
 #include "libavutil/samplefmt.h"
 
+#define MAX_CHANNELS 8
+
 struct AVResampleContext;
 
 static const char *context_to_name(void *ptr)
@@ -37,20 +39,22 @@ static const char *context_to_name(void *ptr)
 }
 
 static const AVOption options[] = {{NULL}};
-static const AVClass audioresample_context_class = { "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT };
+static const AVClass audioresample_context_class = {
+    "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT
+};
 
 struct ReSampleContext {
     struct AVResampleContext *resample_context;
-    short *temp[2];
+    short *temp[MAX_CHANNELS];
     int temp_len;
     float ratio;
     /* channel convert */
     int input_channels, output_channels, filter_channels;
     AVAudioConvert *convert_ctx[2];
     enum AVSampleFormat sample_fmt[2]; ///< input and output sample format
-    unsigned sample_size[2];         ///< size of one sample in sample_fmt
-    short *buffer[2];                ///< buffers used for conversion to S16
-    unsigned buffer_size[2];         ///< sizes of allocated buffers
+    unsigned sample_size[2];           ///< size of one sample in sample_fmt
+    short *buffer[2];                  ///< buffers used for conversion to S16
+    unsigned buffer_size[2];           ///< sizes of allocated buffers
 };
 
 /* n1: number of samples */
@@ -104,41 +108,42 @@ static void mono_to_stereo(short *output, short *input, int n1)
     }
 }
 
-/* XXX: should use more abstract 'N' channels system */
-static void stereo_split(short *output1, short *output2, short *input, int n)
+static void deinterleave(short **output, short *input, int channels, int samples)
 {
-    int i;
+    int i, j;
 
-    for(i=0;i<n;i++) {
-        *output1++ = *input++;
-        *output2++ = *input++;
+    for (i = 0; i < samples; i++) {
+        for (j = 0; j < channels; j++) {
+            *output[j]++ = *input++;
+        }
     }
 }
 
-static void stereo_mux(short *output, short *input1, short *input2, int n)
+static void interleave(short *output, short **input, int channels, int samples)
 {
-    int i;
+    int i, j;
 
-    for(i=0;i<n;i++) {
-        *output++ = *input1++;
-        *output++ = *input2++;
+    for (i = 0; i < samples; i++) {
+        for (j = 0; j < channels; j++) {
+            *output++ = *input[j]++;
+        }
     }
 }
 
 static void ac3_5p1_mux(short *output, short *input1, short *input2, int n)
 {
     int i;
-    short l,r;
-
-    for(i=0;i<n;i++) {
-      l=*input1++;
-      r=*input2++;
-      *output++ = l;           /* left */
-      *output++ = (l/2)+(r/2); /* center */
-      *output++ = r;           /* right */
-      *output++ = 0;           /* left surround */
-      *output++ = 0;           /* right surroud */
-      *output++ = 0;           /* low freq */
+    short l, r;
+
+    for (i = 0; i < n; i++) {
+        l = *input1++;
+        r = *input2++;
+        *output++ = l;                  /* left */
+        *output++ = (l / 2) + (r / 2);  /* center */
+        *output++ = r;                  /* right */
+        *output++ = 0;                  /* left surround */
+        *output++ = 0;                  /* right surroud */
+        *output++ = 0;                  /* low freq */
     }
 }
 
@@ -151,18 +156,25 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
 {
     ReSampleContext *s;
 
-    if ( input_channels > 2)
-      {
-        av_log(NULL, AV_LOG_ERROR, "Resampling with input channels greater than 2 unsupported.\n");
+    if (input_channels > MAX_CHANNELS) {
+        av_log(NULL, AV_LOG_ERROR,
+               "Resampling with input channels greater than %d is unsupported.\n",
+               MAX_CHANNELS);
+        return NULL;
+    }
+    if (output_channels > 2 &&
+        !(output_channels == 6 && input_channels == 2) &&
+        output_channels != input_channels) {
+        av_log(NULL, AV_LOG_ERROR,
+               "Resampling output channel count must be 1 or 2 for mono input; 1, 2 or 6 for stereo input; or N for N channel input.\n");
         return NULL;
-      }
+    }
 
     s = av_mallocz(sizeof(ReSampleContext));
-    if (!s)
-      {
+    if (!s) {
         av_log(NULL, AV_LOG_ERROR, "Can't allocate memory for resample context.\n");
         return NULL;
-      }
+    }
 
     s->ratio = (float)output_rate / (float)input_rate;
 
@@ -173,10 +185,10 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
     if (s->output_channels < s->filter_channels)
         s->filter_channels = s->output_channels;
 
-    s->sample_fmt [0] = sample_fmt_in;
-    s->sample_fmt [1] = sample_fmt_out;
-    s->sample_size[0] = av_get_bits_per_sample_fmt(s->sample_fmt[0])>>3;
-    s->sample_size[1] = av_get_bits_per_sample_fmt(s->sample_fmt[1])>>3;
+    s->sample_fmt[0]  = sample_fmt_in;
+    s->sample_fmt[1]  = sample_fmt_out;
+    s->sample_size[0] = av_get_bits_per_sample_fmt(s->sample_fmt[0]) >> 3;
+    s->sample_size[1] = av_get_bits_per_sample_fmt(s->sample_fmt[1]) >> 3;
 
     if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) {
         if (!(s->convert_ctx[0] = av_audio_convert_alloc(AV_SAMPLE_FMT_S16, 1,
@@ -201,17 +213,10 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
         }
     }
 
-/*
- * AC-3 output is the only case where filter_channels could be greater than 2.
- * input channels can't be greater than 2, so resample the 2 channels and then
- * expand to 6 channels after the resampling.
- */
-    if(s->filter_channels>2)
-      s->filter_channels = 2;
-
 #define TAPS 16
-    s->resample_context= av_resample_init(output_rate, input_rate,
-                         filter_length, log2_phase_count, linear, cutoff);
+    s->resample_context = av_resample_init(output_rate, input_rate,
+                                           filter_length, log2_phase_count,
+                                           linear, cutoff);
 
     *(const AVClass**)s->resample_context = &audioresample_context_class;
 
@@ -223,9 +228,9 @@ ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
 int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples)
 {
     int i, nb_samples1;
-    short *bufin[2];
-    short *bufout[2];
-    short *buftmp2[2], *buftmp3[2];
+    short *bufin[MAX_CHANNELS];
+    short *bufout[MAX_CHANNELS];
+    short *buftmp2[MAX_CHANNELS], *buftmp3[MAX_CHANNELS];
     short *output_bak = NULL;
     int lenout;
 
@@ -240,7 +245,7 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         int ostride[1] = { 2 };
         const void *ibuf[1] = { input };
         void       *obuf[1];
-        unsigned input_size = nb_samples*s->input_channels*2;
+        unsigned input_size = nb_samples * s->input_channels * 2;
 
         if (!s->buffer_size[0] || s->buffer_size[0] < input_size) {
             av_free(s->buffer[0]);
@@ -255,12 +260,13 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         obuf[0] = s->buffer[0];
 
         if (av_audio_convert(s->convert_ctx[0], obuf, ostride,
-                             ibuf, istride, nb_samples*s->input_channels) < 0) {
-            av_log(s->resample_context, AV_LOG_ERROR, "Audio sample format conversion failed\n");
+                             ibuf, istride, nb_samples * s->input_channels) < 0) {
+            av_log(s->resample_context, AV_LOG_ERROR,
+                   "Audio sample format conversion failed\n");
             return 0;
         }
 
-        input  = s->buffer[0];
+        input = s->buffer[0];
     }
 
     lenout= 2*s->output_channels*nb_samples * s->ratio + 16;
@@ -282,52 +288,50 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
     }
 
     /* XXX: move those malloc to resample init code */
-    for(i=0; i<s->filter_channels; i++){
-        bufin[i]= av_malloc( (nb_samples + s->temp_len) * sizeof(short) );
+    for (i = 0; i < s->filter_channels; i++) {
+        bufin[i] = av_malloc((nb_samples + s->temp_len) * sizeof(short));
         memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short));
         buftmp2[i] = bufin[i] + s->temp_len;
+        bufout[i] = av_malloc(lenout * sizeof(short));
     }
 
-    /* make some zoom to avoid round pb */
-    bufout[0]= av_malloc( lenout * sizeof(short) );
-    bufout[1]= av_malloc( lenout * sizeof(short) );
-
-    if (s->input_channels == 2 &&
-        s->output_channels == 1) {
+    if (s->input_channels == 2 && s->output_channels == 1) {
         buftmp3[0] = output;
         stereo_to_mono(buftmp2[0], input, nb_samples);
     } else if (s->output_channels >= 2 && s->input_channels == 1) {
         buftmp3[0] = bufout[0];
-        memcpy(buftmp2[0], input, nb_samples*sizeof(short));
-    } else if (s->output_channels >= 2) {
-        buftmp3[0] = bufout[0];
-        buftmp3[1] = bufout[1];
-        stereo_split(buftmp2[0], buftmp2[1], input, nb_samples);
+        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
+    } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) {
+        for (i = 0; i < s->input_channels; i++) {
+            buftmp3[i] = bufout[i];
+        }
+        deinterleave(buftmp2, input, s->input_channels, nb_samples);
     } else {
         buftmp3[0] = output;
-        memcpy(buftmp2[0], input, nb_samples*sizeof(short));
+        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
     }
 
     nb_samples += s->temp_len;
 
     /* resample each channel */
     nb_samples1 = 0; /* avoid warning */
-    for(i=0;i<s->filter_channels;i++) {
+    for (i = 0; i < s->filter_channels; i++) {
         int consumed;
-        int is_last= i+1 == s->filter_channels;
+        int is_last = i + 1 == s->filter_channels;
 
-        nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i], &consumed, nb_samples, lenout, is_last);
-        s->temp_len= nb_samples - consumed;
-        s->temp[i]= av_realloc(s->temp[i], s->temp_len*sizeof(short));
-        memcpy(s->temp[i], bufin[i] + consumed, s->temp_len*sizeof(short));
+        nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i],
+                                  &consumed, nb_samples, lenout, is_last);
+        s->temp_len = nb_samples - consumed;
+        s->temp[i] = av_realloc(s->temp[i], s->temp_len * sizeof(short));
+        memcpy(s->temp[i], bufin[i] + consumed, s->temp_len * sizeof(short));
     }
 
     if (s->output_channels == 2 && s->input_channels == 1) {
         mono_to_stereo(output, buftmp3[0], nb_samples1);
-    } else if (s->output_channels == 2) {
-        stereo_mux(output, buftmp3[0], buftmp3[1], nb_samples1);
-    } else if (s->output_channels == 6) {
+    } else if (s->output_channels == 6 && s->input_channels == 2) {
         ac3_5p1_mux(output, buftmp3[0], buftmp3[1], nb_samples1);
+    } else if (s->output_channels == s->input_channels && s->input_channels >= 2) {
+        interleave(output, buftmp3, s->output_channels, nb_samples1);
     }
 
     if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
@@ -337,25 +341,27 @@ int audio_resample(ReSampleContext *s, short *output, short *input, int nb_sampl
         void       *obuf[1] = { output_bak };
 
         if (av_audio_convert(s->convert_ctx[1], obuf, ostride,
-                             ibuf, istride, nb_samples1*s->output_channels) < 0) {
-            av_log(s->resample_context, AV_LOG_ERROR, "Audio sample format convertion failed\n");
+                             ibuf, istride, nb_samples1 * s->output_channels) < 0) {
+            av_log(s->resample_context, AV_LOG_ERROR,
+                   "Audio sample format convertion failed\n");
             return 0;
         }
     }
 
-    for(i=0; i<s->filter_channels; i++)
+    for (i = 0; i < s->filter_channels; i++) {
         av_free(bufin[i]);
+        av_free(bufout[i]);
+    }
 
-    av_free(bufout[0]);
-    av_free(bufout[1]);
     return nb_samples1;
 }
 
 void audio_resample_close(ReSampleContext *s)
 {
+    int i;
     av_resample_close(s->resample_context);
-    av_freep(&s->temp[0]);
-    av_freep(&s->temp[1]);
+    for (i = 0; i < s->filter_channels; i++)
+        av_freep(&s->temp[i]);
     av_freep(&s->buffer[0]);
     av_freep(&s->buffer[1]);
     av_audio_convert_free(s->convert_ctx[0]);
diff --git a/libavcodec/s302m.c b/libavcodec/s302m.c
new file mode 100644
index 00000000000..dd0ec2ee190
--- /dev/null
+++ b/libavcodec/s302m.c
@@ -0,0 +1,141 @@
+/*
+ * SMPTE 302M decoder
+ * Copyright (c) 2008 Laurent Aimar <fenrir@videolan.org>
+ * Copyright (c) 2009 Baptiste Coudurier <baptiste.coudurier@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/intreadwrite.h"
+#include "avcodec.h"
+
+#define AES3_HEADER_LEN 4
+
+static int s302m_parse_frame_header(AVCodecContext *avctx, const uint8_t *buf,
+                                    int buf_size)
+{
+    uint32_t h;
+    int frame_size, channels, id, bits;
+
+    if (buf_size <= AES3_HEADER_LEN) {
+        av_log(avctx, AV_LOG_ERROR, "frame is too short\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /*
+     * AES3 header :
+     * size:            16
+     * number channels   2
+     * channel_id        8
+     * bits per samples  2
+     * alignments        4
+     */
+
+    h = AV_RB32(buf);
+    frame_size =  (h >> 16) & 0xffff;
+    channels   = ((h >> 14) & 0x0003) * 2 +  2;
+    id         =  (h >>  6) & 0x00ff;
+    bits       = ((h >>  4) & 0x0003) * 4 + 16;
+
+    if (AES3_HEADER_LEN + frame_size != buf_size || bits > 24) {
+        av_log(avctx, AV_LOG_ERROR, "frame has invalid header\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    /* Set output properties */
+    avctx->bits_per_coded_sample = bits;
+    if (bits > 16)
+        avctx->sample_fmt = SAMPLE_FMT_S32;
+    else
+        avctx->sample_fmt = SAMPLE_FMT_S16;
+
+    avctx->channels    = channels;
+    avctx->sample_rate = 48000;
+    avctx->bit_rate    = 48000 * avctx->channels * (avctx->bits_per_coded_sample + 4) +
+                         32 * (48000 / (buf_size * 8 /
+                                        (avctx->channels *
+                                         (avctx->bits_per_coded_sample + 4))));
+
+    return frame_size;
+}
+
+static int s302m_decode_frame(AVCodecContext *avctx, void *data,
+                              int *data_size, AVPacket *avpkt)
+{
+    const uint8_t *buf = avpkt->data;
+    int buf_size       = avpkt->size;
+
+    int frame_size = s302m_parse_frame_header(avctx, buf, buf_size);
+    if (frame_size < 0)
+        return frame_size;
+
+    buf_size -= AES3_HEADER_LEN;
+    buf      += AES3_HEADER_LEN;
+
+    if (*data_size < 4 * buf_size * 8 / (avctx->bits_per_coded_sample + 4))
+        return -1;
+
+    if (avctx->bits_per_coded_sample == 24) {
+        uint32_t *o = data;
+        for (; buf_size > 6; buf_size -= 7) {
+            *o++ = (av_reverse[buf[2]]        << 24) |
+                   (av_reverse[buf[1]]        << 16) |
+                   (av_reverse[buf[0]]        <<  8);
+            *o++ = (av_reverse[buf[6] & 0xf0] << 28) |
+                   (av_reverse[buf[5]]        << 20) |
+                   (av_reverse[buf[4]]        << 12) |
+                   (av_reverse[buf[3] & 0x0f] <<  8);
+            buf += 7;
+        }
+        *data_size = (uint8_t*) o - (uint8_t*) data;
+    } else if (avctx->bits_per_coded_sample == 20) {
+        uint32_t *o = data;
+        for (; buf_size > 5; buf_size -= 6) {
+            *o++ = (av_reverse[buf[2] & 0xf0] << 28) |
+                   (av_reverse[buf[1]]        << 20) |
+                   (av_reverse[buf[0]]        << 12);
+            *o++ = (av_reverse[buf[5] & 0xf0] << 28) |
+                   (av_reverse[buf[4]]        << 20) |
+                   (av_reverse[buf[3]]        << 12);
+            buf += 6;
+        }
+        *data_size = (uint8_t*) o - (uint8_t*) data;
+    } else {
+        uint16_t *o = data;
+        for (; buf_size > 4; buf_size -= 5) {
+            *o++ = (av_reverse[buf[1]]        <<  8) |
+                    av_reverse[buf[0]];
+            *o++ = (av_reverse[buf[4] & 0xf0] << 12) |
+                   (av_reverse[buf[3]]        <<  4) |
+                    av_reverse[buf[2] & 0x0f];
+            buf += 5;
+        }
+        *data_size = (uint8_t*) o - (uint8_t*) data;
+    }
+
+    return buf - avpkt->data;
+}
+
+
+AVCodec ff_s302m_decoder = {
+    .name           = "s302m",
+    .type           = AVMEDIA_TYPE_AUDIO,
+    .id             = CODEC_ID_S302M,
+    .priv_data_size = 0,
+    .decode         = s302m_decode_frame,
+    .long_name      = NULL_IF_CONFIG_SMALL("SMPTE 302M"),
+};
diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c
index f252913b62f..7e7ae0f0073 100644
--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -168,7 +168,13 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t* dst, int stride, const uin
         }
         switch(s->compr){
         case TIFF_RAW:
-            memcpy(dst, src, width);
+            if (!s->fill_order) {
+                memcpy(dst, src, width);
+            } else {
+                int i;
+                for (i = 0; i < width; i++)
+                    dst[i] = av_reverse[src[i]];
+            }
             src += width;
             break;
         case TIFF_PACKBITS:
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 6416e600a3a..47758536971 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -9,6 +9,7 @@ YASM-OBJS-$(CONFIG_FFT)                += x86/fft_mmx.o                 \
 
 MMX-OBJS-$(CONFIG_H264DSP)             += x86/h264dsp_mmx.o
 YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \
+                                          x86/h264_deblock_10bit.o      \
                                           x86/h264_weight.o             \
                                           x86/h264_idct.o               \
 
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 1a5413bda4e..d867dc3e6a2 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -43,6 +43,7 @@ DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
 {0x8000000080000000ULL, 0x8000000080000000ULL};
 
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_1  ) = 0x0001000100010001ULL;
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2  ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3  ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4  ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index fb9cacfd11a..dbf3daa02ea 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -1,10 +1,11 @@
 ;*****************************************************************************
-;* MMX/SSE2-optimized H.264 deblocking code
+;* MMX/SSE2/AVX-optimized H.264 deblocking code
 ;*****************************************************************************
-;* Copyright (C) 2005-2008 x264 project
+;* Copyright (C) 2005-2011 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Oskar Arvidsson <oskar@irock.se>
 ;*
 ;* This file is part of FFmpeg.
 ;*
@@ -26,96 +27,94 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION .text
 
 cextern pb_0
 cextern pb_1
 cextern pb_3
 cextern pb_A1
 
-SECTION .text
-
 ; expands to [base],...,[base+7*stride]
 %define PASS8ROWS(base, base3, stride, stride3) \
     [base], [base+stride], [base+stride*2], [base3], \
     [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
 
-; in: 8 rows of 4 bytes in %1..%8
+%define PASS8ROWS(base, base3, stride, stride3, offset) \
+    PASS8ROWS(base+offset, base3+offset, stride, stride3)
+
+; in: 8 rows of 4 bytes in %4..%11
 ; out: 4 rows of 8 bytes in m0..m3
-%macro TRANSPOSE4x8_LOAD 8
-    movd       m0, %1
-    movd       m2, %2
-    movd       m1, %3
-    movd       m3, %4
-    punpcklbw  m0, m2
-    punpcklbw  m1, m3
-    movq       m2, m0
-    punpcklwd  m0, m1
-    punpckhwd  m2, m1
-
-    movd       m4, %5
-    movd       m6, %6
-    movd       m5, %7
-    movd       m7, %8
-    punpcklbw  m4, m6
-    punpcklbw  m5, m7
-    movq       m6, m4
-    punpcklwd  m4, m5
-    punpckhwd  m6, m5
-
-    movq       m1, m0
-    movq       m3, m2
-    punpckldq  m0, m4
-    punpckhdq  m1, m4
-    punpckldq  m2, m6
-    punpckhdq  m3, m6
+%macro TRANSPOSE4x8_LOAD 11
+    movh       m0, %4
+    movh       m2, %5
+    movh       m1, %6
+    movh       m3, %7
+    punpckl%1  m0, m2
+    punpckl%1  m1, m3
+    mova       m2, m0
+    punpckl%2  m0, m1
+    punpckh%2  m2, m1
+
+    movh       m4, %8
+    movh       m6, %9
+    movh       m5, %10
+    movh       m7, %11
+    punpckl%1  m4, m6
+    punpckl%1  m5, m7
+    mova       m6, m4
+    punpckl%2  m4, m5
+    punpckh%2  m6, m5
+
+    punpckh%3  m1, m0, m4
+    punpckh%3  m3, m2, m6
+    punpckl%3  m0, m4
+    punpckl%3  m2, m6
 %endmacro
 
 ; in: 4 rows of 8 bytes in m0..m3
 ; out: 8 rows of 4 bytes in %1..%8
-%macro TRANSPOSE8x4_STORE 8
-    movq       m4, m0
-    movq       m5, m1
-    movq       m6, m2
-    punpckhdq  m4, m4
-    punpckhdq  m5, m5
-    punpckhdq  m6, m6
+%macro TRANSPOSE8x4B_STORE 8
+    punpckhdq  m4, m0, m0
+    punpckhdq  m5, m1, m1
+    punpckhdq  m6, m2, m2
 
     punpcklbw  m0, m1
     punpcklbw  m2, m3
-    movq       m1, m0
-    punpcklwd  m0, m2
-    punpckhwd  m1, m2
-    movd       %1, m0
-    punpckhdq  m0, m0
-    movd       %2, m0
-    movd       %3, m1
+    punpcklwd  m1, m0, m2
+    punpckhwd  m0, m2
+    movh       %1, m1
     punpckhdq  m1, m1
-    movd       %4, m1
+    movh       %2, m1
+    movh       %3, m0
+    punpckhdq  m0, m0
+    movh       %4, m0
 
     punpckhdq  m3, m3
     punpcklbw  m4, m5
     punpcklbw  m6, m3
-    movq       m5, m4
-    punpcklwd  m4, m6
-    punpckhwd  m5, m6
-    movd       %5, m4
-    punpckhdq  m4, m4
-    movd       %6, m4
-    movd       %7, m5
+    punpcklwd  m5, m4, m6
+    punpckhwd  m4, m6
+    movh       %5, m5
     punpckhdq  m5, m5
-    movd       %8, m5
+    movh       %6, m5
+    movh       %7, m4
+    punpckhdq  m4, m4
+    movh       %8, m4
+%endmacro
+
+%macro TRANSPOSE4x8B_LOAD 8
+    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
 %endmacro
 
 %macro SBUTTERFLY3 4
-    movq       %4, %2
+    punpckh%1  %4, %2, %3
     punpckl%1  %2, %3
-    punpckh%1  %4, %3
 %endmacro
 
 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
 %macro TRANSPOSE6x8_MEM 9
+    RESET_MM_PERMUTATION
     movq  m0, %1
     movq  m1, %2
     movq  m2, %3
@@ -123,30 +122,32 @@ SECTION .text
     movq  m4, %5
     movq  m5, %6
     movq  m6, %7
-    SBUTTERFLY3 bw, m0, m1, m7
-    SBUTTERFLY3 bw, m2, m3, m1
-    SBUTTERFLY3 bw, m4, m5, m3
-    movq  [%9+0x10], m1
-    SBUTTERFLY3 bw, m6, %8, m5
-    SBUTTERFLY3 wd, m0, m2, m1
-    SBUTTERFLY3 wd, m4, m6, m2
+    SBUTTERFLY bw, 0, 1, 7
+    SBUTTERFLY bw, 2, 3, 7
+    SBUTTERFLY bw, 4, 5, 7
+    movq  [%9+0x10], m3
+    SBUTTERFLY3 bw, m6, %8, m7
+    SBUTTERFLY wd, 0, 2, 3
+    SBUTTERFLY wd, 4, 6, 3
     punpckhdq m0, m4
     movq  [%9+0x00], m0
-    SBUTTERFLY3 wd, m7, [%9+0x10], m6
-    SBUTTERFLY3 wd, m3, m5, m4
-    SBUTTERFLY3 dq, m7, m3, m0
-    SBUTTERFLY3 dq, m1, m2, m5
-    punpckldq m6, m4
-    movq  [%9+0x10], m1
-    movq  [%9+0x20], m5
-    movq  [%9+0x30], m7
-    movq  [%9+0x40], m0
-    movq  [%9+0x50], m6
+    SBUTTERFLY3 wd, m1, [%9+0x10], m3
+    SBUTTERFLY wd, 5, 7, 0
+    SBUTTERFLY dq, 1, 5, 0
+    SBUTTERFLY dq, 2, 6, 0
+    punpckldq m3, m7
+    movq  [%9+0x10], m2
+    movq  [%9+0x20], m6
+    movq  [%9+0x30], m1
+    movq  [%9+0x40], m5
+    movq  [%9+0x50], m3
+    RESET_MM_PERMUTATION
 %endmacro
 
 ; in: 8 rows of 8 in %1..%8
 ; out: 8 rows of 8 in %9..%16
 %macro TRANSPOSE8x8_MEM 16
+    RESET_MM_PERMUTATION
     movq  m0, %1
     movq  m1, %2
     movq  m2, %3
@@ -154,38 +155,44 @@ SECTION .text
     movq  m4, %5
     movq  m5, %6
     movq  m6, %7
-    SBUTTERFLY3 bw, m0, m1, m7
-    SBUTTERFLY3 bw, m2, m3, m1
-    SBUTTERFLY3 bw, m4, m5, m3
-    SBUTTERFLY3 bw, m6, %8, m5
-    movq  %9,  m3
-    SBUTTERFLY3 wd, m0, m2, m3
-    SBUTTERFLY3 wd, m4, m6, m2
-    SBUTTERFLY3 wd, m7, m1, m6
-    movq  %11, m2
-    movq  m2,  %9
-    SBUTTERFLY3 wd, m2, m5, m1
-    SBUTTERFLY3 dq, m0, m4, m5
-    SBUTTERFLY3 dq, m7, m2, m4
+    SBUTTERFLY bw, 0, 1, 7
+    SBUTTERFLY bw, 2, 3, 7
+    SBUTTERFLY bw, 4, 5, 7
+    SBUTTERFLY3 bw, m6, %8, m7
+    movq  %9,  m5
+    SBUTTERFLY wd, 0, 2, 5
+    SBUTTERFLY wd, 4, 6, 5
+    SBUTTERFLY wd, 1, 3, 5
+    movq  %11, m6
+    movq  m6,  %9
+    SBUTTERFLY wd, 6, 7, 5
+    SBUTTERFLY dq, 0, 4, 5
+    SBUTTERFLY dq, 1, 6, 5
     movq  %9,  m0
-    movq  %10, m5
-    movq  %13, m7
-    movq  %14, m4
-    SBUTTERFLY3 dq, m3, %11, m0
-    SBUTTERFLY3 dq, m6, m1, m5
-    movq  %11, m3
+    movq  %10, m4
+    movq  %13, m1
+    movq  %14, m6
+    SBUTTERFLY3 dq, m2, %11, m0
+    SBUTTERFLY dq, 3, 7, 4
+    movq  %11, m2
     movq  %12, m0
-    movq  %15, m6
-    movq  %16, m5
+    movq  %15, m3
+    movq  %16, m7
+    RESET_MM_PERMUTATION
 %endmacro
 
 ; out: %4 = |%1-%2|>%3
 ; clobbers: %5
 %macro DIFF_GT 5
+%if avx_enabled == 0
     mova    %5, %2
     mova    %4, %1
     psubusb %5, %1
     psubusb %4, %2
+%else
+    psubusb %5, %2, %1
+    psubusb %4, %1, %2
+%endif
     por     %4, %5
     psubusb %4, %3
 %endmacro
@@ -193,32 +200,28 @@ SECTION .text
 ; out: %4 = |%1-%2|>%3
 ; clobbers: %5
 %macro DIFF_GT2 5
+%ifdef ARCH_X86_64
+    psubusb %5, %2, %1
+    psubusb %4, %1, %2
+%else
     mova    %5, %2
     mova    %4, %1
     psubusb %5, %1
     psubusb %4, %2
+%endif
     psubusb %5, %3
     psubusb %4, %3
     pcmpeqb %4, %5
 %endmacro
 
-%macro SPLATW 1
-%ifidn m0, xmm0
-    pshuflw  %1, %1, 0
-    punpcklqdq %1, %1
-%else
-    pshufw   %1, %1, 0
-%endif
-%endmacro
-
 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
 ; out: m5=beta-1, m7=mask, %3=alpha-1
 ; clobbers: m4,m6
 %macro LOAD_MASK 2-3
     movd     m4, %1
     movd     m5, %2
-    SPLATW   m4
-    SPLATW   m5
+    SPLATW   m4, m4
+    SPLATW   m5, m5
     packuswb m4, m4  ; 16x alpha-1
     packuswb m5, m5  ; 16x beta-1
 %if %0>2
@@ -237,8 +240,7 @@ SECTION .text
 ; out: m1=p0' m2=q0'
 ; clobbers: m0,3-6
 %macro DEBLOCK_P0_Q0 0
-    mova    m5, m1
-    pxor    m5, m2       ; p0^q0
+    pxor    m5, m1, m2   ; p0^q0
     pand    m5, [pb_1]   ; (p0^q0)&1
     pcmpeqb m4, m4
     pxor    m3, m4
@@ -264,14 +266,12 @@ SECTION .text
 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
 ; clobbers: q2, tmp, tc0
 %macro LUMA_Q1 6
-    mova    %6, m1
-    pavgb   %6, m2
+    pavgb   %6, m1, m2
     pavgb   %2, %6       ; avg(p2,avg(p0,q0))
     pxor    %6, %3
     pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
     psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
-    mova    %6, %1
-    psubusb %6, %5
+    psubusb %6, %1, %5
     paddusb %5, %1
     pmaxub  %2, %6
     pminub  %2, %5
@@ -280,10 +280,10 @@ SECTION .text
 
 %ifdef ARCH_X86_64
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal x264_deblock_v_luma_sse2, 5,5,10
+%macro DEBLOCK_LUMA 1
+cglobal deblock_v_luma_8_%1, 5,5,10
     movd    m8, [r4] ; tc0
     lea     r4, [r1*3]
     dec     r2d        ; alpha-1
@@ -307,8 +307,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
     movdqa  m3, [r4] ; p2
     DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
     pand    m6, m9
-    mova    m7, m8
-    psubb   m7, m6
+    psubb   m7, m8, m6
     pand    m6, m8
     LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
 
@@ -326,10 +325,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
     RET
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX
-cglobal x264_deblock_h_luma_sse2, 5,7
+cglobal deblock_h_luma_8_%1, 5,7
     movsxd r10, r1d
     lea    r11, [r10+r10*2]
     lea    r6,  [r0-4]
@@ -350,13 +349,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7
 
     ; vertical filter
     ; alpha, beta, tc0 are still in r2d, r3d, r4
-    ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+    ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
     lea    r0, [pix_tmp+0x30]
     mov    r1d, 0x10
 %ifdef WIN64
     mov    [rsp+0x20], r4
 %endif
-    call   x264_deblock_v_luma_sse2
+    call   deblock_v_luma_8_%1
 
     ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
     add    r6, 2
@@ -365,7 +364,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
     movq   m1, [pix_tmp+0x28]
     movq   m2, [pix_tmp+0x38]
     movq   m3, [pix_tmp+0x48]
-    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
+    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)
 
     shl    r10, 3
     sub    r6,  r10
@@ -375,7 +374,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
     movq   m1, [pix_tmp+0x20]
     movq   m2, [pix_tmp+0x30]
     movq   m3, [pix_tmp+0x40]
-    TRANSPOSE8x4_STORE  PASS8ROWS(r6, r5, r10, r11)
+    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r10, r11)
 
 %ifdef WIN64
     add    rsp, 0x98
@@ -383,14 +382,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7
     add    rsp, 0x68
 %endif
     RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA sse2
+INIT_AVX
+DEBLOCK_LUMA avx
 
 %else
 
 %macro DEBLOCK_LUMA 3
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_%1, 5,5
+cglobal deblock_%2_luma_8_%1, 5,5
     lea     r4, [r1*3]
     dec     r2     ; alpha-1
     neg     r4
@@ -419,8 +424,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
     DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
     pand    m6, m4
     pand    m4, [esp+%3] ; tc
-    mova    m7, m4
-    psubb   m7, m6
+    psubb   m7, m4, m6
     pand    m6, m4
     LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
 
@@ -441,10 +445,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5
     RET
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX
-cglobal x264_deblock_h_luma_%1, 0,5
+cglobal deblock_h_luma_8_%1, 0,5
     mov    r0, r0mp
     mov    r3, r1m
     lea    r4, [r3*3]
@@ -467,11 +471,11 @@ cglobal x264_deblock_h_luma_%1, 0,5
     PUSH   dword r2m
     PUSH   dword 16
     PUSH   dword r0
-    call   x264_deblock_%2_luma_%1
+    call   deblock_%2_luma_8_%1
 %ifidn %2, v8
     add    dword [esp   ], 8 ; pix_tmp+0x38
     add    dword [esp+16], 2 ; tc0+2
-    call   x264_deblock_%2_luma_%1
+    call   deblock_%2_luma_8_%1
 %endif
     ADD    esp, 20
 
@@ -484,7 +488,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
     movq   m1, [pix_tmp+0x20]
     movq   m2, [pix_tmp+0x30]
     movq   m3, [pix_tmp+0x40]
-    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
+    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
 
     lea    r0, [r0+r3*8]
     lea    r1, [r1+r3*8]
@@ -492,7 +496,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
     movq   m1, [pix_tmp+0x28]
     movq   m2, [pix_tmp+0x38]
     movq   m3, [pix_tmp+0x48]
-    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
+    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
 
     ADD    esp, pad
     RET
@@ -502,22 +506,34 @@ INIT_MMX
 DEBLOCK_LUMA mmxext, v8, 8
 INIT_XMM
 DEBLOCK_LUMA sse2, v, 16
+INIT_AVX
+DEBLOCK_LUMA avx, v, 16
 
 %endif ; ARCH
 
 
 
 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
+%ifdef ARCH_X86_64
+    pavgb t0, p2, p1
+    pavgb t1, p0, q0
+%else
     mova  t0, p2
     mova  t1, p0
     pavgb t0, p1
     pavgb t1, q0
+%endif
     pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
     mova  t5, t1
+%ifdef ARCH_X86_64
+    paddb t2, p2, p1
+    paddb t3, p0, q0
+%else
     mova  t2, p2
     mova  t3, p0
     paddb t2, p1
     paddb t3, q0
+%endif
     paddb t2, t3
     mova  t3, t2
     mova  t4, t2
@@ -527,10 +543,15 @@ DEBLOCK_LUMA sse2, v, 16
     pand  t2, mpb_1
     psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
 
+%ifdef ARCH_X86_64
+    pavgb t1, p2, q1
+    psubb t2, p2, q1
+%else
     mova  t1, p2
     mova  t2, p2
     pavgb t1, q1
     psubb t2, q1
+%endif
     paddb t3, t3
     psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
     pand  t2, mpb_1
@@ -543,10 +564,8 @@ DEBLOCK_LUMA sse2, v, 16
     pand  t3, mpb_1
     psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
 
-    mova  t3, p0
-    mova  t2, p0
-    pxor  t3, q1
-    pavgb t2, q1
+    pxor  t3, p0, q1
+    pavgb t2, p0, q1
     pand  t3, mpb_1
     psubb t2, t3
     pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
@@ -560,9 +579,8 @@ DEBLOCK_LUMA sse2, v, 16
     mova  %1, t1 ; store p0
 
     mova  t1, %4 ; p3
-    mova  t2, t1
+    paddb t2, t1, p2
     pavgb t1, p2
-    paddb t2, p2
     pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
     paddb t2, t2
     paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
@@ -624,9 +642,9 @@ DEBLOCK_LUMA sse2, v, 16
 %endif
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
+cglobal deblock_%2_luma_intra_8_%1, 4,6,16
 %ifndef ARCH_X86_64
     sub     esp, 0x60
 %endif
@@ -686,9 +704,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
 INIT_MMX
 %ifdef ARCH_X86_64
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_intra_%1, 4,7
+cglobal deblock_h_luma_intra_8_%1, 4,7
     movsxd r10, r1d
     lea    r11, [r10*3]
     lea    r6,  [r0-4]
@@ -704,7 +722,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
 
     lea    r0,  [pix_tmp+0x40]
     mov    r1,  0x10
-    call   x264_deblock_v_luma_intra_%1
+    call   deblock_v_luma_intra_8_%1
 
     ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
     lea    r5, [r6+r11]
@@ -717,7 +735,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
     add    rsp, 0x88
     RET
 %else
-cglobal x264_deblock_h_luma_intra_%1, 2,4
+cglobal deblock_h_luma_intra_8_%1, 2,4
     lea    r3,  [r1*3]
     sub    r0,  4
     lea    r2,  [r0+r3]
@@ -736,10 +754,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
     PUSH   dword r2m
     PUSH   dword 16
     PUSH   r0
-    call   x264_deblock_%2_luma_intra_%1
+    call   deblock_%2_luma_intra_8_%1
 %ifidn %2, v8
     add    dword [rsp], 8 ; pix_tmp+8
-    call   x264_deblock_%2_luma_intra_%1
+    call   deblock_%2_luma_intra_8_%1
 %endif
     ADD    esp, 16
 
@@ -760,13 +778,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
 
 INIT_XMM
 DEBLOCK_LUMA_INTRA sse2, v
+INIT_AVX
+DEBLOCK_LUMA_INTRA avx , v
 %ifndef ARCH_X86_64
 INIT_MMX
 DEBLOCK_LUMA_INTRA mmxext, v8
 %endif
 
-
-
 INIT_MMX
 
 %macro CHROMA_V_START 0
@@ -790,23 +808,23 @@ INIT_MMX
 %define t6 r6
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_mmxext, 5,6
+cglobal deblock_v_chroma_8_mmxext, 5,6
     CHROMA_V_START
     movq  m0, [t5]
     movq  m1, [t5+r1]
     movq  m2, [r0]
     movq  m3, [r0+r1]
-    call x264_chroma_inter_body_mmxext
+    call ff_chroma_inter_body_mmxext
     movq  [t5+r1], m1
     movq  [r0], m2
     RET
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_mmxext, 5,7
+cglobal deblock_h_chroma_8_mmxext, 5,7
 %ifdef ARCH_X86_64
     %define buf0 [rsp-24]
     %define buf1 [rsp-16]
@@ -815,17 +833,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7
     %define buf1 r2m
 %endif
     CHROMA_H_START
-    TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
+    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
     movq  buf1, m3
-    call x264_chroma_inter_body_mmxext
+    call ff_chroma_inter_body_mmxext
     movq  m0, buf0
     movq  m3, buf1
-    TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
     RET
 
 ALIGN 16
-x264_chroma_inter_body_mmxext:
+ff_chroma_inter_body_mmxext:
     LOAD_MASK  r2d, r3d
     movd       m6, [r4] ; tc0
     punpcklbw  m6, m6
@@ -850,31 +868,31 @@ x264_chroma_inter_body_mmxext:
 %define t6 r5
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
+cglobal deblock_v_chroma_intra_8_mmxext, 4,5
     CHROMA_V_START
     movq  m0, [t5]
     movq  m1, [t5+r1]
     movq  m2, [r0]
     movq  m3, [r0+r1]
-    call x264_chroma_intra_body_mmxext
+    call ff_chroma_intra_body_mmxext
     movq  [t5+r1], m1
     movq  [r0], m2
     RET
 
 ;-----------------------------------------------------------------------------
-; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
+cglobal deblock_h_chroma_intra_8_mmxext, 4,6
     CHROMA_H_START
-    TRANSPOSE4x8_LOAD  PASS8ROWS(t5, r0, r1, t6)
-    call x264_chroma_intra_body_mmxext
-    TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
+    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
+    call ff_chroma_intra_body_mmxext
+    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
     RET
 
 ALIGN 16
-x264_chroma_intra_body_mmxext:
+ff_chroma_intra_body_mmxext:
     LOAD_MASK r2d, r3d
     movq   m5, m1
     movq   m6, m2
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
new file mode 100644
index 00000000000..c253d029540
--- /dev/null
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -0,0 +1,910 @@
+;*****************************************************************************
+;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
+;*****************************************************************************
+;* Copyright (C) 2005-2011 x264 project
+;*
+;* Authors: Oskar Arvidsson <oskar@irock.se>
+;*          Loren Merritt <lorenm@u.washington.edu>
+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+pw_pixel_max: times 8 dw ((1 << 10)-1)
+
+SECTION .text
+
+cextern pw_2
+cextern pw_3
+cextern pw_4
+
+; out: %4 = |%1-%2|-%3
+; clobbers: %5
+%macro ABS_SUB 5
+    psubusw %5, %2, %1
+    psubusw %4, %1, %2
+    por     %4, %5
+    psubw   %4, %3
+%endmacro
+
+; out: %4 = |%1-%2|<%3
+%macro DIFF_LT   5
+    psubusw %4, %2, %1
+    psubusw %5, %1, %2
+    por     %5, %4 ; |%1-%2|
+    pxor    %4, %4
+    psubw   %5, %3 ; |%1-%2|-%3
+    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
+%endmacro
+
+%macro LOAD_AB 4
+    movd       %1, %3
+    movd       %2, %4
+    SPLATW     %1, %1
+    SPLATW     %2, %2
+%endmacro
+
+; in:  %2=tc reg
+; out: %1=splatted tc
+%macro LOAD_TC 2
+    movd        %1, [%2]
+    punpcklbw   %1, %1
+%if mmsize == 8
+    pshufw      %1, %1, 0
+%else
+    pshuflw     %1, %1, 01010000b
+    pshufd      %1, %1, 01010000b
+%endif
+    psraw       %1, 6
+%endmacro
+
+; in: %1=p1, %2=p0, %3=q0, %4=q1
+;     %5=alpha, %6=beta, %7-%9=tmp
+; out: %7=mask
+%macro LOAD_MASK 9
+    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
+    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
+    pand        %8, %9
+    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
+    pxor        %7, %7
+    pand        %8, %9
+    pcmpgtw     %7, %8
+%endmacro
+
+; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
+; out: %1=p0', m2=q0'
+%macro DEBLOCK_P0_Q0 7
+    psubw   %3, %4
+    pxor    %7, %7
+    paddw   %3, [pw_4]
+    psubw   %7, %5
+    psubw   %6, %2, %1
+    psllw   %6, 2
+    paddw   %3, %6
+    psraw   %3, 3
+    mova    %6, [pw_pixel_max]
+    CLIPW   %3, %7, %5
+    pxor    %7, %7
+    paddw   %1, %3
+    psubw   %2, %3
+    CLIPW   %1, %7, %6
+    CLIPW   %2, %7, %6
+%endmacro
+
+; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
+%macro LUMA_Q1 6
+    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
+    paddw       %1, %6
+    pxor        %6, %6
+    psraw       %1, 1
+    psubw       %6, %5
+    psubw       %1, %2
+    CLIPW       %1, %6, %5
+    paddw       %1, %2
+%endmacro
+
+%macro LUMA_DEBLOCK_ONE 3
+    DIFF_LT     m5, %1, bm, m4, m6
+    pxor        m6, m6
+    mova        %3, m4
+    pcmpgtw     m6, tcm
+    pand        m4, tcm
+    pandn       m6, m7
+    pand        m4, m6
+    LUMA_Q1 m5, %2, m1, m2, m4, m6
+%endmacro
+
+%macro LUMA_H_STORE 2
+%if mmsize == 8
+    movq        [r0-4], m0
+    movq        [r0+r1-4], m1
+    movq        [r0+r1*2-4], m2
+    movq        [r0+%2-4], m3
+%else
+    movq        [r0-4], m0
+    movhps      [r0+r1-4], m0
+    movq        [r0+r1*2-4], m1
+    movhps      [%1-4], m1
+    movq        [%1+r1-4], m2
+    movhps      [%1+r1*2-4], m2
+    movq        [%1+%2-4], m3
+    movhps      [%1+r1*4-4], m3
+%endif
+%endmacro
+
+%macro DEBLOCK_LUMA 1
+;-----------------------------------------------------------------------------
+; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
+    %assign pad 5*mmsize+12-(stack_offset&15)
+    %define tcm [rsp]
+    %define ms1 [rsp+mmsize]
+    %define ms2 [rsp+mmsize*2]
+    %define am  [rsp+mmsize*3]
+    %define bm  [rsp+mmsize*4]
+    SUB        rsp, pad
+    shl        r2d, 2
+    shl        r3d, 2
+    LOAD_AB     m4, m5, r2, r3
+    mov         r3, 32/mmsize
+    mov         r2, r0
+    sub         r0, r1
+    mova        am, m4
+    sub         r0, r1
+    mova        bm, m5
+    sub         r0, r1
+.loop:
+    mova        m0, [r0+r1]
+    mova        m1, [r0+r1*2]
+    mova        m2, [r2]
+    mova        m3, [r2+r1]
+
+    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
+    LOAD_TC     m6, r4
+    mova       tcm, m6
+
+    mova        m5, [r0]
+    LUMA_DEBLOCK_ONE m1, m0, ms1
+    mova   [r0+r1], m5
+
+    mova        m5, [r2+r1*2]
+    LUMA_DEBLOCK_ONE m2, m3, ms2
+    mova   [r2+r1], m5
+
+    pxor        m5, m5
+    mova        m6, tcm
+    pcmpgtw     m5, tcm
+    psubw       m6, ms1
+    pandn       m5, m7
+    psubw       m6, ms2
+    pand        m5, m6
+    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
+    mova [r0+r1*2], m1
+    mova      [r2], m2
+
+    add         r0, mmsize
+    add         r2, mmsize
+    add         r4, mmsize/8
+    dec         r3
+    jg .loop
+    ADD         rsp, pad
+    RET
+
+cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
+    %assign pad 7*mmsize+12-(stack_offset&15)
+    %define tcm [rsp]
+    %define ms1 [rsp+mmsize]
+    %define ms2 [rsp+mmsize*2]
+    %define p1m [rsp+mmsize*3]
+    %define p2m [rsp+mmsize*4]
+    %define am  [rsp+mmsize*5]
+    %define bm  [rsp+mmsize*6]
+    SUB        rsp, pad
+    shl        r2d, 2
+    shl        r3d, 2
+    LOAD_AB     m4, m5, r2, r3
+    mov         r3, r1
+    mova        am, m4
+    add         r3, r1
+    mov         r5, 32/mmsize
+    mova        bm, m5
+    add         r3, r1
+%if mmsize == 16
+    mov         r2, r0
+    add         r2, r3
+%endif
+.loop:
+%if mmsize == 8
+    movq        m2, [r0-8]     ; y q2 q1 q0
+    movq        m7, [r0+0]
+    movq        m5, [r0+r1-8]
+    movq        m3, [r0+r1+0]
+    movq        m0, [r0+r1*2-8]
+    movq        m6, [r0+r1*2+0]
+    movq        m1, [r0+r3-8]
+    TRANSPOSE4x4W 2, 5, 0, 1, 4
+    SWAP         2, 7
+    movq        m7, [r0+r3]
+    TRANSPOSE4x4W 2, 3, 6, 7, 4
+%else
+    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
+    movu        m0, [r0+r1-8]
+    movu        m2, [r0+r1*2-8]
+    movu        m3, [r2-8]
+    TRANSPOSE4x4W 5, 0, 2, 3, 6
+    mova       tcm, m3
+
+    movu        m4, [r2+r1-8]
+    movu        m1, [r2+r1*2-8]
+    movu        m3, [r2+r3-8]
+    movu        m7, [r2+r1*4-8]
+    TRANSPOSE4x4W 4, 1, 3, 7, 6
+
+    mova        m6, tcm
+    punpcklqdq  m6, m7
+    punpckhqdq  m5, m4
+    SBUTTERFLY qdq, 0, 1, 7
+    SBUTTERFLY qdq, 2, 3, 7
+%endif
+
+    mova       p2m, m6
+    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
+    LOAD_TC     m6, r4
+    mova       tcm, m6
+
+    LUMA_DEBLOCK_ONE m1, m0, ms1
+    mova       p1m, m5
+
+    mova        m5, p2m
+    LUMA_DEBLOCK_ONE m2, m3, ms2
+    mova       p2m, m5
+
+    pxor        m5, m5
+    mova        m6, tcm
+    pcmpgtw     m5, tcm
+    psubw       m6, ms1
+    pandn       m5, m7
+    psubw       m6, ms2
+    pand        m5, m6
+    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
+    mova        m0, p1m
+    mova        m3, p2m
+    TRANSPOSE4x4W 0, 1, 2, 3, 4
+    LUMA_H_STORE r2, r3
+
+    add         r4, mmsize/8
+    lea         r0, [r0+r1*(mmsize/2)]
+    lea         r2, [r2+r1*(mmsize/2)]
+    dec         r5
+    jg .loop
+    ADD        rsp, pad
+    RET
+%endmacro
+
+INIT_XMM
+%ifdef ARCH_X86_64
+; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
+;      m12=alpha, m13=beta
+; out: m0=p1', m3=q1', m1=p0', m2=q0'
+; clobbers: m4, m5, m6, m7, m10, m11, m14
+%macro DEBLOCK_LUMA_INTER_SSE2 0
+    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
+    LOAD_TC     m6, r4
+    DIFF_LT     m8, m1, m13, m10, m4
+    DIFF_LT     m9, m2, m13, m11, m4
+    pand        m6, m7
+
+    mova       m14, m6
+    pxor        m4, m4
+    pcmpgtw     m6, m4
+    pand        m6, m14
+
+    mova        m5, m10
+    pand        m5, m6
+    LUMA_Q1 m8, m0, m1, m2, m5, m4
+
+    mova        m5, m11
+    pand        m5, m6
+    LUMA_Q1 m9, m3, m1, m2, m5, m4
+
+    pxor        m4, m4
+    psubw       m6, m10
+    pcmpgtw     m4, m14
+    pandn       m4, m7
+    psubw       m6, m11
+    pand        m4, m6
+    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
+
+    SWAP         0, 8
+    SWAP         3, 9
+%endmacro
+
+%macro DEBLOCK_LUMA_64 1
+cglobal deblock_v_luma_10_%1, 5,5,15
+    %define p2 m8
+    %define p1 m0
+    %define p0 m1
+    %define q0 m2
+    %define q1 m3
+    %define q2 m9
+    %define mask0 m7
+    %define mask1 m10
+    %define mask2 m11
+    shl        r2d, 2
+    shl        r3d, 2
+    LOAD_AB    m12, m13, r2, r3
+    mov         r2, r0
+    sub         r0, r1
+    sub         r0, r1
+    sub         r0, r1
+    mov         r3, 2
+.loop:
+    mova        p2, [r0]
+    mova        p1, [r0+r1]
+    mova        p0, [r0+r1*2]
+    mova        q0, [r2]
+    mova        q1, [r2+r1]
+    mova        q2, [r2+r1*2]
+    DEBLOCK_LUMA_INTER_SSE2
+    mova   [r0+r1], p1
+    mova [r0+r1*2], p0
+    mova      [r2], q0
+    mova   [r2+r1], q1
+    add         r0, mmsize
+    add         r2, mmsize
+    add         r4, 2
+    dec         r3
+    jg .loop
+    REP_RET
+
+cglobal deblock_h_luma_10_%1, 5,7,15
+    shl        r2d, 2
+    shl        r3d, 2
+    LOAD_AB    m12, m13, r2, r3
+    mov         r2, r1
+    add         r2, r1
+    add         r2, r1
+    mov         r5, r0
+    add         r5, r2
+    mov         r6, 2
+.loop:
+    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
+    movu        m0, [r0+r1-8]
+    movu        m2, [r0+r1*2-8]
+    movu        m9, [r5-8]
+    movu        m5, [r5+r1-8]
+    movu        m1, [r5+r1*2-8]
+    movu        m3, [r5+r2-8]
+    movu        m7, [r5+r1*4-8]
+
+    TRANSPOSE4x4W 8, 0, 2, 9, 10
+    TRANSPOSE4x4W 5, 1, 3, 7, 10
+
+    punpckhqdq  m8, m5
+    SBUTTERFLY qdq, 0, 1, 10
+    SBUTTERFLY qdq, 2, 3, 10
+    punpcklqdq  m9, m7
+
+    DEBLOCK_LUMA_INTER_SSE2
+
+    TRANSPOSE4x4W 0, 1, 2, 3, 4
+    LUMA_H_STORE r5, r2
+    add         r4, 2
+    lea         r0, [r0+r1*8]
+    lea         r5, [r5+r1*8]
+    dec         r6
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA_64 sse2
+INIT_AVX
+DEBLOCK_LUMA_64 avx
+%endif
+
+%macro SWAPMOVA 2
+%ifid %1
+    SWAP %1, %2
+%else
+    mova %1, %2
+%endif
+%endmacro
+
+; in: t0-t2: tmp registers
+;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
+;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
+%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
+%ifdef ARCH_X86_64
+    paddw     t0, %3, %2
+    mova      t2, %4
+    paddw     t2, %3
+%else
+    mova      t0, %3
+    mova      t2, %4
+    paddw     t0, %2
+    paddw     t2, %3
+%endif
+    paddw     t0, %1
+    paddw     t2, t2
+    paddw     t0, %5
+    paddw     t2, %9
+    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
+    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
+
+    psrlw     t2, 3
+    psrlw     t1, t0, 2
+    psubw     t2, %3
+    psubw     t1, %2
+    pand      t2, %8
+    pand      t1, %8
+    paddw     t2, %3
+    paddw     t1, %2
+    SWAPMOVA %11, t1
+
+    psubw     t1, t0, %3
+    paddw     t0, t0
+    psubw     t1, %5
+    psubw     t0, %3
+    paddw     t1, %6
+    paddw     t1, %2
+    paddw     t0, %6
+    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
+    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
+
+    pxor      t0, t1
+    pxor      t1, %1
+    pand      t0, %8
+    pand      t1, %7
+    pxor      t0, t1
+    pxor      t0, %1
+    SWAPMOVA %10, t0
+    SWAPMOVA %12, t2
+%endmacro
+
+%macro LUMA_INTRA_INIT 1
+    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
+    %define t0 m4
+    %define t1 m5
+    %define t2 m6
+    %define t3 m7
+    %assign i 4
+%rep %1
+    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
+    %assign i i+1
+%endrep
+    SUB    rsp, pad
+%endmacro
+
+; in: %1-%3=tmp, %4=p2, %5=q2
+%macro LUMA_INTRA_INTER 5
+    LOAD_AB t0, t1, r2d, r3d
+    mova    %1, t0
+    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
+%ifdef ARCH_X86_64
+    mova    %2, t0        ; mask0
+    psrlw   t3, %1, 2
+%else
+    mova    t3, %1
+    mova    %2, t0        ; mask0
+    psrlw   t3, 2
+%endif
+    paddw   t3, [pw_2]    ; alpha/4+2
+    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
+    pand    t2, %2
+    mova    t3, %5        ; q2
+    mova    %1, t2        ; mask1
+    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
+    pand    t2, %1
+    mova    t3, %4        ; p2
+    mova    %3, t2        ; mask1q
+    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
+    pand    t2, %1
+    mova    %1, t2        ; mask1p
+%endmacro
+
+%macro LUMA_H_INTRA_LOAD 0
+%if mmsize == 8
+    movu    t0, [r0-8]
+    movu    t1, [r0+r1-8]
+    movu    m0, [r0+r1*2-8]
+    movu    m1, [r0+r4-8]
+    TRANSPOSE4x4W 4, 5, 0, 1, 2
+    mova    t4, t0        ; p3
+    mova    t5, t1        ; p2
+
+    movu    m2, [r0]
+    movu    m3, [r0+r1]
+    movu    t0, [r0+r1*2]
+    movu    t1, [r0+r4]
+    TRANSPOSE4x4W 2, 3, 4, 5, 6
+    mova    t6, t0        ; q2
+    mova    t7, t1        ; q3
+%else
+    movu    t0, [r0-8]
+    movu    t1, [r0+r1-8]
+    movu    m0, [r0+r1*2-8]
+    movu    m1, [r0+r5-8]
+    movu    m2, [r4-8]
+    movu    m3, [r4+r1-8]
+    movu    t2, [r4+r1*2-8]
+    movu    t3, [r4+r5-8]
+    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
+    mova    t4, t0        ; p3
+    mova    t5, t1        ; p2
+    mova    t6, t2        ; q2
+    mova    t7, t3        ; q3
+%endif
+%endmacro
+
+; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
+%macro LUMA_H_INTRA_STORE 9
+%if mmsize == 8
+    TRANSPOSE4x4W %1, %2, %3, %4, %9
+    movq       [r0-8], m%1
+    movq       [r0+r1-8], m%2
+    movq       [r0+r1*2-8], m%3
+    movq       [r0+r4-8], m%4
+    movq       m%1, %8
+    TRANSPOSE4x4W %5, %6, %7, %1, %9
+    movq       [r0], m%5
+    movq       [r0+r1], m%6
+    movq       [r0+r1*2], m%7
+    movq       [r0+r4], m%1
+%else
+    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
+    movq       [r0-8], m%1
+    movq       [r0+r1-8], m%2
+    movq       [r0+r1*2-8], m%3
+    movq       [r0+r5-8], m%4
+    movhps     [r4-8], m%1
+    movhps     [r4+r1-8], m%2
+    movhps     [r4+r1*2-8], m%3
+    movhps     [r4+r5-8], m%4
+%ifnum %8
+    SWAP       %1, %8
+%else
+    mova       m%1, %8
+%endif
+    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
+    movq       [r0], m%5
+    movq       [r0+r1], m%6
+    movq       [r0+r1*2], m%7
+    movq       [r0+r5], m%1
+    movhps     [r4], m%5
+    movhps     [r4+r1], m%6
+    movhps     [r4+r1*2], m%7
+    movhps     [r4+r5], m%1
+%endif
+%endmacro
+
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+%macro DEBLOCK_LUMA_INTRA_64 1
+cglobal deblock_v_luma_intra_10_%1, 4,7,16
+    %define t0 m1
+    %define t1 m2
+    %define t2 m4
+    %define p2 m8
+    %define p1 m9
+    %define p0 m10
+    %define q0 m11
+    %define q1 m12
+    %define q2 m13
+    %define aa m5
+    %define bb m14
+    lea     r4, [r1*4]
+    lea     r5, [r1*3] ; 3*stride
+    neg     r4
+    add     r4, r0     ; pix-4*stride
+    mov     r6, 2
+    mova    m0, [pw_2]
+    shl    r2d, 2
+    shl    r3d, 2
+    LOAD_AB aa, bb, r2d, r3d
+.loop
+    mova    p2, [r4+r1]
+    mova    p1, [r4+2*r1]
+    mova    p0, [r4+r5]
+    mova    q0, [r0]
+    mova    q1, [r0+r1]
+    mova    q2, [r0+2*r1]
+
+    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
+    mova    t2, aa
+    psrlw   t2, 2
+    paddw   t2, m0 ; alpha/4+2
+    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
+    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
+    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
+    pand    m6, m3
+    pand    m7, m6
+    pand    m6, t1
+    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
+    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
+    add     r0, mmsize
+    add     r4, mmsize
+    dec     r6
+    jg .loop
+    REP_RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_luma_intra_10_%1, 4,7,16
+    %define t0 m15
+    %define t1 m14
+    %define t2 m2
+    %define q3 m5
+    %define q2 m8
+    %define q1 m9
+    %define q0 m10
+    %define p0 m11
+    %define p1 m12
+    %define p2 m13
+    %define p3 m4
+    %define spill [rsp]
+    %assign pad 24-(stack_offset&15)
+    SUB     rsp, pad
+    lea     r4, [r1*4]
+    lea     r5, [r1*3] ; 3*stride
+    add     r4, r0     ; pix+4*stride
+    mov     r6, 2
+    mova    m0, [pw_2]
+    shl    r2d, 2
+    shl    r3d, 2
+.loop
+    movu    q3, [r0-8]
+    movu    q2, [r0+r1-8]
+    movu    q1, [r0+r1*2-8]
+    movu    q0, [r0+r5-8]
+    movu    p0, [r4-8]
+    movu    p1, [r4+r1-8]
+    movu    p2, [r4+r1*2-8]
+    movu    p3, [r4+r5-8]
+    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
+
+    LOAD_AB m1, m2, r2d, r3d
+    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
+    psrlw   m1, 2
+    paddw   m1, m0 ; alpha/4+2
+    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
+    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
+    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
+    pand    m6, m3
+    pand    m7, m6
+    pand    m6, t1
+
+    mova spill, q3
+    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
+    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
+    mova    m7, spill
+
+    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
+
+    lea     r0, [r0+r1*8]
+    lea     r4, [r4+r1*8]
+    dec     r6
+    jg .loop
+    ADD    rsp, pad
+    RET
+%endmacro
+
+INIT_XMM
+DEBLOCK_LUMA_INTRA_64 sse2
+INIT_AVX
+DEBLOCK_LUMA_INTRA_64 avx
+
+%endif
+
+%macro DEBLOCK_LUMA_INTRA 1
+;-----------------------------------------------------------------------------
+; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
+    LUMA_INTRA_INIT 3
+    lea     r4, [r1*4]
+    lea     r5, [r1*3]
+    neg     r4
+    add     r4, r0
+    mov     r6, 32/mmsize
+    shl    r2d, 2
+    shl    r3d, 2
+.loop:
+    mova    m0, [r4+r1*2] ; p1
+    mova    m1, [r4+r5]   ; p0
+    mova    m2, [r0]      ; q0
+    mova    m3, [r0+r1]   ; q1
+    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
+    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
+    mova    t3, [r0+r1*2] ; q2
+    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
+    add     r0, mmsize
+    add     r4, mmsize
+    dec     r6
+    jg .loop
+    ADD    rsp, pad
+    RET
+
+;-----------------------------------------------------------------------------
+; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
+    LUMA_INTRA_INIT 8
+%if mmsize == 8
+    lea     r4, [r1*3]
+    mov     r5, 32/mmsize
+%else
+    lea     r4, [r1*4]
+    lea     r5, [r1*3] ; 3*stride
+    add     r4, r0     ; pix+4*stride
+    mov     r6, 32/mmsize
+%endif
+    shl    r2d, 2
+    shl    r3d, 2
+.loop:
+    LUMA_H_INTRA_LOAD
+    LUMA_INTRA_INTER t8, t9, t10, t5, t6
+
+    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
+    mova    t3, t6     ; q2
+    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
+
+    mova    m2, t4
+    mova    m0, t11
+    mova    m1, t5
+    mova    m3, t8
+    mova    m6, t6
+
+    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
+
+    lea     r0, [r0+r1*(mmsize/2)]
+%if mmsize == 8
+    dec     r5
+%else
+    lea     r4, [r4+r1*(mmsize/2)]
+    dec     r6
+%endif
+    jg .loop
+    ADD    rsp, pad
+    RET
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX
+DEBLOCK_LUMA mmxext
+DEBLOCK_LUMA_INTRA mmxext
+INIT_XMM
+DEBLOCK_LUMA sse2
+DEBLOCK_LUMA_INTRA sse2
+INIT_AVX
+DEBLOCK_LUMA avx
+DEBLOCK_LUMA_INTRA avx
+%endif
+
+; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
+; out: %1=p0', %2=q0'
+%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
+    mova    %6, [pw_2]
+    paddw   %6, %3
+    paddw   %6, %4
+    paddw   %7, %6, %2
+    paddw   %6, %1
+    paddw   %6, %3
+    paddw   %7, %4
+    psraw   %6, 2
+    psraw   %7, 2
+    psubw   %6, %1
+    psubw   %7, %2
+    pand    %6, %5
+    pand    %7, %5
+    paddw   %1, %6
+    paddw   %2, %7
+%endmacro
+
+%macro CHROMA_V_LOAD 1
+    mova        m0, [r0]    ; p1
+    mova        m1, [r0+r1] ; p0
+    mova        m2, [%1]    ; q0
+    mova        m3, [%1+r1] ; q1
+%endmacro
+
+%macro CHROMA_V_STORE 0
+    mova [r0+1*r1], m1
+    mova [r0+2*r1], m2
+%endmacro
+
+%macro DEBLOCK_CHROMA 1
+;-----------------------------------------------------------------------------
+; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
+    mov         r5, r0
+    sub         r0, r1
+    sub         r0, r1
+    shl        r2d, 2
+    shl        r3d, 2
+%if mmsize < 16
+    mov         r6, 16/mmsize
+.loop:
+%endif
+    CHROMA_V_LOAD r5
+    LOAD_AB     m4, m5, r2, r3
+    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
+    pxor        m4, m4
+    LOAD_TC     m6, r4
+    psubw       m6, [pw_3]
+    pmaxsw      m6, m4
+    pand        m7, m6
+    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+    CHROMA_V_STORE
+%if mmsize < 16
+    add         r0, mmsize
+    add         r5, mmsize
+    add         r4, mmsize/8
+    dec         r6
+    jg .loop
+    REP_RET
+%else
+    RET
+%endif
+
+;-----------------------------------------------------------------------------
+; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
+    mov         r4, r0
+    sub         r0, r1
+    sub         r0, r1
+    shl        r2d, 2
+    shl        r3d, 2
+%if mmsize < 16
+    mov         r5, 16/mmsize
+.loop:
+%endif
+    CHROMA_V_LOAD r4
+    LOAD_AB     m4, m5, r2, r3
+    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
+    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
+    CHROMA_V_STORE
+%if mmsize < 16
+    add         r0, mmsize
+    add         r4, mmsize
+    dec         r5
+    jg .loop
+    REP_RET
+%else
+    RET
+%endif
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX
+DEBLOCK_CHROMA mmxext
+%endif
+INIT_XMM
+DEBLOCK_CHROMA sse2
+INIT_AVX
+DEBLOCK_CHROMA avx
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 7657a858908..b331f94b5e7 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -218,41 +218,57 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
     );
 }
 
-#define LF_FUNC(DIR, TYPE, OPT) \
-void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
-                                               int alpha, int beta, int8_t *tc0);
-#define LF_IFUNC(DIR, TYPE, OPT) \
-void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
-                                               int alpha, int beta);
-
-LF_FUNC (h,  chroma,       mmxext)
-LF_IFUNC(h,  chroma_intra, mmxext)
-LF_FUNC (v,  chroma,       mmxext)
-LF_IFUNC(v,  chroma_intra, mmxext)
-
-LF_FUNC (h,  luma,         mmxext)
-LF_IFUNC(h,  luma_intra,   mmxext)
-#if HAVE_YASM && ARCH_X86_32
-LF_FUNC (v8, luma,         mmxext)
-static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
+                                                                int alpha, int beta, int8_t *tc0);
+#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
+void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
+                                                                int alpha, int beta);
+
+#define LF_FUNCS(type, depth)\
+LF_FUNC (h,  chroma,       depth, mmxext)\
+LF_IFUNC(h,  chroma_intra, depth, mmxext)\
+LF_FUNC (v,  chroma,       depth, mmxext)\
+LF_IFUNC(v,  chroma_intra, depth, mmxext)\
+LF_FUNC (h,  luma,         depth, mmxext)\
+LF_IFUNC(h,  luma_intra,   depth, mmxext)\
+LF_FUNC (h,  luma,         depth, sse2)\
+LF_IFUNC(h,  luma_intra,   depth, sse2)\
+LF_FUNC (v,  luma,         depth, sse2)\
+LF_IFUNC(v,  luma_intra,   depth, sse2)\
+LF_FUNC (h,  chroma,       depth, sse2)\
+LF_IFUNC(h,  chroma_intra, depth, sse2)\
+LF_FUNC (v,  chroma,       depth, sse2)\
+LF_IFUNC(v,  chroma_intra, depth, sse2)\
+LF_FUNC (h,  luma,         depth,  avx)\
+LF_IFUNC(h,  luma_intra,   depth,  avx)\
+LF_FUNC (v,  luma,         depth,  avx)\
+LF_IFUNC(v,  luma_intra,   depth,  avx)\
+LF_FUNC (h,  chroma,       depth,  avx)\
+LF_IFUNC(h,  chroma_intra, depth,  avx)\
+LF_FUNC (v,  chroma,       depth,  avx)\
+LF_IFUNC(v,  chroma_intra, depth,  avx)
+
+LF_FUNCS( uint8_t,  8)
+LF_FUNCS(uint16_t, 10)
+
+LF_FUNC (v8, luma,             8, mmxext)
+static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 {
     if((tc0[0] & tc0[1]) >= 0)
-        ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
+        ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0);
     if((tc0[2] & tc0[3]) >= 0)
-        ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
+        ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2);
 }
-LF_IFUNC(v8, luma_intra,   mmxext)
-static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
+LF_IFUNC(v8, luma_intra,        8, mmxext)
+static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta)
 {
-    ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
-    ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta);
+    ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta);
 }
-#endif
 
-LF_FUNC (h,  luma,         sse2)
-LF_IFUNC(h,  luma_intra,   sse2)
-LF_FUNC (v,  luma,         sse2)
-LF_IFUNC(v,  luma_intra,   sse2)
+LF_FUNC (v,  luma,            10, mmxext)
+LF_IFUNC(v,  luma_intra,      10, mmxext)
 
 /***********************************/
 /* weighted prediction */
@@ -314,15 +330,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
             c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
             c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
 
-            c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
-            c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
-            c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
-            c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
+            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext;
+            c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext;
+            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext;
+            c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext;
 #if ARCH_X86_32
-            c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext;
-            c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext;
-            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
-            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
+            c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext;
+            c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
 #endif
             c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
             c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
@@ -360,10 +376,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
                 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
 
 #if HAVE_ALIGNED_STACK
-                c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
-                c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
-                c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
-                c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
 #endif
 
                 c->h264_idct_add16 = ff_h264_idct_add16_sse2;
@@ -377,6 +393,49 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
                 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
                 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
             }
+            if (mm_flags&AV_CPU_FLAG_AVX) {
+#if HAVE_ALIGNED_STACK
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
+#endif
+            }
+        }
+    }
+#endif
+    } else if (bit_depth == 10) {
+#if HAVE_YASM
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+        if (mm_flags & AV_CPU_FLAG_MMX2) {
+#if ARCH_X86_32
+            c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext;
+            c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext;
+            c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext;
+            c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext;
+            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
+#endif
+            if (mm_flags&AV_CPU_FLAG_SSE2) {
+                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
+                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
+#if HAVE_ALIGNED_STACK
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
+#endif
+            }
+            if (mm_flags&AV_CPU_FLAG_AVX) {
+                c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx;
+                c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx;
+#if HAVE_ALIGNED_STACK
+                c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
+                c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
+                c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
+                c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
+#endif
+            }
         }
     }
 #endif
diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm
index b28a6198f70..a96b0655092 100644
--- a/libavcodec/x86/x86util.asm
+++ b/libavcodec/x86/x86util.asm
@@ -24,16 +24,20 @@
 ;******************************************************************************
 
 %macro SBUTTERFLY 4
+%if avx_enabled == 0
     mova      m%4, m%2
     punpckl%1 m%2, m%3
     punpckh%1 m%4, m%3
+%else
+    punpckh%1 m%4, m%2, m%3
+    punpckl%1 m%2, m%3
+%endif
     SWAP %3, %4
 %endmacro
 
 %macro SBUTTERFLY2 4
-    mova      m%4, m%2
-    punpckh%1 m%2, m%3
-    punpckl%1 m%4, m%3
+    punpckl%1 m%4, m%2, m%3
+    punpckh%1 m%2, m%2, m%3
     SWAP %2, %4, %3
 %endmacro
 
@@ -444,3 +448,17 @@
 %macro PMINUB_MMXEXT 3 ; dst, src, ignored
     pminub   %1, %2
 %endmacro
+
+%macro SPLATW 2-3 0
+%if mmsize == 16
+    pshuflw    %1, %2, (%3)*0x55
+    punpcklqdq %1, %1
+%else
+    pshufw     %1, %2, (%3)*0x55
+%endif
+%endmacro
+
+%macro CLIPW 3 ;(dst, min, max)
+    pmaxsw %1, %2
+    pminsw %1, %3
+%endmacro
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index d03fe19ad20..f59b7c2ed2c 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -524,6 +524,7 @@ static const StreamType MISC_types[] = {
 static const StreamType REGD_types[] = {
     { MKTAG('d','r','a','c'), AVMEDIA_TYPE_VIDEO, CODEC_ID_DIRAC },
     { MKTAG('A','C','-','3'), AVMEDIA_TYPE_AUDIO,   CODEC_ID_AC3 },
+    { MKTAG('B','S','S','D'), AVMEDIA_TYPE_AUDIO, CODEC_ID_S302M },
     { 0 },
 };
 
diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
index b6ed0c651bc..3fdf494ad84 100644
--- a/libavformat/rtsp.c
+++ b/libavformat/rtsp.c
@@ -808,6 +808,10 @@ void ff_rtsp_parse_line(RTSPMessageHeader *reply, const char *buf,
         p += strspn(p, SPACE_CHARS);
         if (method && !strcmp(method, "PLAY"))
             rtsp_parse_rtp_info(rt, p);
+    } else if (av_stristart(p, "Public:", &p) && rt) {
+        if (strstr(p, "GET_PARAMETER") &&
+            method && !strcmp(method, "OPTIONS"))
+            rt->get_parameter_supported = 1;
     }
 }
 
diff --git a/libavformat/rtsp.h b/libavformat/rtsp.h
index 0fec3cc9919..56160cefc2f 100644
--- a/libavformat/rtsp.h
+++ b/libavformat/rtsp.h
@@ -331,6 +331,11 @@ typedef struct RTSPState {
      * Polling array for udp
      */
     struct pollfd *p;
+
+    /**
+     * Whether the server supports the GET_PARAMETER method.
+     */
+    int get_parameter_supported;
 } RTSPState;
 
 /**
diff --git a/libavformat/rtspdec.c b/libavformat/rtspdec.c
index 5833a5209ac..454a31c3f92 100644
--- a/libavformat/rtspdec.c
+++ b/libavformat/rtspdec.c
@@ -341,7 +341,9 @@ retry:
 
     /* send dummy request to keep TCP connection alive */
     if ((av_gettime() - rt->last_cmd_time) / 1000000 >= rt->timeout / 2) {
-        if (rt->server_type != RTSP_SERVER_REAL) {
+        if (rt->server_type == RTSP_SERVER_WMS ||
+           (rt->server_type != RTSP_SERVER_REAL &&
+            rt->get_parameter_supported)) {
             ff_rtsp_send_cmd_async(s, "GET_PARAMETER", rt->control_uri, NULL);
         } else {
             ff_rtsp_send_cmd_async(s, "OPTIONS", "*", NULL);
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 8c3cc60dabf..1386ebb190e 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -75,7 +75,7 @@ OBJS-$(ARCH_ARM) += arm/cpu.o
 OBJS-$(ARCH_PPC) += ppc/cpu.o
 OBJS-$(ARCH_X86) += x86/cpu.o
 
-TESTPROGS = adler32 aes base64 cpu crc des lls md5 pca sha softfloat tree
+TESTPROGS = adler32 aes base64 cpu crc des lls md5 pca sha tree
 TESTPROGS-$(HAVE_LZO1X_999_COMPRESS) += lzo
 
 DIRS = arm bfin sh4 x86
diff --git a/tests/rotozoom.c b/tests/rotozoom.c
index 47da1b05262..822c2bce5e0 100644
--- a/tests/rotozoom.c
+++ b/tests/rotozoom.c
@@ -24,47 +24,52 @@
 #include <stdio.h>
 #include <inttypes.h>
 
-#define FIXP (1<<16)
-#define MY_PI 205887 //(M_PI*FIX)
+#define FIXP (1 << 16)
+#define MY_PI 205887 //(M_PI * FIX)
 
-static int64_t int_pow(int64_t a, int p){
-    int64_t v= FIXP;
+static int64_t int_pow(int64_t a, int p)
+{
+    int64_t v = FIXP;
 
-    for(; p; p--){
-        v*= a;
-        v/= FIXP;
+    for (; p; p--) {
+        v *= a;
+        v /= FIXP;
     }
 
     return v;
 }
 
-static int64_t int_sin(int64_t a){
-    if(a<0) a= MY_PI-a; // 0..inf
-    a %= 2*MY_PI;       // 0..2PI
+static int64_t int_sin(int64_t a)
+{
+    if (a < 0)
+        a = MY_PI - a;  // 0..inf
+    a %= 2 * MY_PI;     // 0..2PI
 
-    if(a>=MY_PI*3/2) a -= 2*MY_PI;  // -PI/2 .. 3PI/2
-    if(a>=MY_PI/2  ) a = MY_PI - a; // -PI/2 ..  PI/2
+    if (a >= MY_PI * 3 / 2)
+        a -= 2 * MY_PI; // -PI / 2 .. 3PI / 2
+    if (a >= MY_PI /2)
+        a = MY_PI - a;  // -PI / 2 ..  PI / 2
 
-    return a - int_pow(a, 3)/6 + int_pow(a, 5)/120 - int_pow(a, 7)/5040;
+    return a - int_pow(a, 3) / 6 + int_pow(a, 5) / 120 - int_pow(a, 7) / 5040;
 }
 
 #define SCALEBITS 8
 #define ONE_HALF  (1 << (SCALEBITS - 1))
-#define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5))
-typedef unsigned char UINT8;
+#define FIX(x)    ((int) ((x) * (1L << SCALEBITS) + 0.5))
 
-static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
-                              UINT8 *src, int width, int height)
+static void rgb24_to_yuv420p(unsigned char *lum, unsigned char *cb,
+                             unsigned char *cr, unsigned char *src,
+                             int width, int height)
 {
     int wrap, wrap3, x, y;
     int r, g, b, r1, g1, b1;
-    UINT8 *p;
+    unsigned char *p;
 
-    wrap = width;
+    wrap  = width;
     wrap3 = width * 3;
     p = src;
-    for(y=0;y<height;y+=2) {
-        for(x=0;x<width;x+=2) {
+    for (y = 0; y < height; y += 2) {
+        for (x = 0; x < width; x += 2) {
             r = p[0];
             g = p[1];
             b = p[2];
@@ -81,7 +86,7 @@ static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
             b1 += b;
             lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
-            p += wrap3;
+            p   += wrap3;
             lum += wrap;
 
             r = p[0];
@@ -104,14 +109,14 @@ static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
             cb[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
                       FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
             cr[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
-                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
+                      FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 128;
 
             cb++;
             cr++;
-            p += -wrap3 + 2 * 3;
-            lum += -wrap + 2;
+            p   += -wrap3 + 2 * 3;
+            lum += -wrap  + 2;
         }
-        p += wrap3;
+        p   += wrap3;
         lum += wrap;
     }
 }
@@ -119,7 +124,7 @@ static void rgb24_to_yuv420p(UINT8 *lum, UINT8 *cb, UINT8 *cr,
 /* cif format */
 #define DEFAULT_WIDTH   352
 #define DEFAULT_HEIGHT  288
-#define DEFAULT_NB_PICT 50
+#define DEFAULT_NB_PICT  50
 
 static void pgmyuv_save(const char *filename, int w, int h,
                         unsigned char *rgb_tab)
@@ -130,19 +135,19 @@ static void pgmyuv_save(const char *filename, int w, int h,
     unsigned char *lum_tab, *cb_tab, *cr_tab;
 
     lum_tab = malloc(w * h);
-    cb_tab = malloc((w * h) / 4);
-    cr_tab = malloc((w * h) / 4);
+    cb_tab  = malloc(w * h / 4);
+    cr_tab  = malloc(w * h / 4);
 
     rgb24_to_yuv420p(lum_tab, cb_tab, cr_tab, rgb_tab, w, h);
 
-    f = fopen(filename,"wb");
-    fprintf(f, "P5\n%d %d\n%d\n", w, (h * 3) / 2, 255);
+    f = fopen(filename, "wb");
+    fprintf(f, "P5\n%d %d\n%d\n", w, h * 3 / 2, 255);
     fwrite(lum_tab, 1, w * h, f);
     h2 = h / 2;
     w2 = w / 2;
     cb = cb_tab;
     cr = cr_tab;
-    for(i=0;i<h2;i++) {
+    for (i = 0; i < h2; i++) {
         fwrite(cb, 1, w2, f);
         fwrite(cr, 1, w2, f);
         cb += w2;
@@ -172,104 +177,100 @@ static void put_pixel(int x, int y, int r, int g, int b)
     p[2] = b;
 }
 
-unsigned char tab_r[256*256];
-unsigned char tab_g[256*256];
-unsigned char tab_b[256*256];
+unsigned char tab_r[256 * 256];
+unsigned char tab_g[256 * 256];
+unsigned char tab_b[256 * 256];
 
 int h_cos [360];
 int h_sin [360];
 
-static int ipol(uint8_t *src, int x, int y){
-    int int_x= x>>16;
-    int int_y= y>>16;
-    int frac_x= x&0xFFFF;
-    int frac_y= y&0xFFFF;
-    int s00= src[ ( int_x   &255) + 256*( int_y   &255) ];
-    int s01= src[ ((int_x+1)&255) + 256*( int_y   &255) ];
-    int s10= src[ ( int_x   &255) + 256*((int_y+1)&255) ];
-    int s11= src[ ((int_x+1)&255) + 256*((int_y+1)&255) ];
-    int s0= (((1<<16) - frac_x)*s00 + frac_x*s01)>>8;
-    int s1= (((1<<16) - frac_x)*s10 + frac_x*s11)>>8;
-
-    return (((1<<16) - frac_y)*s0 + frac_y*s1)>>24;
+static int ipol(uint8_t *src, int x, int y)
+{
+    int int_x  = x >> 16;
+    int int_y  = y >> 16;
+    int frac_x = x & 0xFFFF;
+    int frac_y = y & 0xFFFF;
+    int s00    = src[( int_x      & 255) + 256 * ( int_y      & 255)];
+    int s01    = src[((int_x + 1) & 255) + 256 * ( int_y      & 255)];
+    int s10    = src[( int_x      & 255) + 256 * ((int_y + 1) & 255)];
+    int s11    = src[((int_x + 1) & 255) + 256 * ((int_y + 1) & 255)];
+    int s0     = (((1 << 16) - frac_x) * s00 + frac_x * s01) >> 8;
+    int s1     = (((1 << 16) - frac_x) * s10 + frac_x * s11) >> 8;
+
+    return (((1 << 16) - frac_y) * s0 + frac_y * s1) >> 24;
 }
 
 static void gen_image(int num, int w, int h)
 {
-  const int c = h_cos [num % 360];
-  const int s = h_sin [num % 360];
+    const int c = h_cos [num % 360];
+    const int s = h_sin [num % 360];
 
-  const int xi = -(w/2) * c;
-  const int yi =  (w/2) * s;
+    const int xi = -(w / 2) * c;
+    const int yi =  (w / 2) * s;
 
-  const int xj = -(h/2) * s;
-  const int yj = -(h/2) * c;
-  int i,j;
+    const int xj = -(h / 2) * s;
+    const int yj = -(h / 2) * c;
+    int i, j;
 
-  int x,y;
-  int xprime = xj;
-  int yprime = yj;
+    int x, y;
+    int xprime = xj;
+    int yprime = yj;
 
+    for (j = 0; j < h; j++) {
+        x = xprime + xi + FIXP * w / 2;
+        xprime += s;
 
-  for (j=0;j<h;j++) {
+        y = yprime + yi + FIXP * h / 2;
+        yprime += c;
 
-    x = xprime + xi + FIXP*w/2;
-    xprime += s;
-
-    y = yprime + yi + FIXP*h/2;
-    yprime += c;
-
-    for ( i=0 ; i<w ; i++ ) {
-      x += c;
-      y -= s;
-#if 1
-      put_pixel(i, j, ipol(tab_r, x, y), ipol(tab_g, x, y), ipol(tab_b, x, y));
-#else
-      {
-          unsigned dep;
-          dep = ((x>>16)&255) + (((y>>16)&255)<<8);
-          put_pixel(i, j, tab_r[dep], tab_g[dep], tab_b[dep]);
-      }
-#endif
+        for (i = 0; i < w; i++ ) {
+            x += c;
+            y -= s;
+            put_pixel(i, j, ipol(tab_r, x, y), ipol(tab_g, x, y), ipol(tab_b, x, y));
+        }
     }
-  }
 }
 
 #define W 256
 #define H 256
 
-static void init_demo(const char *filename) {
-  int i,j;
-  int h;
-  int radian;
-  char line[3 * W];
-
-  FILE *fichier;
-
-  fichier = fopen(filename,"rb");
-  if (!fichier) {
-      perror(filename);
-      exit(1);
-  }
-
-  fread(line, 1, 15, fichier);
-  for (i=0;i<H;i++) {
-    fread(line,1,3*W,fichier);
-    for (j=0;j<W;j++) {
-          tab_r[W*i+j] = line[3*j    ];
-          tab_g[W*i+j] = line[3*j + 1];
-          tab_b[W*i+j] = line[3*j + 2];
+static int init_demo(const char *filename)
+{
+    int i, j;
+    int h;
+    int radian;
+    char line[3 * W];
+
+    FILE *input_file;
+
+    input_file = fopen(filename, "rb");
+    if (!input_file) {
+        perror(filename);
+        return 1;
     }
-  }
-  fclose(fichier);
-
-  /* tables sin/cos */
-  for (i=0;i<360;i++) {
-    radian = 2*i*MY_PI/360;
-    h = 2*FIXP + int_sin (radian);
-    h_cos[i] = ( h * int_sin (radian + MY_PI/2) )/2/FIXP;
-    h_sin[i] = ( h * int_sin (radian          ) )/2/FIXP;
-  }
+
+    if (fread(line, 1, 15, input_file) != 15)
+        return 1;
+    for (i = 0; i < H; i++) {
+        if (fread(line, 1, 3 * W, input_file) != 3 * W)
+            return 1;
+        for (j = 0; j < W; j++) {
+            tab_r[W * i + j] = line[3 * j    ];
+            tab_g[W * i + j] = line[3 * j + 1];
+            tab_b[W * i + j] = line[3 * j + 2];
+        }
+    }
+    fclose(input_file);
+
+    /* tables sin/cos */
+    for (i = 0; i < 360; i++) {
+        radian = 2 * i * MY_PI / 360;
+        h      = 2 * FIXP + int_sin (radian);
+        h_cos[i] = h * int_sin(radian + MY_PI / 2) / 2 / FIXP;
+        h_sin[i] = h * int_sin(radian)             / 2 / FIXP;
+    }
+
+  return 0;
 }
 
 int main(int argc, char **argv)
@@ -280,20 +281,21 @@ int main(int argc, char **argv)
     if (argc != 3) {
         printf("usage: %s directory/ image.pnm\n"
                "generate a test video stream\n", argv[0]);
-        exit(1);
+        return 1;
     }
 
     w = DEFAULT_WIDTH;
     h = DEFAULT_HEIGHT;
 
     rgb_tab = malloc(w * h * 3);
-    wrap = w * 3;
-    width = w;
-    height = h;
+    wrap    = w * 3;
+    width   = w;
+    height  = h;
 
-    init_demo(argv[2]);
+    if (init_demo(argv[2]))
+        return 1;
 
-    for(i=0;i<DEFAULT_NB_PICT;i++) {
+    for (i = 0; i < DEFAULT_NB_PICT; i++) {
         snprintf(buf, sizeof(buf), "%s%02d.pgm", argv[1], i);
         gen_image(i, w, h);
         pgmyuv_save(buf, w, h, rgb_tab);