From: Michael Niedermayer <michaelni@gmx.at>
Date: Wed, 9 Jul 2014 22:56:05 +0000 (+0200)
Subject: Merge commit 'f46bb608d9d76c543e4929dc8cffe36b84bd789e'
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=2d5e9451de3c7ab00cac6ec4aff290e12a2f190d;p=ffmpeg

Merge commit 'f46bb608d9d76c543e4929dc8cffe36b84bd789e'

* commit 'f46bb608d9d76c543e4929dc8cffe36b84bd789e':
  dsputil: Split off pixel block routines into their own context

Conflicts:
	configure
	libavcodec/dsputil.c
	libavcodec/mpegvideo_enc.c
	libavcodec/pixblockdsp_template.c
	libavcodec/x86/dsputilenc.asm
	libavcodec/x86/dsputilenc_mmx.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>
---

2d5e9451de3c7ab00cac6ec4aff290e12a2f190d
diff --cc configure
index 4691c280a1d,7a29e82adf6..632ba44c7e2
--- a/configure
+++ b/configure
@@@ -1997,17 -1707,16 +1998,17 @@@ threads_if_any="$THREADS_LIST
  
  # subsystems
  dct_select="rdft"
- dsputil_select="fdctdsp idctdsp"
+ dsputil_select="fdctdsp idctdsp pixblockdsp"
  error_resilience_select="dsputil"
 +frame_thread_encoder_deps="encoders threads"
  intrax8_select="error_resilience"
  mdct_select="fft"
  rdft_select="fft"
  mpeg_er_select="error_resilience"
  mpegaudio_select="mpegaudiodsp"
  mpegaudiodsp_select="dct"
 -mpegvideo_select="blockdsp dsputil hpeldsp idctdsp videodsp"
 +mpegvideo_select="blockdsp dsputil h264chroma hpeldsp idctdsp videodsp"
- mpegvideoenc_select="dsputil mpegvideo qpeldsp"
+ mpegvideoenc_select="dsputil mpegvideo pixblockdsp qpeldsp"
  
  # decoders / encoders
  aac_decoder_select="mdct sinewin"
@@@ -2022,13 -1730,12 +2023,13 @@@ alac_encoder_select="lpc
  als_decoder_select="bswapdsp"
  amrnb_decoder_select="lsp"
  amrwb_decoder_select="lsp"
 -amv_decoder_select="sp5x_decoder"
 -ape_decoder_select="bswapdsp"
 +amv_decoder_select="sp5x_decoder exif"
 +amv_encoder_select="aandcttables mpegvideoenc"
 +ape_decoder_select="bswapdsp llauddsp"
  asv1_decoder_select="blockdsp bswapdsp idctdsp"
- asv1_encoder_select="bswapdsp dsputil fdctdsp"
+ asv1_encoder_select="bswapdsp fdctdsp pixblockdsp"
  asv2_decoder_select="blockdsp bswapdsp idctdsp"
- asv2_encoder_select="bswapdsp dsputil fdctdsp"
+ asv2_encoder_select="bswapdsp fdctdsp pixblockdsp"
  atrac1_decoder_select="mdct sinewin"
  atrac3_decoder_select="mdct"
  atrac3p_decoder_select="mdct sinewin"
@@@ -2043,12 -1749,11 +2044,12 @@@ cook_decoder_select="audiodsp mdct sine
  cscd_decoder_select="lzo"
  cscd_decoder_suggest="zlib"
  dca_decoder_select="mdct"
 +dirac_decoder_select="dsputil dwt golomb videodsp"
  dnxhd_decoder_select="blockdsp idctdsp"
- dnxhd_encoder_select="aandcttables blockdsp dsputil fdctdsp idctdsp mpegvideoenc"
+ dnxhd_encoder_select="aandcttables blockdsp fdctdsp idctdsp mpegvideoenc pixblockdsp"
  dvvideo_decoder_select="dvprofile idctdsp"
- dvvideo_encoder_select="dsputil dvprofile fdctdsp"
+ dvvideo_encoder_select="dsputil dvprofile fdctdsp pixblockdsp"
 -dxa_decoder_deps="zlib"
 +dxa_decoder_select="zlib"
  eac3_decoder_select="ac3_decoder"
  eac3_encoder_select="ac3_encoder"
  eamad_decoder_select="aandcttables blockdsp bswapdsp idctdsp mpegvideo"
diff --cc libavcodec/arm/Makefile
index fbbd0696b71,9ba6c2010ad..6b80de8a2bc
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@@ -63,10 -63,9 +64,11 @@@ ARMV6-OBJS-$(CONFIG_IDCTDSP)           
                                            arm/simple_idct_armv6.o
  ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
  ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC)      += arm/mpegvideoencdsp_armv6.o
+ ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP)       += arm/pixblockdsp_armv6.o
  
  ARMV6-OBJS-$(CONFIG_MLP_DECODER)       += arm/mlpdsp_armv6.o
 +ARMV6-OBJS-$(CONFIG_VC1_DECODER)       += arm/startcode_armv6.o
 +ARMV6-OBJS-$(CONFIG_VC1_PARSER)        += arm/startcode_armv6.o
  ARMV6-OBJS-$(CONFIG_VP7_DECODER)       += arm/vp8_armv6.o               \
                                            arm/vp8dsp_init_armv6.o       \
                                            arm/vp8dsp_armv6.o
diff --cc libavcodec/arm/pixblockdsp_armv6.S
index 00000000000,4c925a4daa7..b10ea78e882
mode 000000,100644..100644
--- a/libavcodec/arm/pixblockdsp_armv6.S
+++ b/libavcodec/arm/pixblockdsp_armv6.S
@@@ -1,0 -1,76 +1,76 @@@
+ /*
+  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+  *
 - * This file is part of Libav.
++ * This file is part of FFmpeg.
+  *
 - * Libav is free software; you can redistribute it and/or
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
 - * License along with Libav; if not, write to the Free Software
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
+ #include "libavutil/arm/asm.S"
+ 
+ function ff_get_pixels_armv6, export=1
+         pld             [r1, r2]
+         push            {r4-r8, lr}
+         mov             lr,  #8
+ 1:
+         ldrd_post       r4,  r5,  r1,  r2
+         subs            lr,  lr,  #1
+         uxtb16          r6,  r4
+         uxtb16          r4,  r4,  ror #8
+         uxtb16          r12, r5
+         uxtb16          r8,  r5,  ror #8
+         pld             [r1, r2]
+         pkhbt           r5,  r6,  r4,  lsl #16
+         pkhtb           r6,  r4,  r6,  asr #16
+         pkhbt           r7,  r12, r8,  lsl #16
+         pkhtb           r12, r8,  r12, asr #16
+         stm             r0!, {r5,r6,r7,r12}
+         bgt             1b
+ 
+         pop             {r4-r8, pc}
+ endfunc
+ 
+ function ff_diff_pixels_armv6, export=1
+         pld             [r1, r3]
+         pld             [r2, r3]
+         push            {r4-r9, lr}
+         mov             lr,  #8
+ 1:
+         ldrd_post       r4,  r5,  r1,  r3
+         ldrd_post       r6,  r7,  r2,  r3
+         uxtb16          r8,  r4
+         uxtb16          r4,  r4,  ror #8
+         uxtb16          r9,  r6
+         uxtb16          r6,  r6,  ror #8
+         pld             [r1, r3]
+         ssub16          r9,  r8,  r9
+         ssub16          r6,  r4,  r6
+         uxtb16          r8,  r5
+         uxtb16          r5,  r5,  ror #8
+         pld             [r2, r3]
+         pkhbt           r4,  r9,  r6,  lsl #16
+         pkhtb           r6,  r6,  r9,  asr #16
+         uxtb16          r9,  r7
+         uxtb16          r7,  r7,  ror #8
+         ssub16          r9,  r8,  r9
+         ssub16          r5,  r5,  r7
+         subs            lr,  lr,  #1
+         pkhbt           r8,  r9,  r5,  lsl #16
+         pkhtb           r9,  r5,  r9,  asr #16
+         stm             r0!, {r4,r6,r8,r9}
+         bgt             1b
+ 
+         pop             {r4-r9, pc}
+ endfunc
diff --cc libavcodec/arm/pixblockdsp_init_arm.c
index 00000000000,f20769b3bc5..b77c523a6e6
mode 000000,100644..100644
--- a/libavcodec/arm/pixblockdsp_init_arm.c
+++ b/libavcodec/arm/pixblockdsp_init_arm.c
@@@ -1,0 -1,42 +1,42 @@@
+ /*
 - * This file is part of Libav.
++ * This file is part of FFmpeg.
+  *
 - * Libav is free software; you can redistribute it and/or
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
 - * License along with Libav; if not, write to the Free Software
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
+ #include <stdint.h>
+ 
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/arm/cpu.h"
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/pixblockdsp.h"
+ 
+ void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
+ void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
+                           const uint8_t *s2, int stride);
+ 
+ av_cold void ff_pixblockdsp_init_arm(PixblockDSPContext *c,
+                                      AVCodecContext *avctx,
+                                      unsigned high_bit_depth)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+     if (have_armv6(cpu_flags)) {
+         if (!high_bit_depth)
+             c->get_pixels = ff_get_pixels_armv6;
+         c->diff_pixels = ff_diff_pixels_armv6;
+     }
+ }
diff --cc libavcodec/asvenc.c
index ae81953f308,9944ffaa7c4..02cf2db9913
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@@ -281,11 -241,17 +281,11 @@@ static av_cold int encode_init(AVCodecC
      int i;
      const int scale= avctx->codec_id == AV_CODEC_ID_ASV1 ? 1 : 2;
  
 -    avctx->coded_frame = av_frame_alloc();
 -    if (!avctx->coded_frame)
 -        return AVERROR(ENOMEM);
 -    avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
 -    avctx->coded_frame->key_frame = 1;
 -
      ff_asv_common_init(avctx);
-     ff_dsputil_init(&a->dsp, avctx);
      ff_fdctdsp_init(&a->fdsp, avctx);
+     ff_pixblockdsp_init(&a->pdsp, avctx);
  
 -    if(avctx->global_quality == 0) avctx->global_quality= 4*FF_QUALITY_SCALE;
 +    if(avctx->global_quality <= 0) avctx->global_quality= 4*FF_QUALITY_SCALE;
  
      a->inv_qscale= (32*scale*FF_QUALITY_SCALE +  avctx->global_quality/2) / avctx->global_quality;
  
diff --cc libavcodec/dnxhdenc.c
index f6f9af833ac,e656b6edba2..3ad625352a6
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@@ -33,10 -33,9 +33,10 @@@
  #include "fdctdsp.h"
  #include "internal.h"
  #include "mpegvideo.h"
+ #include "pixblockdsp.h"
  #include "dnxhdenc.h"
  
 +
  // The largest value that will not lead to overflow for 10bit samples.
  #define DNX10BIT_QMAT_SHIFT 18
  #define RC_VARIANCE 1 // use variance or ssd for fast rc
@@@ -326,9 -311,8 +326,10 @@@ static av_cold int dnxhd_encode_init(AV
      ff_fdctdsp_init(&ctx->m.fdsp, avctx);
      ff_idctdsp_init(&ctx->m.idsp, avctx);
      ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx);
+     ff_pixblockdsp_init(&ctx->m.pdsp, avctx);
      ff_dct_common_init(&ctx->m);
 +    ff_dct_encode_init(&ctx->m);
 +
      if (!ctx->m.dct_quantize)
          ctx->m.dct_quantize = ff_dct_quantize_c;
  
diff --cc libavcodec/dsputil.c
index c68a70a79e0,8d0cef2e828..1cd9658ba69
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@@ -584,9 -547,9 +556,9 @@@ static int dct_sad8x8_c(MpegEncContext 
  {
      LOCAL_ALIGNED_16(int16_t, temp, [64]);
  
 -    assert(h == 8);
 +    av_assert2(h == 8);
  
-     s->dsp.diff_pixels(temp, src1, src2, stride);
+     s->pdsp.diff_pixels(temp, src1, src2, stride);
      s->fdsp.fdct(temp);
      return s->dsp.sum_abs_dctelem(temp);
  }
@@@ -651,9 -614,9 +623,9 @@@ static int dct_max8x8_c(MpegEncContext 
      LOCAL_ALIGNED_16(int16_t, temp, [64]);
      int sum = 0, i;
  
 -    assert(h == 8);
 +    av_assert2(h == 8);
  
-     s->dsp.diff_pixels(temp, src1, src2, stride);
+     s->pdsp.diff_pixels(temp, src1, src2, stride);
      s->fdsp.fdct(temp);
  
      for (i = 0; i < 64; i++)
@@@ -669,10 -632,10 +641,10 @@@ static int quant_psnr8x8_c(MpegEncConte
      int16_t *const bak = temp + 64;
      int sum = 0, i;
  
 -    assert(h == 8);
 +    av_assert2(h == 8);
      s->mb_intra = 0;
  
-     s->dsp.diff_pixels(temp, src1, src2, stride);
+     s->pdsp.diff_pixels(temp, src1, src2, stride);
  
      memcpy(bak, temp, 64 * sizeof(int16_t));
  
@@@ -773,9 -736,9 +745,9 @@@ static int bit8x8_c(MpegEncContext *s, 
      const int esc_length = s->ac_esc_length;
      uint8_t *length, *last_length;
  
 -    assert(h == 8);
 +    av_assert2(h == 8);
  
-     s->dsp.diff_pixels(temp, src1, src2, stride);
+     s->pdsp.diff_pixels(temp, src1, src2, stride);
  
      s->block_last_index[0 /* FIXME */] =
      last                               =
@@@ -969,10 -904,6 +941,8 @@@ av_cold void ff_dsputil_init(DSPContex
  {
      const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
  
 +    ff_check_alignment();
 +
-     c->diff_pixels = diff_pixels_c;
- 
      c->sum_abs_dctelem = sum_abs_dctelem_c;
  
      /* TODO [0] 16  [1] 8 */
@@@ -1015,27 -944,7 +985,12 @@@
      c->vsse[5] = vsse_intra8_c;
      c->nsse[0] = nsse16_c;
      c->nsse[1] = nsse8_c;
 +#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
 +    ff_dsputil_init_dwt(c);
 +#endif
  
-     switch (avctx->bits_per_raw_sample) {
-     case 9:
-     case 10:
-     case 12:
-     case 14:
-         c->get_pixels = get_pixels_16_c;
-         break;
-     default:
-         if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
-             c->get_pixels = get_pixels_8_c;
-         }
-         break;
-     }
- 
- 
 +    if (ARCH_ALPHA)
 +        ff_dsputil_init_alpha(c, avctx);
      if (ARCH_ARM)
          ff_dsputil_init_arm(c, avctx, high_bit_depth);
      if (ARCH_PPC)
diff --cc libavcodec/dvenc.c
index a60b834dfe2,9f458e3e47c..aeb4a332596
--- a/libavcodec/dvenc.c
+++ b/libavcodec/dvenc.c
@@@ -67,12 -65,12 +69,13 @@@ static av_cold int dvvideo_encode_init(
  
      dv_vlc_map_tableinit();
  
 +    memset(&dsp,0, sizeof(dsp));
      ff_dsputil_init(&dsp, avctx);
      ff_fdctdsp_init(&fdsp, avctx);
+     ff_pixblockdsp_init(&pdsp, avctx);
      ff_set_cmp(&dsp, dsp.ildct_cmp, avctx->ildct_cmp);
  
-     s->get_pixels = dsp.get_pixels;
+     s->get_pixels = pdsp.get_pixels;
      s->ildct_cmp  = dsp.ildct_cmp[5];
  
      s->fdct[0]    = fdsp.fdct;
diff --cc libavcodec/libavcodec.v
index 5909dce46b6,bf148075c71..5a8c005b97d
--- a/libavcodec/libavcodec.v
+++ b/libavcodec/libavcodec.v
@@@ -1,33 -1,4 +1,34 @@@
  LIBAVCODEC_$MAJOR {
          global: av*;
 +                #deprecated, remove after next bump
 +                audio_resample;
 +                audio_resample_close;
 +                dsputil_init;
 +                ff_dsputil_init;
 +                ff_find_pix_fmt;
 +                ff_framenum_to_drop_timecode;
 +                ff_framenum_to_smtpe_timecode;
 +                ff_raw_pix_fmt_tags;
 +                ff_init_smtpe_timecode;
 +                ff_fft*;
 +                ff_mdct*;
 +                ff_dct*;
 +                ff_rdft*;
 +                ff_prores_idct_put_10_sse2;
 +                ff_simple_idct*;
 +                ff_aanscales;
 +                ff_faan*;
 +                ff_mmx_idct;
 +                ff_fdct*;
 +                fdct_ifast;
 +                j_rev_dct;
 +                ff_mmxext_idct;
 +                ff_idct_xvid*;
 +                ff_jpeg_fdct*;
 +                ff_dnxhd_get_cid_table;
 +                ff_dnxhd_cid_table;
 +                ff_idctdsp_init;
 +                ff_fdctdsp_init;
++                ff_pixblockdsp_init;
          local:  *;
  };
diff --cc libavcodec/mpegvideo_enc.c
index 826f061eeaf,e2504c7b721..56867ccb85e
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@@ -818,8 -698,12 +818,9 @@@ av_cold int ff_MPV_encode_init(AVCodecC
      if (ff_MPV_common_init(s) < 0)
          return -1;
  
 -    if (ARCH_X86)
 -        ff_MPV_encode_init_x86(s);
 -
      ff_fdctdsp_init(&s->fdsp, avctx);
      ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
+     ff_pixblockdsp_init(&s->pdsp, avctx);
      ff_qpeldsp_init(&s->qdsp);
  
      s->avctx->coded_frame = s->current_picture.f;
@@@ -2102,18 -1953,13 +2103,18 @@@ static av_always_inline void encode_mb_
              skip_dct[4] = 1;
              skip_dct[5] = 1;
          } else {
-             s->dsp.get_pixels(s->block[4], ptr_cb, wrap_c);
-             s->dsp.get_pixels(s->block[5], ptr_cr, wrap_c);
+             s->pdsp.get_pixels(s->block[4], ptr_cb, wrap_c);
+             s->pdsp.get_pixels(s->block[5], ptr_cr, wrap_c);
 -            if (!s->chroma_y_shift) { /* 422 */
 -                s->pdsp.get_pixels(s->block[6],
 -                                   ptr_cb + (dct_offset >> 1), wrap_c);
 -                s->pdsp.get_pixels(s->block[7],
 -                                   ptr_cr + (dct_offset >> 1), wrap_c);
 +            if (!s->chroma_y_shift && s->chroma_x_shift) { /* 422 */
-                 s->dsp.get_pixels(s->block[6], ptr_cb + uv_dct_offset, wrap_c);
-                 s->dsp.get_pixels(s->block[7], ptr_cr + uv_dct_offset, wrap_c);
++                s->pdsp.get_pixels(s->block[6], ptr_cb + uv_dct_offset, wrap_c);
++                s->pdsp.get_pixels(s->block[7], ptr_cr + uv_dct_offset, wrap_c);
 +            } else if (!s->chroma_y_shift && !s->chroma_x_shift) { /* 444 */
-                 s->dsp.get_pixels(s->block[6], ptr_cb + 8, wrap_c);
-                 s->dsp.get_pixels(s->block[7], ptr_cr + 8, wrap_c);
-                 s->dsp.get_pixels(s->block[8], ptr_cb + uv_dct_offset, wrap_c);
-                 s->dsp.get_pixels(s->block[9], ptr_cr + uv_dct_offset, wrap_c);
-                 s->dsp.get_pixels(s->block[10], ptr_cb + uv_dct_offset + 8, wrap_c);
-                 s->dsp.get_pixels(s->block[11], ptr_cr + uv_dct_offset + 8, wrap_c);
++                s->pdsp.get_pixels(s->block[ 6], ptr_cb + 8, wrap_c);
++                s->pdsp.get_pixels(s->block[ 7], ptr_cr + 8, wrap_c);
++                s->pdsp.get_pixels(s->block[ 8], ptr_cb + uv_dct_offset, wrap_c);
++                s->pdsp.get_pixels(s->block[ 9], ptr_cr + uv_dct_offset, wrap_c);
++                s->pdsp.get_pixels(s->block[10], ptr_cb + uv_dct_offset + 8, wrap_c);
++                s->pdsp.get_pixels(s->block[11], ptr_cr + uv_dct_offset + 8, wrap_c);
              }
          }
      } else {
@@@ -2191,13 -2036,13 +2192,13 @@@
              skip_dct[4] = 1;
              skip_dct[5] = 1;
          } else {
-             s->dsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
-             s->dsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
+             s->pdsp.diff_pixels(s->block[4], ptr_cb, dest_cb, wrap_c);
+             s->pdsp.diff_pixels(s->block[5], ptr_cr, dest_cr, wrap_c);
              if (!s->chroma_y_shift) { /* 422 */
-                 s->dsp.diff_pixels(s->block[6], ptr_cb + uv_dct_offset,
-                                    dest_cb + uv_dct_offset, wrap_c);
-                 s->dsp.diff_pixels(s->block[7], ptr_cr + uv_dct_offset,
-                                    dest_cr + uv_dct_offset, wrap_c);
 -                s->pdsp.diff_pixels(s->block[6], ptr_cb + (dct_offset >> 1),
 -                                    dest_cb + (dct_offset >> 1), wrap_c);
 -                s->pdsp.diff_pixels(s->block[7], ptr_cr + (dct_offset >> 1),
 -                                    dest_cr + (dct_offset >> 1), wrap_c);
++                s->pdsp.diff_pixels(s->block[6], ptr_cb + uv_dct_offset,
++                                    dest_cb + uv_dct_offset, wrap_c);
++                s->pdsp.diff_pixels(s->block[7], ptr_cr + uv_dct_offset,
++                                    dest_cr + uv_dct_offset, wrap_c);
              }
          }
          /* pre quantization */
diff --cc libavcodec/pixblockdsp.c
index 00000000000,71423f9cfc9..a69948e43ef
mode 000000,100644..100644
--- a/libavcodec/pixblockdsp.c
+++ b/libavcodec/pixblockdsp.c
@@@ -1,0 -1,76 +1,80 @@@
+ /*
 - * This file is part of Libav.
++ * This file is part of FFmpeg.
+  *
 - * Libav is free software; you can redistribute it and/or
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
 - * License along with Libav; if not, write to the Free Software
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
+ #include <stdint.h>
+ 
+ #include "config.h"
+ #include "libavutil/attributes.h"
+ #include "avcodec.h"
+ #include "pixblockdsp.h"
+ 
+ #define BIT_DEPTH 16
+ #include "pixblockdsp_template.c"
+ #undef BIT_DEPTH
+ 
+ #define BIT_DEPTH 8
+ #include "pixblockdsp_template.c"
+ 
 -static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
++static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
+                           const uint8_t *s2, int stride)
+ {
+     int i;
+ 
+     /* read the pixels */
+     for (i = 0; i < 8; i++) {
+         block[0] = s1[0] - s2[0];
+         block[1] = s1[1] - s2[1];
+         block[2] = s1[2] - s2[2];
+         block[3] = s1[3] - s2[3];
+         block[4] = s1[4] - s2[4];
+         block[5] = s1[5] - s2[5];
+         block[6] = s1[6] - s2[6];
+         block[7] = s1[7] - s2[7];
+         s1      += stride;
+         s2      += stride;
+         block   += 8;
+     }
+ }
+ 
+ av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx)
+ {
+     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
+ 
+     c->diff_pixels = diff_pixels_c;
+ 
+     switch (avctx->bits_per_raw_sample) {
+     case 9:
+     case 10:
++    case 12:
++    case 14:
+         c->get_pixels = get_pixels_16_c;
+         break;
+     default:
 -        c->get_pixels = get_pixels_8_c;
++        if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
++            c->get_pixels = get_pixels_8_c;
++        }
+         break;
+     }
+ 
+     if (ARCH_ARM)
+         ff_pixblockdsp_init_arm(c, avctx, high_bit_depth);
+     if (ARCH_PPC)
+         ff_pixblockdsp_init_ppc(c, avctx, high_bit_depth);
+     if (ARCH_X86)
+         ff_pixblockdsp_init_x86(c, avctx, high_bit_depth);
+ }
diff --cc libavcodec/pixblockdsp.h
index 00000000000,8094d14b68e..a724ffbef0a
mode 000000,100644..100644
--- a/libavcodec/pixblockdsp.h
+++ b/libavcodec/pixblockdsp.h
@@@ -1,0 -1,44 +1,44 @@@
+ /*
 - * This file is part of Libav.
++ * This file is part of FFmpeg.
+  *
 - * Libav is free software; you can redistribute it and/or
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
 - * License along with Libav; if not, write to the Free Software
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
+ #ifndef AVCODEC_PIXBLOCKDSP_H
+ #define AVCODEC_PIXBLOCKDSP_H
+ 
+ #include <stdint.h>
+ 
+ #include "avcodec.h"
+ 
+ typedef struct PixblockDSPContext {
+     void (*get_pixels)(int16_t *block /* align 16 */,
+                        const uint8_t *pixels /* align 8 */,
+                        int line_size);
+     void (*diff_pixels)(int16_t *block /* align 16 */,
+                         const uint8_t *s1 /* align 8 */,
+                         const uint8_t *s2 /* align 8 */,
+                         int stride);
+ } PixblockDSPContext;
+ 
+ void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx);
+ void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth);
+ void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth);
+ void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx,
+                              unsigned high_bit_depth);
+ 
+ #endif /* AVCODEC_PIXBLOCKDSP_H */
diff --cc libavcodec/pixblockdsp_template.c
index 711c404a972,71d3cf150d6..3aeddf526c7
--- a/libavcodec/pixblockdsp_template.c
+++ b/libavcodec/pixblockdsp_template.c
@@@ -1,13 -1,7 +1,7 @@@
  /*
-  * DSP utils
-  * Copyright (c) 2000, 2001 Fabrice Bellard
-  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
-  *
-  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
-  *
 - * This file is part of Libav.
 + * This file is part of FFmpeg.
   *
 - * Libav is free software; you can redistribute it and/or
 + * FFmpeg is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
   * version 2.1 of the License, or (at your option) any later version.
@@@ -22,14 -16,9 +16,9 @@@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
- /**
-  * @file
-  * DSP utils
-  */
- 
  #include "bit_depth_template.c"
  
 -static void FUNCC(get_pixels)(int16_t *restrict block, const uint8_t *_pixels,
 +static void FUNCC(get_pixels)(int16_t *av_restrict block, const uint8_t *_pixels,
                                int line_size)
  {
      const pixel *pixels = (const pixel *) _pixels;
diff --cc libavcodec/ppc/pixblockdsp.c
index 00000000000,698d655fc6a..42c5be842ea
mode 000000,100644..100644
--- a/libavcodec/ppc/pixblockdsp.c
+++ b/libavcodec/ppc/pixblockdsp.c
@@@ -1,0 -1,153 +1,153 @@@
+ /*
+  * Copyright (c) 2002 Brian Foley
+  * Copyright (c) 2002 Dieter Shirley
+  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
+  *
 - * This file is part of Libav.
++ * This file is part of FFmpeg.
+  *
 - * Libav is free software; you can redistribute it and/or
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
 - * License along with Libav; if not, write to the Free Software
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
+ #include "config.h"
+ #if HAVE_ALTIVEC_H
+ #include <altivec.h>
+ #endif
+ 
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/ppc/cpu.h"
+ #include "libavutil/ppc/types_altivec.h"
+ #include "libavutil/ppc/util_altivec.h"
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/pixblockdsp.h"
+ 
+ #if HAVE_ALTIVEC
+ 
+ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
+                                int line_size)
+ {
+     int i;
+     vector unsigned char perm = vec_lvsl(0, pixels);
+     const vector unsigned char zero =
+         (const vector unsigned char) vec_splat_u8(0);
+ 
+     for (i = 0; i < 8; i++) {
+         /* Read potentially unaligned pixels.
+          * We're reading 16 pixels, and actually only want 8,
+          * but we simply ignore the extras. */
+         vector unsigned char pixl = vec_ld(0, pixels);
+         vector unsigned char pixr = vec_ld(7, pixels);
+         vector unsigned char bytes = vec_perm(pixl, pixr, perm);
+ 
+         // Convert the bytes into shorts.
+         vector signed short shorts = (vector signed short) vec_mergeh(zero,
+                                                                       bytes);
+ 
+         // Save the data to the block, we assume the block is 16-byte aligned.
+         vec_st(shorts, i * 16, (vector signed short *) block);
+ 
+         pixels += line_size;
+     }
+ }
+ 
+ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
+                                 const uint8_t *s2, int stride)
+ {
+     int i;
+     vector unsigned char perm1 = vec_lvsl(0, s1);
+     vector unsigned char perm2 = vec_lvsl(0, s2);
+     const vector unsigned char zero =
+         (const vector unsigned char) vec_splat_u8(0);
+     vector signed short shorts1, shorts2;
+ 
+     for (i = 0; i < 4; i++) {
+         /* Read potentially unaligned pixels.
+          * We're reading 16 pixels, and actually only want 8,
+          * but we simply ignore the extras. */
+         vector unsigned char pixl  = vec_ld(0,  s1);
+         vector unsigned char pixr  = vec_ld(15, s1);
+         vector unsigned char bytes = vec_perm(pixl, pixr, perm1);
+ 
+         // Convert the bytes into shorts.
+         shorts1 = (vector signed short) vec_mergeh(zero, bytes);
+ 
+         // Do the same for the second block of pixels.
+         pixl  = vec_ld(0,  s2);
+         pixr  = vec_ld(15, s2);
+         bytes = vec_perm(pixl, pixr, perm2);
+ 
+         // Convert the bytes into shorts.
+         shorts2 = (vector signed short) vec_mergeh(zero, bytes);
+ 
+         // Do the subtraction.
+         shorts1 = vec_sub(shorts1, shorts2);
+ 
+         // Save the data to the block, we assume the block is 16-byte aligned.
+         vec_st(shorts1, 0, (vector signed short *) block);
+ 
+         s1    += stride;
+         s2    += stride;
+         block += 8;
+ 
+         /* The code below is a copy of the code above...
+          * This is a manual unroll. */
+ 
+         /* Read potentially unaligned pixels.
+          * We're reading 16 pixels, and actually only want 8,
+          * but we simply ignore the extras. */
+         pixl  = vec_ld(0,  s1);
+         pixr  = vec_ld(15, s1);
+         bytes = vec_perm(pixl, pixr, perm1);
+ 
+         // Convert the bytes into shorts.
+         shorts1 = (vector signed short) vec_mergeh(zero, bytes);
+ 
+         // Do the same for the second block of pixels.
+         pixl  = vec_ld(0,  s2);
+         pixr  = vec_ld(15, s2);
+         bytes = vec_perm(pixl, pixr, perm2);
+ 
+         // Convert the bytes into shorts.
+         shorts2 = (vector signed short) vec_mergeh(zero, bytes);
+ 
+         // Do the subtraction.
+         shorts1 = vec_sub(shorts1, shorts2);
+ 
+         // Save the data to the block, we assume the block is 16-byte aligned.
+         vec_st(shorts1, 0, (vector signed short *) block);
+ 
+         s1    += stride;
+         s2    += stride;
+         block += 8;
+     }
+ }
+ 
+ #endif /* HAVE_ALTIVEC */
+ 
+ av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
+                                      AVCodecContext *avctx,
+                                      unsigned high_bit_depth)
+ {
+ #if HAVE_ALTIVEC
+     if (!PPC_ALTIVEC(av_get_cpu_flags()))
+         return;
+ 
+     c->diff_pixels = diff_pixels_altivec;
+ 
+     if (!high_bit_depth) {
+         c->get_pixels = get_pixels_altivec;
+     }
+ #endif /* HAVE_ALTIVEC */
+ }
diff --cc libavcodec/x86/Makefile
index 0843dcc7749,7c5ac3decf6..44ccb2040f8
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@@ -105,11 -92,9 +106,12 @@@ YASM-OBJS-$(CONFIG_HEVC_DECODER)       
  YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                            x86/hpeldsp.o
  YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
 +YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
 +YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
 +YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
  YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
  YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
+ YASM-OBJS-$(CONFIG_PIXBLOCKDSP)        += x86/pixblockdsp.o
  YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
                                            x86/fpel.o                    \
                                            x86/qpel.o
diff --cc libavcodec/x86/dsputilenc.asm
index 13682ba5d46,8d989c26f89..023f512edd8
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@@ -328,249 -323,14 +328,140 @@@ cglobal sse%1, 5,5,8, v, pix1, pix2, ls
      paddd     m7, m1
      paddd     m7, m3
  
 -    dec       r4
 +%if %1 == mmsize
 +    lea    pix1q, [pix1q + 2*lsizeq]
 +    lea    pix2q, [pix2q + 2*lsizeq]
 +%else
 +    add    pix1q, lsizeq
 +    add    pix2q, lsizeq
 +%endif
 +    dec       hd
      jnz .next2lines
  
 -    mova      m1, m7
 -    psrldq    m7, 8          ; shift hi qword to lo
 -    paddd     m7, m1
 -    mova      m1, m7
 -    psrldq    m7, 4          ; shift hi dword to lo
 -    paddd     m7, m1
 +    HADDD     m7, m1
      movd     eax, m7         ; return value
      RET
 +%endmacro
 +
 +INIT_MMX mmx
 +SUM_SQUARED_ERRORS 8
 +
 +INIT_MMX mmx
 +SUM_SQUARED_ERRORS 16
 +
 +INIT_XMM sse2
 +SUM_SQUARED_ERRORS 16
 +
- INIT_MMX mmx
- ; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
- cglobal get_pixels, 3,4
-     movsxdifnidn r2, r2d
-     add          r0, 128
-     mov          r3, -128
-     pxor         m7, m7
- .loop:
-     mova         m0, [r1]
-     mova         m2, [r1+r2]
-     mova         m1, m0
-     mova         m3, m2
-     punpcklbw    m0, m7
-     punpckhbw    m1, m7
-     punpcklbw    m2, m7
-     punpckhbw    m3, m7
-     mova [r0+r3+ 0], m0
-     mova [r0+r3+ 8], m1
-     mova [r0+r3+16], m2
-     mova [r0+r3+24], m3
-     lea          r1, [r1+r2*2]
-     add          r3, 32
-     js .loop
-     REP_RET
- 
- INIT_XMM sse2
- cglobal get_pixels, 3, 4, 5
-     movsxdifnidn r2, r2d
-     lea          r3, [r2*3]
-     pxor         m4, m4
-     movh         m0, [r1]
-     movh         m1, [r1+r2]
-     movh         m2, [r1+r2*2]
-     movh         m3, [r1+r3]
-     lea          r1, [r1+r2*4]
-     punpcklbw    m0, m4
-     punpcklbw    m1, m4
-     punpcklbw    m2, m4
-     punpcklbw    m3, m4
-     mova       [r0], m0
-     mova  [r0+0x10], m1
-     mova  [r0+0x20], m2
-     mova  [r0+0x30], m3
-     movh         m0, [r1]
-     movh         m1, [r1+r2*1]
-     movh         m2, [r1+r2*2]
-     movh         m3, [r1+r3]
-     punpcklbw    m0, m4
-     punpcklbw    m1, m4
-     punpcklbw    m2, m4
-     punpcklbw    m3, m4
-     mova  [r0+0x40], m0
-     mova  [r0+0x50], m1
-     mova  [r0+0x60], m2
-     mova  [r0+0x70], m3
-     RET
- 
- INIT_MMX mmx
- ; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
- ;                         int stride);
- cglobal diff_pixels, 4,5
-     movsxdifnidn r3, r3d
-     pxor         m7, m7
-     add          r0,  128
-     mov          r4, -128
- .loop:
-     mova         m0, [r1]
-     mova         m2, [r2]
-     mova         m1, m0
-     mova         m3, m2
-     punpcklbw    m0, m7
-     punpckhbw    m1, m7
-     punpcklbw    m2, m7
-     punpckhbw    m3, m7
-     psubw        m0, m2
-     psubw        m1, m3
-     mova  [r0+r4+0], m0
-     mova  [r0+r4+8], m1
-     add          r1, r3
-     add          r2, r3
-     add          r4, 16
-     jne .loop
-     REP_RET
- 
- INIT_XMM sse2
- cglobal diff_pixels, 4, 5, 5
-     movsxdifnidn r3, r3d
-     pxor         m4, m4
-     add          r0,  128
-     mov          r4, -128
- .loop:
-     movh         m0, [r1]
-     movh         m2, [r2]
-     movh         m1, [r1+r3]
-     movh         m3, [r2+r3]
-     punpcklbw    m0, m4
-     punpcklbw    m1, m4
-     punpcklbw    m2, m4
-     punpcklbw    m3, m4
-     psubw        m0, m2
-     psubw        m1, m3
-     mova [r0+r4+0 ], m0
-     mova [r0+r4+16], m1
-     lea          r1, [r1+r3*2]
-     lea          r2, [r2+r3*2]
-     add          r4, 32
-     jne .loop
-     RET
- 
 +;-----------------------------------------------
 +;int ff_sum_abs_dctelem(int16_t *block)
 +;-----------------------------------------------
 +; %1 = number of xmm registers used
 +; %2 = number of inline loops
 +
 +%macro SUM_ABS_DCTELEM 2
 +cglobal sum_abs_dctelem, 1, 1, %1, block
 +    pxor    m0, m0
 +    pxor    m1, m1
 +%assign %%i 0
 +%rep %2
 +    mova      m2, [blockq+mmsize*(0+%%i)]
 +    mova      m3, [blockq+mmsize*(1+%%i)]
 +    mova      m4, [blockq+mmsize*(2+%%i)]
 +    mova      m5, [blockq+mmsize*(3+%%i)]
 +    ABS1_SUM  m2, m6, m0
 +    ABS1_SUM  m3, m6, m1
 +    ABS1_SUM  m4, m6, m0
 +    ABS1_SUM  m5, m6, m1
 +%assign %%i %%i+4
 +%endrep
 +    paddusw m0, m1
 +    HSUM    m0, m1, eax
 +    and     eax, 0xFFFF
 +    RET
 +%endmacro
 +
 +INIT_MMX mmx
 +SUM_ABS_DCTELEM 0, 4
 +INIT_MMX mmxext
 +SUM_ABS_DCTELEM 0, 4
 +INIT_XMM sse2
 +SUM_ABS_DCTELEM 7, 2
 +INIT_XMM ssse3
 +SUM_ABS_DCTELEM 6, 2
 +
 +;------------------------------------------------------------------------------
 +; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h)
 +;------------------------------------------------------------------------------
 +; %1 = 8/16. %2-5=m#
 +%macro HF_NOISE_PART1 5
 +    mova      m%2, [pix1q]
 +%if %1 == 8
 +    mova      m%3, m%2
 +    psllq     m%2, 8
 +    psrlq     m%3, 8
 +    psrlq     m%2, 8
 +%else
 +    mova      m%3, [pix1q+1]
 +%endif
 +    mova      m%4, m%2
 +    mova      m%5, m%3
 +    punpcklbw m%2, m7
 +    punpcklbw m%3, m7
 +    punpckhbw m%4, m7
 +    punpckhbw m%5, m7
 +    psubw     m%2, m%3
 +    psubw     m%4, m%5
 +%endmacro
 +
 +; %1-2 = m#
 +%macro HF_NOISE_PART2 4
 +    psubw     m%1, m%3
 +    psubw     m%2, m%4
 +    pxor       m3, m3
 +    pxor       m1, m1
 +    pcmpgtw    m3, m%1
 +    pcmpgtw    m1, m%2
 +    pxor      m%1, m3
 +    pxor      m%2, m1
 +    psubw     m%1, m3
 +    psubw     m%2, m1
 +    paddw     m%2, m%1
 +    paddw      m6, m%2
 +%endmacro
 +
 +; %1 = 8/16
 +%macro HF_NOISE 1
 +cglobal hf_noise%1, 3,3,0, pix1, lsize, h
 +    movsxdifnidn lsizeq, lsized
 +    sub        hd, 2
 +    pxor       m7, m7
 +    pxor       m6, m6
 +    HF_NOISE_PART1 %1, 0, 1, 2, 3
 +    add     pix1q, lsizeq
 +    HF_NOISE_PART1 %1, 4, 1, 5, 3
 +    HF_NOISE_PART2     0, 2, 4, 5
 +    add     pix1q, lsizeq
 +.loop:
 +    HF_NOISE_PART1 %1, 0, 1, 2, 3
 +    HF_NOISE_PART2     4, 5, 0, 2
 +    add     pix1q, lsizeq
 +    HF_NOISE_PART1 %1, 4, 1, 5, 3
 +    HF_NOISE_PART2     0, 2, 4, 5
 +    add     pix1q, lsizeq
 +    sub        hd, 2
 +        jne .loop
 +
 +    mova       m0, m6
 +    punpcklwd  m0, m7
 +    punpckhwd  m6, m7
 +    paddd      m6, m0
 +    mova       m0, m6
 +    psrlq      m6, 32
 +    paddd      m0, m6
 +    movd      eax, m0   ; eax = result of hf_noise8;
 +    REP_RET                 ; return eax;
 +%endmacro
 +
 +INIT_MMX mmx
 +HF_NOISE 8
 +HF_NOISE 16
diff --cc libavcodec/x86/dsputilenc_mmx.c
index d0936595d0b,5a7d911ca86..5d48a78daa0
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@@ -30,37 -30,381 +30,31 @@@
  #include "libavcodec/mpegvideo.h"
  #include "dsputil_x86.h"
  
- void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
- void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
- void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
-                         int stride);
- void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
-                          int stride);
 -#if HAVE_INLINE_ASM
 -
 -static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 -                    int line_size, int h)
 -{
 -    int tmp;
 -
 -    __asm__ volatile (
 -        "movl         %4, %%ecx          \n"
 -        "shr          $1, %%ecx          \n"
 -        "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
 -        "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
 -        "1:                              \n"
 -        "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
 -        "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
 -        "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
 -        "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
 -
 -        /* todo: mm1-mm2, mm3-mm4 */
 -        /* algo: subtract mm1 from mm2 with saturation and vice versa */
 -        /*       OR the results to get absolute difference */
 -        "movq      %%mm1, %%mm5          \n"
 -        "movq      %%mm3, %%mm6          \n"
 -        "psubusb   %%mm2, %%mm1          \n"
 -        "psubusb   %%mm4, %%mm3          \n"
 -        "psubusb   %%mm5, %%mm2          \n"
 -        "psubusb   %%mm6, %%mm4          \n"
 -
 -        "por       %%mm1, %%mm2          \n"
 -        "por       %%mm3, %%mm4          \n"
 -
 -        /* now convert to 16-bit vectors so we can square them */
 -        "movq      %%mm2, %%mm1          \n"
 -        "movq      %%mm4, %%mm3          \n"
 -
 -        "punpckhbw %%mm0, %%mm2          \n"
 -        "punpckhbw %%mm0, %%mm4          \n"
 -        "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
 -        "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
 -
 -        "pmaddwd   %%mm2, %%mm2          \n"
 -        "pmaddwd   %%mm4, %%mm4          \n"
 -        "pmaddwd   %%mm1, %%mm1          \n"
 -        "pmaddwd   %%mm3, %%mm3          \n"
 -
 -        "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * line_size */
 -        "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * line_size */
 -
 -        "paddd     %%mm2, %%mm1          \n"
 -        "paddd     %%mm4, %%mm3          \n"
 -        "paddd     %%mm1, %%mm7          \n"
 -        "paddd     %%mm3, %%mm7          \n"
 -
 -        "decl      %%ecx                 \n"
 -        "jnz       1b                    \n"
 -
 -        "movq      %%mm7, %%mm1          \n"
 -        "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
 -        "paddd     %%mm7, %%mm1          \n"
 -        "movd      %%mm1, %2             \n"
 -        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
 -        : "r" ((x86_reg) line_size), "m" (h)
 -        : "%ecx");
 -
 -    return tmp;
 -}
 -
 -static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 -                     int line_size, int h)
 -{
 -    int tmp;
 -
 -    __asm__ volatile (
 -        "movl %4, %%ecx\n"
 -        "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
 -        "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
 -        "1:\n"
 -        "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
 -        "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
 -        "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
 -        "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
 -
 -        /* todo: mm1-mm2, mm3-mm4 */
 -        /* algo: subtract mm1 from mm2 with saturation and vice versa */
 -        /*       OR the results to get absolute difference */
 -        "movq %%mm1, %%mm5\n"
 -        "movq %%mm3, %%mm6\n"
 -        "psubusb %%mm2, %%mm1\n"
 -        "psubusb %%mm4, %%mm3\n"
 -        "psubusb %%mm5, %%mm2\n"
 -        "psubusb %%mm6, %%mm4\n"
 -
 -        "por %%mm1, %%mm2\n"
 -        "por %%mm3, %%mm4\n"
 -
 -        /* now convert to 16-bit vectors so we can square them */
 -        "movq %%mm2, %%mm1\n"
 -        "movq %%mm4, %%mm3\n"
 -
 -        "punpckhbw %%mm0, %%mm2\n"
 -        "punpckhbw %%mm0, %%mm4\n"
 -        "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
 -        "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
 -
 -        "pmaddwd %%mm2, %%mm2\n"
 -        "pmaddwd %%mm4, %%mm4\n"
 -        "pmaddwd %%mm1, %%mm1\n"
 -        "pmaddwd %%mm3, %%mm3\n"
 -
 -        "add %3, %0\n"
 -        "add %3, %1\n"
 -
 -        "paddd %%mm2, %%mm1\n"
 -        "paddd %%mm4, %%mm3\n"
 -        "paddd %%mm1, %%mm7\n"
 -        "paddd %%mm3, %%mm7\n"
 -
 -        "decl %%ecx\n"
 -        "jnz 1b\n"
 -
 -        "movq %%mm7, %%mm1\n"
 -        "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
 -        "paddd %%mm7, %%mm1\n"
 -        "movd %%mm1, %2\n"
 -        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
 -        : "r" ((x86_reg) line_size), "m" (h)
 -        : "%ecx");
 -
 -    return tmp;
 -}
 -
 -static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
 -{
 -    int tmp;
 -
 -    __asm__ volatile (
 -        "movl %3, %%ecx\n"
 -        "pxor %%mm7, %%mm7\n"
 -        "pxor %%mm6, %%mm6\n"
 -
 -        "movq (%0), %%mm0\n"
 -        "movq %%mm0, %%mm1\n"
 -        "psllq $8, %%mm0\n"
 -        "psrlq $8, %%mm1\n"
 -        "psrlq $8, %%mm0\n"
 -        "movq %%mm0, %%mm2\n"
 -        "movq %%mm1, %%mm3\n"
 -        "punpcklbw %%mm7, %%mm0\n"
 -        "punpcklbw %%mm7, %%mm1\n"
 -        "punpckhbw %%mm7, %%mm2\n"
 -        "punpckhbw %%mm7, %%mm3\n"
 -        "psubw %%mm1, %%mm0\n"
 -        "psubw %%mm3, %%mm2\n"
 -
 -        "add %2, %0\n"
 -
 -        "movq (%0), %%mm4\n"
 -        "movq %%mm4, %%mm1\n"
 -        "psllq $8, %%mm4\n"
 -        "psrlq $8, %%mm1\n"
 -        "psrlq $8, %%mm4\n"
 -        "movq %%mm4, %%mm5\n"
 -        "movq %%mm1, %%mm3\n"
 -        "punpcklbw %%mm7, %%mm4\n"
 -        "punpcklbw %%mm7, %%mm1\n"
 -        "punpckhbw %%mm7, %%mm5\n"
 -        "punpckhbw %%mm7, %%mm3\n"
 -        "psubw %%mm1, %%mm4\n"
 -        "psubw %%mm3, %%mm5\n"
 -        "psubw %%mm4, %%mm0\n"
 -        "psubw %%mm5, %%mm2\n"
 -        "pxor %%mm3, %%mm3\n"
 -        "pxor %%mm1, %%mm1\n"
 -        "pcmpgtw %%mm0, %%mm3\n\t"
 -        "pcmpgtw %%mm2, %%mm1\n\t"
 -        "pxor %%mm3, %%mm0\n"
 -        "pxor %%mm1, %%mm2\n"
 -        "psubw %%mm3, %%mm0\n"
 -        "psubw %%mm1, %%mm2\n"
 -        "paddw %%mm0, %%mm2\n"
 -        "paddw %%mm2, %%mm6\n"
 -
 -        "add %2, %0\n"
 -        "1:\n"
 -
 -        "movq (%0), %%mm0\n"
 -        "movq %%mm0, %%mm1\n"
 -        "psllq $8, %%mm0\n"
 -        "psrlq $8, %%mm1\n"
 -        "psrlq $8, %%mm0\n"
 -        "movq %%mm0, %%mm2\n"
 -        "movq %%mm1, %%mm3\n"
 -        "punpcklbw %%mm7, %%mm0\n"
 -        "punpcklbw %%mm7, %%mm1\n"
 -        "punpckhbw %%mm7, %%mm2\n"
 -        "punpckhbw %%mm7, %%mm3\n"
 -        "psubw %%mm1, %%mm0\n"
 -        "psubw %%mm3, %%mm2\n"
 -        "psubw %%mm0, %%mm4\n"
 -        "psubw %%mm2, %%mm5\n"
 -        "pxor  %%mm3, %%mm3\n"
 -        "pxor  %%mm1, %%mm1\n"
 -        "pcmpgtw %%mm4, %%mm3\n\t"
 -        "pcmpgtw %%mm5, %%mm1\n\t"
 -        "pxor  %%mm3, %%mm4\n"
 -        "pxor  %%mm1, %%mm5\n"
 -        "psubw %%mm3, %%mm4\n"
 -        "psubw %%mm1, %%mm5\n"
 -        "paddw %%mm4, %%mm5\n"
 -        "paddw %%mm5, %%mm6\n"
 -
 -        "add %2, %0\n"
 -
 -        "movq (%0), %%mm4\n"
 -        "movq      %%mm4, %%mm1\n"
 -        "psllq $8, %%mm4\n"
 -        "psrlq $8, %%mm1\n"
 -        "psrlq $8, %%mm4\n"
 -        "movq      %%mm4, %%mm5\n"
 -        "movq      %%mm1, %%mm3\n"
 -        "punpcklbw %%mm7, %%mm4\n"
 -        "punpcklbw %%mm7, %%mm1\n"
 -        "punpckhbw %%mm7, %%mm5\n"
 -        "punpckhbw %%mm7, %%mm3\n"
 -        "psubw     %%mm1, %%mm4\n"
 -        "psubw     %%mm3, %%mm5\n"
 -        "psubw     %%mm4, %%mm0\n"
 -        "psubw     %%mm5, %%mm2\n"
 -        "pxor      %%mm3, %%mm3\n"
 -        "pxor      %%mm1, %%mm1\n"
 -        "pcmpgtw   %%mm0, %%mm3\n\t"
 -        "pcmpgtw   %%mm2, %%mm1\n\t"
 -        "pxor      %%mm3, %%mm0\n"
 -        "pxor      %%mm1, %%mm2\n"
 -        "psubw     %%mm3, %%mm0\n"
 -        "psubw     %%mm1, %%mm2\n"
 -        "paddw     %%mm0, %%mm2\n"
 -        "paddw     %%mm2, %%mm6\n"
 -
 -        "add  %2, %0\n"
 -        "subl $2, %%ecx\n"
 -        " jnz 1b\n"
 -
 -        "movq      %%mm6, %%mm0\n"
 -        "punpcklwd %%mm7, %%mm0\n"
 -        "punpckhwd %%mm7, %%mm6\n"
 -        "paddd     %%mm0, %%mm6\n"
 -
 -        "movq  %%mm6, %%mm0\n"
 -        "psrlq $32,   %%mm6\n"
 -        "paddd %%mm6, %%mm0\n"
 -        "movd  %%mm0, %1\n"
 -        : "+r" (pix1), "=r" (tmp)
 -        : "r" ((x86_reg) line_size), "g" (h - 2)
 -        : "%ecx");
 -
 -    return tmp;
 -}
 -
 -static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
 -{
 -    int tmp;
 -    uint8_t *pix = pix1;
 -
 -    __asm__ volatile (
 -        "movl %3, %%ecx\n"
 -        "pxor %%mm7, %%mm7\n"
 -        "pxor %%mm6, %%mm6\n"
 -
 -        "movq (%0), %%mm0\n"
 -        "movq 1(%0), %%mm1\n"
 -        "movq %%mm0, %%mm2\n"
 -        "movq %%mm1, %%mm3\n"
 -        "punpcklbw %%mm7, %%mm0\n"
 -        "punpcklbw %%mm7, %%mm1\n"
 -        "punpckhbw %%mm7, %%mm2\n"
 -        "punpckhbw %%mm7, %%mm3\n"
 -        "psubw %%mm1, %%mm0\n"
 -        "psubw %%mm3, %%mm2\n"
 -
 -        "add %2, %0\n"
 -
 -        "movq (%0), %%mm4\n"
 -        "movq 1(%0), %%mm1\n"
 -        "movq %%mm4, %%mm5\n"
 -        "movq %%mm1, %%mm3\n"
 -        "punpcklbw %%mm7, %%mm4\n"
 -        "punpcklbw %%mm7, %%mm1\n"
 -        "punpckhbw %%mm7, %%mm5\n"
 -        "punpckhbw %%mm7, %%mm3\n"
 -        "psubw %%mm1, %%mm4\n"
 -        "psubw %%mm3, %%mm5\n"
 -        "psubw %%mm4, %%mm0\n"
 -        "psubw %%mm5, %%mm2\n"
 -        "pxor %%mm3, %%mm3\n"
 -        "pxor %%mm1, %%mm1\n"
 -        "pcmpgtw %%mm0, %%mm3\n\t"
 -        "pcmpgtw %%mm2, %%mm1\n\t"
 -        "pxor %%mm3, %%mm0\n"
 -        "pxor %%mm1, %%mm2\n"
 -        "psubw %%mm3, %%mm0\n"
 -        "psubw %%mm1, %%mm2\n"
 -        "paddw %%mm0, %%mm2\n"
 -        "paddw %%mm2, %%mm6\n"
 -
 -        "add %2, %0\n"
 -        "1:\n"
 -
 -        "movq (%0), %%mm0\n"
 -        "movq 1(%0), %%mm1\n"
 -        "movq %%mm0, %%mm2\n"
 -        "movq %%mm1, %%mm3\n"
 -        "punpcklbw %%mm7, %%mm0\n"
 -        "punpcklbw %%mm7, %%mm1\n"
 -        "punpckhbw %%mm7, %%mm2\n"
 -        "punpckhbw %%mm7, %%mm3\n"
 -        "psubw %%mm1, %%mm0\n"
 -        "psubw %%mm3, %%mm2\n"
 -        "psubw %%mm0, %%mm4\n"
 -        "psubw %%mm2, %%mm5\n"
 -        "pxor %%mm3, %%mm3\n"
 -        "pxor %%mm1, %%mm1\n"
 -        "pcmpgtw %%mm4, %%mm3\n\t"
 -        "pcmpgtw %%mm5, %%mm1\n\t"
 -        "pxor %%mm3, %%mm4\n"
 -        "pxor %%mm1, %%mm5\n"
 -        "psubw %%mm3, %%mm4\n"
 -        "psubw %%mm1, %%mm5\n"
 -        "paddw %%mm4, %%mm5\n"
 -        "paddw %%mm5, %%mm6\n"
 -
 -        "add %2, %0\n"
 -
 -        "movq (%0), %%mm4\n"
 -        "movq 1(%0), %%mm1\n"
 -        "movq %%mm4, %%mm5\n"
 -        "movq %%mm1, %%mm3\n"
 -        "punpcklbw %%mm7, %%mm4\n"
 -        "punpcklbw %%mm7, %%mm1\n"
 -        "punpckhbw %%mm7, %%mm5\n"
 -        "punpckhbw %%mm7, %%mm3\n"
 -        "psubw %%mm1, %%mm4\n"
 -        "psubw %%mm3, %%mm5\n"
 -        "psubw %%mm4, %%mm0\n"
 -        "psubw %%mm5, %%mm2\n"
 -        "pxor %%mm3, %%mm3\n"
 -        "pxor %%mm1, %%mm1\n"
 -        "pcmpgtw %%mm0, %%mm3\n\t"
 -        "pcmpgtw %%mm2, %%mm1\n\t"
 -        "pxor %%mm3, %%mm0\n"
 -        "pxor %%mm1, %%mm2\n"
 -        "psubw %%mm3, %%mm0\n"
 -        "psubw %%mm1, %%mm2\n"
 -        "paddw %%mm0, %%mm2\n"
 -        "paddw %%mm2, %%mm6\n"
 -
 -        "add %2, %0\n"
 -        "subl $2, %%ecx\n"
 -        " jnz 1b\n"
 +int ff_sum_abs_dctelem_mmx(int16_t *block);
 +int ff_sum_abs_dctelem_mmxext(int16_t *block);
 +int ff_sum_abs_dctelem_sse2(int16_t *block);
 +int ff_sum_abs_dctelem_ssse3(int16_t *block);
 +int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 +                int line_size, int h);
 +int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 +                 int line_size, int h);
 +int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 +                  int line_size, int h);
 +int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
 +int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
  
 -        "movq %%mm6, %%mm0\n"
 -        "punpcklwd %%mm7, %%mm0\n"
 -        "punpckhwd %%mm7, %%mm6\n"
 -        "paddd %%mm0, %%mm6\n"
 +#define hadamard_func(cpu)                                              \
 +    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
 +                                  uint8_t *src2, int stride, int h);    \
 +    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
 +                                    uint8_t *src2, int stride, int h);
  
 -        "movq %%mm6, %%mm0\n"
 -        "psrlq $32, %%mm6\n"
 -        "paddd %%mm6, %%mm0\n"
 -        "movd %%mm0, %1\n"
 -        : "+r" (pix1), "=r" (tmp)
 -        : "r" ((x86_reg) line_size), "g" (h - 2)
 -        : "%ecx");
 +hadamard_func(mmx)
 +hadamard_func(mmxext)
 +hadamard_func(sse2)
 +hadamard_func(ssse3)
  
 -    return tmp + hf_noise8_mmx(pix + 8, line_size, h);
 -}
 +#if HAVE_YASM
  
  static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                        int line_size, int h)
@@@ -353,20 -818,16 +347,10 @@@ av_cold void ff_dsputilenc_init_mmx(DSP
  {
      int cpu_flags = av_get_cpu_flags();
  
-     if (EXTERNAL_MMX(cpu_flags)) {
-         if (!high_bit_depth)
-             c->get_pixels = ff_get_pixels_mmx;
-         c->diff_pixels = ff_diff_pixels_mmx;
-     }
- 
-     if (EXTERNAL_SSE2(cpu_flags))
-         if (!high_bit_depth)
-             c->get_pixels = ff_get_pixels_sse2;
- 
  #if HAVE_INLINE_ASM
      if (INLINE_MMX(cpu_flags)) {
 -        c->sum_abs_dctelem = sum_abs_dctelem_mmx;
 -
 -        c->sse[0]  = sse16_mmx;
 -        c->sse[1]  = sse8_mmx;
          c->vsad[4] = vsad_intra16_mmx;
  
 -        c->nsse[0] = nsse16_mmx;
 -        c->nsse[1] = nsse8_mmx;
          if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
              c->vsad[0]      = vsad16_mmx;
          }
@@@ -409,8 -865,6 +393,7 @@@
  
      if (EXTERNAL_SSE2(cpu_flags)) {
          c->sse[0] = ff_sse16_sse2;
 +        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
-         c->diff_pixels = ff_diff_pixels_sse2;
  
  #if HAVE_ALIGNED_STACK
          c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
diff --cc libavcodec/x86/pixblockdsp.asm
index 00000000000,c8fd1b24a13..00ee9b4ac23
mode 000000,100644..100644
--- a/libavcodec/x86/pixblockdsp.asm
+++ b/libavcodec/x86/pixblockdsp.asm
@@@ -1,0 -1,110 +1,135 @@@
+ ;*****************************************************************************
+ ;* SIMD-optimized pixel operations
+ ;*****************************************************************************
+ ;* Copyright (c) 2000, 2001 Fabrice Bellard
+ ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ ;*
 -;* This file is part of Libav.
++;* This file is part of FFmpeg.
+ ;*
 -;* Libav is free software; you can redistribute it and/or
++;* FFmpeg is free software; you can redistribute it and/or
+ ;* modify it under the terms of the GNU Lesser General Public
+ ;* License as published by the Free Software Foundation; either
+ ;* version 2.1 of the License, or (at your option) any later version.
+ ;*
 -;* Libav is distributed in the hope that it will be useful,
++;* FFmpeg is distributed in the hope that it will be useful,
+ ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ ;* Lesser General Public License for more details.
+ ;*
+ ;* You should have received a copy of the GNU Lesser General Public
 -;* License along with Libav; if not, write to the Free Software
++;* License along with FFmpeg; if not, write to the Free Software
+ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ;*****************************************************************************
+ 
+ %include "libavutil/x86/x86util.asm"
+ 
+ SECTION .text
+ 
+ INIT_MMX mmx
+ ; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
+ cglobal get_pixels, 3,4
+     movsxdifnidn r2, r2d
+     add          r0, 128
+     mov          r3, -128
+     pxor         m7, m7
+ .loop:
+     mova         m0, [r1]
+     mova         m2, [r1+r2]
+     mova         m1, m0
+     mova         m3, m2
+     punpcklbw    m0, m7
+     punpckhbw    m1, m7
+     punpcklbw    m2, m7
+     punpckhbw    m3, m7
+     mova [r0+r3+ 0], m0
+     mova [r0+r3+ 8], m1
+     mova [r0+r3+16], m2
+     mova [r0+r3+24], m3
+     lea          r1, [r1+r2*2]
+     add          r3, 32
+     js .loop
+     REP_RET
+ 
+ INIT_XMM sse2
 -cglobal get_pixels, 3, 4
++cglobal get_pixels, 3, 4, 5
+     movsxdifnidn r2, r2d
+     lea          r3, [r2*3]
+     pxor         m4, m4
+     movh         m0, [r1]
+     movh         m1, [r1+r2]
+     movh         m2, [r1+r2*2]
+     movh         m3, [r1+r3]
+     lea          r1, [r1+r2*4]
+     punpcklbw    m0, m4
+     punpcklbw    m1, m4
+     punpcklbw    m2, m4
+     punpcklbw    m3, m4
+     mova       [r0], m0
+     mova  [r0+0x10], m1
+     mova  [r0+0x20], m2
+     mova  [r0+0x30], m3
+     movh         m0, [r1]
+     movh         m1, [r1+r2*1]
+     movh         m2, [r1+r2*2]
+     movh         m3, [r1+r3]
+     punpcklbw    m0, m4
+     punpcklbw    m1, m4
+     punpcklbw    m2, m4
+     punpcklbw    m3, m4
+     mova  [r0+0x40], m0
+     mova  [r0+0x50], m1
+     mova  [r0+0x60], m2
+     mova  [r0+0x70], m3
+     RET
+ 
+ INIT_MMX mmx
+ ; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+ ;                         int stride);
+ cglobal diff_pixels, 4,5
+     movsxdifnidn r3, r3d
+     pxor         m7, m7
+     add          r0,  128
+     mov          r4, -128
+ .loop:
+     mova         m0, [r1]
+     mova         m2, [r2]
+     mova         m1, m0
+     mova         m3, m2
+     punpcklbw    m0, m7
+     punpckhbw    m1, m7
+     punpcklbw    m2, m7
+     punpckhbw    m3, m7
+     psubw        m0, m2
+     psubw        m1, m3
+     mova  [r0+r4+0], m0
+     mova  [r0+r4+8], m1
+     add          r1, r3
+     add          r2, r3
+     add          r4, 16
+     jne .loop
+     REP_RET
++
++INIT_XMM sse2
++cglobal diff_pixels, 4, 5, 5
++    movsxdifnidn r3, r3d
++    pxor         m4, m4
++    add          r0,  128
++    mov          r4, -128
++.loop:
++    movh         m0, [r1]
++    movh         m2, [r2]
++    movh         m1, [r1+r3]
++    movh         m3, [r2+r3]
++    punpcklbw    m0, m4
++    punpcklbw    m1, m4
++    punpcklbw    m2, m4
++    punpcklbw    m3, m4
++    psubw        m0, m2
++    psubw        m1, m3
++    mova [r0+r4+0 ], m0
++    mova [r0+r4+16], m1
++    lea          r1, [r1+r3*2]
++    lea          r2, [r2+r3*2]
++    add          r4, 32
++    jne .loop
++    RET
diff --cc libavcodec/x86/pixblockdsp_init.c
index 00000000000,9582e0b5c28..4c31b802ff1
mode 000000,100644..100644
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@@ -1,0 -1,47 +1,50 @@@
+ /*
+  * SIMD-optimized pixel operations
+  *
 - * This file is part of Libav.
++ * This file is part of FFmpeg.
+  *
 - * Libav is free software; you can redistribute it and/or
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
 - * License along with Libav; if not, write to the Free Software
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/x86/cpu.h"
+ #include "libavcodec/pixblockdsp.h"
+ 
+ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
+ void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
+ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                         int stride);
++void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
++                         int stride);
+ 
+ av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
+                                      AVCodecContext *avctx,
+                                      unsigned high_bit_depth)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+     if (EXTERNAL_MMX(cpu_flags)) {
+         if (!high_bit_depth)
+             c->get_pixels = ff_get_pixels_mmx;
+         c->diff_pixels = ff_diff_pixels_mmx;
+     }
+ 
+     if (EXTERNAL_SSE2(cpu_flags)) {
+         if (!high_bit_depth)
+             c->get_pixels = ff_get_pixels_sse2;
++        c->diff_pixels = ff_diff_pixels_sse2;
+     }
+ }
diff --cc libavfilter/vf_mpdecimate.c
index 099622a0295,00000000000..c667a9f4ccd
mode 100644,000000..100644
--- a/libavfilter/vf_mpdecimate.c
+++ b/libavfilter/vf_mpdecimate.c
@@@ -1,257 -1,0 +1,261 @@@
 +/*
 + * Copyright (c) 2003 Rich Felker
 + * Copyright (c) 2012 Stefano Sabatini
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License along
 + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 + */
 +
 +/**
 + * @file mpdecimate filter, ported from libmpcodecs/vf_decimate.c by
 + * Rich Felker.
 + */
 +
 +#include "libavutil/opt.h"
 +#include "libavutil/pixdesc.h"
 +#include "libavutil/timestamp.h"
 +#include "libavcodec/dsputil.h"
++#include "libavcodec/pixblockdsp.h"
 +#include "avfilter.h"
 +#include "internal.h"
 +#include "formats.h"
 +#include "video.h"
 +
 +typedef struct {
 +    const AVClass *class;
 +    int lo, hi;                    ///< lower and higher threshold number of differences
 +                                   ///< values for 8x8 blocks
 +
 +    float frac;                    ///< threshold of changed pixels over the total fraction
 +
 +    int max_drop_count;            ///< if positive: maximum number of sequential frames to drop
 +                                   ///< if negative: minimum number of frames between two drops
 +
 +    int drop_count;                ///< if positive: number of frames sequentially dropped
 +                                   ///< if negative: number of sequential frames which were not dropped
 +
 +    int hsub, vsub;                ///< chroma subsampling values
 +    AVFrame *ref;                  ///< reference picture
 +    DSPContext dspctx;             ///< context providing optimized diff routines
++    PixblockDSPContext pdsp;
 +    AVCodecContext *avctx;         ///< codec context required for the DSPContext
 +} DecimateContext;
 +
 +#define OFFSET(x) offsetof(DecimateContext, x)
 +#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
 +
 +static const AVOption mpdecimate_options[] = {
 +    { "max",  "set the maximum number of consecutive dropped frames (positive), or the minimum interval between dropped frames (negative)",
 +      OFFSET(max_drop_count), AV_OPT_TYPE_INT, {.i64=0}, INT_MIN, INT_MAX, FLAGS },
 +    { "hi",   "set high dropping threshold", OFFSET(hi), AV_OPT_TYPE_INT, {.i64=64*12}, INT_MIN, INT_MAX, FLAGS },
 +    { "lo",   "set low dropping threshold", OFFSET(lo), AV_OPT_TYPE_INT, {.i64=64*5}, INT_MIN, INT_MAX, FLAGS },
 +    { "frac", "set fraction dropping threshold",  OFFSET(frac), AV_OPT_TYPE_FLOAT, {.dbl=0.33}, 0, 1, FLAGS },
 +    { NULL }
 +};
 +
 +AVFILTER_DEFINE_CLASS(mpdecimate);
 +
 +/**
 + * Return 1 if the two planes are different, 0 otherwise.
 + */
 +static int diff_planes(AVFilterContext *ctx,
 +                       uint8_t *cur, uint8_t *ref, int linesize,
 +                       int w, int h)
 +{
 +    DecimateContext *decimate = ctx->priv;
 +    DSPContext *dspctx = &decimate->dspctx;
++    PixblockDSPContext *pdsp = &decimate->pdsp;
 +
 +    int x, y;
 +    int d, c = 0;
 +    int t = (w/16)*(h/16)*decimate->frac;
 +    int16_t block[8*8];
 +
 +    /* compute difference for blocks of 8x8 bytes */
 +    for (y = 0; y < h-7; y += 4) {
 +        for (x = 8; x < w-7; x += 4) {
-             dspctx->diff_pixels(block,
++            pdsp->diff_pixels(block,
 +                                cur+x+y*linesize,
 +                                ref+x+y*linesize, linesize);
 +            d = dspctx->sum_abs_dctelem(block);
 +            if (d > decimate->hi)
 +                return 1;
 +            if (d > decimate->lo) {
 +                c++;
 +                if (c > t)
 +                    return 1;
 +            }
 +        }
 +    }
 +    return 0;
 +}
 +
 +/**
 + * Tell if the frame should be decimated, for example if it is no much
 + * different with respect to the reference frame ref.
 + */
 +static int decimate_frame(AVFilterContext *ctx,
 +                          AVFrame *cur, AVFrame *ref)
 +{
 +    DecimateContext *decimate = ctx->priv;
 +    int plane;
 +
 +    if (decimate->max_drop_count > 0 &&
 +        decimate->drop_count >= decimate->max_drop_count)
 +        return 0;
 +    if (decimate->max_drop_count < 0 &&
 +        (decimate->drop_count-1) > decimate->max_drop_count)
 +        return 0;
 +
 +    for (plane = 0; ref->data[plane] && ref->linesize[plane]; plane++) {
 +        int vsub = plane == 1 || plane == 2 ? decimate->vsub : 0;
 +        int hsub = plane == 1 || plane == 2 ? decimate->hsub : 0;
 +        if (diff_planes(ctx,
 +                        cur->data[plane], ref->data[plane], ref->linesize[plane],
 +                        FF_CEIL_RSHIFT(ref->width,  hsub),
 +                        FF_CEIL_RSHIFT(ref->height, vsub)))
 +            return 0;
 +    }
 +
 +    return 1;
 +}
 +
 +static av_cold int init(AVFilterContext *ctx)
 +{
 +    DecimateContext *decimate = ctx->priv;
 +
 +    av_log(ctx, AV_LOG_VERBOSE, "max_drop_count:%d hi:%d lo:%d frac:%f\n",
 +           decimate->max_drop_count, decimate->hi, decimate->lo, decimate->frac);
 +
 +    decimate->avctx = avcodec_alloc_context3(NULL);
 +    if (!decimate->avctx)
 +        return AVERROR(ENOMEM);
 +    avpriv_dsputil_init(&decimate->dspctx, decimate->avctx);
++    ff_pixblockdsp_init(&decimate->pdsp, decimate->avctx);
 +
 +    return 0;
 +}
 +
 +static av_cold void uninit(AVFilterContext *ctx)
 +{
 +    DecimateContext *decimate = ctx->priv;
 +    av_frame_free(&decimate->ref);
 +    if (decimate->avctx) {
 +        avcodec_close(decimate->avctx);
 +        av_freep(&decimate->avctx);
 +    }
 +}
 +
 +static int query_formats(AVFilterContext *ctx)
 +{
 +    static const enum AVPixelFormat pix_fmts[] = {
 +        AV_PIX_FMT_YUV444P,      AV_PIX_FMT_YUV422P,
 +        AV_PIX_FMT_YUV420P,      AV_PIX_FMT_YUV411P,
 +        AV_PIX_FMT_YUV410P,      AV_PIX_FMT_YUV440P,
 +        AV_PIX_FMT_YUVJ444P,     AV_PIX_FMT_YUVJ422P,
 +        AV_PIX_FMT_YUVJ420P,     AV_PIX_FMT_YUVJ440P,
 +        AV_PIX_FMT_YUVA420P,
 +        AV_PIX_FMT_NONE
 +    };
 +
 +    ff_set_common_formats(ctx, ff_make_format_list(pix_fmts));
 +
 +    return 0;
 +}
 +
 +static int config_input(AVFilterLink *inlink)
 +{
 +    AVFilterContext *ctx = inlink->dst;
 +    DecimateContext *decimate = ctx->priv;
 +    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
 +    decimate->hsub = pix_desc->log2_chroma_w;
 +    decimate->vsub = pix_desc->log2_chroma_h;
 +
 +    return 0;
 +}
 +
 +static int filter_frame(AVFilterLink *inlink, AVFrame *cur)
 +{
 +    DecimateContext *decimate = inlink->dst->priv;
 +    AVFilterLink *outlink = inlink->dst->outputs[0];
 +    int ret;
 +
 +    if (decimate->ref && decimate_frame(inlink->dst, cur, decimate->ref)) {
 +        decimate->drop_count = FFMAX(1, decimate->drop_count+1);
 +    } else {
 +        av_frame_free(&decimate->ref);
 +        decimate->ref = cur;
 +        decimate->drop_count = FFMIN(-1, decimate->drop_count-1);
 +
 +        if (ret = ff_filter_frame(outlink, av_frame_clone(cur)) < 0)
 +            return ret;
 +    }
 +
 +    av_log(inlink->dst, AV_LOG_DEBUG,
 +           "%s pts:%s pts_time:%s drop_count:%d\n",
 +           decimate->drop_count > 0 ? "drop" : "keep",
 +           av_ts2str(cur->pts), av_ts2timestr(cur->pts, &inlink->time_base),
 +           decimate->drop_count);
 +
 +    if (decimate->drop_count > 0)
 +        av_frame_free(&cur);
 +
 +    return 0;
 +}
 +
 +static int request_frame(AVFilterLink *outlink)
 +{
 +    DecimateContext *decimate = outlink->src->priv;
 +    AVFilterLink *inlink = outlink->src->inputs[0];
 +    int ret;
 +
 +    do {
 +        ret = ff_request_frame(inlink);
 +    } while (decimate->drop_count > 0 && ret >= 0);
 +
 +    return ret;
 +}
 +
 +static const AVFilterPad mpdecimate_inputs[] = {
 +    {
 +        .name         = "default",
 +        .type         = AVMEDIA_TYPE_VIDEO,
 +        .config_props = config_input,
 +        .filter_frame = filter_frame,
 +    },
 +    { NULL }
 +};
 +
 +static const AVFilterPad mpdecimate_outputs[] = {
 +    {
 +        .name          = "default",
 +        .type          = AVMEDIA_TYPE_VIDEO,
 +        .request_frame = request_frame,
 +    },
 +    { NULL }
 +};
 +
 +AVFilter ff_vf_mpdecimate = {
 +    .name          = "mpdecimate",
 +    .description   = NULL_IF_CONFIG_SMALL("Remove near-duplicate frames."),
 +    .init          = init,
 +    .uninit        = uninit,
 +    .priv_size     = sizeof(DecimateContext),
 +    .priv_class    = &mpdecimate_class,
 +    .query_formats = query_formats,
 +    .inputs        = mpdecimate_inputs,
 +    .outputs       = mpdecimate_outputs,
 +};
diff --cc libavfilter/vf_spp.c
index 9df87ff3f26,00000000000..4e4a5795f47
mode 100644,000000..100644
--- a/libavfilter/vf_spp.c
+++ b/libavfilter/vf_spp.c
@@@ -1,439 -1,0 +1,439 @@@
 +/*
 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 + * Copyright (c) 2013 ClÃ©ment BÅsch <u pkh me>
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License along
 + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 + */
 +
 +/**
 + * @file
 + * Simple post processing filter
 + *
 + * This implementation is based on an algorithm described in
 + * "Aria Nosratinia Embedded Post-Processing for
 + * Enhancement of Compressed Images (1999)"
 + *
 + * Originally written by Michael Niedermayer for the MPlayer project, and
 + * ported by ClÃ©ment BÅsch for FFmpeg.
 + */
 +
 +#include "libavcodec/dsputil.h"
 +#include "libavutil/avassert.h"
 +#include "libavutil/imgutils.h"
 +#include "libavutil/opt.h"
 +#include "libavutil/pixdesc.h"
 +#include "internal.h"
 +#include "vf_spp.h"
 +
 +enum mode {
 +    MODE_HARD,
 +    MODE_SOFT,
 +    NB_MODES
 +};
 +
 +#define OFFSET(x) offsetof(SPPContext, x)
 +#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
 +static const AVOption spp_options[] = {
 +    { "quality", "set quality", OFFSET(log2_count), AV_OPT_TYPE_INT, {.i64 = 3}, 0, MAX_LEVEL, FLAGS },
 +    { "qp", "force a constant quantizer parameter", OFFSET(qp), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 63, FLAGS },
 +    { "mode", "set thresholding mode", OFFSET(mode), AV_OPT_TYPE_INT, {.i64 = MODE_HARD}, 0, NB_MODES - 1, FLAGS, "mode" },
 +        { "hard", "hard thresholding", 0, AV_OPT_TYPE_CONST, {.i64 = MODE_HARD}, INT_MIN, INT_MAX, FLAGS, "mode" },
 +        { "soft", "soft thresholding", 0, AV_OPT_TYPE_CONST, {.i64 = MODE_SOFT}, INT_MIN, INT_MAX, FLAGS, "mode" },
 +    { "use_bframe_qp", "use B-frames' QP", OFFSET(use_bframe_qp), AV_OPT_TYPE_INT, {.i64 = 0}, 0, 1, FLAGS },
 +    { NULL }
 +};
 +
 +AVFILTER_DEFINE_CLASS(spp);
 +
 +// XXX: share between filters?
 +DECLARE_ALIGNED(8, static const uint8_t, ldither)[8][8] = {
 +    {  0,  48,  12,  60,   3,  51,  15,  63 },
 +    { 32,  16,  44,  28,  35,  19,  47,  31 },
 +    {  8,  56,   4,  52,  11,  59,   7,  55 },
 +    { 40,  24,  36,  20,  43,  27,  39,  23 },
 +    {  2,  50,  14,  62,   1,  49,  13,  61 },
 +    { 34,  18,  46,  30,  33,  17,  45,  29 },
 +    { 10,  58,   6,  54,   9,  57,   5,  53 },
 +    { 42,  26,  38,  22,  41,  25,  37,  21 },
 +};
 +
 +static const uint8_t offset[127][2] = {
 +    {0,0},
 +    {0,0}, {4,4},                                           // quality = 1
 +    {0,0}, {2,2}, {6,4}, {4,6},                             // quality = 2
 +    {0,0}, {5,1}, {2,2}, {7,3}, {4,4}, {1,5}, {6,6}, {3,7}, // quality = 3
 +
 +    {0,0}, {4,0}, {1,1}, {5,1}, {3,2}, {7,2}, {2,3}, {6,3}, // quality = 4
 +    {0,4}, {4,4}, {1,5}, {5,5}, {3,6}, {7,6}, {2,7}, {6,7},
 +
 +    {0,0}, {0,2}, {0,4}, {0,6}, {1,1}, {1,3}, {1,5}, {1,7}, // quality = 5
 +    {2,0}, {2,2}, {2,4}, {2,6}, {3,1}, {3,3}, {3,5}, {3,7},
 +    {4,0}, {4,2}, {4,4}, {4,6}, {5,1}, {5,3}, {5,5}, {5,7},
 +    {6,0}, {6,2}, {6,4}, {6,6}, {7,1}, {7,3}, {7,5}, {7,7},
 +
 +    {0,0}, {4,4}, {0,4}, {4,0}, {2,2}, {6,6}, {2,6}, {6,2}, // quality = 6
 +    {0,2}, {4,6}, {0,6}, {4,2}, {2,0}, {6,4}, {2,4}, {6,0},
 +    {1,1}, {5,5}, {1,5}, {5,1}, {3,3}, {7,7}, {3,7}, {7,3},
 +    {1,3}, {5,7}, {1,7}, {5,3}, {3,1}, {7,5}, {3,5}, {7,1},
 +    {0,1}, {4,5}, {0,5}, {4,1}, {2,3}, {6,7}, {2,7}, {6,3},
 +    {0,3}, {4,7}, {0,7}, {4,3}, {2,1}, {6,5}, {2,5}, {6,1},
 +    {1,0}, {5,4}, {1,4}, {5,0}, {3,2}, {7,6}, {3,6}, {7,2},
 +    {1,2}, {5,6}, {1,6}, {5,2}, {3,0}, {7,4}, {3,4}, {7,0},
 +};
 +
 +static void hardthresh_c(int16_t dst[64], const int16_t src[64],
 +                         int qp, const uint8_t *permutation)
 +{
 +    int i;
 +    int bias = 0; // FIXME
 +
 +    unsigned threshold1 = qp * ((1<<4) - bias) - 1;
 +    unsigned threshold2 = threshold1 << 1;
 +
 +    memset(dst, 0, 64 * sizeof(dst[0]));
 +    dst[0] = (src[0] + 4) >> 3;
 +
 +    for (i = 1; i < 64; i++) {
 +        int level = src[i];
 +        if (((unsigned)(level + threshold1)) > threshold2) {
 +            const int j = permutation[i];
 +            dst[j] = (level + 4) >> 3;
 +        }
 +    }
 +}
 +
 +static void softthresh_c(int16_t dst[64], const int16_t src[64],
 +                         int qp, const uint8_t *permutation)
 +{
 +    int i;
 +    int bias = 0; //FIXME
 +
 +    unsigned threshold1 = qp * ((1<<4) - bias) - 1;
 +    unsigned threshold2 = threshold1 << 1;
 +
 +    memset(dst, 0, 64 * sizeof(dst[0]));
 +    dst[0] = (src[0] + 4) >> 3;
 +
 +    for (i = 1; i < 64; i++) {
 +        int level = src[i];
 +        if (((unsigned)(level + threshold1)) > threshold2) {
 +            const int j = permutation[i];
 +            if (level > 0) dst[j] = (level - threshold1 + 4) >> 3;
 +            else           dst[j] = (level + threshold1 + 4) >> 3;
 +        }
 +    }
 +}
 +
 +static void store_slice_c(uint8_t *dst, const int16_t *src,
 +                          int dst_linesize, int src_linesize,
 +                          int width, int height, int log2_scale,
 +                          const uint8_t dither[8][8])
 +{
 +    int y, x;
 +
 +#define STORE(pos) do {                                                     \
 +    temp = ((src[x + y*src_linesize + pos] << log2_scale) + d[pos]) >> 6;   \
 +    if (temp & 0x100)                                                       \
 +        temp = ~(temp >> 31);                                               \
 +    dst[x + y*dst_linesize + pos] = temp;                                   \
 +} while (0)
 +
 +    for (y = 0; y < height; y++) {
 +        const uint8_t *d = dither[y];
 +        for (x = 0; x < width; x += 8) {
 +            int temp;
 +            STORE(0);
 +            STORE(1);
 +            STORE(2);
 +            STORE(3);
 +            STORE(4);
 +            STORE(5);
 +            STORE(6);
 +            STORE(7);
 +        }
 +    }
 +}
 +
 +static inline void add_block(int16_t *dst, int linesize, const int16_t block[64])
 +{
 +    int y;
 +
 +    for (y = 0; y < 8; y++) {
 +        *(uint32_t *)&dst[0 + y*linesize] += *(uint32_t *)&block[0 + y*8];
 +        *(uint32_t *)&dst[2 + y*linesize] += *(uint32_t *)&block[2 + y*8];
 +        *(uint32_t *)&dst[4 + y*linesize] += *(uint32_t *)&block[4 + y*8];
 +        *(uint32_t *)&dst[6 + y*linesize] += *(uint32_t *)&block[6 + y*8];
 +    }
 +}
 +
 +// XXX: export the function?
 +static inline int norm_qscale(int qscale, int type)
 +{
 +    switch (type) {
 +    case FF_QSCALE_TYPE_MPEG1: return qscale;
 +    case FF_QSCALE_TYPE_MPEG2: return qscale >> 1;
 +    case FF_QSCALE_TYPE_H264:  return qscale >> 2;
 +    case FF_QSCALE_TYPE_VP56:  return (63 - qscale + 2) >> 2;
 +    }
 +    return qscale;
 +}
 +
 +static void filter(SPPContext *p, uint8_t *dst, uint8_t *src,
 +                   int dst_linesize, int src_linesize, int width, int height,
 +                   const uint8_t *qp_table, int qp_stride, int is_luma)
 +{
 +    int x, y, i;
 +    const int count = 1 << p->log2_count;
 +    const int linesize = is_luma ? p->temp_linesize : FFALIGN(width+16, 16);
 +    DECLARE_ALIGNED(16, uint64_t, block_align)[32];
 +    int16_t *block  = (int16_t *)block_align;
 +    int16_t *block2 = (int16_t *)(block_align + 16);
 +
 +    for (y = 0; y < height; y++) {
 +        int index = 8 + 8*linesize + y*linesize;
 +        memcpy(p->src + index, src + y*src_linesize, width);
 +        for (x = 0; x < 8; x++) {
 +            p->src[index         - x - 1] = p->src[index +         x    ];
 +            p->src[index + width + x    ] = p->src[index + width - x - 1];
 +        }
 +    }
 +    for (y = 0; y < 8; y++) {
 +        memcpy(p->src + (       7-y)*linesize, p->src + (       y+8)*linesize, linesize);
 +        memcpy(p->src + (height+8+y)*linesize, p->src + (height-y+7)*linesize, linesize);
 +    }
 +
 +    for (y = 0; y < height + 8; y += 8) {
 +        memset(p->temp + (8 + y) * linesize, 0, 8 * linesize * sizeof(*p->temp));
 +        for (x = 0; x < width + 8; x += 8) {
 +            int qp;
 +
 +            if (p->qp) {
 +                qp = p->qp;
 +            } else{
 +                const int qps = 3 + is_luma;
 +                qp = qp_table[(FFMIN(x, width - 1) >> qps) + (FFMIN(y, height - 1) >> qps) * qp_stride];
 +                qp = FFMAX(1, norm_qscale(qp, p->qscale_type));
 +            }
 +            for (i = 0; i < count; i++) {
 +                const int x1 = x + offset[i + count - 1][0];
 +                const int y1 = y + offset[i + count - 1][1];
 +                const int index = x1 + y1*linesize;
-                 p->dsp.get_pixels(block, p->src + index, linesize);
++                p->pdsp.get_pixels(block, p->src + index, linesize);
 +                p->fdsp.fdct(block);
 +                p->requantize(block2, block, qp, p->idsp.idct_permutation);
 +                p->idsp.idct(block2);
 +                add_block(p->temp + index, linesize, block2);
 +            }
 +        }
 +        if (y)
 +            p->store_slice(dst + (y - 8) * dst_linesize, p->temp + 8 + y*linesize,
 +                           dst_linesize, linesize, width,
 +                           FFMIN(8, height + 8 - y), MAX_LEVEL - p->log2_count,
 +                           ldither);
 +    }
 +}
 +
 +static int query_formats(AVFilterContext *ctx)
 +{
 +    static const enum PixelFormat pix_fmts[] = {
 +        AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
 +        AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
 +        AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
 +        AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
 +        AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
 +        AV_PIX_FMT_NONE
 +    };
 +    ff_set_common_formats(ctx, ff_make_format_list(pix_fmts));
 +    return 0;
 +}
 +
 +static int config_input(AVFilterLink *inlink)
 +{
 +    SPPContext *spp = inlink->dst->priv;
 +    const int h = FFALIGN(inlink->h + 16, 16);
 +    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
 +
 +    spp->hsub = desc->log2_chroma_w;
 +    spp->vsub = desc->log2_chroma_h;
 +    spp->temp_linesize = FFALIGN(inlink->w + 16, 16);
 +    spp->temp = av_malloc_array(spp->temp_linesize, h * sizeof(*spp->temp));
 +    spp->src  = av_malloc_array(spp->temp_linesize, h * sizeof(*spp->src));
 +    if (!spp->use_bframe_qp) {
 +        /* we are assuming here the qp blocks will not be smaller that 16x16 */
 +        spp->non_b_qp_alloc_size = FF_CEIL_RSHIFT(inlink->w, 4) * FF_CEIL_RSHIFT(inlink->h, 4);
 +        spp->non_b_qp_table = av_calloc(spp->non_b_qp_alloc_size, sizeof(*spp->non_b_qp_table));
 +        if (!spp->non_b_qp_table)
 +            return AVERROR(ENOMEM);
 +    }
 +    if (!spp->temp || !spp->src)
 +        return AVERROR(ENOMEM);
 +    return 0;
 +}
 +
 +static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 +{
 +    AVFilterContext *ctx = inlink->dst;
 +    SPPContext *spp = ctx->priv;
 +    AVFilterLink *outlink = ctx->outputs[0];
 +    AVFrame *out = in;
 +    int qp_stride = 0;
 +    const int8_t *qp_table = NULL;
 +
 +    /* if we are not in a constant user quantizer mode and we don't want to use
 +     * the quantizers from the B-frames (B-frames often have a higher QP), we
 +     * need to save the qp table from the last non B-frame; this is what the
 +     * following code block does */
 +    if (!spp->qp) {
 +        qp_table = av_frame_get_qp_table(in, &qp_stride, &spp->qscale_type);
 +
 +        if (qp_table && !spp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
 +            int w, h;
 +
 +            /* if the qp stride is not set, it means the QP are only defined on
 +             * a line basis */
 +            if (!qp_stride) {
 +                w = FF_CEIL_RSHIFT(inlink->w, 4);
 +                h = 1;
 +            } else {
 +                w = FF_CEIL_RSHIFT(qp_stride, 4);
 +                h = FF_CEIL_RSHIFT(inlink->h, 4);
 +            }
 +            av_assert0(w * h <= spp->non_b_qp_alloc_size);
 +            memcpy(spp->non_b_qp_table, qp_table, w * h);
 +        }
 +    }
 +
 +    if (spp->log2_count && !ctx->is_disabled) {
 +        if (!spp->use_bframe_qp && spp->non_b_qp_table)
 +            qp_table = spp->non_b_qp_table;
 +
 +        if (qp_table || spp->qp) {
 +            const int cw = FF_CEIL_RSHIFT(inlink->w, spp->hsub);
 +            const int ch = FF_CEIL_RSHIFT(inlink->h, spp->vsub);
 +
 +            /* get a new frame if in-place is not possible or if the dimensions
 +             * are not multiple of 8 */
 +            if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
 +                const int aligned_w = FFALIGN(inlink->w, 8);
 +                const int aligned_h = FFALIGN(inlink->h, 8);
 +
 +                out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
 +                if (!out) {
 +                    av_frame_free(&in);
 +                    return AVERROR(ENOMEM);
 +                }
 +                av_frame_copy_props(out, in);
 +                out->width  = in->width;
 +                out->height = in->height;
 +            }
 +
 +            filter(spp, out->data[0], in->data[0], out->linesize[0], in->linesize[0], inlink->w, inlink->h, qp_table, qp_stride, 1);
 +            filter(spp, out->data[1], in->data[1], out->linesize[1], in->linesize[1], cw,        ch,        qp_table, qp_stride, 0);
 +            filter(spp, out->data[2], in->data[2], out->linesize[2], in->linesize[2], cw,        ch,        qp_table, qp_stride, 0);
 +            emms_c();
 +        }
 +    }
 +
 +    if (in != out) {
 +        if (in->data[3])
 +            av_image_copy_plane(out->data[3], out->linesize[3],
 +                                in ->data[3], in ->linesize[3],
 +                                inlink->w, inlink->h);
 +        av_frame_free(&in);
 +    }
 +    return ff_filter_frame(outlink, out);
 +}
 +
 +static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
 +                           char *res, int res_len, int flags)
 +{
 +    SPPContext *spp = ctx->priv;
 +
 +    if (!strcmp(cmd, "level")) {
 +        if (!strcmp(args, "max"))
 +            spp->log2_count = MAX_LEVEL;
 +        else
 +            spp->log2_count = av_clip(strtol(args, NULL, 10), 0, MAX_LEVEL);
 +        return 0;
 +    }
 +    return AVERROR(ENOSYS);
 +}
 +
 +static av_cold int init(AVFilterContext *ctx)
 +{
 +    SPPContext *spp = ctx->priv;
 +
 +    spp->avctx = avcodec_alloc_context3(NULL);
 +    if (!spp->avctx)
 +        return AVERROR(ENOMEM);
-     avpriv_dsputil_init(&spp->dsp, spp->avctx);
 +    ff_idctdsp_init(&spp->idsp, spp->avctx);
 +    ff_fdctdsp_init(&spp->fdsp, spp->avctx);
++    ff_pixblockdsp_init(&spp->pdsp, spp->avctx);
 +    spp->store_slice = store_slice_c;
 +    switch (spp->mode) {
 +    case MODE_HARD: spp->requantize = hardthresh_c; break;
 +    case MODE_SOFT: spp->requantize = softthresh_c; break;
 +    }
 +    if (ARCH_X86)
 +        ff_spp_init_x86(spp);
 +    return 0;
 +}
 +
 +static av_cold void uninit(AVFilterContext *ctx)
 +{
 +    SPPContext *spp = ctx->priv;
 +
 +    av_freep(&spp->temp);
 +    av_freep(&spp->src);
 +    if (spp->avctx) {
 +        avcodec_close(spp->avctx);
 +        av_freep(&spp->avctx);
 +    }
 +    av_freep(&spp->non_b_qp_table);
 +}
 +
 +static const AVFilterPad spp_inputs[] = {
 +    {
 +        .name         = "default",
 +        .type         = AVMEDIA_TYPE_VIDEO,
 +        .config_props = config_input,
 +        .filter_frame = filter_frame,
 +    },
 +    { NULL }
 +};
 +
 +static const AVFilterPad spp_outputs[] = {
 +    {
 +        .name = "default",
 +        .type = AVMEDIA_TYPE_VIDEO,
 +    },
 +    { NULL }
 +};
 +
 +AVFilter ff_vf_spp = {
 +    .name            = "spp",
 +    .description     = NULL_IF_CONFIG_SMALL("Apply a simple post processing filter."),
 +    .priv_size       = sizeof(SPPContext),
 +    .init            = init,
 +    .uninit          = uninit,
 +    .query_formats   = query_formats,
 +    .inputs          = spp_inputs,
 +    .outputs         = spp_outputs,
 +    .process_command = process_command,
 +    .priv_class      = &spp_class,
 +    .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
 +};
diff --cc libavfilter/vf_spp.h
index 909d4de8122,00000000000..c8eac3caf29
mode 100644,000000..100644
--- a/libavfilter/vf_spp.h
+++ b/libavfilter/vf_spp.h
@@@ -1,63 -1,0 +1,63 @@@
 +/*
 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 + * Copyright (c) 2013 ClÃ©ment BÅsch
 + *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License along
 + * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 + */
 +
 +#ifndef AVFILTER_SPP_H
 +#define AVFILTER_SPP_H
 +
 +#include "libavcodec/avcodec.h"
- #include "libavcodec/dsputil.h"
++#include "libavcodec/pixblockdsp.h"
 +#include "libavcodec/idctdsp.h"
 +#include "libavcodec/fdctdsp.h"
 +#include "avfilter.h"
 +
 +#define MAX_LEVEL 6 /* quality levels */
 +
 +typedef struct {
 +    const AVClass *av_class;
 +
 +    int log2_count;
 +    int qp;
 +    int mode;
 +    int qscale_type;
 +    int temp_linesize;
 +    uint8_t *src;
 +    int16_t *temp;
 +    AVCodecContext *avctx;
-     DSPContext dsp;
 +    IDCTDSPContext idsp;
 +    FDCTDSPContext fdsp;
++    PixblockDSPContext pdsp;
 +    int8_t *non_b_qp_table;
 +    int non_b_qp_alloc_size;
 +    int use_bframe_qp;
 +    int hsub, vsub;
 +
 +    void (*store_slice)(uint8_t *dst, const int16_t *src,
 +                        int dst_stride, int src_stride,
 +                        int width, int height, int log2_scale,
 +                        const uint8_t dither[8][8]);
 +
 +    void (*requantize)(int16_t dst[64], const int16_t src[64],
 +                       int qp, const uint8_t *permutation);
 +} SPPContext;
 +
 +void ff_spp_init_x86(SPPContext *s);
 +
 +#endif /* AVFILTER_SPP_H */