Solaris: use sysconf to get processor count

[x264] / common / x86 / mc-a2.asm
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index 90f31e90c97ef7f568210471836e4572047096dc..e5eab350ac8db17ae28f91236bb86833ac76a506 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1,7 +1,7 @@
  ;*****************************************************************************
  ;* mc-a2.asm: x86 motion compensation
  ;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
+;* Copyright (C) 2005-2012 x264 project
  ;*
  ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  ;*          Fiona Glaser <fiona@x264.com>
@@ -37,6 +37,13 @@ filt_mul15: times 8 db 1, -5
  filt_mul51: times 8 db -5, 1
  hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
  deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
+%if HIGH_BIT_DEPTH
+deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
+deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
+%else
+deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
+deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
+%endif
  
  pd_16: times 4 dd 16
  pd_0f: times 4 dd 0xffff
@@ -146,16 +153,13 @@ cextern pd_ffff
  ;%define movntps movaps
  ;%define sfence
  
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  ;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
  ;-----------------------------------------------------------------------------
  %macro HPEL_FILTER 0
  cglobal hpel_filter_v, 5,6,11
-    FIX_STRIDES r3d, r4d
-%ifdef WIN64
-    movsxd     r4, r4d
-%endif
+    FIX_STRIDES r3, r4
      lea        r5, [r1+r3]
      sub        r1, r3
      sub        r1, r3
@@ -172,7 +176,7 @@ cglobal hpel_filter_v, 5,6,11
      %define s30 [pad30]
  %endif
      add        r0, r4
-    lea        r2, [r2+r4]
+    add        r2, r4
      neg        r4
      mova       m7, [pw_pixel_max]
      pxor       m0, m0
@@ -209,12 +213,12 @@ cglobal hpel_filter_v, 5,6,11
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
  ;-----------------------------------------------------------------------------
  cglobal hpel_filter_c, 3,3,10
      add        r2, r2
      add        r0, r2
-    lea        r1, [r1+r2]
+    add        r1, r2
      neg        r2
      mova       m0, [tap1]
      mova       m7, [tap3]
@@ -258,7 +262,7 @@ cglobal hpel_filter_c, 3,3,10
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
+; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
  ;-----------------------------------------------------------------------------
  cglobal hpel_filter_h, 3,4,8
      %define src r1+r2
@@ -307,15 +311,12 @@ INIT_XMM sse2
  HPEL_FILTER
  %endif ; HIGH_BIT_DEPTH
  
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
  %macro HPEL_V 1
  ;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
  ;-----------------------------------------------------------------------------
  cglobal hpel_filter_v, 5,6,%1
-%ifdef WIN64
-    movsxd   r4, r4d
-%endif
      lea r5, [r1+r3]
      sub r1, r3
      sub r1, r3
@@ -368,7 +369,7 @@ cglobal hpel_filter_v, 5,6,%1
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
  ;-----------------------------------------------------------------------------
  INIT_MMX
  cglobal hpel_filter_c_mmx2, 3,3
@@ -398,7 +399,7 @@ cglobal hpel_filter_c_mmx2, 3,3
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
  ;-----------------------------------------------------------------------------
  cglobal hpel_filter_h_mmx2, 3,3
      add r0, r2
@@ -445,7 +446,7 @@ INIT_XMM
  
  %macro HPEL_C 0
  ;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
  ;-----------------------------------------------------------------------------
  cglobal hpel_filter_c, 3,3,9
      add r0, r2
@@ -455,12 +456,13 @@ cglobal hpel_filter_c, 3,3,9
  %ifnidn cpuname, sse2
      mova    m7, [pw_32]
      %define tpw_32 m7
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
      mova    m8, [pw_32]
      %define tpw_32 m8
  %else
      %define tpw_32 [pw_32]
  %endif
+; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
  %if cpuflag(misalign)
  .loop:
      movu    m4, [src-4]
@@ -512,7 +514,7 @@ cglobal hpel_filter_c, 3,3,9
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
  ;-----------------------------------------------------------------------------
  cglobal hpel_filter_h_sse2, 3,3,8
      add r0, r2
@@ -559,11 +561,11 @@ cglobal hpel_filter_h_sse2, 3,3,8
      jl .loop
      REP_RET
  
-%ifndef ARCH_X86_64
  ;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
  ;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_ssse3, 3,3
+%macro HPEL_H 0
+cglobal hpel_filter_h, 3,3
      add r0, r2
      add r1, r2
      neg r2
@@ -573,6 +575,9 @@ cglobal hpel_filter_h_ssse3, 3,3
      mova      m7, [pw_16]
  .loop:
      mova      m2, [src+16]
+    ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
+    ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
+    ; the repeated loads of constants for pmaddubsw.
      palignr   m3, m1, m0, 14
      palignr   m4, m1, m0, 15
      palignr   m0, m2, m1, 2
@@ -596,7 +601,7 @@ cglobal hpel_filter_h_ssse3, 3,3
      add r2, 16
      jl .loop
      REP_RET
-%endif ; !ARCH_X86_64
+%endmacro
  
  INIT_MMX mmx2
  HPEL_V 0
@@ -604,18 +609,20 @@ INIT_XMM sse2
  HPEL_V 8
  INIT_XMM sse2, misalign
  HPEL_C
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_XMM sse2
  HPEL_C
  INIT_XMM ssse3
  HPEL_C
  HPEL_V 0
+HPEL_H
  INIT_XMM avx
  HPEL_C
  HPEL_V 0
+HPEL_H
  %endif
  
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
  %macro DO_FILT_V 5
      ;The optimum prefetch distance is difficult to determine in checkasm:
      ;any prefetch seems slower than not prefetching.
@@ -660,7 +667,7 @@ HPEL_V 0
      mova      %1, m1
      mova      %2, m4
      FILT_PACK m1, m4, 5, m15
-    movntps  [r11+r4+%5], m1
+    movntps  [r8+r4+%5], m1
  %endmacro
  
  %macro FILT_C 4
@@ -726,28 +733,24 @@ HPEL_V 0
  %macro HPEL 0
  ;-----------------------------------------------------------------------------
  ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-;                   uint8_t *src, int stride, int width, int height)
+;                   uint8_t *src, intptr_t stride, int width, int height )
  ;-----------------------------------------------------------------------------
-cglobal hpel_filter, 7,7,16
-%ifdef WIN64
-    movsxd   r4, r4d
-    movsxd   r5, r5d
-%endif
-    mov      r10, r3
-    sub       r5, 16
-    mov      r11, r1
-    and      r10, 15
-    sub       r3, r10
+cglobal hpel_filter, 7,9,16
+    mov       r7, r3
+    sub      r5d, 16
+    mov       r8, r1
+    and       r7, 15
+    sub       r3, r7
      add       r0, r5
-    add      r11, r5
-    add      r10, r5
+    add       r8, r5
+    add       r7, r5
      add       r5, r2
      mov       r2, r4
-    neg      r10
+    neg       r7
      lea       r1, [r3+r2]
      sub       r3, r2
      sub       r3, r2
-    mov       r4, r10
+    mov       r4, r7
      mova     m15, [pw_16]
  %if cpuflag(ssse3)
      mova      m0, [filt_mul51]
@@ -774,14 +777,14 @@ cglobal hpel_filter, 7,7,16
      cmp      r4, 16
      jl .lastx
  ; setup regs for next y
-    sub      r4, r10
+    sub      r4, r7
      sub      r4, r2
      sub      r1, r4
      sub      r3, r4
      add      r0, r2
-    add     r11, r2
+    add      r8, r2
      add      r5, r2
-    mov      r4, r10
+    mov      r4, r7
      sub     r6d, 1
      jg .loopy
      sfence
@@ -802,21 +805,20 @@ HPEL
  %endif ; !HIGH_BIT_DEPTH
  
  ;-----------------------------------------------------------------------------
-; void plane_copy_core( pixel *dst, int i_dst,
-;                       pixel *src, int i_src, int w, int h)
+; void plane_copy_core( pixel *dst, intptr_t i_dst,
+;                       pixel *src, intptr_t i_src, int w, int h )
  ;-----------------------------------------------------------------------------
  ; assumes i_dst and w are multiples of 16, and i_dst>w
  INIT_MMX
  cglobal plane_copy_core_mmx2, 6,7
-    FIX_STRIDES r1d, r3d, r4d
-    movsxdifnidn r1, r1d
-    movsxdifnidn r3, r3d
+    FIX_STRIDES r1, r3, r4d
+%if HIGH_BIT_DEPTH == 0
      movsxdifnidn r4, r4d
+%endif
      sub    r1,  r4
      sub    r3,  r4
  .loopy:
-    mov    r6d, r4d
-    sub    r6d, 63
+    lea   r6d, [r4-63]
  .loopx:
      prefetchnta [r2+256]
      movq   m0, [r2   ]
@@ -862,7 +864,7 @@ cglobal plane_copy_core_mmx2, 6,7
  
  
  %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  %assign x 0
  %rep 16/mmsize
      mov%4     m0, [%2+(x/2)*mmsize]
@@ -894,7 +896,7 @@ cglobal plane_copy_core_mmx2, 6,7
  %endmacro
  
  %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  %assign n 0
  %rep 16/mmsize
      mova     m0, [%3+(n+0)*mmsize]
@@ -945,27 +947,24 @@ cglobal plane_copy_core_mmx2, 6,7
  
  %macro PLANE_INTERLEAVE 0
  ;-----------------------------------------------------------------------------
-; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
-;                                  uint8_t *srcu, int i_srcu,
-;                                  uint8_t *srcv, int i_srcv, int w, int h )
+; void plane_copy_interleave_core( uint8_t *dst,  intptr_t i_dst,
+;                                  uint8_t *srcu, intptr_t i_srcu,
+;                                  uint8_t *srcv, intptr_t i_srcv, int w, int h )
  ;-----------------------------------------------------------------------------
  ; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core, 7,7
-    FIX_STRIDES r1d, r3d, r5d, r6d
-%ifdef HIGH_BIT_DEPTH
-    mov   r1m, r1d
-    mov   r3m, r3d
-    mov   r6m, r6d
+cglobal plane_copy_interleave_core, 6,9
+    mov   r6d, r6m
+%if HIGH_BIT_DEPTH
+    FIX_STRIDES r1, r3, r5, r6d
+    movifnidn r1mp, r1
+    movifnidn r3mp, r3
+    mov  r6m, r6d
  %endif
-    movsxdifnidn r1, r1d
-    movsxdifnidn r3, r3d
-    movsxdifnidn r5, r5d
-    movsxdifnidn r6, r6d
      lea    r0, [r0+r6*2]
      add    r2,  r6
      add    r4,  r6
-%ifdef ARCH_X86_64
-    DECLARE_REG_TMP 10,11
+%if ARCH_X86_64
+    DECLARE_REG_TMP 7,8
  %else
      DECLARE_REG_TMP 1,3
  %endif
@@ -1015,10 +1014,10 @@ cglobal plane_copy_interleave_core, 7,7
      RET
  
  ;-----------------------------------------------------------------------------
-; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
+; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
  ;-----------------------------------------------------------------------------
  cglobal store_interleave_chroma, 5,5
-    FIX_STRIDES r1d
+    FIX_STRIDES r1
  .loop:
      INTERLEAVE r0+ 0, r2+           0, r3+           0, a
      INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
@@ -1031,7 +1030,7 @@ cglobal store_interleave_chroma, 5,5
  %endmacro ; PLANE_INTERLEAVE
  
  %macro DEINTERLEAVE_START 0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
      mova   m4, [pd_ffff]
  %elif cpuflag(ssse3)
      mova   m4, [deinterleave_shuf]
@@ -1042,20 +1041,17 @@ cglobal store_interleave_chroma, 5,5
  
  %macro PLANE_DEINTERLEAVE 0
  ;-----------------------------------------------------------------------------
-; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
-;                               pixel *dstv, int i_dstv,
-;                               pixel *src, int i_src, int w, int h )
+; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
+;                               pixel *dstv, intptr_t i_dstv,
+;                               pixel *src,  intptr_t i_src, int w, int h )
  ;-----------------------------------------------------------------------------
  cglobal plane_copy_deinterleave, 6,7
      DEINTERLEAVE_START
      mov    r6d, r6m
-    FIX_STRIDES r1d, r3d, r5d, r6d
-%ifdef HIGH_BIT_DEPTH
+    FIX_STRIDES r1, r3, r5, r6d
+%if HIGH_BIT_DEPTH
      mov    r6m, r6d
  %endif
-    movsxdifnidn r1, r1d
-    movsxdifnidn r3, r3d
-    movsxdifnidn r5, r5d
      add    r0,  r6
      add    r2,  r6
      lea    r4, [r4+r6*2]
@@ -1075,11 +1071,11 @@ cglobal plane_copy_deinterleave, 6,7
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
+; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
  ;-----------------------------------------------------------------------------
  cglobal load_deinterleave_chroma_fenc, 4,4
      DEINTERLEAVE_START
-    FIX_STRIDES r2d
+    FIX_STRIDES r2
  .loop:
      DEINTERLEAVE r0+           0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
      DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
@@ -1090,11 +1086,11 @@ cglobal load_deinterleave_chroma_fenc, 4,4
      REP_RET
  
  ;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
+; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
  ;-----------------------------------------------------------------------------
  cglobal load_deinterleave_chroma_fdec, 4,4
      DEINTERLEAVE_START
-    FIX_STRIDES r2d
+    FIX_STRIDES r2
  .loop:
      DEINTERLEAVE r0+           0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
      DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
@@ -1105,7 +1101,7 @@ cglobal load_deinterleave_chroma_fdec, 4,4
      REP_RET
  %endmacro ; PLANE_DEINTERLEAVE
  
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
  INIT_MMX mmx2
  PLANE_INTERLEAVE
  INIT_MMX mmx
@@ -1221,9 +1217,9 @@ MEMZERO
  
  
  
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
  ;-----------------------------------------------------------------------------
-; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
  ;-----------------------------------------------------------------------------
  INIT_XMM
  cglobal integral_init4h_sse4, 3,4
@@ -1278,7 +1274,7 @@ INTEGRAL_INIT8H
  
  %macro INTEGRAL_INIT_8V 0
  ;-----------------------------------------------------------------------------
-; void integral_init8v( uint16_t *sum8, int stride )
+; void integral_init8v( uint16_t *sum8, intptr_t stride )
  ;-----------------------------------------------------------------------------
  cglobal integral_init8v, 3,3
      shl   r1, 1
@@ -1303,7 +1299,7 @@ INIT_XMM sse2
  INTEGRAL_INIT_8V
  
  ;-----------------------------------------------------------------------------
-; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
  ;-----------------------------------------------------------------------------
  INIT_MMX
  cglobal integral_init4v_mmx, 3,5
@@ -1385,12 +1381,17 @@ cglobal integral_init4v_ssse3, 3,5
      pavgb     %4, [r0+r5*2+%7]
      PALIGNR   %1, %3, 1, m6
      PALIGNR   %2, %4, 1, m6
+%if cpuflag(xop)
+    pavgb     %1, %3
+    pavgb     %2, %4
+%else
      pavgb     %1, %3
      pavgb     %2, %4
      psrlw     %5, %1, 8
      psrlw     %6, %2, 8
      pand      %1, m7
      pand      %2, m7
+%endif
  %endmacro
  
  %macro FILT16x2 4
@@ -1402,12 +1403,17 @@ cglobal integral_init4v_ssse3, 3,5
      pavgb     %1, m3
      PALIGNR   m3, m2, 1, m6
      pavgb     m3, m2
+%if cpuflag(xop)
+    vpperm    m5, m3, %1, m7
+    vpperm    m3, m3, %1, m6
+%else
      psrlw     m5, m3, 8
      psrlw     m4, %1, 8
      pand      m3, m7
      pand      %1, m7
      packuswb  m3, %1
      packuswb  m5, m4
+%endif
      mova    [%2], m3
      mova    [%3], m5
      mova      %1, m2
@@ -1464,12 +1470,17 @@ cglobal integral_init4v_ssse3, 3,5
      pavgw     %1, m3
      PALIGNR   m3, m2, 2, m6
      pavgw     m3, m2
+%if cpuflag(xop)
+    vpperm    m5, m3, %1, m7
+    vpperm    m3, m3, %1, m6
+%else
      psrld     m5, m3, 16
      psrld     m4, %1, 16
      pand      m3, m7
      pand      %1, m7
      packssdw  m3, %1
      packssdw  m5, m4
+%endif
      mova    [%2], m3
      mova    [%3], m5
      mova      %1, m2
@@ -1477,17 +1488,14 @@ cglobal integral_init4v_ssse3, 3,5
  
  ;-----------------------------------------------------------------------------
  ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-;                              int src_stride, int dst_stride, int width, int height )
+;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )
  ;-----------------------------------------------------------------------------
  %macro FRAME_INIT_LOWRES 0
  cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
      shl   dword r6m, 1
-    FIX_STRIDES r5d
+    FIX_STRIDES r5
      shl   dword r7m, 1
-%endif
-%ifdef WIN64
-    movsxd    r5, r5d
  %endif
      ; src += 2*(height-1)*stride + 2*width
      mov      r6d, r8m
@@ -1514,9 +1522,14 @@ cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH,
      shl      r6d, 1
      PUSH      r6
      %define src_gap [rsp]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
+%if cpuflag(xop)
+    mova      m6, [deinterleave_shuf32a]
+    mova      m7, [deinterleave_shuf32b]
+%else
      pcmpeqw   m7, m7
      psrld     m7, 16
+%endif
  .vloop:
      mov      r6d, r7m
  %ifnidn cpuname, mmx2
@@ -1551,8 +1564,13 @@ cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH,
      sub       r4, r6
      add  dst_gap, r6d
  %endif ; mmsize
+%if cpuflag(xop)
+    mova      m6, [deinterleave_shuf32a]
+    mova      m7, [deinterleave_shuf32b]
+%else
      pcmpeqb   m7, m7
      psrlw     m7, 8
+%endif
  .vloop:
      mov      r6d, r7m
  %ifnidn cpuname, mmx2
@@ -1566,12 +1584,22 @@ cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH,
      jz .hloop
      sub       r0, 16
      FILT8x4   m0, m1, m2, m3, m4, m5, 0
+%if cpuflag(xop)
+    mova      m4, m0
+    vpperm    m0, m4, m1, m6
+    vpperm    m1, m4, m1, m7
+    movq    [r1], m0
+    movq    [r2], m1
+    movhps  [r3], m0
+    movhps  [r4], m1
+%else
      packuswb  m0, m4
      packuswb  m1, m5
      movq    [r1], m0
      movhps  [r2], m0
      movq    [r3], m1
      movhps  [r4], m1
+%endif
      mova      m0, m2
      mova      m1, m3
      sub      r6d, 8
@@ -1588,10 +1616,17 @@ cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH,
      mova      m8, m0
      mova      m9, m1
      FILT8x4   m2, m3, m0, m1, m4, m5, 0
+%if cpuflag(xop)
+    vpperm    m4, m2, m8, m7
+    vpperm    m2, m2, m8, m6
+    vpperm    m5, m3, m9, m7
+    vpperm    m3, m3, m9, m6
+%else
      packuswb  m2, m8
      packuswb  m3, m9
      packuswb  m4, m10
      packuswb  m5, m11
+%endif
      mova    [r1], m2
      mova    [r2], m4
      mova    [r3], m3
@@ -1622,7 +1657,7 @@ cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH,
  
  INIT_MMX mmx2
  FRAME_INIT_LOWRES
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
  INIT_MMX cache32, mmx2
  FRAME_INIT_LOWRES
  %endif
@@ -1630,6 +1665,10 @@ INIT_XMM sse2
  FRAME_INIT_LOWRES
  INIT_XMM ssse3
  FRAME_INIT_LOWRES
+INIT_XMM avx
+FRAME_INIT_LOWRES
+INIT_XMM xop
+FRAME_INIT_LOWRES
  
  ;-----------------------------------------------------------------------------
  ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
@@ -1747,5 +1786,4 @@ cglobal mbtree_propagate_cost, 7,7,8
      vmovdqu [r0+r6*2], ymm1
      add            r6, 16
      jl .loop
-    vzeroupper
-    RET
+    REP_RET