x86: Add asm for mbtree fixed point conversion

[x264] / common / x86 / mc-a.asm
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index 689999eeb207494bbfce7581f0706a517ad6e7ac..608efcd97b40d70c9223ab23719dcd95baf5de07 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -1,7 +1,7 @@
  ;*****************************************************************************
  ;* mc-a.asm: x86 motion compensation
  ;*****************************************************************************
-;* Copyright (C) 2003-2012 x264 project
+;* Copyright (C) 2003-2016 x264 project
  ;*
  ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  ;*          Fiona Glaser <fiona@x264.com>
@@ -34,7 +34,7 @@
  
  SECTION_RODATA 32
  
-ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
+ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
  ch_shuf_adj: times 8 db 0
               times 8 db 2
               times 8 db 4
@@ -49,10 +49,12 @@ cextern pw_4
  cextern pw_8
  cextern pw_32
  cextern pw_64
+cextern pw_512
  cextern pw_00ff
  cextern pw_pixel_max
  cextern sw_64
  cextern pd_32
+cextern deinterleave_shufd
  
  ;=============================================================================
  ; implicit weighted biprediction
@@ -61,8 +63,7 @@ cextern pd_32
  %if WIN64
      DECLARE_REG_TMP 0,1,2,3,4,5,4,5
      %macro AVG_START 0-1 0
-        PROLOGUE 5,7,%1
-        movsxd r5, dword r5m
+        PROLOGUE 6,7,%1
      %endmacro
  %elif UNIX64
      DECLARE_REG_TMP 0,1,2,3,4,5,7,8
@@ -88,7 +89,7 @@ cextern pd_32
      lea  t0, [t0+t1*2*SIZEOF_PIXEL]
      sub eax, 2
      jg .height_loop
-    REP_RET
+    RET
  %endmacro
  
  %if HIGH_BIT_DEPTH
@@ -142,8 +143,7 @@ cextern pd_32
      movh      m1, %2
      punpcklbw m0, m1
      pmaddubsw m0, m3
-    paddw     m0, m4
-    psraw     m0, 6
+    pmulhrsw  m0, m4
  %endmacro
  
  %macro BIWEIGHT_START_SSSE3 0
@@ -152,9 +152,13 @@ cextern pd_32
      sub    t7d, t6d
      shl    t7d, 8
      add    t6d, t7d
-    movd    m3, t6d
-    mova    m4, [pw_32]
+    mova    m4, [pw_512]
+    movd   xm3, t6d
+%if cpuflag(avx2)
+    vpbroadcastw m3, xm3
+%else
      SPLATW  m3, m3   ; weight_dst,src
+%endif
  %endmacro
  
  %if HIGH_BIT_DEPTH
@@ -190,7 +194,7 @@ cextern pd_32
  %endif ;HIGH_BIT_DEPTH
  
  ;-----------------------------------------------------------------------------
-; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
+; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
  ;-----------------------------------------------------------------------------
  %macro AVG_WEIGHT 1-2 0
  cglobal pixel_avg_weight_w%1
@@ -245,6 +249,25 @@ AVG_WEIGHT 4
  INIT_XMM ssse3
  AVG_WEIGHT 8,  7
  AVG_WEIGHT 16, 7
+
+INIT_YMM avx2
+cglobal pixel_avg_weight_w16
+    BIWEIGHT_START
+    AVG_START 5
+.height_loop:
+    movu     xm0, [t2]
+    movu     xm1, [t4]
+    vinserti128 m0, m0, [t2+t3], 1
+    vinserti128 m1, m1, [t4+t5], 1
+    SBUTTERFLY bw, 0, 1, 2
+    pmaddubsw m0, m3
+    pmaddubsw m1, m3
+    pmulhrsw  m0, m4
+    pmulhrsw  m1, m4
+    packuswb  m0, m1
+    mova    [t0], xm0
+    vextracti128 [t0+t1], m0, 1
+    AVG_END
  %endif ;HIGH_BIT_DEPTH
  
  ;=============================================================================
@@ -275,7 +298,7 @@ AVG_WEIGHT 16, 7
  %endmacro
  
  ; src, dst, width
-%macro WEIGHT_TWO_ROW 3
+%macro WEIGHT_TWO_ROW 4
      %assign x 0
  %rep (%3+mmsize/2-1)/(mmsize/2)
  %if %3-x/2 <= 4 && mmsize == 16
@@ -299,16 +322,21 @@ AVG_WEIGHT 16, 7
  %else ; !HIGH_BIT_DEPTH
  
  %macro WEIGHT_START 1
+%if cpuflag(avx2)
+    vbroadcasti128 m3, [r4]
+    vbroadcasti128 m4, [r4+16]
+%else
      mova     m3, [r4]
      mova     m4, [r4+16]
  %if notcpuflag(ssse3)
      movd     m5, [r4+32]
+%endif
  %endif
      pxor     m2, m2
  %endmacro
  
-; src1, src2, dst1, dst2
-%macro WEIGHT_ROWx2 4
+; src1, src2, dst1, dst2, fast
+%macro WEIGHT_ROWx2 5
      movh      m0, [%1         ]
      movh      m1, [%1+mmsize/2]
      movh      m6, [%2         ]
@@ -318,10 +346,12 @@ AVG_WEIGHT 16, 7
      punpcklbw m6, m2
      punpcklbw m7, m2
  %if cpuflag(ssse3)
+%if %5==0
      psllw     m0, 7
      psllw     m1, 7
      psllw     m6, 7
      psllw     m7, 7
+%endif
      pmulhrsw  m0, m3
      pmulhrsw  m1, m3
      pmulhrsw  m6, m3
@@ -350,15 +380,54 @@ AVG_WEIGHT 16, 7
      mova    [%4], m6
  %endmacro
  
-; src1, src2, dst1, dst2, width
-%macro WEIGHT_COL 5
+; src1, src2, dst1, dst2, width, fast
+%macro WEIGHT_COL 6
+%if cpuflag(avx2)
+%if %5==16
+    movu     xm0, [%1]
+    vinserti128 m0, m0, [%2], 1
+    punpckhbw m1, m0, m2
+    punpcklbw m0, m0, m2
+%if %6==0
+    psllw     m0, 7
+    psllw     m1, 7
+%endif
+    pmulhrsw  m0, m3
+    pmulhrsw  m1, m3
+    paddw     m0, m4
+    paddw     m1, m4
+    packuswb  m0, m1
+    mova    [%3], xm0
+    vextracti128 [%4], m0, 1
+%else
+    movq     xm0, [%1]
+    vinserti128 m0, m0, [%2], 1
+    punpcklbw m0, m2
+%if %6==0
+    psllw     m0, 7
+%endif
+    pmulhrsw  m0, m3
+    paddw     m0, m4
+    packuswb  m0, m0
+    vextracti128 xm1, m0, 1
+%if %5 == 8
+    movq    [%3], xm0
+    movq    [%4], xm1
+%else
+    movd    [%3], xm0
+    movd    [%4], xm1
+%endif
+%endif
+%else
      movh      m0, [%1]
      movh      m1, [%2]
      punpcklbw m0, m2
      punpcklbw m1, m2
  %if cpuflag(ssse3)
+%if %6==0
      psllw     m0, 7
      psllw     m1, 7
+%endif
      pmulhrsw  m0, m3
      pmulhrsw  m1, m3
      paddw     m0, m4
@@ -381,18 +450,22 @@ AVG_WEIGHT 16, 7
      movd    [%3], m0    ; width 2 can write garbage for the last 2 bytes
      movd    [%4], m1
  %endif
+%endif
  %endmacro
-
  ; src, dst, width
-%macro WEIGHT_TWO_ROW 3
+%macro WEIGHT_TWO_ROW 4
  %assign x 0
  %rep %3
  %if (%3-x) >= mmsize
-    WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x
+    WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
      %assign x (x+mmsize)
  %else
-    WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x
-    %exitrep
+    %assign w %3-x
+%if w == 20
+    %assign w 16
+%endif
+    WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
+    %assign x (x+w)
  %endif
  %if x >= %3
      %exitrep
@@ -403,20 +476,37 @@ AVG_WEIGHT 16, 7
  %endif ; HIGH_BIT_DEPTH
  
  ;-----------------------------------------------------------------------------
-;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
+;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
  ;-----------------------------------------------------------------------------
  
  %macro WEIGHTER 1
  cglobal mc_weight_w%1, 6,6,8
      FIX_STRIDES r1, r3
      WEIGHT_START %1
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+    ; we can merge the shift step into the scale factor
+    ; if (m3<<7) doesn't overflow an int16_t
+    cmp byte [r4+1], 0
+    jz .fast
+%endif
  .loop:
-    WEIGHT_TWO_ROW r2, r0, %1
+    WEIGHT_TWO_ROW r2, r0, %1, 0
      lea  r0, [r0+r1*2]
      lea  r2, [r2+r3*2]
      sub r5d, 2
      jg .loop
-    REP_RET
+    RET
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+.fast:
+    psllw m3, 7
+.fastloop:
+    WEIGHT_TWO_ROW r2, r0, %1, 1
+    lea  r0, [r0+r1*2]
+    lea  r2, [r2+r3*2]
+    sub r5d, 2
+    jg .fastloop
+    RET
+%endif
  %endmacro
  
  INIT_MMX mmx2
@@ -438,6 +528,10 @@ INIT_XMM ssse3
  WEIGHTER  8
  WEIGHTER 16
  WEIGHTER 20
+INIT_YMM avx2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
  %endif
  
  %macro OFFSET_OP 7
@@ -479,7 +573,7 @@ WEIGHTER 20
  %endmacro
  
  ;-----------------------------------------------------------------------------
-;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
+;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
  ;-----------------------------------------------------------------------------
  %macro OFFSET 2
  cglobal mc_offset%2_w%1, 6,6
@@ -496,7 +590,7 @@ cglobal mc_offset%2_w%1, 6,6
      lea  r2, [r2+r3*2]
      sub r5d, 2
      jg .loop
-    REP_RET
+    RET
  %endmacro
  
  %macro OFFSETPN 1
@@ -524,25 +618,28 @@ OFFSETPN  8
  ;=============================================================================
  
  ;-----------------------------------------------------------------------------
-; void pixel_avg_4x4( pixel *dst, int dst_stride,
-;                     pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
+; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+;                     pixel *src2, intptr_t src2_stride, int weight );
  ;-----------------------------------------------------------------------------
  %macro AVGH 2
  cglobal pixel_avg_%1x%2
      mov eax, %2
      cmp dword r6m, 32
      jne pixel_avg_weight_w%1 %+ SUFFIX
+%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
+    jmp pixel_avg_w%1_avx2
+%else
  %if mmsize == 16 && %1 == 16
      test dword r4m, 15
      jz pixel_avg_w%1_sse2
  %endif
      jmp pixel_avg_w%1_mmx2
+%endif
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void pixel_avg_w4( pixel *dst, int dst_stride,
-;                    pixel *src1, int src1_stride, pixel *src2, int src2_stride,
-;                    int height, int weight );
+; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+;                    pixel *src2, intptr_t src2_stride, int height, int weight );
  ;-----------------------------------------------------------------------------
  
  %macro AVG_FUNC 3
@@ -637,6 +734,10 @@ AVGH  4, 16
  AVGH  4,  8
  AVGH  4,  4
  AVGH  4,  2
+INIT_XMM avx2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16,  8
  
  %endif ;HIGH_BIT_DEPTH
  
@@ -648,8 +749,8 @@ AVGH  4,  2
  
  %if HIGH_BIT_DEPTH
  ;-----------------------------------------------------------------------------
-; void pixel_avg2_wN( uint16_t *dst,  int dst_stride,
-;                     uint16_t *src1, int src_stride,
+; void pixel_avg2_wN( uint16_t *dst,  intptr_t dst_stride,
+;                     uint16_t *src1, intptr_t src_stride,
  ;                     uint16_t *src2, int height );
  ;-----------------------------------------------------------------------------
  %macro AVG2_W_ONE 1
@@ -659,7 +760,7 @@ cglobal pixel_avg2_w%1, 6,7,4
  .height_loop:
      movu    m0, [r2]
      movu    m1, [r2+r3*2]
-%if mmsize == 8
+%if cpuflag(avx) || mmsize == 8
      pavgw   m0, [r2+r4]
      pavgw   m1, [r2+r6]
  %else
@@ -674,7 +775,7 @@ cglobal pixel_avg2_w%1, 6,7,4
      lea     r0, [r0+r1*4]
      sub    r5d, 2
      jg .height_loop
-    REP_RET
+    RET
  %endmacro
  
  %macro AVG2_W_TWO 3
@@ -709,7 +810,7 @@ cglobal pixel_avg2_w%1, 6,7,8
      lea     r0, [r0+r1*4]
      sub    r5d, 2
      jg .height_loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx2
@@ -719,6 +820,8 @@ INIT_XMM sse2
  AVG2_W_ONE  8
  AVG2_W_TWO 10, movd, movd
  AVG2_W_TWO 16, movu, mova
+INIT_YMM avx2
+AVG2_W_ONE 16
  
  INIT_MMX
  cglobal pixel_avg2_w10_mmx2, 6,7
@@ -747,7 +850,7 @@ cglobal pixel_avg2_w10_mmx2, 6,7
      lea     r0, [r0+r1*2*2]
      sub    r5d, 2
      jg .height_loop
-    REP_RET
+    RET
  
  cglobal pixel_avg2_w16_mmx2, 6,7
      sub     r4, r2
@@ -781,7 +884,7 @@ cglobal pixel_avg2_w16_mmx2, 6,7
      lea     r0, [r0+r1*2*2]
      sub    r5d, 2
      jg .height_loop
-    REP_RET
+    RET
  
  cglobal pixel_avg2_w18_mmx2, 6,7
      sub     r4, r2
@@ -805,35 +908,48 @@ cglobal pixel_avg2_w18_mmx2, 6,7
      lea     r0, [r0+r1*2]
      dec    r5d
      jg .height_loop
-    REP_RET
+    RET
  
-INIT_XMM
-cglobal pixel_avg2_w18_sse2, 6,7,6
+%macro PIXEL_AVG_W18 0
+cglobal pixel_avg2_w18, 6,7
      sub     r4, r2
  .height_loop:
      movu    m0, [r2+ 0]
+    movd   xm2, [r2+32]
+%if mmsize == 32
+    pavgw   m0, [r2+r4+ 0]
+    movd   xm1, [r2+r4+32]
+    pavgw  xm2, xm1
+%else
      movu    m1, [r2+16]
-    movh    m2, [r2+32]
      movu    m3, [r2+r4+ 0]
      movu    m4, [r2+r4+16]
-    movh    m5, [r2+r4+32]
+    movd    m5, [r2+r4+32]
      pavgw   m0, m3
      pavgw   m1, m4
      pavgw   m2, m5
-    mova   [r0+ 0], m0
      mova   [r0+16], m1
-    movh   [r0+32], m2
+%endif
+    mova   [r0+ 0], m0
+    movd   [r0+32], xm2
      lea     r2, [r2+r3*2]
      lea     r0, [r0+r1*2]
      dec    r5d
      jg .height_loop
-    REP_RET
+    RET
+%endmacro
+
+INIT_XMM sse2
+PIXEL_AVG_W18
+INIT_YMM avx2
+PIXEL_AVG_W18
+
  %endif ; HIGH_BIT_DEPTH
  
  %if HIGH_BIT_DEPTH == 0
  ;-----------------------------------------------------------------------------
-; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
-;                     uint8_t *src1, int src_stride,
+; void pixel_avg2_w4( uint8_t *dst,  intptr_t dst_stride,
+;                     uint8_t *src1, intptr_t src_stride,
  ;                     uint8_t *src2, int height );
  ;-----------------------------------------------------------------------------
  %macro AVG2_W8 2
@@ -851,7 +967,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX
@@ -879,7 +995,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
+    RET
  %endmacro
  
  AVG2_W16 12, movd
@@ -911,61 +1027,67 @@ cglobal pixel_avg2_w20_mmx2, 6,7
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
+    RET
  
+INIT_XMM
  cglobal pixel_avg2_w16_sse2, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
-    movdqu xmm0, [r2]
-    movdqu xmm2, [r2+r3]
-    movdqu xmm1, [r2+r4]
-    movdqu xmm3, [r2+r6]
+    movu   m0, [r2]
+    movu   m2, [r2+r3]
+    movu   m1, [r2+r4]
+    movu   m3, [r2+r6]
      lea    r2, [r2+r3*2]
-    pavgb  xmm0, xmm1
-    pavgb  xmm2, xmm3
-    movdqa [r0], xmm0
-    movdqa [r0+r1], xmm2
+    pavgb  m0, m1
+    pavgb  m2, m3
+    mova [r0], m0
+    mova [r0+r1], m2
      lea    r0, [r0+r1*2]
-    sub    r5d, 2
-    jg     .height_loop
-    REP_RET
+    sub   r5d, 2
+    jg .height_loop
+    RET
  
-%macro AVG2_W20 1
-cglobal pixel_avg2_w20_%1, 6,7
+cglobal pixel_avg2_w20_sse2, 6,7
      sub    r2, r4
      lea    r6, [r2+r3]
  .height_loop:
-    movdqu xmm0, [r4]
-    movdqu xmm2, [r4+r3]
-%ifidn %1, sse2_misalign
-    movd   mm4,  [r4+16]
-    movd   mm5,  [r4+r3+16]
-    pavgb  xmm0, [r4+r2]
-    pavgb  xmm2, [r4+r6]
-%else
-    movdqu xmm1, [r4+r2]
-    movdqu xmm3, [r4+r6]
-    movd   mm4,  [r4+16]
-    movd   mm5,  [r4+r3+16]
-    pavgb  xmm0, xmm1
-    pavgb  xmm2, xmm3
-%endif
-    pavgb  mm4,  [r4+r2+16]
-    pavgb  mm5,  [r4+r6+16]
+    movu   m0, [r4]
+    movu   m2, [r4+r3]
+    movu   m1, [r4+r2]
+    movu   m3, [r4+r6]
+    movd  mm4, [r4+16]
+    movd  mm5, [r4+r3+16]
+    pavgb  m0, m1
+    pavgb  m2, m3
+    pavgb mm4, [r4+r2+16]
+    pavgb mm5, [r4+r6+16]
      lea    r4, [r4+r3*2]
-    movdqa [r0], xmm0
-    movd   [r0+16], mm4
-    movdqa [r0+r1], xmm2
-    movd   [r0+r1+16], mm5
+    mova [r0], m0
+    mova [r0+r1], m2
+    movd [r0+16], mm4
+    movd [r0+r1+16], mm5
+    lea    r0, [r0+r1*2]
+    sub   r5d, 2
+    jg .height_loop
+    RET
+
+INIT_YMM avx2
+cglobal pixel_avg2_w20, 6,7
+    sub    r2, r4
+    lea    r6, [r2+r3]
+.height_loop:
+    movu   m0, [r4]
+    movu   m1, [r4+r3]
+    pavgb  m0, [r4+r2]
+    pavgb  m1, [r4+r6]
+    lea    r4, [r4+r3*2]
+    mova [r0], m0
+    mova [r0+r1], m1
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
-%endmacro
-
-AVG2_W20 sse2
-AVG2_W20 sse2_misalign
+    RET
  
  ; Cacheline split code for processors with high latencies for loads
  ; split over cache lines.  See sad-a.asm for a more detailed explanation.
@@ -1024,7 +1146,7 @@ pixel_avg2_w%1_cache_mmx2:
      add    r0, r1
      dec    r5d
      jg .height_loop
-    REP_RET
+    RET
  %endmacro
  
  %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
@@ -1174,18 +1296,18 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k
      movu  m1, [r2+%4*mmsize]
      movu  m2, [r2+r3+%3*mmsize]
      movu  m3, [r2+r3+%4*mmsize]
-    movu  m4, [r2+r3*2+%3*mmsize]
-    movu  m5, [r2+r3*2+%4*mmsize]
-    movu  m6, [r2+%2+%3*mmsize]
-    movu  m7, [r2+%2+%4*mmsize]
      mova  [r0+%3*mmsize],      m0
      mova  [r0+%4*mmsize],      m1
      mova  [r0+r1+%3*mmsize],   m2
      mova  [r0+r1+%4*mmsize],   m3
-    mova  [r0+r1*2+%3*mmsize], m4
-    mova  [r0+r1*2+%4*mmsize], m5
-    mova  [r0+%1+%3*mmsize],   m6
-    mova  [r0+%1+%4*mmsize],   m7
+    movu  m0, [r2+r3*2+%3*mmsize]
+    movu  m1, [r2+r3*2+%4*mmsize]
+    movu  m2, [r2+%2+%3*mmsize]
+    movu  m3, [r2+%2+%4*mmsize]
+    mova  [r0+r1*2+%3*mmsize], m0
+    mova  [r0+r1*2+%4*mmsize], m1
+    mova  [r0+%1+%3*mmsize],   m2
+    mova  [r0+%1+%4*mmsize],   m3
  %endmacro
  
  %macro COPY4 2
@@ -1194,8 +1316,8 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
-;                  uint8_t *src, int i_src_stride, int i_height )
+; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
+;                  uint8_t *src, intptr_t i_src_stride, int i_height )
  ;-----------------------------------------------------------------------------
  INIT_MMX
  cglobal mc_copy_w4_mmx, 4,6
@@ -1218,7 +1340,7 @@ cglobal mc_copy_w4_mmx, 4,6
  %macro MC_COPY 1
  %assign %%w %1*SIZEOF_PIXEL/mmsize
  %if %%w > 0
-cglobal mc_copy_w%1, 5,7,8*(%%w/2)
+cglobal mc_copy_w%1, 5,7
      FIX_STRIDES r1, r3
      lea     r6, [r3*3]
      lea     r5, [r1*3]
@@ -1228,20 +1350,24 @@ cglobal mc_copy_w%1, 5,7,8*(%%w/2)
      lea     r0, [r0+r1*4]
      sub    r4d, 4
      jg .height_loop
-    REP_RET
+    RET
  %endif
  %endmacro
  
  INIT_MMX mmx
  MC_COPY  8
  MC_COPY 16
-INIT_XMM sse2
+INIT_XMM sse
  MC_COPY  8
  MC_COPY 16
-INIT_XMM aligned, sse2
+INIT_XMM aligned, sse
  MC_COPY 16
-
-
+%if HIGH_BIT_DEPTH
+INIT_YMM avx
+MC_COPY 16
+INIT_YMM aligned, avx
+MC_COPY 16
+%endif
  
  ;=============================================================================
  ; prefetch
@@ -1250,14 +1376,14 @@ MC_COPY 16
  ; FIXME doesn't cover all pixels in high depth and/or 4:4:4
  
  ;-----------------------------------------------------------------------------
-; void prefetch_fenc( pixel *pix_y, int stride_y,
-;                     pixel *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y,  intptr_t stride_y,
+;                     pixel *pix_uv, intptr_t stride_uv, int mb_x )
  ;-----------------------------------------------------------------------------
  
  %macro PREFETCH_FENC 1
  %if ARCH_X86_64
  cglobal prefetch_fenc_%1, 5,5
-    FIX_STRIDES r1d, r3d
+    FIX_STRIDES r1, r3
      and    r4d, 3
      mov    eax, r4d
      imul   r4d, r1d
@@ -1317,11 +1443,11 @@ PREFETCH_FENC 420
  PREFETCH_FENC 422
  
  ;-----------------------------------------------------------------------------
-; void prefetch_ref( pixel *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
  ;-----------------------------------------------------------------------------
  INIT_MMX mmx2
  cglobal prefetch_ref, 3,3
-    FIX_STRIDES r1d
+    FIX_STRIDES r1
      dec    r2d
      and    r2d, r1d
      lea    r0,  [r0+r2*8+64*SIZEOF_PIXEL]
@@ -1387,7 +1513,7 @@ cglobal prefetch_ref, 3,3
  %endmacro
  %else ; !HIGH_BIT_DEPTH
  %macro UNPACK_UNALIGNED 3
-%if mmsize == 8 || cpuflag(misalign)
+%if mmsize == 8
      punpcklwd  %1, %3
  %else
      movh       %2, %3
@@ -1397,8 +1523,8 @@ cglobal prefetch_ref, 3,3
  %endif ; HIGH_BIT_DEPTH
  
  ;-----------------------------------------------------------------------------
-; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
-;                 uint8_t *src, int src_stride,
+; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
+;                 uint8_t *src, intptr_t src_stride,
  ;                 int dx, int dy,
  ;                 int width, int height )
  ;-----------------------------------------------------------------------------
@@ -1508,7 +1634,7 @@ ALIGN 4
      add        r1, r2
      dec       r5d
      jg .loop2
-    REP_RET
+    RET
  
  %if mmsize==8
  .width4:
@@ -1516,7 +1642,11 @@ ALIGN 4
      mov        t0, r0
      mov        t1, r1
      mov        t2, r3
+%if WIN64
+    %define multy0 r4m
+%else
      %define multy0 [rsp-8]
+%endif
      mova    multy0, m5
  %else
      mov       r3m, r3
@@ -1628,11 +1758,11 @@ ALIGN 4
      dec       r5d
      jg .loop4
  %if mmsize!=8
-    REP_RET
+    RET
  %else
      sub dword r7m, 4
      jg .width8
-    REP_RET
+    RET
  .width8:
  %if ARCH_X86_64
      lea        r3, [t2+8*SIZEOF_PIXEL]
@@ -1653,10 +1783,9 @@ ALIGN 4
  %if ARCH_X86_64 ; too many regs for x86_32
      RESET_MM_PERMUTATION
  %if WIN64
-%if xmm_regs_used > 6
-    %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
-    %assign xmm_regs_used 6
-%endif
+    %assign stack_offset stack_offset - stack_size_padded
+    %assign stack_size_padded 0
+    %assign xmm_regs_used 0
  %endif
  .mc1dy:
      and       t2d, 7
@@ -1768,7 +1897,7 @@ ALIGN 4
      add        r1, r2
      dec       r5d
      jg .loop1d_w4
-    REP_RET
+    RET
  .mc1d_w8:
      sub       r2, 4*SIZEOF_PIXEL
      sub       r4, 8*SIZEOF_PIXEL
@@ -1783,7 +1912,7 @@ ALIGN 4
  
  %macro MC_CHROMA_SSSE3 0
  cglobal mc_chroma
-    MC_CHROMA_START 9
+    MC_CHROMA_START 10-cpuflag(avx2)
      and       r5d, 7
      and       t2d, 7
      mov       t0d, r5d
@@ -1794,18 +1923,18 @@ cglobal mc_chroma
      sub       r5d, t2d
      imul      t2d, t0d ; (x*255+8)*y
      imul      r5d, t0d ; (x*255+8)*(8-y)
-    movd       m6, t2d
-    movd       m7, r5d
+    movd      xm6, t2d
+    movd      xm7, r5d
  %if cpuflag(cache64)
      mov       t0d, r3d
      and       t0d, 7
  %ifdef PIC
      lea        t1, [ch_shuf_adj]
-    movddup    m5, [t1 + t0*4]
+    movddup   xm5, [t1 + t0*4]
  %else
-    movddup    m5, [ch_shuf_adj + t0*4]
+    movddup   xm5, [ch_shuf_adj + t0*4]
  %endif
-    paddb      m5, [ch_shuf]
+    paddb     xm5, [ch_shuf]
      and        r3, ~7
  %else
      mova       m5, [ch_shuf]
@@ -1814,10 +1943,78 @@ cglobal mc_chroma
      movifnidn  r1, r1mp
      movifnidn r2d, r2m
      movifnidn r5d, r8m
+%if cpuflag(avx2)
+    vpbroadcastw m6, xm6
+    vpbroadcastw m7, xm7
+%else
      SPLATW     m6, m6
      SPLATW     m7, m7
+%endif
+%if ARCH_X86_64
+    %define shiftround m8
+    mova       m8, [pw_512]
+%else
+    %define shiftround [pw_512]
+%endif
      cmp dword r7m, 4
      jg .width8
+
+%if cpuflag(avx2)
+.loop4:
+    movu      xm0, [r3]
+    movu      xm1, [r3+r4]
+    vinserti128 m0, m0, [r3+r4], 1
+    vinserti128 m1, m1, [r3+r4*2], 1
+    pshufb     m0, m5
+    pshufb     m1, m5
+    pmaddubsw  m0, m7
+    pmaddubsw  m1, m6
+    paddw      m0, m1
+    pmulhrsw   m0, shiftround
+    packuswb   m0, m0
+    vextracti128 xm1, m0, 1
+    movd     [r0], xm0
+    movd  [r0+r2], xm1
+    psrldq    xm0, 4
+    psrldq    xm1, 4
+    movd     [r1], xm0
+    movd  [r1+r2], xm1
+    lea        r3, [r3+r4*2]
+    lea        r0, [r0+r2*2]
+    lea        r1, [r1+r2*2]
+    sub       r5d, 2
+    jg .loop4
+    RET
+.width8:
+    movu      xm0, [r3]
+    vinserti128 m0, m0, [r3+8], 1
+    pshufb     m0, m5
+.loop8:
+    movu      xm3, [r3+r4]
+    vinserti128 m3, m3, [r3+r4+8], 1
+    pshufb     m3, m5
+    pmaddubsw  m1, m0, m7
+    pmaddubsw  m2, m3, m6
+    pmaddubsw  m3, m3, m7
+
+    movu      xm0, [r3+r4*2]
+    vinserti128 m0, m0, [r3+r4*2+8], 1
+    pshufb     m0, m5
+    pmaddubsw  m4, m0, m6
+
+    paddw      m1, m2
+    paddw      m3, m4
+    pmulhrsw   m1, shiftround
+    pmulhrsw   m3, shiftround
+    packuswb   m1, m3
+    mova       m2, [deinterleave_shufd]
+    vpermd     m1, m2, m1
+    vextracti128 xm2, m1, 1
+    movq      [r0], xm1
+    movhps    [r1], xm1
+    movq   [r0+r2], xm2
+    movhps [r1+r2], xm2
+%else
      movu       m0, [r3]
      pshufb     m0, m5
  .loop4:
@@ -1830,36 +2027,39 @@ cglobal mc_chroma
      pmaddubsw  m2, m1, m7
      pmaddubsw  m1, m6
      pmaddubsw  m3, m6
-    paddw      m0, [pw_32]
-    paddw      m2, [pw_32]
      paddw      m1, m0
      paddw      m3, m2
+    pmulhrsw   m1, shiftround
+    pmulhrsw   m3, shiftround
      mova       m0, m4
-    psrlw      m1, 6
-    psrlw      m3, 6
      packuswb   m1, m3
-    movhlps    m3, m1
      movd     [r0], m1
+%if cpuflag(sse4)
+    pextrd    [r1], m1, 1
+    pextrd [r0+r2], m1, 2
+    pextrd [r1+r2], m1, 3
+%else
+    movhlps    m3, m1
      movd  [r0+r2], m3
      psrldq     m1, 4
      psrldq     m3, 4
      movd     [r1], m1
      movd  [r1+r2], m3
+%endif
      lea        r3, [r3+r4*2]
      lea        r0, [r0+r2*2]
      lea        r1, [r1+r2*2]
      sub       r5d, 2
      jg .loop4
-    REP_RET
-
+    RET
  .width8:
      movu       m0, [r3]
      pshufb     m0, m5
      movu       m1, [r3+8]
      pshufb     m1, m5
  %if ARCH_X86_64
-    SWAP        8, 6
-    %define  mult1 m8
+    SWAP        9, 6
+    %define  mult1 m9
  %else
      mova      r0m, m6
      %define  mult1 r0m
@@ -1875,12 +2075,10 @@ cglobal mc_chroma
      pmaddubsw  m1, m7
      pmaddubsw  m2, mult1
      pmaddubsw  m3, mult1
-    paddw      m0, [pw_32]
-    paddw      m1, [pw_32]
      paddw      m0, m2
      paddw      m1, m3
-    psrlw      m0, 6
-    psrlw      m1, 6
+    pmulhrsw   m0, shiftround ; x + 32 >> 6
+    pmulhrsw   m1, shiftround
      packuswb   m0, m1
      pshufd     m0, m0, q3120
      movq     [r0], m0
@@ -1896,22 +2094,21 @@ cglobal mc_chroma
      pmaddubsw  m6, m7
      pmaddubsw  m2, mult1
      pmaddubsw  m3, mult1
-    paddw      m4, [pw_32]
-    paddw      m6, [pw_32]
      paddw      m2, m4
      paddw      m3, m6
-    psrlw      m2, 6
-    psrlw      m3, 6
+    pmulhrsw   m2, shiftround
+    pmulhrsw   m3, shiftround
      packuswb   m2, m3
      pshufd     m2, m2, q3120
      movq   [r0+r2], m2
      movhps [r1+r2], m2
+%endif
      lea        r3, [r3+r4*2]
      lea        r0, [r0+r2*2]
      lea        r1, [r1+r2*2]
      sub       r5d, 2
      jg .loop8
-    REP_RET
+    RET
  %endmacro
  
  %if HIGH_BIT_DEPTH
@@ -1924,8 +2121,6 @@ MC_CHROMA
  %else ; !HIGH_BIT_DEPTH
  INIT_MMX mmx2
  MC_CHROMA
-INIT_XMM sse2, misalign
-MC_CHROMA
  INIT_XMM sse2
  MC_CHROMA
  INIT_XMM ssse3
@@ -1934,4 +2129,6 @@ INIT_XMM ssse3, cache64
  MC_CHROMA_SSSE3
  INIT_XMM avx
  MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
+INIT_YMM avx2
+MC_CHROMA_SSSE3
  %endif ; HIGH_BIT_DEPTH