x86inc: activate REP_RET automatically

author Loren Merritt <pengvado@akuvian.org>

Sun, 2 Dec 2012 15:56:30 +0000 (15:56 +0000)

committer Fiona Glaser <fiona@x264.com>

Tue, 8 Jan 2013 21:52:27 +0000 (13:52 -0800)
author Loren Merritt <pengvado@akuvian.org>
Sun, 2 Dec 2012 15:56:30 +0000 (15:56 +0000)
committer Fiona Glaser <fiona@x264.com>
Tue, 8 Jan 2013 21:52:27 +0000 (13:52 -0800)
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm

index 0136511f8d1234626d3d577219e883fe8b44e5bc..009b720b44f0cbead5770ab8c912cd1be0842c1b 100644 (file)
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -139,13 +139,13 @@ cglobal cabac_encode_terminal_asm, 0,3
  ; can only be 0 or 1 and is zero over 99% of the time.
      test dword [t0+cb.range], 0x100
      je .renorm
-    REP_RET
+    RET
  .renorm:
      shl  dword [t0+cb.low], 1
      shl  dword [t0+cb.range], 1
      inc  dword [t0+cb.queue]
      jge .putbyte
-    REP_RET
+    RET
  .putbyte:
      PROLOGUE 0,7
      mov t3d, [t0+cb.queue]
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm

index e16f713482f6f3b03878db320ea0b17180858d2f..8ee94c20ab820e8e69dfe21ee3be813811959f5b 100644 (file)
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -555,7 +555,7 @@ cglobal add16x16_idct_dc, 2,3,8
      add         r0, 4*FDEC_STRIDEB
      dec         r2
      jg .loop
-    REP_RET
+    RET
  %endmacro ; ADD_IDCT_DC
  
  INIT_XMM sse2
@@ -664,7 +664,7 @@ cglobal add16x16_idct_dc, 2,3
      add       r0, FDEC_STRIDE*4
      dec       r2
      jg .loop
-    REP_RET
+    RET
  
  INIT_XMM sse2
  cglobal add16x16_idct_dc, 2,2,8
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm

index 210761a85f9328aa324bdb56d591cd33bdaf436f..a8be80e75315c0dd9a6839d4c473d5250d5f7055 100644 (file)
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -378,7 +378,7 @@ cglobal deblock_v_luma, 5,5,15
      add         r4, 2
      dec         r3
      jg .loop
-    REP_RET
+    RET
  
  cglobal deblock_h_luma, 5,7,15
      add         r1, r1
@@ -416,7 +416,7 @@ cglobal deblock_h_luma, 5,7,15
      lea         r5, [r5+r1*8]
      dec         r6
      jg .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_XMM sse2
@@ -650,7 +650,7 @@ cglobal deblock_v_luma_intra, 4,7,16
      add     r4, mmsize
      dec     r6
      jg .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1497,7 +1497,7 @@ cglobal deblock_%1_luma_intra, 4,6,16,ARCH_X86_64*0x50-0x50
      LUMA_INTRA_SWAP_PQ
      LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
  .end:
-    RET
+    REP_RET
  
  INIT_MMX cpuname
  %if ARCH_X86_64
@@ -1687,7 +1687,7 @@ cglobal deblock_v_chroma, 5,7,8
      add         r4, mmsize/8
      dec         r6
      jg .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1706,7 +1706,7 @@ cglobal deblock_h_chroma, 5,7,8
      add         r4, mmsize/8
      dec         r5
      jg .loop
-    REP_RET
+    RET
  
  
  cglobal deblock_intra_body
@@ -1734,7 +1734,7 @@ cglobal deblock_v_chroma_intra, 4,6,8
      add         r4, mmsize
      dec         r5
      jg .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1752,7 +1752,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
      lea         r0, [r0+r1*(mmsize/4)]
      dec         r4
      jg .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1775,7 +1775,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
      dec         r4
      jg .loop
  %endif
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1803,7 +1803,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8
      dec         r5
      jg .loop
  %endif
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
@@ -1821,7 +1821,7 @@ cglobal deblock_h_chroma_422_intra, 4,6,8
      lea         r0, [r0+r1*(mmsize/4)]
      dec         r4
      jg .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
@@ -1852,7 +1852,7 @@ cglobal deblock_h_chroma_422, 5,7,8
  %endif
      dec         r5
      jg .loop
-    REP_RET
+    RET
  %endmacro ; DEBLOCK_CHROMA
  
  %if ARCH_X86_64 == 0
@@ -2020,7 +2020,7 @@ cglobal deblock_h_chroma_422, 5,8,8
      add   r4, mmsize/8
      dec   cntr
      jg .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx2
@@ -2101,7 +2101,7 @@ cglobal deblock_h_chroma_422_intra, 4,7,8
      lea   t5, [t5+r1*(mmsize/2)]
      dec  r6d
      jg .loop
-    REP_RET
+    RET
  %endmacro ; DEBLOCK_CHROMA_INTRA
  
  INIT_XMM sse2
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index 923a2cd33a77703294bef7e39519cf4e1317423c..4b336816ae3f828964b8581db4d4412ca8c2f042 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -87,7 +87,7 @@ cextern pd_32
      lea  t0, [t0+t1*2*SIZEOF_PIXEL]
      sub eax, 2
      jg .height_loop
-    REP_RET
+    RET
  %endmacro
  
  %if HIGH_BIT_DEPTH
@@ -415,7 +415,7 @@ cglobal mc_weight_w%1, 6,6,8
      lea  r2, [r2+r3*2]
      sub r5d, 2
      jg .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx2
@@ -495,7 +495,7 @@ cglobal mc_offset%2_w%1, 6,6
      lea  r2, [r2+r3*2]
      sub r5d, 2
      jg .loop
-    REP_RET
+    RET
  %endmacro
  
  %macro OFFSETPN 1
@@ -672,7 +672,7 @@ cglobal pixel_avg2_w%1, 6,7,4
      lea     r0, [r0+r1*4]
      sub    r5d, 2
      jg .height_loop
-    REP_RET
+    RET
  %endmacro
  
  %macro AVG2_W_TWO 3
@@ -707,7 +707,7 @@ cglobal pixel_avg2_w%1, 6,7,8
      lea     r0, [r0+r1*4]
      sub    r5d, 2
      jg .height_loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx2
@@ -745,7 +745,7 @@ cglobal pixel_avg2_w10_mmx2, 6,7
      lea     r0, [r0+r1*2*2]
      sub    r5d, 2
      jg .height_loop
-    REP_RET
+    RET
  
  cglobal pixel_avg2_w16_mmx2, 6,7
      sub     r4, r2
@@ -779,7 +779,7 @@ cglobal pixel_avg2_w16_mmx2, 6,7
      lea     r0, [r0+r1*2*2]
      sub    r5d, 2
      jg .height_loop
-    REP_RET
+    RET
  
  cglobal pixel_avg2_w18_mmx2, 6,7
      sub     r4, r2
@@ -803,7 +803,7 @@ cglobal pixel_avg2_w18_mmx2, 6,7
      lea     r0, [r0+r1*2]
      dec    r5d
      jg .height_loop
-    REP_RET
+    RET
  
  INIT_XMM
  cglobal pixel_avg2_w18_sse2, 6,7,6
@@ -825,7 +825,7 @@ cglobal pixel_avg2_w18_sse2, 6,7,6
      lea     r0, [r0+r1*2]
      dec    r5d
      jg .height_loop
-    REP_RET
+    RET
  %endif ; HIGH_BIT_DEPTH
  
  %if HIGH_BIT_DEPTH == 0
@@ -849,7 +849,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX
@@ -877,7 +877,7 @@ cglobal pixel_avg2_w%1_mmx2, 6,7
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
+    RET
  %endmacro
  
  AVG2_W16 12, movd
@@ -909,7 +909,7 @@ cglobal pixel_avg2_w20_mmx2, 6,7
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
+    RET
  
  cglobal pixel_avg2_w16_sse2, 6,7
      sub    r4, r2
@@ -927,7 +927,7 @@ cglobal pixel_avg2_w16_sse2, 6,7
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
+    RET
  
  %macro AVG2_W20 1
  cglobal pixel_avg2_w20_%1, 6,7
@@ -959,7 +959,7 @@ cglobal pixel_avg2_w20_%1, 6,7
      lea    r0, [r0+r1*2]
      sub    r5d, 2
      jg     .height_loop
-    REP_RET
+    RET
  %endmacro
  
  AVG2_W20 sse2
@@ -1022,7 +1022,7 @@ pixel_avg2_w%1_cache_mmx2:
      add    r0, r1
      dec    r5d
      jg .height_loop
-    REP_RET
+    RET
  %endmacro
  
  %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
@@ -1226,7 +1226,7 @@ cglobal mc_copy_w%1, 5,7,8*(%%w/2)
      lea     r0, [r0+r1*4]
      sub    r4d, 4
      jg .height_loop
-    REP_RET
+    RET
  %endif
  %endmacro
  
@@ -1506,7 +1506,7 @@ ALIGN 4
      add        r1, r2
      dec       r5d
      jg .loop2
-    REP_RET
+    RET
  
  %if mmsize==8
  .width4:
@@ -1626,11 +1626,11 @@ ALIGN 4
      dec       r5d
      jg .loop4
  %if mmsize!=8
-    REP_RET
+    RET
  %else
      sub dword r7m, 4
      jg .width8
-    REP_RET
+    RET
  .width8:
  %if ARCH_X86_64
      lea        r3, [t2+8*SIZEOF_PIXEL]
@@ -1766,7 +1766,7 @@ ALIGN 4
      add        r1, r2
      dec       r5d
      jg .loop1d_w4
-    REP_RET
+    RET
  .mc1d_w8:
      sub       r2, 4*SIZEOF_PIXEL
      sub       r4, 8*SIZEOF_PIXEL
@@ -1848,7 +1848,7 @@ cglobal mc_chroma
      lea        r1, [r1+r2*2]
      sub       r5d, 2
      jg .loop4
-    REP_RET
+    RET
  
  .width8:
      movu       m0, [r3]
@@ -1909,7 +1909,7 @@ cglobal mc_chroma
      lea        r1, [r1+r2*2]
      sub       r5d, 2
      jg .loop8
-    REP_RET
+    RET
  %endmacro
  
  %if HIGH_BIT_DEPTH
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index e5eab350ac8db17ae28f91236bb86833ac76a506..b063379474384e195498f494d5159259cf4a47f0 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -210,7 +210,7 @@ cglobal hpel_filter_v, 5,6,11
      mova      [r0+r4+mmsize], m4
      add        r4, 2*mmsize
      jl .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
@@ -259,7 +259,7 @@ cglobal hpel_filter_c, 3,3,10
      mova  [r0+r2], m1
      add        r2, mmsize
      jl .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
@@ -302,7 +302,7 @@ cglobal hpel_filter_h, 3,4,8
      mova      [r0+r2+mmsize], m4
      add        r2, mmsize*2
      jl .loop
-    REP_RET
+    RET
  %endmacro ; HPEL_FILTER
  
  INIT_MMX mmx2
@@ -365,7 +365,7 @@ cglobal hpel_filter_v, 5,6,%1
      add r5, mmsize
      add r4, mmsize
      jl .loop
-    REP_RET
+    RET
  %endmacro
  
  ;-----------------------------------------------------------------------------
@@ -396,7 +396,7 @@ cglobal hpel_filter_c_mmx2, 3,3
      movntq [r0+r2], m1
      add r2, 8
      jl .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
@@ -440,7 +440,7 @@ cglobal hpel_filter_h_mmx2, 3,3
      movntq     [r0+r2], m1
      add r2, 8
      jl .loop
-    REP_RET
+    RET
  
  INIT_XMM
  
@@ -510,7 +510,7 @@ cglobal hpel_filter_c, 3,3,9
      movntps [r0+r2], m4
      add r2, 16
      jl .loop
-    REP_RET
+    RET
  %endmacro
  
  ;-----------------------------------------------------------------------------
@@ -559,7 +559,7 @@ cglobal hpel_filter_h_sse2, 3,3,8
      movntps    [r0+r2], m1
      add r2, 16
      jl .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
@@ -600,7 +600,7 @@ cglobal hpel_filter_h, 3,3
      movntps [r0+r2], m3
      add r2, 16
      jl .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx2
@@ -1026,7 +1026,7 @@ cglobal store_interleave_chroma, 5,5
      lea    r0, [r0+r1*2]
      sub   r4d, 2
      jg .loop
-    REP_RET
+    RET
  %endmacro ; PLANE_INTERLEAVE
  
  %macro DEINTERLEAVE_START 0
@@ -1068,7 +1068,7 @@ cglobal plane_copy_deinterleave, 6,7
      add    r4, r5
      dec dword r7m
      jg .loopy
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
@@ -1083,7 +1083,7 @@ cglobal load_deinterleave_chroma_fenc, 4,4
      lea    r1, [r1+r2*2]
      sub   r3d, 2
      jg .loop
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
@@ -1098,7 +1098,7 @@ cglobal load_deinterleave_chroma_fdec, 4,4
      lea    r1, [r1+r2*2]
      sub   r3d, 2
      jg .loop
-    REP_RET
+    RET
  %endmacro ; PLANE_DEINTERLEAVE
  
  %if HIGH_BIT_DEPTH
@@ -1155,7 +1155,7 @@ cglobal memcpy_aligned_mmx, 3,3
      sub  r2d, 32
      jg .copy32
  .ret
-    REP_RET
+    RET
  
  ;-----------------------------------------------------------------------------
  ; void *memcpy_aligned( void *dst, const void *src, size_t n );
@@ -1207,7 +1207,7 @@ cglobal memzero_aligned, 2,2
  %endrep
      add r1, mmsize*8
      jl .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx
@@ -1239,7 +1239,7 @@ cglobal integral_init4h_sse4, 3,4
      movdqa  [r3+r2*2+16], m1
      add     r2, 16
      jl .loop
-    REP_RET
+    RET
  
  %macro INTEGRAL_INIT8H 0
  cglobal integral_init8h, 3,4
@@ -1263,7 +1263,7 @@ cglobal integral_init8h, 3,4
      movdqa  [r3+r2*2+16], m1
      add     r2, 16
      jl .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_XMM sse4
@@ -1290,7 +1290,7 @@ cglobal integral_init8v, 3,3
      mova  [r0+r1+mmsize], m1
      add   r1, 2*mmsize
      jl .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx
@@ -1321,7 +1321,7 @@ cglobal integral_init4v_mmx, 3,5
      mova  [r1+r2-8], m3
      sub   r2, 8
      jge .loop
-    REP_RET
+    RET
  
  INIT_XMM
  cglobal integral_init4v_sse2, 3,5
@@ -1347,7 +1347,7 @@ cglobal integral_init4v_sse2, 3,5
      mova  [r1+r2], m3
      add     r2, 16
      jl .loop
-    REP_RET
+    RET
  
  cglobal integral_init4v_ssse3, 3,5
      shl     r2, 1
@@ -1372,7 +1372,7 @@ cglobal integral_init4v_ssse3, 3,5
      mova  [r1+r2], m3
      add     r2, 16
      jl .loop
-    REP_RET
+    RET
  
  %macro FILT8x4 7
      mova      %3, [r0+%7]
@@ -1732,7 +1732,7 @@ cglobal mbtree_propagate_cost, 7,7,7
      movdqa [r0+r6*2], xmm0
      add         r6, 8
      jl .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_XMM sse2
@@ -1786,4 +1786,4 @@ cglobal mbtree_propagate_cost, 7,7,8
      vmovdqu [r0+r6*2], ymm1
      add            r6, 16
      jl .loop
-    REP_RET
+    RET
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm

index 460ecb75e04c05a991d940a47aa08c8ac490612f..0805aba207f3f25eb13bca23b31d44b1a6ac48ca 100644 (file)
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -807,7 +807,7 @@ cglobal predict_8x8_dc, 2,2
      psrlw       m0, 4
      SPLATW      m0, m0
      STORE8x8    m0, m0
-    REP_RET
+    RET
  
  %else ; !HIGH_BIT_DEPTH
  INIT_MMX mmx2
@@ -1103,7 +1103,7 @@ ALIGN 4
      add         r0, FDEC_STRIDE
      dec         r1d
      jg .loop
-    REP_RET
+    RET
  %endmacro ; PREDICT_CHROMA_P_MMX
  
  INIT_MMX mmx2
@@ -1140,7 +1140,7 @@ cglobal predict_8x%1c_p_core, 1,2,7
      add         r0, FDEC_STRIDEB
      dec        r1d
      jg .loop
-    REP_RET
+    RET
  %else ; !HIGH_BIT_DEPTH
  cglobal predict_8x%1c_p_core, 1,2
      movd        m0, r1m
@@ -1225,7 +1225,7 @@ ALIGN 4
      add         r0, FDEC_STRIDE
      dec         r1d
      jg          .loop
-    REP_RET
+    RET
  %endif ; !ARCH_X86_64
  
  %macro PREDICT_16x16_P 0
@@ -1282,7 +1282,7 @@ ALIGN 4
      dec      r1d
      jg       .loop
  %endif ; !HIGH_BIT_DEPTH
-    REP_RET
+    RET
  %endmacro ; PREDICT_16x16_P
  
  INIT_XMM sse2
@@ -1996,20 +1996,20 @@ cglobal predict_16x16_v_mmx2, 1,2
      mova        m2, [r0 - FDEC_STRIDEB+16]
      mova        m3, [r0 - FDEC_STRIDEB+24]
      STORE16x16  m0, m1, m2, m3
-    REP_RET
+    RET
  INIT_XMM
  cglobal predict_16x16_v_sse2, 2,2
      mova      m0, [r0 - FDEC_STRIDEB+ 0]
      mova      m1, [r0 - FDEC_STRIDEB+16]
      STORE16x16_SSE2 m0, m1
-    REP_RET
+    RET
  %else ; !HIGH_BIT_DEPTH
  INIT_MMX
  cglobal predict_16x16_v_mmx2, 1,2
      movq        m0, [r0 - FDEC_STRIDE + 0]
      movq        m1, [r0 - FDEC_STRIDE + 8]
      STORE16x16  m0, m1
-    REP_RET
+    RET
  INIT_XMM
  cglobal predict_16x16_v_sse2, 1,1
      movdqa      xmm0, [r0 - FDEC_STRIDE]
@@ -2055,7 +2055,7 @@ cglobal predict_16x16_h, 1,2
  %endif ; HIGH_BIT_DEPTH
      sub r1, 4*FDEC_STRIDEB
      jge .vloop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx2
@@ -2106,12 +2106,12 @@ cglobal predict_16x16_dc_core, 1,2
  %else
      PRED16x16_DC r1m, 5
  %endif
-    REP_RET
+    RET
  
  INIT_MMX mmx2
  cglobal predict_16x16_dc_top, 1,2
      PRED16x16_DC [pw_8], 4
-    REP_RET
+    RET
  
  INIT_MMX mmx2
  %if HIGH_BIT_DEPTH
@@ -2119,14 +2119,14 @@ cglobal predict_16x16_dc_left_core, 1,2
      movd       m0, r1m
      SPLATW     m0, m0
      STORE16x16 m0, m0, m0, m0
-    REP_RET
+    RET
  %else ; !HIGH_BIT_DEPTH
  cglobal predict_16x16_dc_left_core, 1,1
      movd       m0, r1m
      pshufw     m0, m0, 0
      packuswb   m0, m0
      STORE16x16 m0, m0
-    REP_RET
+    RET
  %endif
  
  ;-----------------------------------------------------------------------------
@@ -2159,11 +2159,11 @@ INIT_XMM sse2
  cglobal predict_16x16_dc_core, 2,2,4
      movd       m3, r1m
      PRED16x16_DC_SSE2 m3, 5
-    REP_RET
+    RET
  
  cglobal predict_16x16_dc_top, 1,2
      PRED16x16_DC_SSE2 [pw_8], 4
-    REP_RET
+    RET
  
  INIT_XMM sse2
  %if HIGH_BIT_DEPTH
@@ -2171,7 +2171,7 @@ cglobal predict_16x16_dc_left_core, 1,2
      movd       m0, r1m
      SPLATW     m0, m0
      STORE16x16_SSE2 m0, m0
-    REP_RET
+    RET
  %else ; !HIGH_BIT_DEPTH
  cglobal predict_16x16_dc_left_core, 1,1
      movd       m0, r1m
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index 3c43220b175add85da89bd217084e1d962dc68ac..00889bb2c2c5966c767aee16afb442a1fad8a1e5 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -416,7 +416,7 @@ QUANT_AC quant_8x8, 8
      %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
      sub t0d, 16*%3
      jge %%loop
-    REP_RET
+    RET
  %else
      %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
      %1 [r0+(0   )*SIZEOF_PIXEL], [r1+0    ], [r1+ 8*%3]
@@ -738,7 +738,7 @@ cglobal optimize_chroma_2x2_dc, 0,%%regs,7
      PSIGND    m5, m2, m1
      test     t3d, t3d
      jnz .outer_loop_0
-    REP_RET
+    RET
  %endmacro
  
  %if HIGH_BIT_DEPTH == 0
@@ -783,7 +783,7 @@ cglobal denoise_dct, 4,4,8
      mova      [r1+r3*4-1*mmsize], m5
      sub       r3, mmsize/2
      jg .loop
-    REP_RET
+    RET
  %endmacro
  
  %if ARCH_X86_64 == 0
@@ -831,7 +831,7 @@ cglobal denoise_dct, 4,4,7
      mova      [r1+r3*4-1*mmsize], m1
      sub       r3, mmsize
      jg .loop
-    REP_RET
+    RET
  %endmacro
  
  %if ARCH_X86_64 == 0
@@ -954,7 +954,7 @@ cglobal decimate_score%1, 1,3
      jne  .loop
  %endif
  .ret:
-    RET
+    REP_RET
  .ret9:
      mov   eax, 9
      RET
@@ -1066,7 +1066,7 @@ cglobal decimate_score64, 1,5
  .tryret:
      xor   r4, -1
      jne  .cont
-    REP_RET
+    RET
  .ret9:
      mov   eax, 9
      RET
@@ -1077,7 +1077,7 @@ cglobal decimate_score64, 1,5
      shr   r3, cl
      shr   r3, 1
      jne  .loop
-    REP_RET
+    RET
  %endif ; ARCH
  
  %endmacro
@@ -1381,7 +1381,7 @@ cglobal coeff_level_run%1,0,7
      inc    t6d
      sub    t4d, t3d
      jge .loop
-    REP_RET
+    RET
  %endmacro
  
  INIT_MMX mmx2
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm

index 7888903ad8e89a597b5136f03a4af5a8bdc04a12..1b81ff54fba3132a96d5784157cb6b509f0a3a80 100644 (file)
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -125,8 +125,7 @@ CPU amdnop
  ; Pops anything that was pushed by PROLOGUE, and returns.
  
  ; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
+; Use this instead of RET if it's a branch target.
  
  ; registers:
  ; rN and rNq are the native-size register holding function argument N
@@ -473,7 +472,7 @@ DECLARE_REG 14, R15, 120
  %if mmsize == 32
      vzeroupper
  %endif
-    ret
+    AUTO_REP_RET
  %endmacro
  
  %elif ARCH_X86_64 ; *nix x64 ;=============================================
@@ -520,7 +519,7 @@ DECLARE_REG 14, R15, 72
  %if mmsize == 32
      vzeroupper
  %endif
-    ret
+    AUTO_REP_RET
  %endmacro
  
  %else ; X86_32 ;==============================================================
@@ -576,7 +575,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
  %if mmsize == 32
      vzeroupper
  %endif
-    ret
+    AUTO_REP_RET
  %endmacro
  
  %endif ;======================================================================
@@ -590,6 +589,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
  %endmacro
  %endif
  
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
  %macro REP_RET 0
      %if has_epilogue
          RET
@@ -598,6 +601,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
      %endif
  %endmacro
  
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %ifndef cpuflags
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
+    %elif notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep
+    %endif
+    ret
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %%branch_instr:
+            %xdefine last_branch_adr %%branch_instr
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
  %macro TAIL_CALL 2 ; callee, is_nonadjacent
      %if has_epilogue
          call %1
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm

index 47a4f65e5d50f1ff7d640eafd2f85c404495ba8b..001eec828e285aacdef7bcba087fa3d775b81536 100644 (file)
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -199,7 +199,7 @@ cglobal checkasm_call, 1,7
      mov  dword [r1], 0
      mov  eax, r3
  .ok:
-    RET
+    REP_RET
  
  %endif ; ARCH_X86_64
author	Loren Merritt <pengvado@akuvian.org>
	Sun, 2 Dec 2012 15:56:30 +0000 (15:56 +0000)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 8 Jan 2013 21:52:27 +0000 (13:52 -0800)
common/x86/cabac-a.asm		patch \| blob \| history
common/x86/dct-a.asm		patch \| blob \| history
common/x86/deblock-a.asm		patch \| blob \| history
common/x86/mc-a.asm		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/predict-a.asm		patch \| blob \| history
common/x86/quant-a.asm		patch \| blob \| history
common/x86/x86inc.asm		patch \| blob \| history
tools/checkasm-a.asm		patch \| blob \| history