x86inc: Remove .rodata kludges

[x264] / common / x86 / dct-64.asm
diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm

index 78a2484161bff53c5377df67053a3595434c3ee2..c1aff843101f0081686f07ebe04908c7aa803e4a 100644 (file)
--- a/common/x86/dct-64.asm
+++ b/common/x86/dct-64.asm
@@ -1,7 +1,7 @@
  ;*****************************************************************************
  ;* dct-64.asm: x86_64 transform and zigzag
  ;*****************************************************************************
-;* Copyright (C) 2003-2012 x264 project
+;* Copyright (C) 2003-2013 x264 project
  ;*
  ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  ;*          Holger Lubitz <holger@lubitz.org>
@@ -89,7 +89,7 @@ cextern hsub_mul
  
  %macro IDCT8_1D 11
      SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
-    
+
      psra%1   m%10, m%3, 1
      padd%1   m%10, m%3
      padd%1   m%10, m%5
@@ -141,10 +141,7 @@ cextern hsub_mul
  
  %macro SUB8x8_DCT8 0
  cglobal sub8x8_dct8, 3,3,14
-%if WIN64
-    call .skip_prologue
-    RET
-%endif
+    TAIL_CALL .skip_prologue, 0
  global current_function %+ .skip_prologue
  .skip_prologue:
      LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
@@ -194,10 +191,7 @@ SUB8x8_DCT8
  %macro ADD8x8_IDCT8 0
  cglobal add8x8_idct8, 2,2,16
      add r1, 128
-%if WIN64
-    call .skip_prologue
-    RET
-%endif
+    TAIL_CALL .skip_prologue, 0
  global current_function %+ .skip_prologue
  .skip_prologue:
      mova     m0, [r1-128]
@@ -260,10 +254,7 @@ cglobal sub8x8_dct, 3,3,10
  %if cpuflag(ssse3)
      mova m7, [hsub_mul]
  %endif
-%if WIN64
-    call .skip_prologue
-    RET
-%endif
+    TAIL_CALL .skip_prologue, 0
  global current_function %+ .skip_prologue
  .skip_prologue:
      SWAP 7, 9
@@ -287,10 +278,7 @@ cglobal sub8x8_dct8, 3,3,11
  %if cpuflag(ssse3)
      mova m7, [hsub_mul]
  %endif
-%if WIN64
-    call .skip_prologue
-    RET
-%endif
+    TAIL_CALL .skip_prologue, 0
  global current_function %+ .skip_prologue
  .skip_prologue:
      SWAP 7, 10
@@ -323,6 +311,42 @@ DCT_SUB8
  INIT_XMM xop
  DCT_SUB8
  
+INIT_YMM avx2
+cglobal sub16x16_dct8, 3,3,10
+    add  r0, 128
+    add  r2, 4*FDEC_STRIDE
+    call .sub16x8_dct8
+    add  r0, 256
+    add  r1, FENC_STRIDE*8
+    add  r2, FDEC_STRIDE*8
+    call .sub16x8_dct8
+    RET
+.sub16x8_dct8:
+    LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
+    LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
+    LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
+    LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
+    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
+    TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
+    DCT8_1D    w, 0,1,2,3,4,5,6,7,8,9
+    mova    [r0-0x80+0x00], xm0
+    vextracti128 [r0+0x00], m0, 1
+    mova    [r0-0x80+0x10], xm1
+    vextracti128 [r0+0x10], m1, 1
+    mova    [r0-0x80+0x20], xm2
+    vextracti128 [r0+0x20], m2, 1
+    mova    [r0-0x80+0x30], xm3
+    vextracti128 [r0+0x30], m3, 1
+    mova    [r0-0x80+0x40], xm4
+    vextracti128 [r0+0x40], m4, 1
+    mova    [r0-0x80+0x50], xm5
+    vextracti128 [r0+0x50], m5, 1
+    mova    [r0-0x80+0x60], xm6
+    vextracti128 [r0+0x60], m6, 1
+    mova    [r0-0x80+0x70], xm7
+    vextracti128 [r0+0x70], m7, 1
+    ret
+
  ;-----------------------------------------------------------------------------
  ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
@@ -330,10 +354,7 @@ DCT_SUB8
  cglobal add8x8_idct8, 2,2,11
      add r0, 4*FDEC_STRIDE
      pxor m7, m7
-%if WIN64
-    call .skip_prologue
-    RET
-%endif
+    TAIL_CALL .skip_prologue, 0
  global current_function %+ .skip_prologue
  .skip_prologue:
      SWAP 7, 9
@@ -369,10 +390,7 @@ ADD8x8_IDCT8
  cglobal add8x8_idct, 2,2,11
      add  r0, 4*FDEC_STRIDE
      pxor m7, m7
-%if WIN64
-    call .skip_prologue
-    RET
-%endif
+    TAIL_CALL .skip_prologue, 0
  global current_function %+ .skip_prologue
  .skip_prologue:
      SWAP 7, 9
@@ -408,4 +426,5 @@ INIT_XMM sse2
  ADD8x8
  INIT_XMM avx
  ADD8x8
+
  %endif ; !HIGH_BIT_DEPTH