git.sesse.net Git - x264/blob - common/x86/dct-a.asm

   1 ;*****************************************************************************
   2 ;* dct-a.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2003-2008 x264 project
   5 ;*
   6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7 ;*          Loren Merritt <lorenm@u.washington.edu>
   8 ;*          Min Chen <chenm001.163.com>
   9 ;*
  10 ;* This program is free software; you can redistribute it and/or modify
  11 ;* it under the terms of the GNU General Public License as published by
  12 ;* the Free Software Foundation; either version 2 of the License, or
  13 ;* (at your option) any later version.
  14 ;*
  15 ;* This program is distributed in the hope that it will be useful,
  16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 ;* GNU General Public License for more details.
  19 ;*
  20 ;* You should have received a copy of the GNU General Public License
  21 ;* along with this program; if not, write to the Free Software
  22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23 ;*****************************************************************************
  24
  25 %include "x86inc.asm"
  26 %include "x86util.asm"
  27
  28 SECTION_RODATA
  29 pw_1:  times 8 dw 1
  30 pw_32: times 8 dw 32
  31 pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
  32
  33 SECTION .text
  34
  35 %macro HADAMARD4_1D 4
  36     SUMSUB_BADC m%2, m%1, m%4, m%3
  37     SUMSUB_BADC m%4, m%2, m%3, m%1
  38     SWAP %1, %4, %3
  39 %endmacro
  40
  41 ;-----------------------------------------------------------------------------
  42 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
  43 ;-----------------------------------------------------------------------------
  44 cglobal x264_dct4x4dc_mmx, 1,1,1
  45     movq   m0, [r0+ 0]
  46     movq   m1, [r0+ 8]
  47     movq   m2, [r0+16]
  48     movq   m3, [r0+24]
  49     HADAMARD4_1D  0,1,2,3
  50     TRANSPOSE4x4W 0,1,2,3,4
  51     HADAMARD4_1D  0,1,2,3
  52     movq   m6, [pw_1 GLOBAL]
  53     paddw  m0, m6
  54     paddw  m1, m6
  55     paddw  m2, m6
  56     paddw  m3, m6
  57     psraw  m0, 1
  58     psraw  m1, 1
  59     psraw  m2, 1
  60     psraw  m3, 1
  61     movq  [r0+0], m0
  62     movq  [r0+8], m1
  63     movq [r0+16], m2
  64     movq [r0+24], m3
  65     RET
  66
  67 ;-----------------------------------------------------------------------------
  68 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
  69 ;-----------------------------------------------------------------------------
  70 cglobal x264_idct4x4dc_mmx, 1,1
  71     movq  m0, [r0+ 0]
  72     movq  m1, [r0+ 8]
  73     movq  m2, [r0+16]
  74     movq  m3, [r0+24]
  75     HADAMARD4_1D  0,1,2,3
  76     TRANSPOSE4x4W 0,1,2,3,4
  77     HADAMARD4_1D  0,1,2,3
  78     movq  [r0+ 0], m0
  79     movq  [r0+ 8], m1
  80     movq  [r0+16], m2
  81     movq  [r0+24], m3
  82     RET
  83
  84 %macro DCT4_1D 5
  85     SUMSUB_BADC m%4, m%1, m%3, m%2
  86     SUMSUB_BA   m%3, m%4
  87     SUMSUB2_AB  m%1, m%2, m%5
  88     SWAP %1, %3, %4, %5, %2
  89 %endmacro
  90
  91 %macro IDCT4_1D 6
  92     SUMSUB_BA   m%3, m%1
  93     SUMSUBD2_AB m%2, m%4, m%6, m%5
  94     SUMSUB_BADC m%2, m%3, m%5, m%1
  95     SWAP %1, %2, %5, %4, %3
  96 %endmacro
  97
  98 ;-----------------------------------------------------------------------------
  99 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
 100 ;-----------------------------------------------------------------------------
 101 cglobal x264_sub4x4_dct_mmx, 3,3
 102 .skip_prologue:
 103 %macro SUB_DCT4 1
 104     LOAD_DIFF  m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
 105     LOAD_DIFF  m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
 106     LOAD_DIFF  m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
 107     LOAD_DIFF  m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
 108     DCT4_1D 0,1,2,3,4
 109     TRANSPOSE%1 0,1,2,3,4
 110     DCT4_1D 0,1,2,3,4
 111     movq  [r0+ 0], m0
 112     movq  [r0+ 8], m1
 113     movq  [r0+16], m2
 114     movq  [r0+24], m3
 115 %endmacro
 116     SUB_DCT4 4x4W
 117     RET
 118
 119 ;-----------------------------------------------------------------------------
 120 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
 121 ;-----------------------------------------------------------------------------
 122 cglobal x264_add4x4_idct_mmx, 2,2,1
 123 .skip_prologue:
 124     movq  m0, [r1+ 0]
 125     movq  m1, [r1+ 8]
 126     movq  m2, [r1+16]
 127     movq  m3, [r1+24]
 128 %macro ADD_IDCT4 1
 129     IDCT4_1D 0,1,2,3,4,5
 130     TRANSPOSE%1 0,1,2,3,4
 131     paddw m0, [pw_32 GLOBAL]
 132     IDCT4_1D 0,1,2,3,4,5
 133     pxor  m7, m7
 134     STORE_DIFF  m0, m4, m7, [r0+0*FDEC_STRIDE]
 135     STORE_DIFF  m1, m4, m7, [r0+1*FDEC_STRIDE]
 136     STORE_DIFF  m2, m4, m7, [r0+2*FDEC_STRIDE]
 137     STORE_DIFF  m3, m4, m7, [r0+3*FDEC_STRIDE]
 138 %endmacro
 139     ADD_IDCT4 4x4W
 140     RET
 141
 142 INIT_XMM
 143
 144 cglobal x264_sub8x8_dct_sse2, 3,3
 145 .skip_prologue:
 146     call .8x4
 147     add  r0, 64
 148     add  r1, 4*FENC_STRIDE
 149     add  r2, 4*FDEC_STRIDE
 150 .8x4:
 151     SUB_DCT4 2x4x4W
 152     movhps [r0+32], m0
 153     movhps [r0+40], m1
 154     movhps [r0+48], m2
 155     movhps [r0+56], m3
 156     ret
 157
 158 cglobal x264_add8x8_idct_sse2, 2,2,1
 159 .skip_prologue:
 160     call .8x4
 161     add  r1, 64
 162     add  r0, 4*FDEC_STRIDE
 163 .8x4:
 164     movq   m0, [r1+ 0]
 165     movq   m1, [r1+ 8]
 166     movq   m2, [r1+16]
 167     movq   m3, [r1+24]
 168     movhps m0, [r1+32]
 169     movhps m1, [r1+40]
 170     movhps m2, [r1+48]
 171     movhps m3, [r1+56]
 172     ADD_IDCT4 2x4x4W
 173     ret
 174
 175 ;-----------------------------------------------------------------------------
 176 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 177 ;-----------------------------------------------------------------------------
 178 %macro SUB_NxN_DCT 6
 179 cglobal %1, 3,3
 180 .skip_prologue:
 181     call %2
 182     add  r0, %3
 183     add  r1, %4-%5-%6*FENC_STRIDE
 184     add  r2, %4-%5-%6*FDEC_STRIDE
 185     call %2
 186     add  r0, %3
 187     add  r1, (%4-%6)*FENC_STRIDE-%5-%4
 188     add  r2, (%4-%6)*FDEC_STRIDE-%5-%4
 189     call %2
 190     add  r0, %3
 191     add  r1, %4-%5-%6*FENC_STRIDE
 192     add  r2, %4-%5-%6*FDEC_STRIDE
 193     jmp  %2
 194 %endmacro
 195
 196 ;-----------------------------------------------------------------------------
 197 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
 198 ;-----------------------------------------------------------------------------
 199 %macro ADD_NxN_IDCT 6
 200 cglobal %1, 2,2,1
 201 .skip_prologue:
 202     call %2
 203     add  r0, %4-%5-%6*FDEC_STRIDE
 204     add  r1, %3
 205     call %2
 206     add  r0, (%4-%6)*FDEC_STRIDE-%5-%4
 207     add  r1, %3
 208     call %2
 209     add  r0, %4-%5-%6*FDEC_STRIDE
 210     add  r1, %3
 211     jmp  %2
 212 %endmacro
 213
 214 %ifndef ARCH_X86_64
 215 SUB_NxN_DCT  x264_sub8x8_dct_mmx,    x264_sub4x4_dct_mmx  %+ .skip_prologue, 32, 4, 0, 0
 216 ADD_NxN_IDCT x264_add8x8_idct_mmx,   x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
 217 SUB_NxN_DCT  x264_sub16x16_dct_mmx,  x264_sub8x8_dct_mmx  %+ .skip_prologue, 32, 8, 4, 4
 218 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
 219
 220 cextern x264_sub8x8_dct8_mmx.skip_prologue
 221 cextern x264_add8x8_idct8_mmx.skip_prologue
 222 SUB_NxN_DCT  x264_sub16x16_dct8_mmx,  x264_sub8x8_dct8_mmx  %+ .skip_prologue, 128, 8, 0, 0
 223 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
 224 %define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
 225 %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
 226 %endif
 227
 228 SUB_NxN_DCT  x264_sub16x16_dct_sse2,  x264_sub8x8_dct_sse2  %+ .skip_prologue, 64, 8, 0, 4
 229 ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
 230
 231 cextern x264_sub8x8_dct8_sse2
 232 cextern x264_add8x8_idct8_sse2
 233 SUB_NxN_DCT  x264_sub16x16_dct8_sse2,  x264_sub8x8_dct8_sse2,  128, 8, 0, 0
 234 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
 235
 236
 237
 238 ;-----------------------------------------------------------------------------
 239 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
 240 ;-----------------------------------------------------------------------------
 241 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
 242 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
 243     pshufw     mm0, [r1+4], 0xd2
 244     movq       mm1, [r1+16]
 245     movq       mm2, [r1+24]
 246     movq    [r0+4], mm0
 247     movq   [r0+16], mm1
 248     movq   [r0+24], mm2
 249     mov        r2d, [r1]
 250     mov       [r0], r2d
 251     mov        r2d, [r1+12]
 252     mov    [r0+12], r2d
 253     RET
 254
 255 ;-----------------------------------------------------------------------------
 256 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
 257 ;-----------------------------------------------------------------------------
 258 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
 259     movd      xmm0, [r1+0*FENC_STRIDE]
 260     movd      xmm1, [r1+1*FENC_STRIDE]
 261     movd      xmm2, [r1+2*FENC_STRIDE]
 262     movd      xmm3, [r1+3*FENC_STRIDE]
 263     movd      xmm4, [r2+0*FDEC_STRIDE]
 264     movd      xmm5, [r2+1*FDEC_STRIDE]
 265     movd      xmm6, [r2+2*FDEC_STRIDE]
 266     movd      xmm7, [r2+3*FDEC_STRIDE]
 267     movd      [r2+0*FDEC_STRIDE], xmm0
 268     movd      [r2+1*FDEC_STRIDE], xmm1
 269     movd      [r2+2*FDEC_STRIDE], xmm2
 270     movd      [r2+3*FDEC_STRIDE], xmm3
 271     picgetgot r1
 272     punpckldq xmm0, xmm1
 273     punpckldq xmm2, xmm3
 274     punpckldq xmm4, xmm5
 275     punpckldq xmm6, xmm7
 276     movlhps   xmm0, xmm2
 277     movlhps   xmm4, xmm6
 278     movdqa    xmm7, [pb_zigzag4 GLOBAL]
 279     pshufb    xmm0, xmm7
 280     pshufb    xmm4, xmm7
 281     pxor      xmm6, xmm6
 282     movdqa    xmm1, xmm0
 283     movdqa    xmm5, xmm4
 284     punpcklbw xmm0, xmm6
 285     punpckhbw xmm1, xmm6
 286     punpcklbw xmm4, xmm6
 287     punpckhbw xmm5, xmm6
 288     psubw     xmm0, xmm4
 289     psubw     xmm1, xmm5
 290     movdqa    [r0], xmm0
 291     movdqa [r0+16], xmm1
 292     RET