1 ;*****************************************************************************
2 ;* dct.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003 x264 project
5 ;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
7 ;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
8 ;* Laurent Aimar <fenrir@via.ecp.fr> (initial version)
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
25 ;*****************************************************************************
27 ;* Revision history: *
29 ;* 2004.04.28 portab all 4x4 function to nasm (CM) *
31 ;*****************************************************************************
35 ;=============================================================================
36 ; Macros and other preprocessor constants
37 ;=============================================================================
39 %include "amd64inc.asm"
45 %macro MMX_LOAD_DIFF_4P 5
53 %macro MMX_LOAD_DIFF_8P 5
61 %macro MMX_SUMSUB_BA 2
67 %macro MMX_SUMSUB_BADC 4
76 %macro MMX_SUMSUB2_AB 3
84 %macro MMX_SUMSUBD2_AB 4
99 ;-----------------------------------------------------------------------------
100 ; input ABCD output ADTC
101 ;-----------------------------------------------------------------------------
102 %macro MMX_TRANSPOSE 5
103 SBUTTERFLY q, wd, %1, %2, %5
104 SBUTTERFLY q, wd, %3, %4, %2
105 SBUTTERFLY q, dq, %1, %3, %4
106 SBUTTERFLY q, dq, %5, %2, %3
109 ;-----------------------------------------------------------------------------
110 ; input ABCDEFGH output AFHDTECB
111 ;-----------------------------------------------------------------------------
112 %macro SSE2_TRANSPOSE8x8 9
113 SBUTTERFLY dqa, wd, %1, %2, %9
114 SBUTTERFLY dqa, wd, %3, %4, %2
115 SBUTTERFLY dqa, wd, %5, %6, %4
116 SBUTTERFLY dqa, wd, %7, %8, %6
117 SBUTTERFLY dqa, dq, %1, %3, %8
118 SBUTTERFLY dqa, dq, %9, %2, %3
119 SBUTTERFLY dqa, dq, %5, %7, %2
120 SBUTTERFLY dqa, dq, %4, %6, %7
121 SBUTTERFLY dqa, qdq, %1, %5, %6
122 SBUTTERFLY dqa, qdq, %9, %4, %5
123 SBUTTERFLY dqa, qdq, %8, %2, %4
124 SBUTTERFLY dqa, qdq, %3, %7, %2
127 %macro MMX_STORE_DIFF_4P 5
137 %macro MMX_STORE_DIFF_8P 4
146 ;=============================================================================
148 ;=============================================================================
150 SECTION .rodata align=16
154 ;=============================================================================
156 ;=============================================================================
160 cglobal x264_dct4x4dc_mmxext
163 ;-----------------------------------------------------------------------------
164 ; void dct4x4dc( int16_t d[4][4] )
165 ;-----------------------------------------------------------------------------
166 x264_dct4x4dc_mmxext:
167 movq mm0, [parm1q+ 0]
168 movq mm1, [parm1q+ 8]
169 movq mm2, [parm1q+16]
170 movq mm3, [parm1q+24]
172 MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
173 MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
175 MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
177 MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
178 MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
180 MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
182 movq mm6, [pw_1 GLOBAL]
197 cglobal x264_idct4x4dc_mmxext
200 ;-----------------------------------------------------------------------------
201 ; void x264_idct4x4dc_mmxext( int16_t d[4][4] )
202 ;-----------------------------------------------------------------------------
203 x264_idct4x4dc_mmxext:
204 movq mm0, [parm1q+ 0]
205 movq mm1, [parm1q+ 8]
206 movq mm2, [parm1q+16]
207 movq mm3, [parm1q+24]
209 MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
210 MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
212 MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
214 MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
215 MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
217 MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
219 movq [parm1q+ 0], mm0
220 movq [parm1q+ 8], mm4
221 movq [parm1q+16], mm1
222 movq [parm1q+24], mm3
225 cglobal x264_sub4x4_dct_mmxext
228 ;-----------------------------------------------------------------------------
229 ; void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
230 ;-----------------------------------------------------------------------------
231 x264_sub4x4_dct_mmxext:
236 mov r10, parm1q ; dct
237 mov rax, parm2q ; pix1
239 mov rcx, parm4q ; pix2
240 movsxd rdx, dword [rsp+40+8] ; i_pix2
241 movsxd rbx, parm3d ; i_pix1
243 movsxd rbx, parm3d ; i_pix1
244 movsxd rdx, parm5d ; i_pix2
250 MMX_LOAD_DIFF_4P mm0, mm6, mm7, [rax ], [rcx]
251 MMX_LOAD_DIFF_4P mm1, mm6, mm7, [rax+rbx ], [rcx+rdx]
252 MMX_LOAD_DIFF_4P mm2, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
255 MMX_LOAD_DIFF_4P mm3, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
257 MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
259 MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
260 MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
262 ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
263 MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1
265 MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
267 MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
268 MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
270 ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
271 MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4
273 movq [r10+ 0], mm1 ; dct
282 cglobal x264_add4x4_idct_mmxext
285 ;-----------------------------------------------------------------------------
286 ; void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
287 ;-----------------------------------------------------------------------------
288 x264_add4x4_idct_mmxext:
290 movq mm0, [parm3q+ 0] ; dct
291 movq mm4, [parm3q+ 8]
292 movq mm3, [parm3q+16]
293 movq mm1, [parm3q+24]
295 mov rax, parm1q ; p_dst
296 movsxd rcx, parm2d ; i_dst
299 ; out:mm0, mm1, mm2, mm3
300 MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2
302 MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
303 MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
305 MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
307 ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
308 MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3
310 MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
311 MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
313 MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
316 movq mm6, [pw_32 GLOBAL]
318 MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [rax]
319 MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [rax+rcx]
320 MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [rax+rcx*2]
321 MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [rax+rdx]
327 ; =============================================================================
329 ; =============================================================================
334 MMX_SUMSUB_BA %8, %1 ; %8=s07, %1=d07
335 MMX_SUMSUB_BA %7, %2 ; %7=s16, %2=d16
336 MMX_SUMSUB_BA %6, %3 ; %6=s25, %3=d25
337 MMX_SUMSUB_BA %5, %4 ; %5=s34, %4=d34
339 MMX_SUMSUB_BA %5, %8 ; %5=a0, %8=a2
340 MMX_SUMSUB_BA %6, %7 ; %6=a1, %7=a3
352 psubw %10, %3 ; %10=a7
362 MMX_SUMSUB_BA %6, %5 ; %6=b0, %5=b4
368 psubw %9, %10 ; %9=b7
383 cglobal x264_sub8x8_dct8_sse2
386 ;-----------------------------------------------------------------------------
387 ; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
388 ;-----------------------------------------------------------------------------
389 x264_sub8x8_dct8_sse2:
391 ; mov rsi, rsi ; pix1
392 movsxd rdx, edx ; i_pix1
393 ; mov rcx, rcx ; pix2
394 movsxd r8, r8d ; i_pix2
398 MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [rsi ], [rcx]
399 MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [rsi+rdx ], [rcx+r8]
400 MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [rsi+rdx*2], [rcx+r8*2]
405 MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [rsi ], [rcx]
406 MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [rsi+rdx ], [rcx+r8]
407 MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [rsi+rdx*2], [rcx+r8*2]
408 MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [rsi+r9 ], [rcx+r10]
409 MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [rsi+rdx*4], [rcx+r8*4]
411 SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
412 DCT8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm6, xmm9
413 SSE2_TRANSPOSE8x8 xmm4, xmm5, xmm7, xmm2, xmm8, xmm3, xmm1, xmm6, xmm0
414 DCT8_1D xmm4, xmm3, xmm6, xmm2, xmm0, xmm8, xmm7, xmm5, xmm1, xmm9
416 movdqa [rdi+0x00], xmm8
417 movdqa [rdi+0x10], xmm3
418 movdqa [rdi+0x20], xmm6
419 movdqa [rdi+0x30], xmm7
420 movdqa [rdi+0x40], xmm0
421 movdqa [rdi+0x50], xmm2
422 movdqa [rdi+0x60], xmm5
423 movdqa [rdi+0x70], xmm1
431 MMX_SUMSUB_BA %5, %1 ; %5=a0, %1=a2
436 paddw %7, %10 ; %7=a6
448 psubw %10, %2 ; %10=a5
459 MMX_SUMSUB_BA %7, %5 ; %7=b0, %5=b6
460 MMX_SUMSUB_BA %3, %1 ; %3=b2, %1=b4
472 psubw %2, %10 ; %2=b5
474 MMX_SUMSUB_BA %9, %7 ; %9=c0, %7=c7
475 MMX_SUMSUB_BA %2, %3 ; %2=c1, %3=c6
476 MMX_SUMSUB_BA %8, %1 ; %8=c2, %1=c5
477 MMX_SUMSUB_BA %4, %5 ; %4=c3, %5=c4
480 cglobal x264_add8x8_idct8_sse2
483 ;-----------------------------------------------------------------------------
484 ; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int i_dst, int16_t dct[8][8] )
485 ;-----------------------------------------------------------------------------
486 x264_add8x8_idct8_sse2:
487 movsxd rsi, esi ; i_dst
488 movdqa xmm0, [rdx+0x00] ; dct
489 movdqa xmm1, [rdx+0x10]
490 movdqa xmm2, [rdx+0x20]
491 movdqa xmm3, [rdx+0x30]
492 movdqa xmm4, [rdx+0x40]
493 movdqa xmm5, [rdx+0x50]
494 movdqa xmm6, [rdx+0x60]
495 movdqa xmm7, [rdx+0x70]
497 SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
498 IDCT8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm9, xmm6
499 SSE2_TRANSPOSE8x8 xmm9, xmm5, xmm1, xmm3, xmm8, xmm0, xmm7, xmm2, xmm4
500 paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
501 IDCT8_1D xmm9, xmm0, xmm2, xmm3, xmm4, xmm8, xmm1, xmm5, xmm6, xmm7
504 MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [rdi]
505 MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [rdi+rsi]
506 MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [rdi+rsi*2]
509 MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [rdi]
510 MMX_STORE_DIFF_8P xmm4, xmm14, xmm15, [rdi+rsi]
511 MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [rdi+rsi*2]
512 MMX_STORE_DIFF_8P xmm2, xmm14, xmm15, [rdi+rax]
513 MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [rdi+rsi*4]