1 ;*****************************************************************************
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
6 ;* This program is free software; you can redistribute it and/or modify
7 ;* it under the terms of the GNU General Public License as published by
8 ;* the Free Software Foundation; either version 2 of the License, or
9 ;* (at your option) any later version.
11 ;* This program is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;* GNU General Public License for more details.
16 ;* You should have received a copy of the GNU General Public License
17 ;* along with this program; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 ;*****************************************************************************
21 ; FIXME: All of the 64bit asm functions that take a stride as an argument
22 ; via register, assume that the high dword of that register is filled with 0.
23 ; This is true in practice (since we never do any 64bit arithmetic on strides,
24 ; and x264's strides are all positive), but is not guaranteed by the ABI.
26 ; Name of the .rodata section.
27 ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
28 ; so use a different read-only section.
29 %macro SECTION_RODATA 0
30 %ifidn __OUTPUT_FORMAT__,macho64
31 SECTION .text align=16
32 %elifidn __OUTPUT_FORMAT__,macho
33 SECTION .text align=16
36 SECTION .rodata align=16
41 ; x86_64 can't fit 64bit address literals in most instruction types,
42 ; so shared objects (under the assumption that they might be anywhere
43 ; in memory) must use an address mode that does fit.
44 ; So all accesses to global variables must use this macro, e.g.
45 ; mov eax, [foo GLOBAL]
49 ; x86_32 doesn't require PIC.
50 ; Some distros prefer shared objects to be PIC, but nothing breaks if
51 ; the code contains a few textrels, so we'll skip that complexity.
57 %define GLOBAL wrt rip
62 ; Macros to eliminate most code duplication between x86_32 and x86_64:
63 ; Currently this works only for leaf functions which load all their arguments
64 ; into registers at the start, and make no other use of the stack. Luckily that
65 ; covers most of x264's asm.
68 ; %1 = number of arguments. loads them from stack if needed.
69 ; %2 = number of registers used. pushes callee-saved regs if needed.
70 ; %3 = list of names to define to registers
71 ; PROLOGUE can also be invoked by adding the same options to cglobal
74 ; cglobal foo, 2,3, dst, src, tmp
75 ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
77 ; TODO Some functions can use some args directly from the stack. If they're the
78 ; last args then you can just not declare them, but if they're in the middle
79 ; we need more flexible macro.
82 ; Pops anything that was pushed by PROLOGUE
85 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
86 ; which are slow when a normal ret follows a branch.
97 %macro DECLARE_REG_SIZE 2
111 DECLARE_REG_SIZE ax, al
112 DECLARE_REG_SIZE bx, bl
113 DECLARE_REG_SIZE cx, cl
114 DECLARE_REG_SIZE dx, dl
115 DECLARE_REG_SIZE si, sil
116 DECLARE_REG_SIZE di, dil
117 DECLARE_REG_SIZE bp, bpl
119 ; t# defines for when per-arch register allocation is more complex than just function arguments
121 %macro DECLARE_REG_TMP 1-*
124 CAT_XDEFINE t, %%i, r%1
130 %macro DECLARE_REG_TMP_SIZE 0-*
132 %define t%1q t%1 %+ q
133 %define t%1d t%1 %+ d
134 %define t%1w t%1 %+ w
135 %define t%1b t%1 %+ b
140 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
150 %assign stack_offset stack_offset+gprsize
155 %assign stack_offset stack_offset-gprsize
161 %assign stack_offset stack_offset+(%2)
168 %assign stack_offset stack_offset-(%2)
178 %macro movsxdifnidn 2
190 %macro DEFINE_ARGS 0-*
194 CAT_UNDEF arg_name %+ %%i, q
195 CAT_UNDEF arg_name %+ %%i, d
196 CAT_UNDEF arg_name %+ %%i, w
197 CAT_UNDEF arg_name %+ %%i, b
198 CAT_UNDEF arg_name, %%i
205 %xdefine %1q r %+ %%i %+ q
206 %xdefine %1d r %+ %%i %+ d
207 %xdefine %1w r %+ %%i %+ w
208 %xdefine %1b r %+ %%i %+ b
209 CAT_XDEFINE arg_name, %%i, %1
213 %assign n_arg_names %%i
216 %ifdef ARCH_X86_64 ;========================================================
218 DECLARE_REG 0, rdi, edi, di, dil, edi
219 DECLARE_REG 1, rsi, esi, si, sil, esi
220 DECLARE_REG 2, rdx, edx, dx, dl, edx
221 DECLARE_REG 3, rcx, ecx, cx, cl, ecx
222 DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
223 DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
224 DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
225 %define r7m [rsp + stack_offset + 16]
226 %define r8m [rsp + stack_offset + 24]
228 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
230 mov r%1, [rsp - 40 + %1*8]
234 %macro PROLOGUE 2-3+ ; #args, #regs, arg_names...
237 %assign stack_offset 0
250 %else ; X86_32 ;==============================================================
252 DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
253 DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
254 DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
255 DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
256 DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
257 DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
258 DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
259 %define r7m [esp + stack_offset + 32]
260 %define r8m [esp + stack_offset + 36]
263 %macro PUSH_IF_USED 1 ; reg_id
266 %assign stack_offset stack_offset+4
270 %macro POP_IF_USED 1 ; reg_id
276 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
278 mov r%1, [esp + stack_offset + 4 + %1*4]
282 %macro PROLOGUE 2-3+ ; #args, #regs, arg_names...
284 %assign stack_offset 0
286 ASSERT regs_used <= 7
317 %endif ;======================================================================
321 ;=============================================================================
322 ; arch-independent part
323 ;=============================================================================
325 %assign function_align 16
327 ; Symbol prefix for C linkage
329 %ifidn __OUTPUT_FORMAT__,elf
331 global _%1:function hidden
334 global %1:function hidden
346 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
361 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
362 ; executable by default.
363 %ifidn __OUTPUT_FORMAT__,elf
364 SECTION .note.GNU-stack noalloc noexec nowrite progbits
367 %assign FENC_STRIDE 16
368 %assign FDEC_STRIDE 32
381 %define RESET_MM_PERMUTATION INIT_MMX
390 CAT_XDEFINE m, %%i, mm %+ %%i
391 CAT_XDEFINE nmm, %%i, %%i
402 %define RESET_MM_PERMUTATION INIT_XMM
406 %define num_mmregs 16
411 %define movnt movntdq
414 CAT_XDEFINE m, %%i, xmm %+ %%i
415 CAT_XDEFINE nxmm, %%i, %%i
422 ; I often want to use macros that permute their arguments. e.g. there's no
423 ; efficient way to implement butterfly or transpose or dct without swapping some
426 ; I would like to not have to manually keep track of the permutations:
427 ; If I insert a permutation in the middle of a function, it should automatically
428 ; change everything that follows. For more complex macros I may also have multiple
429 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
431 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
432 ; permutes its arguments. It's equivalent to exchanging the contents of the
433 ; registers, except that this way you exchange the register names instead, so it
434 ; doesn't cost any cycles.
436 %macro PERMUTE 2-* ; takes a list of pairs to swap
451 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
457 CAT_XDEFINE n, m%1, %1
458 CAT_XDEFINE n, m%2, %2
460 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
461 ; Be careful using this mode in nested macros though, as in some cases there may be
462 ; other copies of m# that have already been dereferenced and don't get updated correctly.
463 %xdefine %%n1 n %+ %1
464 %xdefine %%n2 n %+ %2
465 %xdefine tmp m %+ %%n1
466 CAT_XDEFINE m, %%n1, m %+ %%n2
467 CAT_XDEFINE m, %%n2, tmp
468 CAT_XDEFINE n, m %+ %%n1, %%n1
469 CAT_XDEFINE n, m %+ %%n2, %%n2
476 %macro SAVE_MM_PERMUTATION 1
479 CAT_XDEFINE %1_m, %%i, m %+ %%i
484 %macro LOAD_MM_PERMUTATION 1
487 CAT_XDEFINE m, %%i, %1_m %+ %%i
488 CAT_XDEFINE n, m %+ %%i, %%i
496 LOAD_MM_PERMUTATION %1