1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* This program is free software; you can redistribute it and/or modify
7 ;* it under the terms of the GNU General Public License as published by
8 ;* the Free Software Foundation; either version 2 of the License, or
9 ;* (at your option) any later version.
11 ;* This program is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;* GNU General Public License for more details.
16 ;* You should have received a copy of the GNU General Public License
17 ;* along with this program; if not, write to the Free Software
18 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
19 ;*****************************************************************************
23 ;=============================================================================
24 ; Macros and other preprocessor constants
25 ;=============================================================================
27 %include "amd64inc.asm"
29 ;=============================================================================
31 ;=============================================================================
33 SECTION .rodata align=16
40 ;=============================================================================
42 ;=============================================================================
59 psubw mm1, mm2 ; a-5*b+4*c
63 paddw mm1, mm3 ; a-5*b+20*c
70 psraw mm1, 2 ; (a-b)/4
72 psubw mm1, mm2 ; (a-b)/4-b
74 paddw mm1, mm3 ; (a-b)/4-b+c
76 psraw mm1, 2 ; ((a-b)/4-b+c)/4
78 paddw mm1, mm3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
91 ;=============================================================================
93 ;=============================================================================
97 cglobal x264_hpel_filter_mmxext
98 cglobal x264_plane_copy_mmxext
100 ;-----------------------------------------------------------------------------
101 ; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
102 ; int i_stride, int i_width, int i_height );
103 ;-----------------------------------------------------------------------------
106 x264_hpel_filter_mmxext :
127 movsxd r8, dword [rbp+72]
128 movsxd r9, dword [rbp+80]
129 mov ebx, dword [rbp+88]
131 mov ebx, dword [rbp+24]
143 %define tbuffer rsp + 8
145 lea stride3, [stride*3]
146 lea stride5, [stride*5]
150 lea rax, [stride*2 + 24]
161 prefetcht0 [src + stride5 + 32]
163 LOAD_ADD mm1, [src ], [src + stride5 ] ; a0
164 LOAD_ADD mm2, [src + stride ], [src + stride*4 ] ; b0
165 LOAD_ADD mm3, [src + stride*2 ], [src + stride3 ] ; c0
166 LOAD_ADD mm4, [src + 4], [src + stride5 + 4] ; a1
167 LOAD_ADD mm5, [src + stride + 4], [src + stride*4 + 4] ; b1
168 LOAD_ADD mm6, [src + stride*2 + 4], [src + stride3 + 4] ; c1
172 movq mm7, [pw_16 GLOBAL]
173 movq [tbuffer + x*2], mm1
174 movq [tbuffer + x*2 + 8], mm4
180 movntq [dstv + x], mm1
187 pshufw mm2, [tbuffer], 0
188 movq [tbuffer - 8], mm2 ; pad left
189 ; no need to pad right, since vertical_filter already did 4 extra pixels
193 movq mm7, [pw_32 GLOBAL]
196 movq mm1, [tbuffer + x*2 - 4 ]
197 movq mm2, [tbuffer + x*2 - 2 ]
198 movq mm3, [tbuffer + x*2 ]
199 movq mm4, [tbuffer + x*2 + 4 ]
200 movq mm5, [tbuffer + x*2 + 6 ]
201 paddw mm3, [tbuffer + x*2 + 2 ] ; c0
204 movq mm6, [tbuffer + x*2 + 8 ]
205 paddw mm4, [tbuffer + x*2 + 14] ; a1
206 paddw mm5, [tbuffer + x*2 + 12] ; b1
207 paddw mm6, [tbuffer + x*2 + 10] ; c1
211 movntq [dstc + x], mm1
217 lea src, [src + stride*2]
221 movd mm1, [src + x - 2]
222 movd mm2, [src + x - 1]
224 movd mm6, [src + x + 1]
225 movd mm4, [src + x + 2]
226 movd mm5, [src + x + 3]
236 movd mm7, [src + x + 7]
237 movd mm6, [src + x + 6]
242 movd mm7, [src + x + 5]
243 movd mm6, [src + x + 4]
248 movq mm7, [pw_1 GLOBAL]
251 movntq [dsth + x], mm1
255 jl .horizontal_filter
275 ;-----------------------------------------------------------------------------
276 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
277 ; uint8_t *src, int i_src, int w, int h)
278 ;-----------------------------------------------------------------------------
280 x264_plane_copy_mmxext:
281 movsxd parm2q, parm2d
282 movsxd parm4q, parm4d
287 ; shuffle regs because movsd needs dst=rdi, src=rsi, w=ecx
295 prefetchnta [rsi+256]
317 prefetchnta [rsi+256]