1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* This program is free software; you can redistribute it and/or modify
7 ;* it under the terms of the GNU General Public License as published by
8 ;* the Free Software Foundation; either version 2 of the License, or
9 ;* (at your option) any later version.
11 ;* This program is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;* GNU General Public License for more details.
16 ;* You should have received a copy of the GNU General Public License
17 ;* along with this program; if not, write to the Free Software
18 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
19 ;*****************************************************************************
23 ;=============================================================================
24 ; Macros and other preprocessor constants
25 ;=============================================================================
27 %include "amd64inc.asm"
29 ;=============================================================================
31 ;=============================================================================
39 ;=============================================================================
41 ;=============================================================================
58 psubw mm1, mm2 ; a-5*b+4*c
62 paddw mm1, mm3 ; a-5*b+20*c
69 psraw mm1, 2 ; (a-b)/4
71 psubw mm1, mm2 ; (a-b)/4-b
73 paddw mm1, mm3 ; (a-b)/4-b+c
75 psraw mm1, 2 ; ((a-b)/4-b+c)/4
77 paddw mm1, mm3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
90 ;=============================================================================
92 ;=============================================================================
96 ;-----------------------------------------------------------------------------
97 ; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
98 ; int i_stride, int i_width, int i_height );
99 ;-----------------------------------------------------------------------------
100 cglobal x264_hpel_filter_mmxext
121 movsxd r8, dword [rbp+72]
122 movsxd r9, dword [rbp+80]
123 mov ebx, dword [rbp+88]
125 mov ebx, dword [rbp+24]
137 %define tbuffer rsp + 8
139 lea stride3, [stride*3]
140 lea stride5, [stride*5]
144 lea rax, [stride*2 + 24]
155 prefetcht0 [src + stride5 + 32]
157 LOAD_ADD mm1, [src ], [src + stride5 ] ; a0
158 LOAD_ADD mm2, [src + stride ], [src + stride*4 ] ; b0
159 LOAD_ADD mm3, [src + stride*2 ], [src + stride3 ] ; c0
160 LOAD_ADD mm4, [src + 4], [src + stride5 + 4] ; a1
161 LOAD_ADD mm5, [src + stride + 4], [src + stride*4 + 4] ; b1
162 LOAD_ADD mm6, [src + stride*2 + 4], [src + stride3 + 4] ; c1
166 movq mm7, [pw_16 GLOBAL]
167 movq [tbuffer + x*2], mm1
168 movq [tbuffer + x*2 + 8], mm4
174 movntq [dstv + x], mm1
181 pshufw mm2, [tbuffer], 0
182 movq [tbuffer - 8], mm2 ; pad left
183 ; no need to pad right, since vertical_filter already did 4 extra pixels
187 movq mm7, [pw_32 GLOBAL]
190 movq mm1, [tbuffer + x*2 - 4 ]
191 movq mm2, [tbuffer + x*2 - 2 ]
192 movq mm3, [tbuffer + x*2 ]
193 movq mm4, [tbuffer + x*2 + 4 ]
194 movq mm5, [tbuffer + x*2 + 6 ]
195 paddw mm3, [tbuffer + x*2 + 2 ] ; c0
198 movq mm6, [tbuffer + x*2 + 8 ]
199 paddw mm4, [tbuffer + x*2 + 14] ; a1
200 paddw mm5, [tbuffer + x*2 + 12] ; b1
201 paddw mm6, [tbuffer + x*2 + 10] ; c1
205 movntq [dstc + x], mm1
211 lea src, [src + stride*2]
215 movd mm1, [src + x - 2]
216 movd mm2, [src + x - 1]
218 movd mm6, [src + x + 1]
219 movd mm4, [src + x + 2]
220 movd mm5, [src + x + 3]
230 movd mm7, [src + x + 7]
231 movd mm6, [src + x + 6]
236 movd mm7, [src + x + 5]
237 movd mm6, [src + x + 4]
242 movq mm7, [pw_1 GLOBAL]
245 movntq [dsth + x], mm1
249 jl .horizontal_filter
269 ;-----------------------------------------------------------------------------
270 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
271 ; uint8_t *src, int i_src, int w, int h)
272 ;-----------------------------------------------------------------------------
273 cglobal x264_plane_copy_mmxext
274 movsxd parm2q, parm2d
275 movsxd parm4q, parm4d
280 ; shuffle regs because movsd needs dst=rdi, src=rsi, w=ecx
288 prefetchnta [rsi+256]
310 prefetchnta [rsi+256]