1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* This program is free software; you can redistribute it and/or modify
7 ;* it under the terms of the GNU General Public License as published by
8 ;* the Free Software Foundation; either version 2 of the License, or
9 ;* (at your option) any later version.
11 ;* This program is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;* GNU General Public License for more details.
16 ;* You should have received a copy of the GNU General Public License
17 ;* along with this program; if not, write to the Free Software
18 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
19 ;*****************************************************************************
23 ;=============================================================================
24 ; Macros and other preprocessor constants
25 ;=============================================================================
28 %define GLOBAL wrt rip
42 ;=============================================================================
44 ;=============================================================================
60 ;=============================================================================
62 ;=============================================================================
99 LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + rcx], [%1 + 2 * rcx], [%1 + rbx], mm0
101 movd mm5, [%1 + 4 * rcx]
115 ;=============================================================================
117 ;=============================================================================
121 cglobal x264_vertical_filter_mmxext
122 cglobal x264_horizontal_filter_mmxext
123 cglobal x264_center_filter_mmxext
125 ;-----------------------------------------------------------------------------
127 ; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
128 ; uint8_t *dst2, int i_dst2_stride,
129 ; uint8_t *src, int i_src_stride,
130 ; int i_width, int i_height );
132 ;-----------------------------------------------------------------------------
135 x264_center_filter_mmxext :
145 movsxd r13, r9d ; src_stride
148 sub r12, r13 ; tsrc = src - 2 * src_stride
150 ; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned
151 lea rax, [r13 + r13 + 24 + tbuffer]
155 movsxd r11, ecx ; dst2_stride
157 movsxd r9, esi ; dst1_stride
158 movsxd r14, dword [rbp + 56] ; width
159 movsxd r15, dword [rbp + 64] ; height
161 mov rcx, r13 ; src_stride
162 lea rbx, [r13 + r13 * 2] ; 3 * src_stride
163 lea rdx, [r13 + r13 * 4] ; 5 * src_stride
165 pxor mm0, mm0 ; 0 ---> mm0
166 movq mm7, [mmx_dd_one GLOBAL] ; for rounding
176 movq [rsp + tbuffer], mm2
177 movq [rsp + tbuffer + 8], mm1
178 paddw mm1, [mmx_dw_one GLOBAL]
182 movd [r8], mm1 ; dst1[0] = mm1
186 lea rdi, [r8 - 4] ; rdi = dst1 - 4
192 movq [rsp + tbuffer + 2 * rax], mm1
193 paddw mm1, [mmx_dw_one GLOBAL]
196 movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1
200 cmp rax, r14 ; cmp rax, width
206 movq [rsp + tbuffer + 2 * rax], mm1
207 movq [rsp + tbuffer + 2 * rax + 8], mm2
208 paddw mm1, [mmx_dw_one GLOBAL]
211 movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1
213 add r12, r13 ; tsrc = tsrc + src_stride
215 add r8, r9 ; dst1 = dst1 + dst1_stride
221 movq mm2, [rsp + 2 * rax + 2 + 4 + tbuffer]
222 movq mm3, [rsp + 2 * rax + 4 + 4 + tbuffer]
223 movq mm4, [rsp + 2 * rax + 6 + 4 + tbuffer]
224 movq mm5, [rsp + 2 * rax + 8 + 4 + tbuffer]
225 movq mm1, [rsp + 2 * rax + 4 + tbuffer]
226 movq mm6, [rsp + 2 * rax + 10 + 4 + tbuffer]
231 movq mm5, [mmx_dw_20 GLOBAL]
232 movq mm4, [mmx_dw_5 GLOBAL]
238 punpcklwd mm2, [mmx_dw_20 GLOBAL]
239 punpckhwd mm3, [mmx_dw_5 GLOBAL]
252 paddd mm2, [mmx_dd_one GLOBAL]
253 paddd mm3, [mmx_dd_one GLOBAL]
261 movd [r10 + rax], mm2 ; dst2[rax] = mm2
264 cmp rax, r14 ; cmp rax, width
267 add r10, r11 ; dst2 += dst2_stride
284 ;-----------------------------------------------------------------------------
286 ; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
287 ; uint8_t *src, int i_src_stride,
288 ; int i_width, int i_height );
290 ;-----------------------------------------------------------------------------
293 x264_horizontal_filter_mmxext :
294 movsxd r10, esi ; dst_stride
295 movsxd r11, ecx ; src_stride
296 movsxd r8, r8d ; width
302 movq mm7, [mmx_dw_one GLOBAL]
304 movsxd rcx, r9d ; height
315 prefetchnta [rsi + rax + 48]
317 LOAD_4 mm1, mm2, mm3, mm4, [rsi + rax], [rsi + rax + 1], [rsi + rax + 2], [rsi + rax + 3], mm0
319 movd mm5, [rsi + rax + 4]
320 movd mm6, [rsi + rax + 5]
322 movd mm2, [rsi + rax + 4]
323 movd mm3, [rsi + rax + 6]
326 FILT_6 mm1, mm5, mm6, mm7
327 movd mm4, [rsi + rax + 7]
328 movd mm5, [rsi + rax + 8]
330 punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready
332 movd mm6, [rsi + rax + 9]
334 punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
337 FILT_6 mm2, mm5, mm6, mm7
340 movq [rdi + rax], mm1
343 cmp rax, r8 ; cmp rax, width
346 add rsi, r11 ; src_pitch
347 add rdi, r10 ; dst_pitch