1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Mathieu Monnier <manao@melix.net>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
49 psubw mm1, mm2 ; a-5*b+4*c
53 paddw mm1, mm3 ; a-5*b+20*c
60 psraw mm1, 2 ; (a-b)/4
62 psubw mm1, mm2 ; (a-b)/4-b
64 paddw mm1, mm3 ; (a-b)/4-b+c
66 psraw mm1, 2 ; ((a-b)/4-b+c)/4
68 paddw mm1, mm3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
80 ;-----------------------------------------------------------------------------
81 ; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
82 ; int i_stride, int i_width, int i_height );
83 ;-----------------------------------------------------------------------------
84 cglobal x264_hpel_filter_mmxext, 0,7
101 %define tdsth r10 ; FIXME r8,9
112 %define tdsth [rbp + 20]
113 %define tdstv [rbp + 24]
114 %define tdstc [rbp + 28]
115 %define tsrc [rbp + 32]
116 %define theight [rbp + 44]
122 lea rax, [stride*2 + 24]
126 %define tpw_1 [pw_1 GLOBAL]
127 %define tpw_16 [pw_16 GLOBAL]
128 %define tpw_32 [pw_32 GLOBAL]
130 ; mov globals onto the stack, to free up PIC pointer
131 %define tpw_1 [ebp - 24]
132 %define tpw_16 [ebp - 16]
133 %define tpw_32 [ebp - 8]
136 movq mm1, [pw_1 GLOBAL]
137 movq mm2, [pw_16 GLOBAL]
138 movq mm3, [pw_32 GLOBAL]
148 lea src3, [src + stride]
155 prefetcht0 [src3 + stride*2 + 32]
157 LOAD_ADD mm1, [src ], [src3 + stride*2 ] ; a0
158 LOAD_ADD mm2, [src + stride ], [src3 + stride ] ; b0
159 LOAD_ADD mm3, [src + stride*2 ], [src3 ] ; c0
160 LOAD_ADD mm4, [src + 4], [src3 + stride*2 + 4] ; a1
161 LOAD_ADD mm5, [src + stride + 4], [src3 + stride + 4] ; b1
162 LOAD_ADD mm6, [src + stride*2 + 4], [src3 + 4] ; c1
167 movq [tbuffer + x*2], mm1
168 movq [tbuffer + x*2 + 8], mm4
174 movntq [dstv + x], mm1
182 pshufw mm2, [tbuffer], 0
183 movq [tbuffer - 8], mm2 ; pad left
184 ; no need to pad right, since vertical_filter already did 4 extra pixels
191 movq mm1, [tbuffer + x*2 - 4 ]
192 movq mm2, [tbuffer + x*2 - 2 ]
193 movq mm3, [tbuffer + x*2 ]
194 movq mm4, [tbuffer + x*2 + 4 ]
195 movq mm5, [tbuffer + x*2 + 6 ]
196 paddw mm3, [tbuffer + x*2 + 2 ] ; c0
199 movq mm6, [tbuffer + x*2 + 8 ]
200 paddw mm4, [tbuffer + x*2 + 14] ; a1
201 paddw mm5, [tbuffer + x*2 + 12] ; b1
202 paddw mm6, [tbuffer + x*2 + 10] ; c1
206 movntq [dstc + x], mm1
217 movd mm1, [src + x - 2]
218 movd mm2, [src + x - 1]
220 movd mm6, [src + x + 1]
221 movd mm4, [src + x + 2]
222 movd mm5, [src + x + 3]
232 movd mm7, [src + x + 7]
233 movd mm6, [src + x + 6]
238 movd mm7, [src + x + 5]
239 movd mm6, [src + x + 4]
247 movntq [dsth + x], mm1
251 jl .horizontal_filter
271 ;-----------------------------------------------------------------------------
272 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
273 ; uint8_t *src, int i_src, int w, int h)
274 ;-----------------------------------------------------------------------------
275 cglobal x264_plane_copy_mmxext, 6,7
339 ;-----------------------------------------------------------------------------
340 ; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
341 ;-----------------------------------------------------------------------------
342 cglobal x264_memcpy_aligned_mmx, 3,3
346 movq mm0, [r1 + r2 + 0]
347 movq mm1, [r1 + r2 + 8]
348 movq [r0 + r2 + 0], mm0
349 movq [r0 + r2 + 8], mm1
352 movq mm0, [r1 + r2 + 0]
353 movq mm1, [r1 + r2 + 8]
354 movq mm2, [r1 + r2 + 16]
355 movq mm3, [r1 + r2 + 24]
356 movq [r0 + r2 + 0], mm0
357 movq [r0 + r2 + 8], mm1
358 movq [r0 + r2 + 16], mm2
359 movq [r0 + r2 + 24], mm3
363 ;-----------------------------------------------------------------------------
364 ; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
365 ;-----------------------------------------------------------------------------
366 cglobal x264_memcpy_aligned_sse2, 3,3
370 movdqa xmm0, [r1 + r2]
371 movdqa [r0 + r2], xmm0
376 movdqa xmm0, [r1 + r2 + 0]
377 movdqa [r0 + r2 + 0], xmm0
378 movdqa xmm1, [r1 + r2 + 16]
379 movdqa [r0 + r2 + 16], xmm1
382 movdqa xmm0, [r1 + r2 + 0]
383 movdqa [r0 + r2 + 0], xmm0
384 movdqa xmm1, [r1 + r2 + 16]
385 movdqa [r0 + r2 + 16], xmm1
386 movdqa xmm2, [r1 + r2 + 32]
387 movdqa [r0 + r2 + 32], xmm2
388 movdqa xmm3, [r1 + r2 + 48]
389 movdqa [r0 + r2 + 48], xmm3