1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Mathieu Monnier <manao@melix.net>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
49 psubw m1, m2 ; a-5*b+4*c
53 paddw m1, m3 ; a-5*b+20*c
60 psubw %1, %2 ; (a-b)/4-b
61 paddw %1, %3 ; (a-b)/4-b+c
62 psraw %1, 2 ; ((a-b)/4-b+c)/4
63 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
98 %macro PALIGNR_SSSE3 4
105 ;-----------------------------------------------------------------------------
106 ; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
107 ;-----------------------------------------------------------------------------
108 cglobal x264_hpel_filter_v_%1, 5,6,1
117 prefetcht0 [r5+r3*2+64]
118 LOAD_ADD m1, [r1 ], [r5+r3*2] ; a0
119 LOAD_ADD m2, [r1+r3 ], [r5+r3 ] ; b0
120 LOAD_ADD m3, [r1+r3*2], [r5 ] ; c0
121 LOAD_ADD m4, [r1 +regsize/2], [r5+r3*2+regsize/2] ; a1
122 LOAD_ADD m5, [r1+r3 +regsize/2], [r5+r3 +regsize/2] ; b1
123 LOAD_ADD m6, [r1+r3*2+regsize/2], [r5 +regsize/2] ; c1
125 mova m7, [pw_16 GLOBAL]
127 mova [r2+r4*2+regsize], m4
142 ;-----------------------------------------------------------------------------
143 ; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
144 ;-----------------------------------------------------------------------------
145 cglobal x264_hpel_filter_c_mmxext, 3,3,1
150 movq m7, [pw_32 GLOBAL]
157 paddw m3, [src+2] ; c0
161 paddw m4, [src+14] ; a1
162 paddw m5, [src+12] ; b1
163 paddw m6, [src+10] ; c1
171 ;-----------------------------------------------------------------------------
172 ; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
173 ;-----------------------------------------------------------------------------
174 cglobal x264_hpel_filter_h_mmxext, 3,3,1
207 movq m7, [pw_1 GLOBAL]
218 ;-----------------------------------------------------------------------------
219 ; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
220 ;-----------------------------------------------------------------------------
221 cglobal x264_hpel_filter_c_%1, 3,3,1
227 mova m7, [pw_32 GLOBAL]
230 mova m8, [pw_32 GLOBAL]
233 %define tpw_32 [pw_32 GLOBAL]
243 PALIGNR m3, m2, 2, m7
244 PALIGNR m4, m2, 4, m7
245 PALIGNR m5, m2, 6, m7
246 PALIGNR m0, m6, 12, m7
247 PALIGNR m1, m6, 14, m7
261 ;-----------------------------------------------------------------------------
262 ; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
263 ;-----------------------------------------------------------------------------
264 cglobal x264_hpel_filter_h_sse2, 3,3,1
301 mova m7, [pw_1 GLOBAL] ; FIXME xmm8
309 %define PALIGNR PALIGNR_SSE2
312 %define PALIGNR PALIGNR_SSSE3
321 ;-----------------------------------------------------------------------------
322 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
323 ; uint8_t *src, int i_src, int w, int h)
324 ;-----------------------------------------------------------------------------
325 cglobal x264_plane_copy_mmxext, 6,7
392 ; These functions are not general-use; not only do the SSE ones require aligned input,
393 ; but they also will fail if given a non-mod16 size or a size less than 64.
394 ; memzero SSE will fail for non-mod128.
396 ;-----------------------------------------------------------------------------
397 ; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
398 ;-----------------------------------------------------------------------------
399 cglobal x264_memcpy_aligned_mmx, 3,3
403 movq mm0, [r1 + r2 + 0]
404 movq mm1, [r1 + r2 + 8]
405 movq [r0 + r2 + 0], mm0
406 movq [r0 + r2 + 8], mm1
409 movq mm0, [r1 + r2 + 0]
410 movq mm1, [r1 + r2 + 8]
411 movq mm2, [r1 + r2 + 16]
412 movq mm3, [r1 + r2 + 24]
413 movq [r0 + r2 + 0], mm0
414 movq [r0 + r2 + 8], mm1
415 movq [r0 + r2 + 16], mm2
416 movq [r0 + r2 + 24], mm3
420 ;-----------------------------------------------------------------------------
421 ; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
422 ;-----------------------------------------------------------------------------
423 cglobal x264_memcpy_aligned_sse2, 3,3
427 movdqa xmm0, [r1 + r2]
428 movdqa [r0 + r2], xmm0
433 movdqa xmm0, [r1 + r2 + 0]
434 movdqa [r0 + r2 + 0], xmm0
435 movdqa xmm1, [r1 + r2 + 16]
436 movdqa [r0 + r2 + 16], xmm1
439 movdqa xmm0, [r1 + r2 + 0]
440 movdqa [r0 + r2 + 0], xmm0
441 movdqa xmm1, [r1 + r2 + 16]
442 movdqa [r0 + r2 + 16], xmm1
443 movdqa xmm2, [r1 + r2 + 32]
444 movdqa [r0 + r2 + 32], xmm2
445 movdqa xmm3, [r1 + r2 + 48]
446 movdqa [r0 + r2 + 48], xmm3
450 ;-----------------------------------------------------------------------------
451 ; void *x264_memzero_aligned( void *dst, size_t n );
452 ;-----------------------------------------------------------------------------
454 cglobal x264_memzero_aligned_%1, 2,2
460 mova [r0 + r1 + i], m0