1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Mathieu Monnier <manao@melix.net>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
50 psubw m1, m2 ; a-5*b+4*c
54 paddw m1, m3 ; a-5*b+20*c
61 psubw %1, %2 ; (a-b)/4-b
62 paddw %1, %3 ; (a-b)/4-b+c
63 psraw %1, 2 ; ((a-b)/4-b+c)/4
64 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
104 %macro PALIGNR_SSSE3 4
111 ;-----------------------------------------------------------------------------
112 ; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
113 ;-----------------------------------------------------------------------------
114 cglobal x264_hpel_filter_v_%1, 5,6,1
123 prefetcht0 [r5+r3*2+64]
124 LOAD_ADD m1, [r1 ], [r5+r3*2] ; a0
125 LOAD_ADD m2, [r1+r3 ], [r5+r3 ] ; b0
126 LOAD_ADD m3, [r1+r3*2], [r5 ] ; c0
127 LOAD_ADD m4, [r1 +regsize/2], [r5+r3*2+regsize/2] ; a1
128 LOAD_ADD m5, [r1+r3 +regsize/2], [r5+r3 +regsize/2] ; b1
129 LOAD_ADD m6, [r1+r3*2+regsize/2], [r5 +regsize/2] ; c1
131 mova m7, [pw_16 GLOBAL]
133 mova [r2+r4*2+regsize], m4
148 ;-----------------------------------------------------------------------------
149 ; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
150 ;-----------------------------------------------------------------------------
151 cglobal x264_hpel_filter_c_mmxext, 3,3,1
156 movq m7, [pw_32 GLOBAL]
163 paddw m3, [src+2] ; c0
167 paddw m4, [src+14] ; a1
168 paddw m5, [src+12] ; b1
169 paddw m6, [src+10] ; c1
177 ;-----------------------------------------------------------------------------
178 ; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
179 ;-----------------------------------------------------------------------------
180 cglobal x264_hpel_filter_h_mmxext, 3,3,1
213 movq m7, [pw_1 GLOBAL]
224 ;-----------------------------------------------------------------------------
225 ; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
226 ;-----------------------------------------------------------------------------
227 cglobal x264_hpel_filter_c_%1, 3,3,1
233 mova m7, [pw_32 GLOBAL]
236 mova m8, [pw_32 GLOBAL]
239 %define tpw_32 [pw_32 GLOBAL]
249 PALIGNR m3, m2, 2, m7
250 PALIGNR m4, m2, 4, m7
251 PALIGNR m5, m2, 6, m7
252 PALIGNR m0, m6, 12, m7
253 PALIGNR m1, m6, 14, m7
267 ;-----------------------------------------------------------------------------
268 ; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
269 ;-----------------------------------------------------------------------------
270 cglobal x264_hpel_filter_h_sse2, 3,3,1
307 mova m7, [pw_1 GLOBAL] ; FIXME xmm8
315 %define PALIGNR PALIGNR_MMX
318 %define PALIGNR PALIGNR_SSSE3
327 ;-----------------------------------------------------------------------------
328 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
329 ; uint8_t *src, int i_src, int w, int h)
330 ;-----------------------------------------------------------------------------
331 cglobal x264_plane_copy_mmxext, 6,7
398 ; These functions are not general-use; not only do the SSE ones require aligned input,
399 ; but they also will fail if given a non-mod16 size or a size less than 64.
400 ; memzero SSE will fail for non-mod128.
402 ;-----------------------------------------------------------------------------
403 ; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
404 ;-----------------------------------------------------------------------------
405 cglobal x264_memcpy_aligned_mmx, 3,3
409 movq mm0, [r1 + r2 + 0]
410 movq mm1, [r1 + r2 + 8]
411 movq [r0 + r2 + 0], mm0
412 movq [r0 + r2 + 8], mm1
415 movq mm0, [r1 + r2 + 0]
416 movq mm1, [r1 + r2 + 8]
417 movq mm2, [r1 + r2 + 16]
418 movq mm3, [r1 + r2 + 24]
419 movq [r0 + r2 + 0], mm0
420 movq [r0 + r2 + 8], mm1
421 movq [r0 + r2 + 16], mm2
422 movq [r0 + r2 + 24], mm3
426 ;-----------------------------------------------------------------------------
427 ; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
428 ;-----------------------------------------------------------------------------
429 cglobal x264_memcpy_aligned_sse2, 3,3
433 movdqa xmm0, [r1 + r2]
434 movdqa [r0 + r2], xmm0
439 movdqa xmm0, [r1 + r2 + 0]
440 movdqa [r0 + r2 + 0], xmm0
441 movdqa xmm1, [r1 + r2 + 16]
442 movdqa [r0 + r2 + 16], xmm1
445 movdqa xmm0, [r1 + r2 + 0]
446 movdqa [r0 + r2 + 0], xmm0
447 movdqa xmm1, [r1 + r2 + 16]
448 movdqa [r0 + r2 + 16], xmm1
449 movdqa xmm2, [r1 + r2 + 32]
450 movdqa [r0 + r2 + 32], xmm2
451 movdqa xmm3, [r1 + r2 + 48]
452 movdqa [r0 + r2 + 48], xmm3
456 ;-----------------------------------------------------------------------------
457 ; void *x264_memzero_aligned( void *dst, size_t n );
458 ;-----------------------------------------------------------------------------
460 cglobal x264_memzero_aligned_%1, 2,2
466 mova [r0 + r1 + i], m0
484 pavgb %4, [r0+r5*2+%7]
485 PALIGNR %1, %3, 1, m6
486 PALIGNR %2, %4, 1, m6
498 mova m3, [r0+%4+regsize]
500 pavgb m3, [r0+%4+r5+regsize]
502 PALIGNR %1, m3, 1, m6
504 PALIGNR m3, m2, 1, m6
522 pavgb m3, [r0+%3+r5+8]
526 pavgb m1, [r0+%3+r5+9]
527 pavgb m0, [r0+%3+r5+1]
542 ;-----------------------------------------------------------------------------
543 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
544 ; int src_stride, int dst_stride, int width, int height )
545 ;-----------------------------------------------------------------------------
546 %macro FRAME_INIT_LOWRES 1 ; FIXME
547 cglobal x264_frame_init_lowres_core_%1, 6,7
548 ; src += 2*(height-1)*stride + 2*width
554 ; dst += (height-1)*stride + width
563 ; gap = stride - width
567 %define dst_gap [rsp+push_size]
572 %define src_gap [rsp]
574 ; adjust for the odd end case
597 FILT8x4 m0, m1, m2, m3, m4, m5, 0
615 FILT8x4 m0, m1, m2, m3, m10, m11, regsize
618 FILT8x4 m2, m3, m0, m1, m4, m5, 0
631 FILT16x2 m0, r1, r2, 0
632 FILT16x2 m1, r3, r4, r5
647 %endmacro ; FRAME_INIT_LOWRES
650 %define PALIGNR PALIGNR_MMX
651 FRAME_INIT_LOWRES mmxext
653 FRAME_INIT_LOWRES cache32_mmxext
656 FRAME_INIT_LOWRES sse2
657 %define PALIGNR PALIGNR_SSSE3
658 FRAME_INIT_LOWRES ssse3