1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <hal@duncan.ol.sub.de>
9 ;* Mathieu Monnier <manao@melix.net>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
64 psubw m1, m2 ; a-5*b+4*c
68 paddw m1, m3 ; a-5*b+20*c
75 psubw %1, %2 ; (a-b)/4-b
76 paddw %1, %3 ; (a-b)/4-b+c
77 psraw %1, 2 ; ((a-b)/4-b+c)/4
78 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
118 %macro PALIGNR_SSSE3 4
125 ;-----------------------------------------------------------------------------
126 ; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
127 ;-----------------------------------------------------------------------------
128 cglobal x264_hpel_filter_v_%1, 5,6
137 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
138 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
139 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
140 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
142 mova m7, [pw_16 GLOBAL]
144 mova [r2+r4*2+mmsize], m4
159 ;-----------------------------------------------------------------------------
160 ; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
161 ;-----------------------------------------------------------------------------
162 cglobal x264_hpel_filter_c_mmxext, 3,3
167 movq m7, [pw_32 GLOBAL]
174 paddw m3, [src+2] ; c0
178 paddw m4, [src+14] ; a1
179 paddw m5, [src+12] ; b1
180 paddw m6, [src+10] ; c1
181 FILT_H2 m1, m2, m3, m4, m5, m6
188 ;-----------------------------------------------------------------------------
189 ; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
190 ;-----------------------------------------------------------------------------
191 cglobal x264_hpel_filter_h_mmxext, 3,3
224 movq m7, [pw_1 GLOBAL]
225 FILT_H2 m1, m2, m3, m4, m5, m6
235 ;-----------------------------------------------------------------------------
236 ; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
237 ;-----------------------------------------------------------------------------
238 cglobal x264_hpel_filter_c_%1, 3,3
244 mova m7, [pw_32 GLOBAL]
247 mova m8, [pw_32 GLOBAL]
250 %define tpw_32 [pw_32 GLOBAL]
253 %ifidn %1,sse2_misalign
268 PALIGNR m3, m2, 2, m7
269 PALIGNR m4, m2, 4, m7
270 PALIGNR m5, m2, 6, m7
271 PALIGNR m0, m6, 12, m7
272 PALIGNR m1, m6, 14, m7
287 ;-----------------------------------------------------------------------------
288 ; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
289 ;-----------------------------------------------------------------------------
290 cglobal x264_hpel_filter_h_sse2, 3,3
327 mova m7, [pw_1 GLOBAL] ; FIXME xmm8
328 FILT_H2 m1, m2, m3, m4, m5, m6
336 ;-----------------------------------------------------------------------------
337 ; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
338 ;-----------------------------------------------------------------------------
339 cglobal x264_hpel_filter_h_ssse3, 3,3
346 punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
349 mova m7, [pw_1 GLOBAL]
403 %define PALIGNR PALIGNR_MMX
409 %define PALIGNR PALIGNR_SSSE3
415 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
416 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
417 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
428 movntps [r11+r4+%5], m1
433 PALIGNR m1, %1, 12, m4
435 PALIGNR m2, %1, 14, m4
437 PALIGNR %3, %2, 6, m4
439 PALIGNR m3, %2, 4, m4
442 PALIGNR m4, %2, 2, m1
451 DO_FILT_H %1, %2, %3, 6
452 DO_FILT_H %2, %1, %4, 6
458 DO_FILT_H %1, %2, %3, 1
459 DO_FILT_H %2, %1, %4, 1
465 DO_FILT_H %1, %2, %3, 6
467 DO_FILT_H %4, %5, %6, 1
472 ;-----------------------------------------------------------------------------
473 ; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
474 ; uint8_t *src, int stride, int width, int height)
475 ;-----------------------------------------------------------------------------
476 cglobal x264_hpel_filter_%1, 7,7
499 ; prefetching does not help here! lots of variants tested, all slower
500 DO_FILT_V m8, m7, m13, m12, 0
503 DO_FILT_V m6, m5, m11, m10, 16
506 DO_FILT_CC m9, m8, m7, m6
507 movdqa m7, m12 ; not really necessary, but seems free and
508 movdqa m6, m11 ; gives far shorter code
510 DO_FILT_HH m14, m13, m7, m6
518 ; setup regs for next y
533 %define PALIGNR PALIGNR_MMX
535 %define PALIGNR PALIGNR_SSSE3
544 ;-----------------------------------------------------------------------------
545 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
546 ; uint8_t *src, int i_src, int w, int h)
547 ;-----------------------------------------------------------------------------
548 cglobal x264_plane_copy_mmxext, 6,7
615 ; These functions are not general-use; not only do the SSE ones require aligned input,
616 ; but they also will fail if given a non-mod16 size or a size less than 64.
617 ; memzero SSE will fail for non-mod128.
619 ;-----------------------------------------------------------------------------
620 ; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
621 ;-----------------------------------------------------------------------------
622 cglobal x264_memcpy_aligned_mmx, 3,3
626 movq mm0, [r1 + r2 + 0]
627 movq mm1, [r1 + r2 + 8]
628 movq [r0 + r2 + 0], mm0
629 movq [r0 + r2 + 8], mm1
632 movq mm0, [r1 + r2 + 0]
633 movq mm1, [r1 + r2 + 8]
634 movq mm2, [r1 + r2 + 16]
635 movq mm3, [r1 + r2 + 24]
636 movq [r0 + r2 + 0], mm0
637 movq [r0 + r2 + 8], mm1
638 movq [r0 + r2 + 16], mm2
639 movq [r0 + r2 + 24], mm3
643 ;-----------------------------------------------------------------------------
644 ; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
645 ;-----------------------------------------------------------------------------
646 cglobal x264_memcpy_aligned_sse2, 3,3
650 movdqa xmm0, [r1 + r2]
651 movdqa [r0 + r2], xmm0
656 movdqa xmm0, [r1 + r2 + 0]
657 movdqa [r0 + r2 + 0], xmm0
658 movdqa xmm1, [r1 + r2 + 16]
659 movdqa [r0 + r2 + 16], xmm1
662 movdqa xmm0, [r1 + r2 + 0]
663 movdqa [r0 + r2 + 0], xmm0
664 movdqa xmm1, [r1 + r2 + 16]
665 movdqa [r0 + r2 + 16], xmm1
666 movdqa xmm2, [r1 + r2 + 32]
667 movdqa [r0 + r2 + 32], xmm2
668 movdqa xmm3, [r1 + r2 + 48]
669 movdqa [r0 + r2 + 48], xmm3
673 ;-----------------------------------------------------------------------------
674 ; void *x264_memzero_aligned( void *dst, size_t n );
675 ;-----------------------------------------------------------------------------
677 cglobal x264_memzero_aligned_%1, 2,2
683 mova [r0 + r1 + i], m0
701 pavgb %4, [r0+r5*2+%7]
702 PALIGNR %1, %3, 1, m6
703 PALIGNR %2, %4, 1, m6
715 mova m3, [r0+%4+mmsize]
717 pavgb m3, [r0+%4+r5+mmsize]
719 PALIGNR %1, m3, 1, m6
721 PALIGNR m3, m2, 1, m6
739 pavgb m3, [r0+%3+r5+8]
743 pavgb m1, [r0+%3+r5+9]
744 pavgb m0, [r0+%3+r5+1]
759 ;-----------------------------------------------------------------------------
760 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
761 ; int src_stride, int dst_stride, int width, int height )
762 ;-----------------------------------------------------------------------------
763 %macro FRAME_INIT_LOWRES 1 ; FIXME
764 cglobal x264_frame_init_lowres_core_%1, 6,7
765 ; src += 2*(height-1)*stride + 2*width
771 ; dst += (height-1)*stride + width
780 ; gap = stride - width
784 %define dst_gap [rsp+gprsize]
789 %define src_gap [rsp]
791 ; adjust for the odd end case
814 FILT8x4 m0, m1, m2, m3, m4, m5, 0
832 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
835 FILT8x4 m2, m3, m0, m1, m4, m5, 0
848 FILT16x2 m0, r1, r2, 0
849 FILT16x2 m1, r3, r4, r5
865 %endmacro ; FRAME_INIT_LOWRES
868 %define PALIGNR PALIGNR_MMX
869 FRAME_INIT_LOWRES mmxext
871 FRAME_INIT_LOWRES cache32_mmxext
874 FRAME_INIT_LOWRES sse2
875 %define PALIGNR PALIGNR_SSSE3
876 FRAME_INIT_LOWRES ssse3