1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <hal@duncan.ol.sub.de>
9 ;* Mathieu Monnier <manao@melix.net>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 filt_mul20: times 16 db 20
32 filt_mul51: times 8 db 1, -5
68 psubw m1, m2 ; a-5*b+4*c
72 paddw m1, m3 ; a-5*b+20*c
79 psubw %1, %2 ; (a-b)/4-b
80 paddw %1, %3 ; (a-b)/4-b+c
81 psraw %1, 2 ; ((a-b)/4-b+c)/4
82 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
111 ;-----------------------------------------------------------------------------
112 ; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
113 ;-----------------------------------------------------------------------------
114 cglobal x264_hpel_filter_v_%1, 5,6,%2
127 mova m0, [filt_mul51 GLOBAL]
137 SBUTTERFLY bw, 1, 4, 7
138 SBUTTERFLY bw, 2, 5, 7
139 SBUTTERFLY bw, 3, 6, 7
144 pmaddubsw m3, [filt_mul20 GLOBAL]
145 pmaddubsw m6, [filt_mul20 GLOBAL]
151 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
152 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
153 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
154 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
157 mova m7, [pw_16 GLOBAL]
159 mova [r2+r4*2+mmsize], m4
174 ;-----------------------------------------------------------------------------
175 ; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
176 ;-----------------------------------------------------------------------------
177 cglobal x264_hpel_filter_c_mmxext, 3,3
182 movq m7, [pw_32 GLOBAL]
189 paddw m3, [src+2] ; c0
193 paddw m4, [src+14] ; a1
194 paddw m5, [src+12] ; b1
195 paddw m6, [src+10] ; c1
196 FILT_H2 m1, m2, m3, m4, m5, m6
203 ;-----------------------------------------------------------------------------
204 ; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
205 ;-----------------------------------------------------------------------------
206 cglobal x264_hpel_filter_h_mmxext, 3,3
239 movq m7, [pw_1 GLOBAL]
240 FILT_H2 m1, m2, m3, m4, m5, m6
250 ;-----------------------------------------------------------------------------
251 ; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
252 ;-----------------------------------------------------------------------------
253 cglobal x264_hpel_filter_c_%1, 3,3,9
259 mova m7, [pw_32 GLOBAL]
262 mova m8, [pw_32 GLOBAL]
265 %define tpw_32 [pw_32 GLOBAL]
268 %ifidn %1,sse2_misalign
283 PALIGNR m3, m2, 2, m7
284 PALIGNR m4, m2, 4, m7
285 PALIGNR m5, m2, 6, m7
286 PALIGNR m0, m6, 12, m7
287 PALIGNR m1, m6, 14, m7
302 ;-----------------------------------------------------------------------------
303 ; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
304 ;-----------------------------------------------------------------------------
305 cglobal x264_hpel_filter_h_sse2, 3,3,8
342 mova m7, [pw_1 GLOBAL] ; FIXME xmm8
343 FILT_H2 m1, m2, m3, m4, m5, m6
351 ;-----------------------------------------------------------------------------
352 ; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
353 ;-----------------------------------------------------------------------------
354 cglobal x264_hpel_filter_h_ssse3, 3,3
361 punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
364 mova m7, [pw_1 GLOBAL]
418 %define PALIGNR PALIGNR_MMX
424 %define PALIGNR PALIGNR_SSSE3
438 mova %2, [filt_mul51 GLOBAL]
454 pmaddubsw m3, [filt_mul20 GLOBAL]
455 pmaddubsw %1, [filt_mul20 GLOBAL]
463 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
464 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
465 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
477 movntps [r11+r4+%5], m1
482 PALIGNR m1, %1, 12, m4
484 PALIGNR m2, %1, 14, m4
486 PALIGNR %3, %2, 6, m4
488 PALIGNR m3, %2, 4, m4
491 PALIGNR m4, %2, 2, m1
500 DO_FILT_H %1, %2, %3, 6
501 DO_FILT_H %2, %1, %4, 6
507 DO_FILT_H %1, %2, %3, 1
508 DO_FILT_H %2, %1, %4, 1
514 DO_FILT_H %1, %2, %3, 6
516 DO_FILT_H %4, %5, %6, 1
521 ;-----------------------------------------------------------------------------
522 ; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
523 ; uint8_t *src, int stride, int width, int height)
524 ;-----------------------------------------------------------------------------
525 cglobal x264_hpel_filter_%1, 7,7,16
554 ; prefetching does not help here! lots of variants tested, all slower
555 DO_FILT_V m8, m7, m13, m12, 0, %1
558 DO_FILT_V m6, m5, m11, m10, 16, %1
561 DO_FILT_CC m9, m8, m7, m6
562 movdqa m7, m12 ; not really necessary, but seems free and
563 movdqa m6, m11 ; gives far shorter code
565 DO_FILT_HH m14, m13, m7, m6
573 ; setup regs for next y
588 %define PALIGNR PALIGNR_MMX
590 %define PALIGNR PALIGNR_SSSE3
599 ;-----------------------------------------------------------------------------
600 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
601 ; uint8_t *src, int i_src, int w, int h)
602 ;-----------------------------------------------------------------------------
603 cglobal x264_plane_copy_mmxext, 6,7
670 ; These functions are not general-use; not only do the SSE ones require aligned input,
671 ; but they also will fail if given a non-mod16 size or a size less than 64.
672 ; memzero SSE will fail for non-mod128.
674 ;-----------------------------------------------------------------------------
675 ; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
676 ;-----------------------------------------------------------------------------
677 cglobal x264_memcpy_aligned_mmx, 3,3
681 movq mm0, [r1 + r2 + 0]
682 movq mm1, [r1 + r2 + 8]
683 movq [r0 + r2 + 0], mm0
684 movq [r0 + r2 + 8], mm1
687 movq mm0, [r1 + r2 + 0]
688 movq mm1, [r1 + r2 + 8]
689 movq mm2, [r1 + r2 + 16]
690 movq mm3, [r1 + r2 + 24]
691 movq [r0 + r2 + 0], mm0
692 movq [r0 + r2 + 8], mm1
693 movq [r0 + r2 + 16], mm2
694 movq [r0 + r2 + 24], mm3
698 ;-----------------------------------------------------------------------------
699 ; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
700 ;-----------------------------------------------------------------------------
701 cglobal x264_memcpy_aligned_sse2, 3,3
705 movdqa xmm0, [r1 + r2]
706 movdqa [r0 + r2], xmm0
711 movdqa xmm0, [r1 + r2 + 0]
712 movdqa [r0 + r2 + 0], xmm0
713 movdqa xmm1, [r1 + r2 + 16]
714 movdqa [r0 + r2 + 16], xmm1
717 movdqa xmm0, [r1 + r2 + 0]
718 movdqa [r0 + r2 + 0], xmm0
719 movdqa xmm1, [r1 + r2 + 16]
720 movdqa [r0 + r2 + 16], xmm1
721 movdqa xmm2, [r1 + r2 + 32]
722 movdqa [r0 + r2 + 32], xmm2
723 movdqa xmm3, [r1 + r2 + 48]
724 movdqa [r0 + r2 + 48], xmm3
728 ;-----------------------------------------------------------------------------
729 ; void *x264_memzero_aligned( void *dst, size_t n );
730 ;-----------------------------------------------------------------------------
732 cglobal x264_memzero_aligned_%1, 2,2
738 mova [r0 + r1 + i], m0
752 ;-----------------------------------------------------------------------------
753 ; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
754 ;-----------------------------------------------------------------------------
755 cglobal x264_integral_init4h_sse4, 3,4
766 paddw m1, [r0+r2*2+16]
767 movdqa [r3+r2*2 ], m0
768 movdqa [r3+r2*2+16], m1
773 cglobal x264_integral_init8h_sse4, 3,4
788 paddw m1, [r0+r2*2+16]
791 movdqa [r3+r2*2 ], m0
792 movdqa [r3+r2*2+16], m1
797 %macro INTEGRAL_INIT 1
798 ;-----------------------------------------------------------------------------
799 ; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
800 ;-----------------------------------------------------------------------------
801 cglobal x264_integral_init4v_%1, 3,5
823 ;-----------------------------------------------------------------------------
824 ; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
825 ;-----------------------------------------------------------------------------
826 cglobal x264_integral_init8v_%1, 3,3
833 mova m1, [r2+r1+mmsize]
835 psubw m1, [r0+r1+mmsize]
837 mova [r0+r1+mmsize], m1
854 pavgb %4, [r0+r5*2+%7]
855 PALIGNR %1, %3, 1, m6
856 PALIGNR %2, %4, 1, m6
868 mova m3, [r0+%4+mmsize]
870 pavgb m3, [r0+%4+r5+mmsize]
872 PALIGNR %1, m3, 1, m6
874 PALIGNR m3, m2, 1, m6
892 pavgb m3, [r0+%3+r5+8]
896 pavgb m1, [r0+%3+r5+9]
897 pavgb m0, [r0+%3+r5+1]
912 ;-----------------------------------------------------------------------------
913 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
914 ; int src_stride, int dst_stride, int width, int height )
915 ;-----------------------------------------------------------------------------
916 %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
917 cglobal x264_frame_init_lowres_core_%1, 6,7,%2
921 ; src += 2*(height-1)*stride + 2*width
927 ; dst += (height-1)*stride + width
936 ; gap = stride - width
940 %define dst_gap [rsp+gprsize]
945 %define src_gap [rsp]
947 ; adjust for the odd end case
970 FILT8x4 m0, m1, m2, m3, m4, m5, 0
988 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
991 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1004 FILT16x2 m0, r1, r2, 0
1005 FILT16x2 m1, r3, r4, r5
1021 %endmacro ; FRAME_INIT_LOWRES
1024 %define PALIGNR PALIGNR_MMX
1025 FRAME_INIT_LOWRES mmxext
1027 FRAME_INIT_LOWRES cache32_mmxext
1030 FRAME_INIT_LOWRES sse2, 12
1031 %define PALIGNR PALIGNR_SSSE3
1032 FRAME_INIT_LOWRES ssse3, 12