1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <hal@duncan.ol.sub.de>
9 ;* Mathieu Monnier <manao@melix.net>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 filt_mul20: times 16 db 20
32 filt_mul51: times 8 db 1, -5
37 pd_128: times 4 dd 128
69 psubw m1, m2 ; a-5*b+4*c
73 paddw m1, m3 ; a-5*b+20*c
80 psubw %1, %2 ; (a-b)/4-b
81 paddw %1, %3 ; (a-b)/4-b+c
82 psraw %1, 2 ; ((a-b)/4-b+c)/4
83 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
112 ;-----------------------------------------------------------------------------
113 ; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
114 ;-----------------------------------------------------------------------------
115 cglobal x264_hpel_filter_v_%1, 5,6,%2
128 mova m0, [filt_mul51]
138 SBUTTERFLY bw, 1, 4, 7
139 SBUTTERFLY bw, 2, 5, 7
140 SBUTTERFLY bw, 3, 6, 7
145 pmaddubsw m3, [filt_mul20]
146 pmaddubsw m6, [filt_mul20]
152 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
153 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
154 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
155 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
160 mova [r2+r4*2+mmsize], m4
175 ;-----------------------------------------------------------------------------
176 ; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
177 ;-----------------------------------------------------------------------------
178 cglobal x264_hpel_filter_c_mmxext, 3,3
190 paddw m3, [src+2] ; c0
194 paddw m4, [src+14] ; a1
195 paddw m5, [src+12] ; b1
196 paddw m6, [src+10] ; c1
197 FILT_H2 m1, m2, m3, m4, m5, m6
204 ;-----------------------------------------------------------------------------
205 ; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
206 ;-----------------------------------------------------------------------------
207 cglobal x264_hpel_filter_h_mmxext, 3,3
241 FILT_H2 m1, m2, m3, m4, m5, m6
251 ;-----------------------------------------------------------------------------
252 ; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
253 ;-----------------------------------------------------------------------------
254 cglobal x264_hpel_filter_c_%1, 3,3,9
266 %define tpw_32 [pw_32]
269 %ifidn %1,sse2_misalign
284 PALIGNR m3, m2, 2, m7
285 PALIGNR m4, m2, 4, m7
286 PALIGNR m5, m2, 6, m7
287 PALIGNR m0, m6, 12, m7
288 PALIGNR m1, m6, 14, m7
303 ;-----------------------------------------------------------------------------
304 ; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
305 ;-----------------------------------------------------------------------------
306 cglobal x264_hpel_filter_h_sse2, 3,3,8
343 mova m7, [pw_1] ; FIXME xmm8
344 FILT_H2 m1, m2, m3, m4, m5, m6
352 ;-----------------------------------------------------------------------------
353 ; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
354 ;-----------------------------------------------------------------------------
355 cglobal x264_hpel_filter_h_ssse3, 3,3
362 punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
419 %define PALIGNR PALIGNR_MMX
425 %define PALIGNR PALIGNR_SSSE3
439 mova %2, [filt_mul51]
455 pmaddubsw m3, [filt_mul20]
456 pmaddubsw %1, [filt_mul20]
464 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
465 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
466 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
478 movntps [r11+r4+%5], m1
483 PALIGNR m1, %1, 12, m4
485 PALIGNR m2, %1, 14, m4
487 PALIGNR %3, %2, 6, m4
489 PALIGNR m3, %2, 4, m4
492 PALIGNR m4, %2, 2, m1
501 DO_FILT_H %1, %2, %3, 6
502 DO_FILT_H %2, %1, %4, 6
508 DO_FILT_H %1, %2, %3, 1
509 DO_FILT_H %2, %1, %4, 1
515 DO_FILT_H %1, %2, %3, 6
517 DO_FILT_H %4, %5, %6, 1
522 ;-----------------------------------------------------------------------------
523 ; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
524 ; uint8_t *src, int stride, int width, int height)
525 ;-----------------------------------------------------------------------------
526 cglobal x264_hpel_filter_%1, 7,7,16
555 ; prefetching does not help here! lots of variants tested, all slower
556 DO_FILT_V m8, m7, m13, m12, 0, %1
559 DO_FILT_V m6, m5, m11, m10, 16, %1
562 DO_FILT_CC m9, m8, m7, m6
563 movdqa m7, m12 ; not really necessary, but seems free and
564 movdqa m6, m11 ; gives far shorter code
566 DO_FILT_HH m14, m13, m7, m6
574 ; setup regs for next y
589 %define PALIGNR PALIGNR_MMX
591 %define PALIGNR PALIGNR_SSSE3
600 ;-----------------------------------------------------------------------------
601 ; void x264_plane_copy_core_mmxext( uint8_t *dst, int i_dst,
602 ; uint8_t *src, int i_src, int w, int h)
603 ;-----------------------------------------------------------------------------
604 ; assumes i_dst and w are multiples of 16, and i_dst>w
605 cglobal x264_plane_copy_core_mmxext, 6,7
659 ; These functions are not general-use; not only do the SSE ones require aligned input,
660 ; but they also will fail if given a non-mod16 size or a size less than 64.
661 ; memzero SSE will fail for non-mod128.
663 ;-----------------------------------------------------------------------------
664 ; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
665 ;-----------------------------------------------------------------------------
666 cglobal x264_memcpy_aligned_mmx, 3,3
670 movq mm0, [r1 + r2 + 0]
671 movq mm1, [r1 + r2 + 8]
672 movq [r0 + r2 + 0], mm0
673 movq [r0 + r2 + 8], mm1
676 movq mm0, [r1 + r2 + 0]
677 movq mm1, [r1 + r2 + 8]
678 movq mm2, [r1 + r2 + 16]
679 movq mm3, [r1 + r2 + 24]
680 movq [r0 + r2 + 0], mm0
681 movq [r0 + r2 + 8], mm1
682 movq [r0 + r2 + 16], mm2
683 movq [r0 + r2 + 24], mm3
687 ;-----------------------------------------------------------------------------
688 ; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
689 ;-----------------------------------------------------------------------------
690 cglobal x264_memcpy_aligned_sse2, 3,3
694 movdqa xmm0, [r1 + r2]
695 movdqa [r0 + r2], xmm0
700 movdqa xmm0, [r1 + r2 + 0]
701 movdqa [r0 + r2 + 0], xmm0
702 movdqa xmm1, [r1 + r2 + 16]
703 movdqa [r0 + r2 + 16], xmm1
706 movdqa xmm0, [r1 + r2 + 0]
707 movdqa [r0 + r2 + 0], xmm0
708 movdqa xmm1, [r1 + r2 + 16]
709 movdqa [r0 + r2 + 16], xmm1
710 movdqa xmm2, [r1 + r2 + 32]
711 movdqa [r0 + r2 + 32], xmm2
712 movdqa xmm3, [r1 + r2 + 48]
713 movdqa [r0 + r2 + 48], xmm3
717 ;-----------------------------------------------------------------------------
718 ; void *x264_memzero_aligned( void *dst, size_t n );
719 ;-----------------------------------------------------------------------------
721 cglobal x264_memzero_aligned_%1, 2,2
728 mova [r0 + r1 + i], m0
743 ;-----------------------------------------------------------------------------
744 ; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
745 ;-----------------------------------------------------------------------------
746 cglobal x264_integral_init4h_sse4, 3,4
753 movdqa m1, [r1+r2+16]
758 paddw m1, [r0+r2*2+16]
759 movdqa [r3+r2*2 ], m0
760 movdqa [r3+r2*2+16], m1
765 cglobal x264_integral_init8h_sse4, 3,4
772 movdqa m1, [r1+r2+16]
781 paddw m1, [r0+r2*2+16]
784 movdqa [r3+r2*2 ], m0
785 movdqa [r3+r2*2+16], m1
790 %macro INTEGRAL_INIT_8V 1
791 ;-----------------------------------------------------------------------------
792 ; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
793 ;-----------------------------------------------------------------------------
794 cglobal x264_integral_init8v_%1, 3,3
801 mova m1, [r2+r1+mmsize]
803 psubw m1, [r0+r1+mmsize]
805 mova [r0+r1+mmsize], m1
814 INTEGRAL_INIT_8V sse2
816 ;-----------------------------------------------------------------------------
817 ; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
818 ;-----------------------------------------------------------------------------
820 cglobal x264_integral_init4v_mmx, 3,5
842 cglobal x264_integral_init4v_sse2, 3,5
854 shufpd m0, [r0+r2+16], 1
855 shufpd m1, [r4+r2+16], 1
867 cglobal x264_integral_init4v_ssse3, 3,5
896 pavgb %4, [r0+r5*2+%7]
897 PALIGNR %1, %3, 1, m6
898 PALIGNR %2, %4, 1, m6
910 mova m3, [r0+%4+mmsize]
912 pavgb m3, [r0+%4+r5+mmsize]
914 PALIGNR %1, m3, 1, m6
916 PALIGNR m3, m2, 1, m6
934 pavgb m3, [r0+%3+r5+8]
938 pavgb m1, [r0+%3+r5+9]
939 pavgb m0, [r0+%3+r5+1]
954 ;-----------------------------------------------------------------------------
955 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
956 ; int src_stride, int dst_stride, int width, int height )
957 ;-----------------------------------------------------------------------------
958 %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
959 cglobal x264_frame_init_lowres_core_%1, 6,7,%2
963 ; src += 2*(height-1)*stride + 2*width
969 ; dst += (height-1)*stride + width
978 ; gap = stride - width
982 %define dst_gap [rsp+gprsize]
987 %define src_gap [rsp]
989 ; adjust for the odd end case
1012 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1030 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1033 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1046 FILT16x2 m0, r1, r2, 0
1047 FILT16x2 m1, r3, r4, r5
1063 %endmacro ; FRAME_INIT_LOWRES
1066 %define PALIGNR PALIGNR_MMX
1067 FRAME_INIT_LOWRES mmxext
1069 FRAME_INIT_LOWRES cache32_mmxext
1072 FRAME_INIT_LOWRES sse2, 12
1073 %define PALIGNR PALIGNR_SSSE3
1074 FRAME_INIT_LOWRES ssse3, 12
1076 ;-----------------------------------------------------------------------------
1077 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1078 ; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
1079 ;-----------------------------------------------------------------------------
1080 cglobal x264_mbtree_propagate_cost_sse2, 6,6
1089 movdqa xmm4, [pd_128]
1091 movq xmm2, [r2+r5] ; intra
1092 movq xmm0, [r4+r5] ; invq
1093 punpcklwd xmm2, xmm5
1094 punpcklwd xmm0, xmm5
1097 psrld xmm0, 8 ; intra*invq>>8
1098 movq xmm1, [r1+r5] ; prop
1099 movq xmm3, [r3+r5] ; inter
1100 punpcklwd xmm1, xmm5
1101 punpcklwd xmm3, xmm5
1102 paddd xmm0, xmm1 ; prop + (intra*invq>>8)
1103 cvtdq2ps xmm1, xmm2 ; intra
1104 psubd xmm2, xmm3 ; intra - inter
1107 mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
1108 divps xmm0, xmm1 ; / intra
1109 cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
1110 movdqa [r0+r5*2], xmm0