1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <hal@duncan.ol.sub.de>
9 ;* Mathieu Monnier <manao@melix.net>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 filt_mul20: times 16 db 20
32 filt_mul51: times 8 db 1, -5
37 pd_128: times 4 dd 128
69 psubw m1, m2 ; a-5*b+4*c
73 paddw m1, m3 ; a-5*b+20*c
80 psubw %1, %2 ; (a-b)/4-b
81 paddw %1, %3 ; (a-b)/4-b+c
82 psraw %1, 2 ; ((a-b)/4-b+c)/4
83 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
112 ;-----------------------------------------------------------------------------
113 ; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
114 ;-----------------------------------------------------------------------------
115 cglobal x264_hpel_filter_v_%1, 5,6,%2
128 mova m0, [filt_mul51 GLOBAL]
138 SBUTTERFLY bw, 1, 4, 7
139 SBUTTERFLY bw, 2, 5, 7
140 SBUTTERFLY bw, 3, 6, 7
145 pmaddubsw m3, [filt_mul20 GLOBAL]
146 pmaddubsw m6, [filt_mul20 GLOBAL]
152 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
153 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
154 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
155 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
158 mova m7, [pw_16 GLOBAL]
160 mova [r2+r4*2+mmsize], m4
175 ;-----------------------------------------------------------------------------
176 ; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width );
177 ;-----------------------------------------------------------------------------
178 cglobal x264_hpel_filter_c_mmxext, 3,3
183 movq m7, [pw_32 GLOBAL]
190 paddw m3, [src+2] ; c0
194 paddw m4, [src+14] ; a1
195 paddw m5, [src+12] ; b1
196 paddw m6, [src+10] ; c1
197 FILT_H2 m1, m2, m3, m4, m5, m6
204 ;-----------------------------------------------------------------------------
205 ; void x264_hpel_filter_h_mmxext( uint8_t *dst, uint8_t *src, int width );
206 ;-----------------------------------------------------------------------------
207 cglobal x264_hpel_filter_h_mmxext, 3,3
240 movq m7, [pw_1 GLOBAL]
241 FILT_H2 m1, m2, m3, m4, m5, m6
251 ;-----------------------------------------------------------------------------
252 ; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
253 ;-----------------------------------------------------------------------------
254 cglobal x264_hpel_filter_c_%1, 3,3,9
260 mova m7, [pw_32 GLOBAL]
263 mova m8, [pw_32 GLOBAL]
266 %define tpw_32 [pw_32 GLOBAL]
269 %ifidn %1,sse2_misalign
284 PALIGNR m3, m2, 2, m7
285 PALIGNR m4, m2, 4, m7
286 PALIGNR m5, m2, 6, m7
287 PALIGNR m0, m6, 12, m7
288 PALIGNR m1, m6, 14, m7
303 ;-----------------------------------------------------------------------------
304 ; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
305 ;-----------------------------------------------------------------------------
306 cglobal x264_hpel_filter_h_sse2, 3,3,8
343 mova m7, [pw_1 GLOBAL] ; FIXME xmm8
344 FILT_H2 m1, m2, m3, m4, m5, m6
352 ;-----------------------------------------------------------------------------
353 ; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
354 ;-----------------------------------------------------------------------------
355 cglobal x264_hpel_filter_h_ssse3, 3,3
362 punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
365 mova m7, [pw_1 GLOBAL]
419 %define PALIGNR PALIGNR_MMX
425 %define PALIGNR PALIGNR_SSSE3
439 mova %2, [filt_mul51 GLOBAL]
455 pmaddubsw m3, [filt_mul20 GLOBAL]
456 pmaddubsw %1, [filt_mul20 GLOBAL]
464 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
465 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
466 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
478 movntps [r11+r4+%5], m1
483 PALIGNR m1, %1, 12, m4
485 PALIGNR m2, %1, 14, m4
487 PALIGNR %3, %2, 6, m4
489 PALIGNR m3, %2, 4, m4
492 PALIGNR m4, %2, 2, m1
501 DO_FILT_H %1, %2, %3, 6
502 DO_FILT_H %2, %1, %4, 6
508 DO_FILT_H %1, %2, %3, 1
509 DO_FILT_H %2, %1, %4, 1
515 DO_FILT_H %1, %2, %3, 6
517 DO_FILT_H %4, %5, %6, 1
522 ;-----------------------------------------------------------------------------
523 ; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
524 ; uint8_t *src, int stride, int width, int height)
525 ;-----------------------------------------------------------------------------
526 cglobal x264_hpel_filter_%1, 7,7,16
555 ; prefetching does not help here! lots of variants tested, all slower
556 DO_FILT_V m8, m7, m13, m12, 0, %1
559 DO_FILT_V m6, m5, m11, m10, 16, %1
562 DO_FILT_CC m9, m8, m7, m6
563 movdqa m7, m12 ; not really necessary, but seems free and
564 movdqa m6, m11 ; gives far shorter code
566 DO_FILT_HH m14, m13, m7, m6
574 ; setup regs for next y
589 %define PALIGNR PALIGNR_MMX
591 %define PALIGNR PALIGNR_SSSE3
600 ;-----------------------------------------------------------------------------
601 ; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
602 ; uint8_t *src, int i_src, int w, int h)
603 ;-----------------------------------------------------------------------------
604 cglobal x264_plane_copy_mmxext, 6,7
671 ; These functions are not general-use; not only do the SSE ones require aligned input,
672 ; but they also will fail if given a non-mod16 size or a size less than 64.
673 ; memzero SSE will fail for non-mod128.
675 ;-----------------------------------------------------------------------------
676 ; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
677 ;-----------------------------------------------------------------------------
678 cglobal x264_memcpy_aligned_mmx, 3,3
682 movq mm0, [r1 + r2 + 0]
683 movq mm1, [r1 + r2 + 8]
684 movq [r0 + r2 + 0], mm0
685 movq [r0 + r2 + 8], mm1
688 movq mm0, [r1 + r2 + 0]
689 movq mm1, [r1 + r2 + 8]
690 movq mm2, [r1 + r2 + 16]
691 movq mm3, [r1 + r2 + 24]
692 movq [r0 + r2 + 0], mm0
693 movq [r0 + r2 + 8], mm1
694 movq [r0 + r2 + 16], mm2
695 movq [r0 + r2 + 24], mm3
699 ;-----------------------------------------------------------------------------
700 ; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
701 ;-----------------------------------------------------------------------------
702 cglobal x264_memcpy_aligned_sse2, 3,3
706 movdqa xmm0, [r1 + r2]
707 movdqa [r0 + r2], xmm0
712 movdqa xmm0, [r1 + r2 + 0]
713 movdqa [r0 + r2 + 0], xmm0
714 movdqa xmm1, [r1 + r2 + 16]
715 movdqa [r0 + r2 + 16], xmm1
718 movdqa xmm0, [r1 + r2 + 0]
719 movdqa [r0 + r2 + 0], xmm0
720 movdqa xmm1, [r1 + r2 + 16]
721 movdqa [r0 + r2 + 16], xmm1
722 movdqa xmm2, [r1 + r2 + 32]
723 movdqa [r0 + r2 + 32], xmm2
724 movdqa xmm3, [r1 + r2 + 48]
725 movdqa [r0 + r2 + 48], xmm3
729 ;-----------------------------------------------------------------------------
730 ; void *x264_memzero_aligned( void *dst, size_t n );
731 ;-----------------------------------------------------------------------------
733 cglobal x264_memzero_aligned_%1, 2,2
739 mova [r0 + r1 + i], m0
753 ;-----------------------------------------------------------------------------
754 ; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
755 ;-----------------------------------------------------------------------------
756 cglobal x264_integral_init4h_sse4, 3,4
763 movdqa m1, [r1+r2+16]
768 paddw m1, [r0+r2*2+16]
769 movdqa [r3+r2*2 ], m0
770 movdqa [r3+r2*2+16], m1
775 cglobal x264_integral_init8h_sse4, 3,4
782 movdqa m1, [r1+r2+16]
791 paddw m1, [r0+r2*2+16]
794 movdqa [r3+r2*2 ], m0
795 movdqa [r3+r2*2+16], m1
800 %macro INTEGRAL_INIT_8V 1
801 ;-----------------------------------------------------------------------------
802 ; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
803 ;-----------------------------------------------------------------------------
804 cglobal x264_integral_init8v_%1, 3,3
811 mova m1, [r2+r1+mmsize]
813 psubw m1, [r0+r1+mmsize]
815 mova [r0+r1+mmsize], m1
824 INTEGRAL_INIT_8V sse2
826 ;-----------------------------------------------------------------------------
827 ; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
828 ;-----------------------------------------------------------------------------
830 cglobal x264_integral_init4v_mmx, 3,5
852 cglobal x264_integral_init4v_sse2, 3,5
864 shufpd m0, [r0+r2+16], 1
865 shufpd m1, [r4+r2+16], 1
877 cglobal x264_integral_init4v_ssse3, 3,5
906 pavgb %4, [r0+r5*2+%7]
907 PALIGNR %1, %3, 1, m6
908 PALIGNR %2, %4, 1, m6
920 mova m3, [r0+%4+mmsize]
922 pavgb m3, [r0+%4+r5+mmsize]
924 PALIGNR %1, m3, 1, m6
926 PALIGNR m3, m2, 1, m6
944 pavgb m3, [r0+%3+r5+8]
948 pavgb m1, [r0+%3+r5+9]
949 pavgb m0, [r0+%3+r5+1]
964 ;-----------------------------------------------------------------------------
965 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
966 ; int src_stride, int dst_stride, int width, int height )
967 ;-----------------------------------------------------------------------------
968 %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
969 cglobal x264_frame_init_lowres_core_%1, 6,7,%2
973 ; src += 2*(height-1)*stride + 2*width
979 ; dst += (height-1)*stride + width
988 ; gap = stride - width
992 %define dst_gap [rsp+gprsize]
997 %define src_gap [rsp]
999 ; adjust for the odd end case
1022 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1040 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1043 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1056 FILT16x2 m0, r1, r2, 0
1057 FILT16x2 m1, r3, r4, r5
1073 %endmacro ; FRAME_INIT_LOWRES
1076 %define PALIGNR PALIGNR_MMX
1077 FRAME_INIT_LOWRES mmxext
1079 FRAME_INIT_LOWRES cache32_mmxext
1082 FRAME_INIT_LOWRES sse2, 12
1083 %define PALIGNR PALIGNR_SSSE3
1084 FRAME_INIT_LOWRES ssse3, 12
1086 ;-----------------------------------------------------------------------------
1087 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1088 ; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
1089 ;-----------------------------------------------------------------------------
1090 cglobal x264_mbtree_propagate_cost_sse2, 6,6
1099 movdqa xmm4, [pd_128 GLOBAL]
1101 movq xmm2, [r2+r5] ; intra
1102 movq xmm0, [r4+r5] ; invq
1103 punpcklwd xmm2, xmm5
1104 punpcklwd xmm0, xmm5
1107 psrld xmm0, 8 ; intra*invq>>8
1108 movq xmm1, [r1+r5] ; prop
1109 movq xmm3, [r3+r5] ; inter
1110 punpcklwd xmm1, xmm5
1111 punpcklwd xmm3, xmm5
1112 paddd xmm0, xmm1 ; prop + (intra*invq>>8)
1113 cvtdq2ps xmm1, xmm2 ; intra
1114 psubd xmm2, xmm3 ; intra - inter
1117 mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
1118 divps xmm0, xmm1 ; / intra
1119 cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
1120 movdqa [r0+r5*2], xmm0