1 ;*****************************************************************************
2 ;* mc-a2.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
34 filt_mul20: times 16 db 20
35 filt_mul15: times 8 db 1, -5
36 filt_mul51: times 8 db -5, 1
37 hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
38 deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
77 psubw %1, %2 ; a-5*b+4*c
81 paddw %1, %3 ; a-5*b+20*c
88 psubw %1, %2 ; (a-b)/4-b
89 paddw %1, %3 ; (a-b)/4-b+c
90 psraw %1, 2 ; ((a-b)/4-b+c)/4
91 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
117 ;The hpel_filter routines use non-temporal writes for output.
118 ;The following defines may be uncommented for testing.
119 ;Doing the hpel_filter temporal may be a win if the last level cache
120 ;is big enough (preliminary benching suggests on the order of 4* framesize).
123 ;%define movntps movaps
129 ;-----------------------------------------------------------------------------
130 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
131 ;-----------------------------------------------------------------------------
132 cglobal hpel_filter_v_%1, 5,6,%2
145 mova m0, [filt_mul15]
155 SBUTTERFLY bw, 1, 4, 7
156 SBUTTERFLY bw, 2, 5, 7
157 SBUTTERFLY bw, 3, 6, 7
162 pmaddubsw m3, [filt_mul20]
163 pmaddubsw m6, [filt_mul20]
169 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
170 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
171 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
172 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
173 FILT_V2 m1, m2, m3, m4, m5, m6
177 mova [r2+r4*2+mmsize], m4
178 FILT_PACK m1, m4, 5, m7
188 ;-----------------------------------------------------------------------------
189 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
190 ;-----------------------------------------------------------------------------
191 cglobal hpel_filter_c_mmxext, 3,3
203 paddw m3, [src+2] ; c0
207 paddw m4, [src+14] ; a1
208 paddw m5, [src+12] ; b1
209 paddw m6, [src+10] ; c1
210 FILT_H2 m1, m2, m3, m4, m5, m6
211 FILT_PACK m1, m4, 6, m7
217 ;-----------------------------------------------------------------------------
218 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
219 ;-----------------------------------------------------------------------------
220 cglobal hpel_filter_h_mmxext, 3,3
254 FILT_H2 m1, m2, m3, m4, m5, m6
255 FILT_PACK m1, m4, 1, m7
264 ;-----------------------------------------------------------------------------
265 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
266 ;-----------------------------------------------------------------------------
267 cglobal hpel_filter_c_%1, 3,3,9
279 %define tpw_32 [pw_32]
281 %ifidn %1,sse2_misalign
295 FILT_H2 m4, m5, m6, m3, m2, m1
302 PALIGNR m4, m0, 12, m7
304 PALIGNR m5, m0, 14, m0
306 PALIGNR m0, m1, 6, m7
309 PALIGNR m0, m1, 4, m7
312 PALIGNR m6, m1, 2, m7
318 PALIGNR m2, m1, 12, m7
319 PALIGNR m5, m1, 14, m1
322 PALIGNR m3, m0, 6, m7
325 PALIGNR m6, m0, 4, m7
328 PALIGNR m6, m0, 2, m7
332 FILT_PACK m4, m3, 6, tpw_32
339 ;-----------------------------------------------------------------------------
340 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
341 ;-----------------------------------------------------------------------------
342 cglobal hpel_filter_h_sse2, 3,3,8
379 mova m7, [pw_1] ; FIXME xmm8
380 FILT_H2 m1, m2, m3, m4, m5, m6
381 FILT_PACK m1, m4, 1, m7
388 ;-----------------------------------------------------------------------------
389 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
390 ;-----------------------------------------------------------------------------
391 cglobal hpel_filter_h_ssse3, 3,3
407 pmaddubsw m3, [filt_mul15]
408 pmaddubsw m4, [filt_mul15]
409 pmaddubsw m0, [filt_mul51]
416 pmaddubsw m1, [filt_mul20]
417 pmaddubsw m5, [filt_mul20]
418 pmaddubsw m6, [filt_mul51]
422 FILT_PACK m3, m4, 5, m7
423 pshufb m3, [hpel_shuf]
431 %define PALIGNR PALIGNR_MMX
437 %define PALIGNR PALIGNR_SSSE3
444 ;The optimum prefetch distance is difficult to determine in checkasm:
445 ;any prefetch seems slower than not prefetching.
446 ;In real use, the prefetch seems to be a slight win.
447 ;+16 is picked somewhat arbitrarily here based on the fact that even one
448 ;loop iteration is going to take longer than the prefetch.
449 prefetcht0 [r1+r2*2+16]
479 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
480 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
481 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
483 FILT_V2 m1, m2, m3, m4, m5, m6
489 FILT_PACK m1, m4, 5, m15
490 movntps [r11+r4+%5], m1
495 PALIGNR m1, %1, 12, m2
497 PALIGNR m2, %1, 14, %1
499 PALIGNR m3, %2, 4, %1
501 PALIGNR m4, %2, 2, %1
504 PALIGNR %3, %2, 6, m2
513 FILT_PACK %3, %4, 6, m15
530 PALIGNR m1, %1, 14, m3
532 PALIGNR m2, %1, 15, m3
534 PALIGNR m4, %2, 1 , m3
536 PALIGNR m5, %2, 2 , m3
538 PALIGNR m6, %2, 3 , m3
541 ADD8TO16 m1, m6, m12, m3, m0 ; a
542 ADD8TO16 m2, m5, m12, m3, m0 ; b
543 ADD8TO16 %2, m4, m12, m3, m0 ; c
544 FILT_V2 m1, m2, %2, m6, m5, m4
545 FILT_PACK m1, m6, 5, m15
557 FILT_PACK m1, m2, 5, m15
558 pshufb m1, [hpel_shuf]
565 ;-----------------------------------------------------------------------------
566 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
567 ; uint8_t *src, int stride, int width, int height)
568 ;-----------------------------------------------------------------------------
569 cglobal hpel_filter_%1, 7,7,16
593 mova m0, [filt_mul51]
594 mova m12, [filt_mul15]
595 mova m14, [filt_mul20]
600 DO_FILT_V m8, m7, m13, m12, 0, %1
603 DO_FILT_V m6, m5, m11, m12, 16, %1
605 paddw m15, m15 ; pw_32
606 DO_FILT_C m9, m8, m7, m6
609 DO_FILT_H m10, m13, m11, %1
614 ; setup regs for next y
629 %define PALIGNR PALIGNR_MMX
631 %define PALIGNR PALIGNR_SSSE3
639 ;-----------------------------------------------------------------------------
640 ; void plane_copy_core( uint8_t *dst, int i_dst,
641 ; uint8_t *src, int i_src, int w, int h)
642 ;-----------------------------------------------------------------------------
643 ; assumes i_dst and w are multiples of 16, and i_dst>w
644 cglobal plane_copy_core_mmxext, 6,7
697 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
717 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant
750 %macro PLANE_INTERLEAVE 1
751 ;-----------------------------------------------------------------------------
752 ; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
753 ; uint8_t *srcu, int i_srcu,
754 ; uint8_t *srcv, int i_srcv, int w, int h )
755 ;-----------------------------------------------------------------------------
756 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
757 cglobal plane_copy_interleave_core_%1, 6,7
766 DECLARE_REG_TMP 10,11
785 INTERLEAVE r0+r6*2, r2+r6, r4+r6, 0, nt
786 INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt
792 movntq [r0+r6*2+8], m0
793 movntq [r0+r6*2+16], m0
794 movntq [r0+r6*2+24], m0
796 movntdq [r0+r6*2], m0
797 movntdq [r0+r6*2+16], m0
811 ;-----------------------------------------------------------------------------
812 ; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
813 ;-----------------------------------------------------------------------------
814 cglobal store_interleave_8x8x2_%1, 4,5
817 INTERLEAVE r0, r2, r3, 1
818 INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1
819 add r2, FDEC_STRIDE*2
820 add r3, FDEC_STRIDE*2
825 %endmacro ; PLANE_INTERLEAVE
827 %macro DEINTERLEAVE_START 1
829 mova m4, [deinterleave_shuf]
835 %macro PLANE_DEINTERLEAVE 1
836 ;-----------------------------------------------------------------------------
837 ; void plane_copy_deinterleave( uint8_t *dstu, int i_dstu,
838 ; uint8_t *dstv, int i_dstv,
839 ; uint8_t *src, int i_src, int w, int h )
840 ;-----------------------------------------------------------------------------
841 cglobal plane_copy_deinterleave_%1, 6,7
842 DEINTERLEAVE_START %1
854 DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, %1, m4
855 DEINTERLEAVE r0+r6+8, r2+r6+8, r4+r6*2+16, 0, %1, m4
865 ;-----------------------------------------------------------------------------
866 ; void load_deinterleave_8x8x2_fenc( uint8_t *dst, uint8_t *src, int i_src )
867 ;-----------------------------------------------------------------------------
868 cglobal load_deinterleave_8x8x2_fenc_%1, 3,4
869 DEINTERLEAVE_START %1
872 DEINTERLEAVE r0, r0+FENC_STRIDE/2, r1, 1, %1, m4
873 DEINTERLEAVE r0+FENC_STRIDE, r0+FENC_STRIDE*3/2, r1+r2, 1, %1, m4
874 add r0, FENC_STRIDE*2
880 ;-----------------------------------------------------------------------------
881 ; void load_deinterleave_8x8x2_fdec( uint8_t *dst, uint8_t *src, int i_src )
882 ;-----------------------------------------------------------------------------
883 cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
884 DEINTERLEAVE_START %1
887 DEINTERLEAVE r0, r0+FDEC_STRIDE/2, r1, 0, %1, m4
888 DEINTERLEAVE r0+FDEC_STRIDE, r0+FDEC_STRIDE*3/2, r1+r2, 0, %1, m4
889 add r0, FDEC_STRIDE*2
894 %endmacro ; PLANE_DEINTERLEAVE
897 PLANE_INTERLEAVE mmxext
898 PLANE_DEINTERLEAVE mmx
900 PLANE_INTERLEAVE sse2
901 PLANE_DEINTERLEAVE sse2
902 PLANE_DEINTERLEAVE ssse3
905 ; These functions are not general-use; not only do the SSE ones require aligned input,
906 ; but they also will fail if given a non-mod16 size or a size less than 64.
907 ; memzero SSE will fail for non-mod128.
909 ;-----------------------------------------------------------------------------
910 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
911 ;-----------------------------------------------------------------------------
912 cglobal memcpy_aligned_mmx, 3,3
916 movq mm0, [r1 + r2 + 0]
917 movq mm1, [r1 + r2 + 8]
918 movq [r0 + r2 + 0], mm0
919 movq [r0 + r2 + 8], mm1
922 movq mm0, [r1 + r2 + 0]
923 movq mm1, [r1 + r2 + 8]
924 movq mm2, [r1 + r2 + 16]
925 movq mm3, [r1 + r2 + 24]
926 movq [r0 + r2 + 0], mm0
927 movq [r0 + r2 + 8], mm1
928 movq [r0 + r2 + 16], mm2
929 movq [r0 + r2 + 24], mm3
933 ;-----------------------------------------------------------------------------
934 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
935 ;-----------------------------------------------------------------------------
936 cglobal memcpy_aligned_sse2, 3,3
940 movdqa xmm0, [r1 + r2]
941 movdqa [r0 + r2], xmm0
946 movdqa xmm0, [r1 + r2 + 0]
947 movdqa [r0 + r2 + 0], xmm0
948 movdqa xmm1, [r1 + r2 + 16]
949 movdqa [r0 + r2 + 16], xmm1
952 movdqa xmm0, [r1 + r2 + 0]
953 movdqa [r0 + r2 + 0], xmm0
954 movdqa xmm1, [r1 + r2 + 16]
955 movdqa [r0 + r2 + 16], xmm1
956 movdqa xmm2, [r1 + r2 + 32]
957 movdqa [r0 + r2 + 32], xmm2
958 movdqa xmm3, [r1 + r2 + 48]
959 movdqa [r0 + r2 + 48], xmm3
963 ;-----------------------------------------------------------------------------
964 ; void *memzero_aligned( void *dst, size_t n );
965 ;-----------------------------------------------------------------------------
967 cglobal memzero_aligned_%1, 2,2
974 mova [r0 + r1 + i], m0
989 ;-----------------------------------------------------------------------------
990 ; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
991 ;-----------------------------------------------------------------------------
992 cglobal integral_init4h_sse4, 3,4
999 movdqa m1, [r1+r2+16]
1004 paddw m1, [r0+r2*2+16]
1005 movdqa [r3+r2*2 ], m0
1006 movdqa [r3+r2*2+16], m1
1011 cglobal integral_init8h_sse4, 3,4
1018 movdqa m1, [r1+r2+16]
1027 paddw m1, [r0+r2*2+16]
1030 movdqa [r3+r2*2 ], m0
1031 movdqa [r3+r2*2+16], m1
1036 %macro INTEGRAL_INIT_8V 1
1037 ;-----------------------------------------------------------------------------
1038 ; void integral_init8v( uint16_t *sum8, int stride )
1039 ;-----------------------------------------------------------------------------
1040 cglobal integral_init8v_%1, 3,3
1047 mova m1, [r2+r1+mmsize]
1049 psubw m1, [r0+r1+mmsize]
1051 mova [r0+r1+mmsize], m1
1058 INTEGRAL_INIT_8V mmx
1060 INTEGRAL_INIT_8V sse2
1062 ;-----------------------------------------------------------------------------
1063 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
1064 ;-----------------------------------------------------------------------------
1066 cglobal integral_init4v_mmx, 3,5
1088 cglobal integral_init4v_sse2, 3,5
1100 shufpd m0, [r0+r2+16], 1
1101 shufpd m1, [r4+r2+16], 1
1113 cglobal integral_init4v_ssse3, 3,5
1142 pavgb %4, [r0+r5*2+%7]
1143 PALIGNR %1, %3, 1, m6
1144 PALIGNR %2, %4, 1, m6
1156 mova m3, [r0+%4+mmsize]
1158 pavgb m3, [r0+%4+r5+mmsize]
1159 pavgb m2, [r0+%4+r5]
1160 PALIGNR %1, m3, 1, m6
1162 PALIGNR m3, m2, 1, m6
1180 pavgb m3, [r0+%3+r5+8]
1181 pavgb m2, [r0+%3+r5]
1184 pavgb m1, [r0+%3+r5+9]
1185 pavgb m0, [r0+%3+r5+1]
1200 ;-----------------------------------------------------------------------------
1201 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1202 ; int src_stride, int dst_stride, int width, int height )
1203 ;-----------------------------------------------------------------------------
1204 %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
1205 cglobal frame_init_lowres_core_%1, 6,7,%2
1209 ; src += 2*(height-1)*stride + 2*width
1215 ; dst += (height-1)*stride + width
1224 ; gap = stride - width
1228 %define dst_gap [rsp+gprsize]
1233 %define src_gap [rsp]
1235 ; adjust for the odd end case
1258 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1276 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1279 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1292 FILT16x2 m0, r1, r2, 0
1293 FILT16x2 m1, r3, r4, r5
1309 %endmacro ; FRAME_INIT_LOWRES
1312 %define PALIGNR PALIGNR_MMX
1313 FRAME_INIT_LOWRES mmxext
1315 FRAME_INIT_LOWRES cache32_mmxext
1318 FRAME_INIT_LOWRES sse2, 12
1319 %define PALIGNR PALIGNR_SSSE3
1320 FRAME_INIT_LOWRES ssse3, 12
1322 ;-----------------------------------------------------------------------------
1323 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1324 ; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
1325 ;-----------------------------------------------------------------------------
1326 cglobal mbtree_propagate_cost_sse2, 6,6,7
1335 movdqa xmm6, [pw_3fff]
1336 movdqa xmm4, [pd_128]
1338 movq xmm2, [r2+r5] ; intra
1339 movq xmm0, [r4+r5] ; invq
1340 movq xmm3, [r3+r5] ; inter
1341 movq xmm1, [r1+r5] ; prop
1342 punpcklwd xmm2, xmm5
1343 punpcklwd xmm0, xmm5
1346 punpcklwd xmm1, xmm5
1347 punpcklwd xmm3, xmm5
1349 psrld xmm0, 8 ; intra*invq>>8
1350 paddd xmm0, xmm1 ; prop + (intra*invq>>8)
1351 cvtdq2ps xmm1, xmm2 ; intra
1352 psubd xmm2, xmm3 ; intra - inter
1353 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1355 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1357 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1358 mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
1359 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1360 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1361 mulps xmm0, xmm3 ; / intra
1362 cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
1363 movdqa [r0+r5*2], xmm0