1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 filt_mul20: times 16 db 20
32 filt_mul15: times 8 db 1, -5
33 filt_mul51: times 8 db -5, 1
34 hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
35 deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
74 psubw %1, %2 ; a-5*b+4*c
78 paddw %1, %3 ; a-5*b+20*c
85 psubw %1, %2 ; (a-b)/4-b
86 paddw %1, %3 ; (a-b)/4-b+c
87 psraw %1, 2 ; ((a-b)/4-b+c)/4
88 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
114 ;The hpel_filter routines use non-temporal writes for output.
115 ;The following defines may be uncommented for testing.
116 ;Doing the hpel_filter temporal may be a win if the last level cache
117 ;is big enough (preliminary benching suggests on the order of 4* framesize).
120 ;%define movntps movaps
126 ;-----------------------------------------------------------------------------
127 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
128 ;-----------------------------------------------------------------------------
129 cglobal hpel_filter_v_%1, 5,6,%2
142 mova m0, [filt_mul15]
152 SBUTTERFLY bw, 1, 4, 7
153 SBUTTERFLY bw, 2, 5, 7
154 SBUTTERFLY bw, 3, 6, 7
159 pmaddubsw m3, [filt_mul20]
160 pmaddubsw m6, [filt_mul20]
166 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
167 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
168 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
169 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
170 FILT_V2 m1, m2, m3, m4, m5, m6
174 mova [r2+r4*2+mmsize], m4
175 FILT_PACK m1, m4, 5, m7
185 ;-----------------------------------------------------------------------------
186 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
187 ;-----------------------------------------------------------------------------
188 cglobal hpel_filter_c_mmxext, 3,3
200 paddw m3, [src+2] ; c0
204 paddw m4, [src+14] ; a1
205 paddw m5, [src+12] ; b1
206 paddw m6, [src+10] ; c1
207 FILT_H2 m1, m2, m3, m4, m5, m6
208 FILT_PACK m1, m4, 6, m7
214 ;-----------------------------------------------------------------------------
215 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
216 ;-----------------------------------------------------------------------------
217 cglobal hpel_filter_h_mmxext, 3,3
251 FILT_H2 m1, m2, m3, m4, m5, m6
252 FILT_PACK m1, m4, 1, m7
261 ;-----------------------------------------------------------------------------
262 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
263 ;-----------------------------------------------------------------------------
264 cglobal hpel_filter_c_%1, 3,3,9
276 %define tpw_32 [pw_32]
278 %ifidn %1,sse2_misalign
292 FILT_H2 m4, m5, m6, m3, m2, m1
299 PALIGNR m4, m0, 12, m7
301 PALIGNR m5, m0, 14, m0
303 PALIGNR m0, m1, 6, m7
306 PALIGNR m0, m1, 4, m7
309 PALIGNR m6, m1, 2, m7
315 PALIGNR m2, m1, 12, m7
316 PALIGNR m5, m1, 14, m1
319 PALIGNR m3, m0, 6, m7
322 PALIGNR m6, m0, 4, m7
325 PALIGNR m6, m0, 2, m7
329 FILT_PACK m4, m3, 6, tpw_32
336 ;-----------------------------------------------------------------------------
337 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
338 ;-----------------------------------------------------------------------------
339 cglobal hpel_filter_h_sse2, 3,3,8
376 mova m7, [pw_1] ; FIXME xmm8
377 FILT_H2 m1, m2, m3, m4, m5, m6
378 FILT_PACK m1, m4, 1, m7
385 ;-----------------------------------------------------------------------------
386 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
387 ;-----------------------------------------------------------------------------
388 cglobal hpel_filter_h_ssse3, 3,3
404 pmaddubsw m3, [filt_mul15]
405 pmaddubsw m4, [filt_mul15]
406 pmaddubsw m0, [filt_mul51]
413 pmaddubsw m1, [filt_mul20]
414 pmaddubsw m5, [filt_mul20]
415 pmaddubsw m6, [filt_mul51]
419 FILT_PACK m3, m4, 5, m7
420 pshufb m3, [hpel_shuf]
428 %define PALIGNR PALIGNR_MMX
434 %define PALIGNR PALIGNR_SSSE3
441 ;The optimum prefetch distance is difficult to determine in checkasm:
442 ;any prefetch seems slower than not prefetching.
443 ;In real use, the prefetch seems to be a slight win.
444 ;+16 is picked somewhat arbitrarily here based on the fact that even one
445 ;loop iteration is going to take longer than the prefetch.
446 prefetcht0 [r1+r2*2+16]
476 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
477 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
478 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
480 FILT_V2 m1, m2, m3, m4, m5, m6
486 FILT_PACK m1, m4, 5, m15
487 movntps [r11+r4+%5], m1
492 PALIGNR m1, %1, 12, m2
494 PALIGNR m2, %1, 14, %1
496 PALIGNR m3, %2, 4, %1
498 PALIGNR m4, %2, 2, %1
501 PALIGNR %3, %2, 6, m2
510 FILT_PACK %3, %4, 6, m15
527 PALIGNR m1, %1, 14, m3
529 PALIGNR m2, %1, 15, m3
531 PALIGNR m4, %2, 1 , m3
533 PALIGNR m5, %2, 2 , m3
535 PALIGNR m6, %2, 3 , m3
538 ADD8TO16 m1, m6, m12, m3, m0 ; a
539 ADD8TO16 m2, m5, m12, m3, m0 ; b
540 ADD8TO16 %2, m4, m12, m3, m0 ; c
541 FILT_V2 m1, m2, %2, m6, m5, m4
542 FILT_PACK m1, m6, 5, m15
554 FILT_PACK m1, m2, 5, m15
555 pshufb m1, [hpel_shuf]
562 ;-----------------------------------------------------------------------------
563 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
564 ; uint8_t *src, int stride, int width, int height)
565 ;-----------------------------------------------------------------------------
566 cglobal hpel_filter_%1, 7,7,16
590 mova m0, [filt_mul51]
591 mova m12, [filt_mul15]
592 mova m14, [filt_mul20]
597 DO_FILT_V m8, m7, m13, m12, 0, %1
600 DO_FILT_V m6, m5, m11, m12, 16, %1
602 paddw m15, m15 ; pw_32
603 DO_FILT_C m9, m8, m7, m6
606 DO_FILT_H m10, m13, m11, %1
611 ; setup regs for next y
626 %define PALIGNR PALIGNR_MMX
628 %define PALIGNR PALIGNR_SSSE3
636 ;-----------------------------------------------------------------------------
637 ; void plane_copy_core( uint8_t *dst, int i_dst,
638 ; uint8_t *src, int i_src, int w, int h)
639 ;-----------------------------------------------------------------------------
640 ; assumes i_dst and w are multiples of 16, and i_dst>w
641 cglobal plane_copy_core_mmxext, 6,7
694 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
714 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant
747 %macro PLANE_INTERLEAVE 1
748 ;-----------------------------------------------------------------------------
749 ; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
750 ; uint8_t *srcu, int i_srcu,
751 ; uint8_t *srcv, int i_srcv, int w, int h )
752 ;-----------------------------------------------------------------------------
753 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
754 cglobal plane_copy_interleave_core_%1, 6,7
763 DECLARE_REG_TMP 10,11
782 INTERLEAVE r0+r6*2, r2+r6, r4+r6, 0, nt
783 INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt
789 movntq [r0+r6*2+8], m0
790 movntq [r0+r6*2+16], m0
791 movntq [r0+r6*2+24], m0
793 movntdq [r0+r6*2], m0
794 movntdq [r0+r6*2+16], m0
808 ;-----------------------------------------------------------------------------
809 ; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
810 ;-----------------------------------------------------------------------------
811 cglobal store_interleave_8x8x2_%1, 4,5
814 INTERLEAVE r0, r2, r3, 1
815 INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1
816 add r2, FDEC_STRIDE*2
817 add r3, FDEC_STRIDE*2
822 %endmacro ; PLANE_INTERLEAVE
824 %macro DEINTERLEAVE_START 1
826 mova m4, [deinterleave_shuf]
832 %macro PLANE_DEINTERLEAVE 1
833 ;-----------------------------------------------------------------------------
834 ; void plane_copy_deinterleave( uint8_t *dstu, int i_dstu,
835 ; uint8_t *dstv, int i_dstv,
836 ; uint8_t *src, int i_src, int w, int h )
837 ;-----------------------------------------------------------------------------
838 cglobal plane_copy_deinterleave_%1, 6,7
839 DEINTERLEAVE_START %1
851 DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, %1, m4
852 DEINTERLEAVE r0+r6+8, r2+r6+8, r4+r6*2+16, 0, %1, m4
862 ;-----------------------------------------------------------------------------
863 ; void load_deinterleave_8x8x2_fenc( uint8_t *dst, uint8_t *src, int i_src )
864 ;-----------------------------------------------------------------------------
865 cglobal load_deinterleave_8x8x2_fenc_%1, 3,4
866 DEINTERLEAVE_START %1
869 DEINTERLEAVE r0, r0+FENC_STRIDE/2, r1, 1, %1, m4
870 DEINTERLEAVE r0+FENC_STRIDE, r0+FENC_STRIDE*3/2, r1+r2, 1, %1, m4
871 add r0, FENC_STRIDE*2
877 ;-----------------------------------------------------------------------------
878 ; void load_deinterleave_8x8x2_fdec( uint8_t *dst, uint8_t *src, int i_src )
879 ;-----------------------------------------------------------------------------
880 cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
881 DEINTERLEAVE_START %1
884 DEINTERLEAVE r0, r0+FDEC_STRIDE/2, r1, 0, %1, m4
885 DEINTERLEAVE r0+FDEC_STRIDE, r0+FDEC_STRIDE*3/2, r1+r2, 0, %1, m4
886 add r0, FDEC_STRIDE*2
891 %endmacro ; PLANE_DEINTERLEAVE
894 PLANE_INTERLEAVE mmxext
895 PLANE_DEINTERLEAVE mmx
897 PLANE_INTERLEAVE sse2
898 PLANE_DEINTERLEAVE sse2
899 PLANE_DEINTERLEAVE ssse3
902 ; These functions are not general-use; not only do the SSE ones require aligned input,
903 ; but they also will fail if given a non-mod16 size or a size less than 64.
904 ; memzero SSE will fail for non-mod128.
906 ;-----------------------------------------------------------------------------
907 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
908 ;-----------------------------------------------------------------------------
909 cglobal memcpy_aligned_mmx, 3,3
913 movq mm0, [r1 + r2 + 0]
914 movq mm1, [r1 + r2 + 8]
915 movq [r0 + r2 + 0], mm0
916 movq [r0 + r2 + 8], mm1
919 movq mm0, [r1 + r2 + 0]
920 movq mm1, [r1 + r2 + 8]
921 movq mm2, [r1 + r2 + 16]
922 movq mm3, [r1 + r2 + 24]
923 movq [r0 + r2 + 0], mm0
924 movq [r0 + r2 + 8], mm1
925 movq [r0 + r2 + 16], mm2
926 movq [r0 + r2 + 24], mm3
930 ;-----------------------------------------------------------------------------
931 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
932 ;-----------------------------------------------------------------------------
933 cglobal memcpy_aligned_sse2, 3,3
937 movdqa xmm0, [r1 + r2]
938 movdqa [r0 + r2], xmm0
943 movdqa xmm0, [r1 + r2 + 0]
944 movdqa [r0 + r2 + 0], xmm0
945 movdqa xmm1, [r1 + r2 + 16]
946 movdqa [r0 + r2 + 16], xmm1
949 movdqa xmm0, [r1 + r2 + 0]
950 movdqa [r0 + r2 + 0], xmm0
951 movdqa xmm1, [r1 + r2 + 16]
952 movdqa [r0 + r2 + 16], xmm1
953 movdqa xmm2, [r1 + r2 + 32]
954 movdqa [r0 + r2 + 32], xmm2
955 movdqa xmm3, [r1 + r2 + 48]
956 movdqa [r0 + r2 + 48], xmm3
960 ;-----------------------------------------------------------------------------
961 ; void *memzero_aligned( void *dst, size_t n );
962 ;-----------------------------------------------------------------------------
964 cglobal memzero_aligned_%1, 2,2
971 mova [r0 + r1 + i], m0
986 ;-----------------------------------------------------------------------------
987 ; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
988 ;-----------------------------------------------------------------------------
989 cglobal integral_init4h_sse4, 3,4
996 movdqa m1, [r1+r2+16]
1001 paddw m1, [r0+r2*2+16]
1002 movdqa [r3+r2*2 ], m0
1003 movdqa [r3+r2*2+16], m1
1008 cglobal integral_init8h_sse4, 3,4
1015 movdqa m1, [r1+r2+16]
1024 paddw m1, [r0+r2*2+16]
1027 movdqa [r3+r2*2 ], m0
1028 movdqa [r3+r2*2+16], m1
1033 %macro INTEGRAL_INIT_8V 1
1034 ;-----------------------------------------------------------------------------
1035 ; void integral_init8v( uint16_t *sum8, int stride )
1036 ;-----------------------------------------------------------------------------
1037 cglobal integral_init8v_%1, 3,3
1044 mova m1, [r2+r1+mmsize]
1046 psubw m1, [r0+r1+mmsize]
1048 mova [r0+r1+mmsize], m1
1055 INTEGRAL_INIT_8V mmx
1057 INTEGRAL_INIT_8V sse2
1059 ;-----------------------------------------------------------------------------
1060 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
1061 ;-----------------------------------------------------------------------------
1063 cglobal integral_init4v_mmx, 3,5
1085 cglobal integral_init4v_sse2, 3,5
1097 shufpd m0, [r0+r2+16], 1
1098 shufpd m1, [r4+r2+16], 1
1110 cglobal integral_init4v_ssse3, 3,5
1139 pavgb %4, [r0+r5*2+%7]
1140 PALIGNR %1, %3, 1, m6
1141 PALIGNR %2, %4, 1, m6
1153 mova m3, [r0+%4+mmsize]
1155 pavgb m3, [r0+%4+r5+mmsize]
1156 pavgb m2, [r0+%4+r5]
1157 PALIGNR %1, m3, 1, m6
1159 PALIGNR m3, m2, 1, m6
1177 pavgb m3, [r0+%3+r5+8]
1178 pavgb m2, [r0+%3+r5]
1181 pavgb m1, [r0+%3+r5+9]
1182 pavgb m0, [r0+%3+r5+1]
1197 ;-----------------------------------------------------------------------------
1198 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
1199 ; int src_stride, int dst_stride, int width, int height )
1200 ;-----------------------------------------------------------------------------
1201 %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
1202 cglobal frame_init_lowres_core_%1, 6,7,%2
1206 ; src += 2*(height-1)*stride + 2*width
1212 ; dst += (height-1)*stride + width
1221 ; gap = stride - width
1225 %define dst_gap [rsp+gprsize]
1230 %define src_gap [rsp]
1232 ; adjust for the odd end case
1255 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1273 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1276 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1289 FILT16x2 m0, r1, r2, 0
1290 FILT16x2 m1, r3, r4, r5
1306 %endmacro ; FRAME_INIT_LOWRES
1309 %define PALIGNR PALIGNR_MMX
1310 FRAME_INIT_LOWRES mmxext
1312 FRAME_INIT_LOWRES cache32_mmxext
1315 FRAME_INIT_LOWRES sse2, 12
1316 %define PALIGNR PALIGNR_SSSE3
1317 FRAME_INIT_LOWRES ssse3, 12
1319 ;-----------------------------------------------------------------------------
1320 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1321 ; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
1322 ;-----------------------------------------------------------------------------
1323 cglobal mbtree_propagate_cost_sse2, 6,6,7
1332 movdqa xmm6, [pw_3fff]
1333 movdqa xmm4, [pd_128]
1335 movq xmm2, [r2+r5] ; intra
1336 movq xmm0, [r4+r5] ; invq
1337 movq xmm3, [r3+r5] ; inter
1338 movq xmm1, [r1+r5] ; prop
1339 punpcklwd xmm2, xmm5
1340 punpcklwd xmm0, xmm5
1343 punpcklwd xmm1, xmm5
1344 punpcklwd xmm3, xmm5
1346 psrld xmm0, 8 ; intra*invq>>8
1347 paddd xmm0, xmm1 ; prop + (intra*invq>>8)
1348 cvtdq2ps xmm1, xmm2 ; intra
1349 psubd xmm2, xmm3 ; intra - inter
1350 rcpps xmm3, xmm1 ; 1 / intra 1st approximation
1352 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
1354 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
1355 mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
1356 addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
1357 subps xmm3, xmm1 ; 2nd approximation for 1/intra
1358 mulps xmm0, xmm3 ; / intra
1359 cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
1360 movdqa [r0+r5*2], xmm0