1 ;*****************************************************************************
2 ;* mc-a2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Holger Lubitz <holger@lubitz.org>
9 ;* Mathieu Monnier <manao@melix.net>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 filt_mul20: times 16 db 20
32 filt_mul15: times 8 db 1, -5
33 filt_mul51: times 8 db -5, 1
34 hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
72 psubw %1, %2 ; a-5*b+4*c
76 paddw %1, %3 ; a-5*b+20*c
83 psubw %1, %2 ; (a-b)/4-b
84 paddw %1, %3 ; (a-b)/4-b+c
85 psraw %1, 2 ; ((a-b)/4-b+c)/4
86 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
112 ;The hpel_filter routines use non-temporal writes for output.
113 ;The following defines may be uncommented for testing.
114 ;Doing the hpel_filter temporal may be a win if the last level cache
115 ;is big enough (preliminary benching suggests on the order of 4* framesize).
118 ;%define movntps movaps
124 ;-----------------------------------------------------------------------------
125 ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
126 ;-----------------------------------------------------------------------------
127 cglobal hpel_filter_v_%1, 5,6,%2
140 mova m0, [filt_mul15]
150 SBUTTERFLY bw, 1, 4, 7
151 SBUTTERFLY bw, 2, 5, 7
152 SBUTTERFLY bw, 3, 6, 7
157 pmaddubsw m3, [filt_mul20]
158 pmaddubsw m6, [filt_mul20]
164 LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
165 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
166 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
167 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
168 FILT_V2 m1, m2, m3, m4, m5, m6
172 mova [r2+r4*2+mmsize], m4
173 FILT_PACK m1, m4, 5, m7
183 ;-----------------------------------------------------------------------------
184 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
185 ;-----------------------------------------------------------------------------
186 cglobal hpel_filter_c_mmxext, 3,3
198 paddw m3, [src+2] ; c0
202 paddw m4, [src+14] ; a1
203 paddw m5, [src+12] ; b1
204 paddw m6, [src+10] ; c1
205 FILT_H2 m1, m2, m3, m4, m5, m6
206 FILT_PACK m1, m4, 6, m7
212 ;-----------------------------------------------------------------------------
213 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
214 ;-----------------------------------------------------------------------------
215 cglobal hpel_filter_h_mmxext, 3,3
249 FILT_H2 m1, m2, m3, m4, m5, m6
250 FILT_PACK m1, m4, 1, m7
259 ;-----------------------------------------------------------------------------
260 ; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
261 ;-----------------------------------------------------------------------------
262 cglobal hpel_filter_c_%1, 3,3,9
274 %define tpw_32 [pw_32]
276 %ifidn %1,sse2_misalign
290 FILT_H2 m4, m5, m6, m3, m2, m1
297 PALIGNR m4, m0, 12, m7
299 PALIGNR m5, m0, 14, m0
301 PALIGNR m0, m1, 6, m7
304 PALIGNR m0, m1, 4, m7
307 PALIGNR m6, m1, 2, m7
313 PALIGNR m2, m1, 12, m7
314 PALIGNR m5, m1, 14, m1
317 PALIGNR m3, m0, 6, m7
320 PALIGNR m6, m0, 4, m7
323 PALIGNR m6, m0, 2, m7
327 FILT_PACK m4, m3, 6, tpw_32
334 ;-----------------------------------------------------------------------------
335 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
336 ;-----------------------------------------------------------------------------
337 cglobal hpel_filter_h_sse2, 3,3,8
374 mova m7, [pw_1] ; FIXME xmm8
375 FILT_H2 m1, m2, m3, m4, m5, m6
376 FILT_PACK m1, m4, 1, m7
383 ;-----------------------------------------------------------------------------
384 ; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
385 ;-----------------------------------------------------------------------------
386 cglobal hpel_filter_h_ssse3, 3,3
402 pmaddubsw m3, [filt_mul15]
403 pmaddubsw m4, [filt_mul15]
404 pmaddubsw m0, [filt_mul51]
411 pmaddubsw m1, [filt_mul20]
412 pmaddubsw m5, [filt_mul20]
413 pmaddubsw m6, [filt_mul51]
417 FILT_PACK m3, m4, 5, m7
418 pshufb m3, [hpel_shuf]
426 %define PALIGNR PALIGNR_MMX
432 %define PALIGNR PALIGNR_SSSE3
439 ;The optimum prefetch distance is difficult to determine in checkasm:
440 ;any prefetch seems slower than not prefetching.
441 ;In real use, the prefetch seems to be a slight win.
442 ;+16 is picked somewhat arbitrarily here based on the fact that even one
443 ;loop iteration is going to take longer than the prefetch.
444 prefetcht0 [r1+r2*2+16]
474 LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
475 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
476 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
478 FILT_V2 m1, m2, m3, m4, m5, m6
484 FILT_PACK m1, m4, 5, m15
485 movntps [r11+r4+%5], m1
490 PALIGNR m1, %1, 12, m2
492 PALIGNR m2, %1, 14, %1
494 PALIGNR m3, %2, 4, %1
496 PALIGNR m4, %2, 2, %1
499 PALIGNR %3, %2, 6, m2
508 FILT_PACK %3, %4, 6, m15
525 PALIGNR m1, %1, 14, m3
527 PALIGNR m2, %1, 15, m3
529 PALIGNR m4, %2, 1 , m3
531 PALIGNR m5, %2, 2 , m3
533 PALIGNR m6, %2, 3 , m3
536 ADD8TO16 m1, m6, m12, m3, m0 ; a
537 ADD8TO16 m2, m5, m12, m3, m0 ; b
538 ADD8TO16 %2, m4, m12, m3, m0 ; c
539 FILT_V2 m1, m2, %2, m6, m5, m4
540 FILT_PACK m1, m6, 5, m15
552 FILT_PACK m1, m2, 5, m15
553 pshufb m1, [hpel_shuf]
560 ;-----------------------------------------------------------------------------
561 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
562 ; uint8_t *src, int stride, int width, int height)
563 ;-----------------------------------------------------------------------------
564 cglobal hpel_filter_%1, 7,7,16
588 mova m0, [filt_mul51]
589 mova m12, [filt_mul15]
590 mova m14, [filt_mul20]
595 DO_FILT_V m8, m7, m13, m12, 0, %1
598 DO_FILT_V m6, m5, m11, m12, 16, %1
600 paddw m15, m15 ; pw_32
601 DO_FILT_C m9, m8, m7, m6
604 DO_FILT_H m10, m13, m11, %1
609 ; setup regs for next y
624 %define PALIGNR PALIGNR_MMX
626 %define PALIGNR PALIGNR_SSSE3
634 ;-----------------------------------------------------------------------------
635 ; void plane_copy_core( uint8_t *dst, int i_dst,
636 ; uint8_t *src, int i_src, int w, int h)
637 ;-----------------------------------------------------------------------------
638 ; assumes i_dst and w are multiples of 16, and i_dst>w
639 cglobal plane_copy_core_mmxext, 6,7
693 ; These functions are not general-use; not only do the SSE ones require aligned input,
694 ; but they also will fail if given a non-mod16 size or a size less than 64.
695 ; memzero SSE will fail for non-mod128.
697 ;-----------------------------------------------------------------------------
698 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
699 ;-----------------------------------------------------------------------------
700 cglobal memcpy_aligned_mmx, 3,3
704 movq mm0, [r1 + r2 + 0]
705 movq mm1, [r1 + r2 + 8]
706 movq [r0 + r2 + 0], mm0
707 movq [r0 + r2 + 8], mm1
710 movq mm0, [r1 + r2 + 0]
711 movq mm1, [r1 + r2 + 8]
712 movq mm2, [r1 + r2 + 16]
713 movq mm3, [r1 + r2 + 24]
714 movq [r0 + r2 + 0], mm0
715 movq [r0 + r2 + 8], mm1
716 movq [r0 + r2 + 16], mm2
717 movq [r0 + r2 + 24], mm3
721 ;-----------------------------------------------------------------------------
722 ; void *memcpy_aligned( void *dst, const void *src, size_t n );
723 ;-----------------------------------------------------------------------------
724 cglobal memcpy_aligned_sse2, 3,3
728 movdqa xmm0, [r1 + r2]
729 movdqa [r0 + r2], xmm0
734 movdqa xmm0, [r1 + r2 + 0]
735 movdqa [r0 + r2 + 0], xmm0
736 movdqa xmm1, [r1 + r2 + 16]
737 movdqa [r0 + r2 + 16], xmm1
740 movdqa xmm0, [r1 + r2 + 0]
741 movdqa [r0 + r2 + 0], xmm0
742 movdqa xmm1, [r1 + r2 + 16]
743 movdqa [r0 + r2 + 16], xmm1
744 movdqa xmm2, [r1 + r2 + 32]
745 movdqa [r0 + r2 + 32], xmm2
746 movdqa xmm3, [r1 + r2 + 48]
747 movdqa [r0 + r2 + 48], xmm3
751 ;-----------------------------------------------------------------------------
752 ; void *memzero_aligned( void *dst, size_t n );
753 ;-----------------------------------------------------------------------------
755 cglobal memzero_aligned_%1, 2,2
762 mova [r0 + r1 + i], m0
777 ;-----------------------------------------------------------------------------
778 ; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
779 ;-----------------------------------------------------------------------------
780 cglobal integral_init4h_sse4, 3,4
787 movdqa m1, [r1+r2+16]
792 paddw m1, [r0+r2*2+16]
793 movdqa [r3+r2*2 ], m0
794 movdqa [r3+r2*2+16], m1
799 cglobal integral_init8h_sse4, 3,4
806 movdqa m1, [r1+r2+16]
815 paddw m1, [r0+r2*2+16]
818 movdqa [r3+r2*2 ], m0
819 movdqa [r3+r2*2+16], m1
824 %macro INTEGRAL_INIT_8V 1
825 ;-----------------------------------------------------------------------------
826 ; void integral_init8v( uint16_t *sum8, int stride )
827 ;-----------------------------------------------------------------------------
828 cglobal integral_init8v_%1, 3,3
835 mova m1, [r2+r1+mmsize]
837 psubw m1, [r0+r1+mmsize]
839 mova [r0+r1+mmsize], m1
848 INTEGRAL_INIT_8V sse2
850 ;-----------------------------------------------------------------------------
851 ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
852 ;-----------------------------------------------------------------------------
854 cglobal integral_init4v_mmx, 3,5
876 cglobal integral_init4v_sse2, 3,5
888 shufpd m0, [r0+r2+16], 1
889 shufpd m1, [r4+r2+16], 1
901 cglobal integral_init4v_ssse3, 3,5
930 pavgb %4, [r0+r5*2+%7]
931 PALIGNR %1, %3, 1, m6
932 PALIGNR %2, %4, 1, m6
944 mova m3, [r0+%4+mmsize]
946 pavgb m3, [r0+%4+r5+mmsize]
948 PALIGNR %1, m3, 1, m6
950 PALIGNR m3, m2, 1, m6
968 pavgb m3, [r0+%3+r5+8]
972 pavgb m1, [r0+%3+r5+9]
973 pavgb m0, [r0+%3+r5+1]
988 ;-----------------------------------------------------------------------------
989 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
990 ; int src_stride, int dst_stride, int width, int height )
991 ;-----------------------------------------------------------------------------
992 %macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
993 cglobal frame_init_lowres_core_%1, 6,7,%2
997 ; src += 2*(height-1)*stride + 2*width
1003 ; dst += (height-1)*stride + width
1012 ; gap = stride - width
1016 %define dst_gap [rsp+gprsize]
1021 %define src_gap [rsp]
1023 ; adjust for the odd end case
1046 FILT8x4 m0, m1, m2, m3, m4, m5, 0
1064 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
1067 FILT8x4 m2, m3, m0, m1, m4, m5, 0
1080 FILT16x2 m0, r1, r2, 0
1081 FILT16x2 m1, r3, r4, r5
1097 %endmacro ; FRAME_INIT_LOWRES
1100 %define PALIGNR PALIGNR_MMX
1101 FRAME_INIT_LOWRES mmxext
1103 FRAME_INIT_LOWRES cache32_mmxext
1106 FRAME_INIT_LOWRES sse2, 12
1107 %define PALIGNR PALIGNR_SSSE3
1108 FRAME_INIT_LOWRES ssse3, 12
1110 ;-----------------------------------------------------------------------------
1111 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
1112 ; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
1113 ;-----------------------------------------------------------------------------
1114 cglobal mbtree_propagate_cost_sse2, 6,6
1123 movdqa xmm4, [pd_128]
1125 movq xmm2, [r2+r5] ; intra
1126 movq xmm0, [r4+r5] ; invq
1127 punpcklwd xmm2, xmm5
1128 punpcklwd xmm0, xmm5
1131 psrld xmm0, 8 ; intra*invq>>8
1132 movq xmm3, [r3+r5] ; inter
1133 movq xmm1, [r1+r5] ; prop
1134 pand xmm3, [pw_3fff]
1135 punpcklwd xmm1, xmm5
1136 punpcklwd xmm3, xmm5
1137 paddd xmm0, xmm1 ; prop + (intra*invq>>8)
1138 cvtdq2ps xmm1, xmm2 ; intra
1139 psubd xmm2, xmm3 ; intra - inter
1142 mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
1143 divps xmm0, xmm1 ; / intra
1144 cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
1145 movdqa [r0+r5*2], xmm0