1 ;*****************************************************************************
2 ;* sad16-a.asm: x86 high depth sad functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2010-2015 x264 project
6 ;* Authors: Oskar Arvidsson <oskar@irock.se>
7 ;* Henrik Gramner <henrik@gramner.com>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;* This program is also available under a commercial proprietary license.
24 ;* For more information, contact us at licensing@x264.com.
25 ;*****************************************************************************
28 %include "x86util.asm"
36 ;=============================================================================
38 ;=============================================================================
40 %macro SAD_INC_1x16P_MMX 0
49 ABSW2 m1, m2, m1, m2, m5, m6
50 ABSW2 m3, m4, m3, m4, m7, m5
59 %macro SAD_INC_2x8P_MMX 0
68 ABSW2 m1, m2, m1, m2, m5, m6
69 ABSW2 m3, m4, m3, m4, m7, m5
78 %macro SAD_INC_2x4P_MMX 0
83 ABSW2 m1, m2, m1, m2, m3, m4
90 ;-----------------------------------------------------------------------------
91 ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
92 ;-----------------------------------------------------------------------------
94 cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
127 ;=============================================================================
129 ;=============================================================================
131 %macro SAD_INC_2ROW 1
135 movu m3, [r2+2*r3+ 0]
136 movu m4, [r2+2*r3+16]
139 psubw m3, [r0+2*r1+ 0]
140 psubw m4, [r0+2*r1+16]
141 ABSW2 m1, m2, m1, m2, m5, m6
144 ABSW2 m3, m4, m3, m4, m7, m5
154 ABSW2 m1, m2, m1, m2, m3, m4
162 ;-----------------------------------------------------------------------------
163 ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
164 ;-----------------------------------------------------------------------------
166 cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
189 INIT_XMM sse2, aligned
200 INIT_XMM ssse3, aligned
209 ;=============================================================================
211 ;=============================================================================
213 %macro SAD_X3_INC_P 0
214 add r0, 4*FENC_STRIDE
220 %macro SAD_X3_ONE_START 0
228 ABSW2 m0, m1, m0, m1, m4, m5
240 ABSW2 m3, m4, m3, m4, m7, m6
248 %if mmsize == 8 && %1*%2 == 256
270 %macro SAD_X4_INC_P 0
271 add r0, 4*FENC_STRIDE
278 %macro SAD_X4_ONE_START 0
288 ABSW2 m0, m1, m0, m1, m5, m6
289 ABSW2 m2, m3, m2, m3, m4, m7
303 ABSW2 m5, m6, m5, m6, m9, m10
304 ABSW2 m7, m8, m7, m8, m9, m10
324 %else ; num_mmregs == 8 && !ssse3
335 ABSW2 m5, m6, m5, m6, m7, m4
342 %if mmsize == 8 && %1*%2 == 256
364 SAD_X%1_ONE x*mmsize, x*mmsize
365 SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
371 cglobal pixel_vsad, 3,3,8
375 mova m3, [r0+2*r1+16]
379 ABSW2 m0, m1, m0, m1, m4, m5
387 mova m7, [r0+2*r1+16]
407 HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
409 HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
422 cglobal pixel_vsad, 3,3
452 ;-----------------------------------------------------------------------------
453 ; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
454 ; uint16_t *pix2, intptr_t i_stride, int scores[3] )
455 ;-----------------------------------------------------------------------------
457 cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
459 %xdefine STRIDE r %+ regnum
462 SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
463 SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
466 SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
543 ;-----------------------------------------------------------------------------
544 ; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
545 ;-----------------------------------------------------------------------------
547 %macro INTRA_SAD_X3_4x4 0
548 cglobal intra_sad_x3_4x4, 3,3,7
550 movddup m0, [r1-1*FDEC_STRIDEB]
552 movq m0, [r1-1*FDEC_STRIDEB]
555 movq m1, [r0+0*FENC_STRIDEB]
556 movq m2, [r0+2*FENC_STRIDEB]
557 pshuflw m6, m0, q1032
559 pshuflw m5, m6, q2301
561 punpcklqdq m6, m6 ; A+B+C+D 8 times
562 movhps m1, [r0+1*FENC_STRIDEB]
563 movhps m2, [r0+3*FENC_STRIDEB]
566 ABSW2 m3, m0, m3, m0, m4, m5
568 movd m3, [r1+0*FDEC_STRIDEB-4]
569 movd m4, [r1+2*FDEC_STRIDEB-4]
570 movhps m3, [r1+1*FDEC_STRIDEB-8]
571 movhps m4, [r1+3*FDEC_STRIDEB-8]
572 pshufhw m3, m3, q3333
573 pshufhw m4, m4, q3333
574 pshuflw m3, m3, q1111 ; FF FF EE EE
575 pshuflw m4, m4, q1111 ; HH HH GG GG
586 ABSW2 m5, m6, m5, m6, m3, m4
587 ABSW2 m1, m2, m1, m2, m3, m4
601 movd [r2], m0 ; V prediction cost
602 movd [r2+4], m1 ; H prediction cost
603 movd [r2+8], m5 ; DC prediction cost
615 ;-----------------------------------------------------------------------------
616 ; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
617 ;-----------------------------------------------------------------------------
628 %macro INTRA_SAD_HVDC_ITER 2
629 mova m4, [r0+(%1-4)*FENC_STRIDEB]
632 ACCUM paddw, 1, 4, %1
633 mova m4, [r0+(%1-4)*FENC_STRIDEB]
636 ACCUM paddw, 2, 4, %1
638 psubw m5, [r0+(%1-4)*FENC_STRIDEB]
640 ACCUM paddw, 3, 5, %1
643 %macro INTRA_SAD_X3_8x8 0
644 cglobal intra_sad_x3_8x8, 3,3,8
645 add r0, 4*FENC_STRIDEB
646 movu m0, [r1+7*SIZEOF_PIXEL]
647 mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
655 INTRA_SAD_HVDC_ITER 0, q3333
656 INTRA_SAD_HVDC_ITER 1, q2222
657 INTRA_SAD_HVDC_ITER 2, q1111
658 INTRA_SAD_HVDC_ITER 3, q0000
659 movq m7, [r1+7*SIZEOF_PIXEL]
661 INTRA_SAD_HVDC_ITER 4, q3333
662 INTRA_SAD_HVDC_ITER 5, q2222
663 INTRA_SAD_HVDC_ITER 6, q1111
664 INTRA_SAD_HVDC_ITER 7, q0000
666 phaddw m2, m3 ; 2 2 2 2 3 3 3 3
668 paddw m1, m3 ; 1 1 1 1 _ _ _ _
669 phaddw m2, m1 ; 2 2 3 3 1 1 _ _
670 pmaddwd m2, [pw_1] ; 2 3 1 _
688 %macro INTRA_SAD_HVDC_ITER_YMM 2
689 mova xm4, [r0+(%1-4)*FENC_STRIDEB]
690 vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
694 ACCUM paddw, 2, 5, %1 ; H
699 ACCUM paddw, 1, 5, %1 ; V
700 ACCUM paddw, 3, 4, %1 ; DC
704 cglobal intra_sad_x3_8x8, 3,3,8
705 add r0, 4*FENC_STRIDEB
706 movu xm0, [r1+7*SIZEOF_PIXEL]
707 vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
710 paddw xm0, [pw_1] ; equal to +8 after HADDW
715 INTRA_SAD_HVDC_ITER_YMM 0, q3333
716 INTRA_SAD_HVDC_ITER_YMM 1, q2222
717 INTRA_SAD_HVDC_ITER_YMM 2, q1111
718 INTRA_SAD_HVDC_ITER_YMM 3, q0000
719 phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
720 punpckhqdq m2, m3, m3
721 paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
722 phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
723 vextracti128 xm2, m1, 1
724 paddw xm1, xm2 ; 1 1 2 2 3 3 _ _
725 pmaddwd xm1, [pw_1] ; 1 2 3 _