1 ;*****************************************************************************
2 ;* pixel.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Alex Izvorski <aizvorksi@gmail.com>
9 ;* Fiona Glaser <fiona@x264.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
32 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
33 mask_ff: times 16 db 0xff
35 mask_ac4: dw 0,-1,-1,-1, 0,-1,-1,-1
36 mask_ac8: dw 0,-1,-1,-1,-1,-1,-1,-1
40 %macro HADDD 2 ; sum junk
54 pmaddwd %1, [pw_1 GLOBAL]
67 ;=============================================================================
69 ;=============================================================================
145 pinsrd m1, [r0+%1], 1
146 pinsrd m2, [r2+%2], 1
147 pinsrd m3, [r0+%3], 1
148 pinsrd m4, [r2+%4], 1
170 ;-----------------------------------------------------------------------------
171 ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
172 ;-----------------------------------------------------------------------------
174 cglobal x264_pixel_ssd_%1x%2_%3, 4,4
181 SSD_FULL 0, 0, mmsize, mmsize, i, 0
182 SSD_FULL r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/2-1
184 SSD_FULL 0, 0, r1, r3, i, i<%2/2-1
186 SSD_HALF 0, 0, r1, r3, i, i<%2/2-1
210 cglobal x264_pixel_ssd_4x8_sse4, 4,4
211 SSD_QUARTER 0, 0, r1, r3, 0, 1
212 SSD_QUARTER 0, 0, r1, r3, 1, 0
217 cglobal x264_pixel_ssd_4x4_sse4, 4,4
218 SSD_QUARTER 0, 0, r1, r3, 0, 0
224 ;=============================================================================
226 ;=============================================================================
230 pxor m6, m6 ; sum squared
246 movd [r2], m5 ; return sum
251 sub eax, r1d ; sqr - (sum * sum >> shift)
289 ;-----------------------------------------------------------------------------
290 ; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
291 ;-----------------------------------------------------------------------------
293 cglobal x264_pixel_var_16x16_mmxext, 2,3
298 cglobal x264_pixel_var_8x8_mmxext, 2,3
304 cglobal x264_pixel_var_16x16_sse2, 2,3
309 cglobal x264_pixel_var_8x8_sse2, 2,3
331 ;=============================================================================
333 ;=============================================================================
335 ; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
336 ; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
337 ; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
338 ; whereas phaddw-based transform doesn't care what order the coefs end up in.
347 %macro HADAMARD4_ROW_PHADD 5
355 %macro HADAMARD4_1D 4
356 SUMSUB_BADC %1, %2, %3, %4
357 SUMSUB_BADC %1, %3, %2, %4
360 %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block)
362 HADAMARD4_1D m4, m5, m6, m7
363 TRANSPOSE4x4W 4, 5, 6, 7, %%n
364 HADAMARD4_1D m4, m5, m6, m7
365 ABS2 m4, m5, m3, m %+ %%n
366 ABS2 m6, m7, m3, m %+ %%n
373 ; in: r4=3*stride1, r5=3*stride2
374 ; in: %2 = horizontal offset
375 ; in: %3 = whether we need to increment pix1 and pix2
378 %macro SATD_4x4_MMX 3
379 LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2]
380 LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2]
381 LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2]
382 LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]
390 %macro SATD_8x4_SSE2 1
391 HADAMARD4_1D m0, m1, m2, m3
392 %ifidn %1, ssse3_phadd
393 HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4
395 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
396 HADAMARD4_1D m0, m1, m2, m3
398 ABS4 m0, m1, m2, m3, m4, m5
405 %macro SATD_START_MMX 0
406 lea r4, [3*r1] ; 3*stride1
407 lea r5, [3*r3] ; 3*stride2
410 %macro SATD_END_MMX 0
411 pshufw m1, m0, 01001110b
413 pshufw m1, m0, 10110001b
420 ; FIXME avoid the spilling of regs to hold 3*stride.
421 ; for small blocks on x86_32, modify pixel pointer instead.
423 ;-----------------------------------------------------------------------------
424 ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
425 ;-----------------------------------------------------------------------------
427 cglobal x264_pixel_satd_16x4_internal_mmxext
428 SATD_4x4_MMX m2, 0, 0
429 SATD_4x4_MMX m1, 4, 0
431 SATD_4x4_MMX m2, 8, 0
433 SATD_4x4_MMX m1, 12, 0
438 cglobal x264_pixel_satd_8x8_internal_mmxext
439 SATD_4x4_MMX m2, 0, 0
440 SATD_4x4_MMX m1, 4, 1
443 x264_pixel_satd_8x4_internal_mmxext:
444 SATD_4x4_MMX m2, 0, 0
445 SATD_4x4_MMX m1, 4, 0
450 cglobal x264_pixel_satd_16x16_mmxext, 4,6
454 call x264_pixel_satd_16x4_internal_mmxext
458 call x264_pixel_satd_16x4_internal_mmxext
463 cglobal x264_pixel_satd_16x8_mmxext, 4,6
466 call x264_pixel_satd_16x4_internal_mmxext
469 call x264_pixel_satd_16x4_internal_mmxext
472 cglobal x264_pixel_satd_8x16_mmxext, 4,6
475 call x264_pixel_satd_8x8_internal_mmxext
478 call x264_pixel_satd_8x8_internal_mmxext
481 cglobal x264_pixel_satd_8x8_mmxext, 4,6
484 call x264_pixel_satd_8x8_internal_mmxext
487 cglobal x264_pixel_satd_8x4_mmxext, 4,6
490 call x264_pixel_satd_8x4_internal_mmxext
493 cglobal x264_pixel_satd_4x8_mmxext, 4,6
495 SATD_4x4_MMX m0, 0, 1
496 SATD_4x4_MMX m1, 0, 0
502 cglobal x264_pixel_satd_4x4_%1, 4,6
504 SATD_4x4_MMX m0, 0, 0
510 %macro SATD_START_SSE2 0
516 %macro SATD_END_SSE2 0
523 %macro BACKUP_POINTERS 0
530 %macro RESTORE_AND_INC_POINTERS 0
542 ;-----------------------------------------------------------------------------
543 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
544 ;-----------------------------------------------------------------------------
547 cglobal x264_pixel_satd_8x8_internal_%1
548 LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
552 x264_pixel_satd_8x4_internal_%1:
553 LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
554 x264_pixel_satd_4x8_internal_%1:
555 SAVE_MM_PERMUTATION satd_4x8_internal
559 cglobal x264_pixel_satd_16x16_%1, 4,6
562 call x264_pixel_satd_8x8_internal_%1
565 call x264_pixel_satd_8x8_internal_%1
566 RESTORE_AND_INC_POINTERS
567 call x264_pixel_satd_8x8_internal_%1
570 call x264_pixel_satd_8x8_internal_%1
573 cglobal x264_pixel_satd_16x8_%1, 4,6
576 call x264_pixel_satd_8x8_internal_%1
577 RESTORE_AND_INC_POINTERS
578 call x264_pixel_satd_8x8_internal_%1
581 cglobal x264_pixel_satd_8x16_%1, 4,6
583 call x264_pixel_satd_8x8_internal_%1
586 call x264_pixel_satd_8x8_internal_%1
589 cglobal x264_pixel_satd_8x8_%1, 4,6
591 call x264_pixel_satd_8x8_internal_%1
594 cglobal x264_pixel_satd_8x4_%1, 4,6
596 call x264_pixel_satd_8x4_internal_%1
599 cglobal x264_pixel_satd_4x8_%1, 4,6
601 LOAD_MM_PERMUTATION satd_4x8_internal
604 LOAD_DIFF m0, m7, m6, [r0], [r2]
605 LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3]
606 LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3]
607 LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5]
610 LOAD_DIFF m4, m7, m6, [r0], [r2]
611 LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3]
614 LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3]
615 LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5]
619 call x264_pixel_satd_4x8_internal_%1
623 ;-----------------------------------------------------------------------------
624 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
625 ;-----------------------------------------------------------------------------
626 cglobal x264_pixel_sa8d_8x8_internal_%1
629 LOAD_DIFF_8x4P m0, m1, m2, m3, m8, m9, r0, r2
630 LOAD_DIFF_8x4P m4, m5, m6, m7, m8, m9, r10, r11
632 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
633 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
634 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
636 ABS4 m0, m1, m2, m3, m8, m9
637 ABS4 m4, m5, m6, m7, m8, m9
647 cglobal x264_pixel_sa8d_8x8_%1, 4,6
650 call x264_pixel_sa8d_8x8_internal_%1
657 cglobal x264_pixel_sa8d_16x16_%1, 4,6
660 call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
664 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8]
668 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride+8]
672 call x264_pixel_sa8d_8x8_internal_%1 ; pix[8*stride]
681 cglobal x264_pixel_sa8d_8x8_internal_%1
682 LOAD_DIFF_8x4P m0, m1, m2, m3, m6, m7
686 LOAD_DIFF_8x4P m4, m5, m6, m7, m2, m2
689 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
690 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [esp+4], [esp+20]
691 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
714 %endmacro ; SATDS_SSE2
716 %macro SA8D_16x16_32 1
718 cglobal x264_pixel_sa8d_8x8_%1, 4,7
724 call x264_pixel_sa8d_8x8_internal_%1
732 cglobal x264_pixel_sa8d_16x16_%1, 4,7
738 call x264_pixel_sa8d_8x8_internal_%1
742 call x264_pixel_sa8d_8x8_internal_%1
749 call x264_pixel_sa8d_8x8_internal_%1
755 mova [esp+48-mmsize], m0
756 call x264_pixel_sa8d_8x8_internal_%1
757 paddusw m0, [esp+48-mmsize]
779 %endif ; !ARCH_X86_64
780 %endmacro ; SA8D_16x16_32
784 ;=============================================================================
786 ;=============================================================================
788 %macro INTRA_SA8D_SSE2 1
791 ;-----------------------------------------------------------------------------
792 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
793 ;-----------------------------------------------------------------------------
794 cglobal x264_intra_sa8d_x3_8x8_core_%1
797 movq m0, [r0+0*FENC_STRIDE]
798 movq m1, [r0+1*FENC_STRIDE]
799 movq m2, [r0+2*FENC_STRIDE]
800 movq m3, [r0+3*FENC_STRIDE]
801 movq m4, [r0+4*FENC_STRIDE]
802 movq m5, [r0+5*FENC_STRIDE]
803 movq m6, [r0+6*FENC_STRIDE]
804 movq m7, [r0+7*FENC_STRIDE]
813 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
814 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
815 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
818 movzx edi, word [r1+0]
829 ABS4 m8, m9, m10, m11, m12, m13
840 ABS2 m10, m11, m13, m14
847 movdqa m14, m15 ; 7x8 sum
849 movdqa m8, [r1+0] ; left edge
855 ABS1 m9, m11 ; 1x8 sum
864 punpcklqdq m0, m4 ; transpose
865 movdqa m1, [r1+16] ; top edge
868 psrldq m2, 2 ; 8x7 sum
869 psubw m0, m1 ; 8x1 sum
874 movdqa m7, [pw_1 GLOBAL]
890 movq [r2], m3 ; i8x8_v, i8x8_h
892 movd [r2+8], m3 ; i8x8_dc
895 %endmacro ; INTRA_SA8D_SSE2
898 ; out: m0..m3 = hadamard coefs
903 movd m0, [r0+0*FENC_STRIDE]
904 movd m1, [r0+1*FENC_STRIDE]
905 movd m2, [r0+2*FENC_STRIDE]
906 movd m3, [r0+3*FENC_STRIDE]
911 HADAMARD4_1D m0, m1, m2, m3
912 TRANSPOSE4x4W 0, 1, 2, 3, 4
913 HADAMARD4_1D m0, m1, m2, m3
914 SAVE_MM_PERMUTATION load_hadamard
917 %macro SCALAR_SUMSUB 4
926 %macro SCALAR_HADAMARD_LEFT 5 ; y, 4x tmp
928 shl %1d, 5 ; log(FDEC_STRIDE)
930 movzx %2d, byte [r1+%1-1+0*FDEC_STRIDE]
931 movzx %3d, byte [r1+%1-1+1*FDEC_STRIDE]
932 movzx %4d, byte [r1+%1-1+2*FDEC_STRIDE]
933 movzx %5d, byte [r1+%1-1+3*FDEC_STRIDE]
937 SCALAR_SUMSUB %2d, %3d, %4d, %5d
938 SCALAR_SUMSUB %2d, %4d, %3d, %5d
939 mov [left_1d+2*%1+0], %2w
940 mov [left_1d+2*%1+2], %3w
941 mov [left_1d+2*%1+4], %4w
942 mov [left_1d+2*%1+6], %5w
945 %macro SCALAR_HADAMARD_TOP 5 ; x, 4x tmp
946 movzx %2d, byte [r1+%1-FDEC_STRIDE+0]
947 movzx %3d, byte [r1+%1-FDEC_STRIDE+1]
948 movzx %4d, byte [r1+%1-FDEC_STRIDE+2]
949 movzx %5d, byte [r1+%1-FDEC_STRIDE+3]
950 SCALAR_SUMSUB %2d, %3d, %4d, %5d
951 SCALAR_SUMSUB %2d, %4d, %3d, %5d
952 mov [top_1d+2*%1+0], %2w
953 mov [top_1d+2*%1+2], %3w
954 mov [top_1d+2*%1+4], %4w
955 mov [top_1d+2*%1+6], %5w
958 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
960 pshufw %4, %1, 01001110b
961 pshufw %5, %2, 01001110b
962 pshufw %6, %3, 01001110b
969 pshufw %4, %1, 01001110b
970 pshufw %5, %2, 01001110b
971 pshufw %6, %3, 01001110b
979 mov qword [sums+0], 0
980 mov qword [sums+8], 0
981 mov qword [sums+16], 0
1010 ; in: m0..m3 (4x4), m7 (3x4)
1011 ; out: m0 v, m4 h, m5 dc
1013 %macro SUM4x3 3 ; dc, left, top
1021 punpckldq m0, m2 ; transpose
1025 ABS2 m4, m5, m2, m3 ; 1x4 sum
1026 ABS1 m0, m1 ; 4x1 sum
1029 %macro INTRA_SATDS_MMX 1
1031 ;-----------------------------------------------------------------------------
1032 ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1033 ;-----------------------------------------------------------------------------
1034 cglobal x264_intra_satd_x3_4x4_%1, 2,6
1036 ; stack is 16 byte aligned because abi says so
1037 %define top_1d rsp-8 ; size 8
1038 %define left_1d rsp-16 ; size 8
1042 ; stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
1044 %define top_1d esp+8
1051 SCALAR_HADAMARD_LEFT 0, r0, r3, r4, r5
1053 SCALAR_HADAMARD_TOP 0, r0, r3, r4, r5
1054 lea t0d, [t0d + r0d + 4]
1059 SUM4x3 t0d, [left_1d], [top_1d]
1063 psrlq m1, 16 ; 4x3 sum
1066 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
1070 movd [r2+0], m0 ; i4x4_v satd
1071 movd [r2+4], m4 ; i4x4_h satd
1072 movd [r2+8], m5 ; i4x4_dc satd
1092 ;-----------------------------------------------------------------------------
1093 ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1094 ;-----------------------------------------------------------------------------
1095 cglobal x264_intra_satd_x3_16x16_%1, 0,7
1097 %assign stack_pad 88
1099 %assign stack_pad 88 + ((stack_offset+88+4)&15)
1101 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1103 %define sums rsp+64 ; size 24
1104 %define top_1d rsp+32 ; size 32
1105 %define left_1d rsp ; size 32
1113 SCALAR_HADAMARD_LEFT t0, r3, r4, r5, r6
1115 SCALAR_HADAMARD_TOP t0, r3, r4, r5, r6
1132 SUM4x3 t2d, [left_1d+8*r3], [top_1d+8*r4]
1135 paddw m0, [sums+0] ; i16x16_v satd
1136 paddw m4, [sums+8] ; i16x16_h satd
1137 paddw m5, [sums+16] ; i16x16_dc satd
1146 add r0, 4*FENC_STRIDE-16
1157 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1163 movd [r2+8], m2 ; i16x16_dc satd
1164 movd [r2+4], m1 ; i16x16_h satd
1165 movd [r2+0], m0 ; i16x16_v satd
1169 ;-----------------------------------------------------------------------------
1170 ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
1171 ;-----------------------------------------------------------------------------
1172 cglobal x264_intra_satd_x3_8x8c_%1, 0,6
1173 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
1175 %define sums rsp+48 ; size 24
1176 %define dc_1d rsp+32 ; size 16
1177 %define top_1d rsp+16 ; size 16
1178 %define left_1d rsp ; size 16
1185 SCALAR_HADAMARD_LEFT t0, t2, r3, r4, r5
1186 SCALAR_HADAMARD_TOP t0, t2, r3, r4, r5
1191 movzx t2d, word [left_1d+0]
1192 movzx r3d, word [top_1d+0]
1193 movzx r4d, word [left_1d+8]
1194 movzx r5d, word [top_1d+8]
1205 mov [dc_1d+ 0], t2d ; tl
1206 mov [dc_1d+ 4], r5d ; tr
1207 mov [dc_1d+ 8], r4d ; bl
1208 mov [dc_1d+12], r3d ; br
1221 SUM4x3 [r5+4*r4], [left_1d+8*r3], [top_1d+8*r4]
1224 paddw m0, [sums+16] ; i4x4_v satd
1225 paddw m4, [sums+8] ; i4x4_h satd
1226 paddw m5, [sums+0] ; i4x4_dc satd
1235 add r0, 4*FENC_STRIDE-8
1248 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
1250 movd [r2+0], m0 ; i8x8c_dc satd
1251 movd [r2+4], m1 ; i8x8c_h satd
1252 movd [r2+8], m2 ; i8x8c_v satd
1255 %endmacro ; INTRA_SATDS_MMX
1258 %macro ABS_MOV_SSSE3 2
1262 %macro ABS_MOV_MMX 2
1268 %define ABS_MOV ABS_MOV_MMX
1270 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
1271 ; out: [tmp]=hadamard4, m0=satd
1272 cglobal x264_hadamard_ac_4x4_mmxext
1281 HADAMARD4_1D m0, m1, m2, m3
1282 TRANSPOSE4x4W 0, 1, 2, 3, 4
1283 HADAMARD4_1D m0, m1, m2, m3
1296 SAVE_MM_PERMUTATION x264_hadamard_ac_4x4_mmxext
1299 cglobal x264_hadamard_ac_2x2_mmxext
1304 HADAMARD4_1D m0, m1, m2, m3
1307 SAVE_MM_PERMUTATION x264_hadamard_ac_2x2_mmxext
1310 cglobal x264_hadamard_ac_8x8_mmxext
1311 mova m6, [mask_ac4 GLOBAL]
1313 call x264_hadamard_ac_4x4_mmxext
1317 call x264_hadamard_ac_4x4_mmxext
1321 call x264_hadamard_ac_4x4_mmxext
1325 call x264_hadamard_ac_4x4_mmxext
1328 mova [rsp+gprsize+8], m5 ; save satd
1329 call x264_hadamard_ac_2x2_mmxext
1336 call x264_hadamard_ac_2x2_mmxext
1343 call x264_hadamard_ac_2x2_mmxext
1350 mova [rsp+gprsize], m6 ; save sa8d
1352 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_mmxext
1355 %macro HADAMARD_AC_WXH_MMX 2
1356 cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
1357 %assign pad 16-gprsize-(stack_offset&15)
1362 call x264_hadamard_ac_8x8_mmxext
1367 call x264_hadamard_ac_8x8_mmxext
1372 lea r0, [r0+ysub*4+8]
1374 call x264_hadamard_ac_8x8_mmxext
1378 call x264_hadamard_ac_8x8_mmxext
1383 paddusw m0, [rsp+0x10]
1384 paddusw m1, [rsp+0x18]
1388 paddusw m1, [rsp+0x28]
1389 paddusw m2, [rsp+0x30]
1391 paddusw m1, [rsp+0x38]
1393 pand m3, [pw_1 GLOBAL]
1410 add rsp, 128+%1*%2/4+pad
1412 %endmacro ; HADAMARD_AC_WXH_MMX
1414 HADAMARD_AC_WXH_MMX 16, 16
1415 HADAMARD_AC_WXH_MMX 8, 16
1416 HADAMARD_AC_WXH_MMX 16, 8
1417 HADAMARD_AC_WXH_MMX 8, 8
1419 %macro HADAMARD_AC_SSE2 1
1421 ; in: r0=pix, r1=stride, r2=stride*3
1422 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
1423 cglobal x264_hadamard_ac_8x8_%1
1429 %define spill0 [rsp+gprsize]
1430 %define spill1 [rsp+gprsize+16]
1431 %define spill2 [rsp+gprsize+32]
1443 HADAMARD4_1D m0, m1, m2, m3
1454 HADAMARD4_1D m4, m5, m6, m7
1457 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
1459 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,spill0,spill1
1461 HADAMARD4_1D m0, m1, m2, m3
1462 HADAMARD4_1D m4, m5, m6, m7
1471 pand m1, [mask_ac4 GLOBAL]
1487 mova [rsp+gprsize+32], m1 ; save satd
1494 SBUTTERFLY qdq, 0, 4, 7
1495 SBUTTERFLY qdq, 1, 5, 7
1496 SBUTTERFLY qdq, 2, 6, 7
1497 SUMSUB_BADC m0, m4, m1, m5
1501 pand m0, [mask_ac8 GLOBAL]
1508 SBUTTERFLY qdq, 3, 7, 4
1518 mova [rsp+gprsize+16], m0 ; save sa8d
1519 SAVE_MM_PERMUTATION x264_hadamard_ac_8x8_%1
1522 HADAMARD_AC_WXH_SSE2 16, 16, %1
1523 HADAMARD_AC_WXH_SSE2 8, 16, %1
1524 HADAMARD_AC_WXH_SSE2 16, 8, %1
1525 HADAMARD_AC_WXH_SSE2 8, 8, %1
1526 %endmacro ; HADAMARD_AC_SSE2
1528 ; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
1529 %macro HADAMARD_AC_WXH_SSE2 3
1530 cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3
1531 %assign pad 16-gprsize-(stack_offset&15)
1535 call x264_hadamard_ac_8x8_%3
1540 call x264_hadamard_ac_8x8_%3
1545 lea r0, [r0+ysub*4+8]
1547 call x264_hadamard_ac_8x8_%3
1551 call x264_hadamard_ac_8x8_%3
1556 paddusw m0, [rsp+0x30]
1557 paddusw m1, [rsp+0x40]
1560 paddusw m0, [rsp+0x50]
1561 paddusw m1, [rsp+0x60]
1562 paddusw m0, [rsp+0x70]
1563 paddusw m1, [rsp+0x80]
1570 shr edx, 2 - (%1*%2 >> 8)
1576 add rsp, 16+%1*%2/2+pad
1578 %endmacro ; HADAMARD_AC_WXH_SSE2
1583 cextern x264_pixel_sa8d_8x8_internal_mmxext
1584 SA8D_16x16_32 mmxext
1587 %define ABS1 ABS1_MMX
1588 %define ABS2 ABS2_MMX
1591 INTRA_SA8D_SSE2 sse2
1592 INTRA_SATDS_MMX mmxext
1593 HADAMARD_AC_SSE2 sse2
1594 %define ABS1 ABS1_SSSE3
1595 %define ABS2 ABS2_SSSE3
1596 %define ABS_MOV ABS_MOV_SSSE3
1597 SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
1600 INTRA_SA8D_SSE2 ssse3
1601 INTRA_SATDS_MMX ssse3
1602 HADAMARD_AC_SSE2 ssse3
1603 SATDS_SSE2 ssse3_phadd
1607 ;=============================================================================
1609 ;=============================================================================
1611 ;-----------------------------------------------------------------------------
1612 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
1613 ; const uint8_t *pix2, int stride2, int sums[2][4] )
1614 ;-----------------------------------------------------------------------------
1615 cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
1640 movdqa m7, [pw_1 GLOBAL]
1668 ;-----------------------------------------------------------------------------
1669 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
1670 ;-----------------------------------------------------------------------------
1671 cglobal x264_pixel_ssim_end4_sse2, 3,3
1686 movdqa m5, [ssim_c1 GLOBAL]
1687 movdqa m6, [ssim_c2 GLOBAL]
1688 TRANSPOSE4x4D 0, 1, 2, 3, 4
1690 ; s1=m0, s2=m1, ss=m2, s12=m3
1693 pmaddwd m4, m0 ; s1*s2
1695 pmaddwd m0, m0 ; s1*s1 + s2*s2
1699 psubd m3, m4 ; covar*2
1705 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
1706 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
1707 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
1708 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
1714 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
1717 lea r3, [mask_ff + 16 GLOBAL]
1718 movdqu m1, [r3 + r2*4]
1720 movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
1736 ;=============================================================================
1737 ; Successive Elimination ADS
1738 ;=============================================================================
1740 %macro ADS_START 1 ; unroll_size
1765 %define ABS1 ABS1_MMX
1767 ;-----------------------------------------------------------------------------
1768 ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
1769 ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
1770 ;-----------------------------------------------------------------------------
1771 cglobal x264_pixel_ads4_mmxext, 4,7
1775 pshufw mm6, mm6, 0xAA
1777 pshufw mm4, mm4, 0xAA
1787 movq mm3, [r1+r2+16]
1796 pshufw mm1, [r10+8], 0
1798 pshufw mm1, [ebp+stack_offset+28], 0
1806 cglobal x264_pixel_ads2_mmxext, 4,7
1810 pshufw mm6, mm6, 0xAA
1827 cglobal x264_pixel_ads1_mmxext, 4,7
1849 cglobal x264_pixel_ads4_%1, 4,7
1851 pshuflw xmm7, xmm4, 0
1852 pshuflw xmm6, xmm4, 0xAA
1853 pshufhw xmm5, xmm4, 0
1854 pshufhw xmm4, xmm4, 0xAA
1855 punpcklqdq xmm7, xmm7
1856 punpcklqdq xmm6, xmm6
1857 punpckhqdq xmm5, xmm5
1858 punpckhqdq xmm4, xmm4
1860 pshuflw xmm8, r6m, 0
1861 punpcklqdq xmm8, xmm8
1864 movdqu xmm11, [r1+r2]
1867 movdqu xmm1, [r1+16]
1874 movdqu xmm3, [r1+r2+16]
1893 movdqu xmm1, [r1+16]
1898 movdqu xmm2, [r1+r2]
1899 movdqu xmm3, [r1+r2+16]
1907 movd xmm1, [ebp+stack_offset+28]
1909 pshuflw xmm1, xmm1, 0
1910 punpcklqdq xmm1, xmm1
1918 cglobal x264_pixel_ads2_%1, 4,7
1921 pshuflw xmm7, xmm6, 0
1922 pshuflw xmm6, xmm6, 0xAA
1923 pshuflw xmm5, xmm5, 0
1924 punpcklqdq xmm7, xmm7
1925 punpcklqdq xmm6, xmm6
1926 punpcklqdq xmm5, xmm5
1930 movdqu xmm1, [r1+r2]
1944 cglobal x264_pixel_ads1_%1, 4,7
1947 pshuflw xmm7, xmm7, 0
1948 pshuflw xmm6, xmm6, 0
1949 punpcklqdq xmm7, xmm7
1950 punpcklqdq xmm6, xmm6
1954 movdqu xmm1, [r1+16]
1958 movdqu xmm3, [r3+16]
1973 %define ABS1 ABS1_SSSE3
1976 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
1979 ; *(uint32_t*)(masks+width) = 0;
1980 ; for( i=0; i<width; i+=8 )
1982 ; uint64_t mask = *(uint64_t*)(masks+i);
1983 ; if( !mask ) continue;
1984 ; for( j=0; j<8; j++ )
1985 ; if( mask & (255<<j*8) )
1990 cglobal x264_pixel_ads_mvs
1998 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
1999 mov dword [rsp+r5], 0
2012 test edi, 0xff<<(%1*8)
2033 ; no PROLOGUE, inherit from x264_pixel_ads1
2034 mov ebx, [ebp+stack_offset+20] ; mvs
2035 mov edi, [ebp+stack_offset+24] ; width
2036 mov dword [esp+edi], 0
2044 mov ebp, [esp+esi+4]
2045 mov edx, [esp+esi+8]
2052 test %2, 0xff<<(%1*8)