1 ;*****************************************************************************
2 ;* pixel-sse2.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005 x264 project
6 ;* Authors: Alex Izvorski <aizvorksi@gmail.com>
7 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 ;*****************************************************************************
26 ;=============================================================================
27 ; Macros and other preprocessor constants
28 ;=============================================================================
30 %include "amd64inc.asm"
35 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
36 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
37 mask_ff: times 16 db 0xff
43 %macro HADDD 2 ; sum junk
51 pmaddwd %1, [pw_1 GLOBAL]
63 ;-----------------------------------------------------------------------------
64 ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
65 ;-----------------------------------------------------------------------------
66 cglobal x264_pixel_sad_16x16_%1
68 movdqu xmm1, [rdx+rcx]
71 movdqu xmm3, [rdx+rcx]
74 psadbw xmm1, [rdi+rsi]
79 psadbw xmm3, [rdi+rsi]
81 movdqu xmm5, [rdx+rcx]
85 movdqu xmm7, [rdx+rcx]
89 psadbw xmm5, [rdi+rsi]
94 psadbw xmm7, [rdi+rsi]
96 movdqu xmm2, [rdx+rcx]
101 movdqu xmm4, [rdx+rcx]
105 psadbw xmm2, [rdi+rsi]
110 psadbw xmm4, [rdi+rsi]
112 movdqu xmm6, [rdx+rcx]
117 movdqu xmm1, [rdx+rcx]
120 psadbw xmm6, [rdi+rsi]
124 psadbw xmm1, [rdi+rsi]
130 ;-----------------------------------------------------------------------------
131 ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
132 ;-----------------------------------------------------------------------------
133 cglobal x264_pixel_sad_16x8_%1
135 movdqu xmm2, [rdx+rcx]
138 movdqu xmm4, [rdx+rcx]
140 psadbw xmm2, [rdi+rsi]
143 psadbw xmm4, [rdi+rsi]
150 movdqu xmm2, [rdx+rcx]
153 movdqu xmm4, [rdx+rcx]
155 psadbw xmm2, [rdi+rsi]
158 psadbw xmm4, [rdi+rsi]
178 %macro SAD_X3_START_1x16P 0
179 movdqa xmm3, [parm1q]
180 movdqu xmm0, [parm2q]
181 movdqu xmm1, [parm3q]
182 movdqu xmm2, [parm4q]
188 %macro SAD_X3_1x16P 2
189 movdqa xmm3, [parm1q+%1]
190 movdqu xmm4, [parm2q+%2]
191 movdqu xmm5, [parm3q+%2]
192 movdqu xmm6, [parm4q+%2]
201 %macro SAD_X3_2x16P 1
207 SAD_X3_1x16P FENC_STRIDE, parm5q
208 add parm1q, 2*FENC_STRIDE
209 lea parm2q, [parm2q+2*parm5q]
210 lea parm3q, [parm3q+2*parm5q]
211 lea parm4q, [parm4q+2*parm5q]
214 %macro SAD_X4_START_1x16P 0
215 movdqa xmm7, [parm1q]
216 movdqu xmm0, [parm2q]
217 movdqu xmm1, [parm3q]
218 movdqu xmm2, [parm4q]
219 movdqu xmm3, [parm5q]
226 %macro SAD_X4_1x16P 2
227 movdqa xmm7, [parm1q+%1]
228 movdqu xmm4, [parm2q+%2]
229 movdqu xmm5, [parm3q+%2]
230 movdqu xmm6, [parm4q+%2]
231 movdqu xmm8, [parm5q+%2]
242 %macro SAD_X4_2x16P 1
248 SAD_X4_1x16P FENC_STRIDE, parm6q
249 add parm1q, 2*FENC_STRIDE
250 lea parm2q, [parm2q+2*parm6q]
251 lea parm3q, [parm3q+2*parm6q]
252 lea parm4q, [parm4q+2*parm6q]
253 lea parm5q, [parm5q+2*parm6q]
263 movd [parm6q+0], xmm0
264 movd [parm6q+4], xmm1
265 movd [parm6q+8], xmm2
284 ;-----------------------------------------------------------------------------
285 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
286 ; uint8_t *pix2, int i_stride, int scores[3] )
287 ;-----------------------------------------------------------------------------
289 cglobal x264_pixel_sad_x%1_%2x%3_%4
297 SAD_X 3, 16, 16, sse2
299 SAD_X 4, 16, 16, sse2
304 SAD_X 3, 16, 16, sse3
306 SAD_X 4, 16, 16, sse3
312 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
313 ; unless the unaligned data spans the border between 2 cachelines, in which
314 ; case it's really slow. The exact numbers may differ, but all Intel cpus
315 ; have a large penalty for cacheline splits.
316 ; (8-byte alignment exactly half way between two cachelines is ok though.)
317 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
318 ; So in the split case we load aligned data and explicitly perform the
319 ; alignment between registers. Like on archs that have only aligned loads,
320 ; except complicated by the fact that PALIGNR takes only an immediate, not
321 ; a variable alignment.
322 ; It is also possible to hoist the realignment to the macroblock level (keep
323 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
324 ; needed for that method makes it often slower.
326 ; sad 16x16 costs on Core2:
327 ; good offsets: 49 cycles (50/64 of all mvs)
328 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
329 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
330 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
332 ; computed jump assumes this loop is exactly 64 bytes
333 %macro SAD16_CACHELINE_LOOP 1 ; alignment
336 movdqa xmm1, [rdx+16]
337 movdqa xmm2, [rdx+rcx+16]
338 palignr xmm1, [rdx], %1
339 palignr xmm2, [rdx+rcx], %1
341 psadbw xmm2, [rdi+rsi]
351 %macro SAD16_CACHELINE_FUNC 1 ; height
352 cglobal x264_pixel_sad_16x%1_cache64_ssse3
356 jle x264_pixel_sad_16x%1_sse2
361 lea r10, [sad_w16_align1 - 64 GLOBAL]
364 lea r10, [sad_w16_align1 - 64 + rax]
373 %macro SAD8_CACHELINE_FUNC 1 ; height
374 cglobal x264_pixel_sad_8x%1_cache64_mmxext
378 jle x264_pixel_sad_8x%1_mmxext
381 movd mm6, [sw_64 GLOBAL]
389 movq mm2, [parm3q+parm4q+8]
391 movq mm4, [parm3q+parm4q]
399 psadbw mm2, [parm1q+parm2q]
402 lea parm3q, [parm3q+2*parm4q]
403 lea parm1q, [parm1q+2*parm2q]
411 ; sad_x3/x4_cache64: check each mv.
412 ; if they're all within a cacheline, use normal sad_x3/x4.
413 ; otherwise, send them individually to sad_cache64.
414 %macro CHECK_SPLIT 2 ; pix, width
421 %macro SADX3_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver
422 cglobal x264_pixel_sad_x3_%1x%2_cache64_%4
423 CHECK_SPLIT parm2d, %1
424 CHECK_SPLIT parm3d, %1
425 CHECK_SPLIT parm4d, %1
426 jmp x264_pixel_sad_x3_%1x%2_%3
431 mov parm2q, FENC_STRIDE
434 call x264_pixel_sad_%1x%2_cache64_%4
438 call x264_pixel_sad_%1x%2_cache64_%4
442 call x264_pixel_sad_%1x%2_cache64_%4
447 %macro SADX4_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver
448 cglobal x264_pixel_sad_x4_%1x%2_cache64_%4
449 CHECK_SPLIT parm2d, %1
450 CHECK_SPLIT parm3d, %1
451 CHECK_SPLIT parm4d, %1
452 CHECK_SPLIT parm5d, %1
453 jmp x264_pixel_sad_x4_%1x%2_%3
460 mov parm2q, FENC_STRIDE
463 call x264_pixel_sad_%1x%2_cache64_%4
467 call x264_pixel_sad_%1x%2_cache64_%4
471 call x264_pixel_sad_%1x%2_cache64_%4
475 call x264_pixel_sad_%1x%2_cache64_%4
480 %macro SADX34_CACHELINE_FUNC 4
481 SADX3_CACHELINE_FUNC %1, %2, %3, %4
482 SADX4_CACHELINE_FUNC %1, %2, %3, %4
485 cextern x264_pixel_sad_8x16_mmxext
486 cextern x264_pixel_sad_8x8_mmxext
487 cextern x264_pixel_sad_8x4_mmxext
488 cextern x264_pixel_sad_x3_8x16_mmxext
489 cextern x264_pixel_sad_x3_8x8_mmxext
490 cextern x264_pixel_sad_x4_8x16_mmxext
491 cextern x264_pixel_sad_x4_8x8_mmxext
493 ; instantiate the aligned sads
495 SAD8_CACHELINE_FUNC 4
496 SAD8_CACHELINE_FUNC 8
497 SAD8_CACHELINE_FUNC 16
498 SADX34_CACHELINE_FUNC 8, 16, mmxext, mmxext
499 SADX34_CACHELINE_FUNC 8, 8, mmxext, mmxext
503 SAD16_CACHELINE_FUNC 8
504 SAD16_CACHELINE_FUNC 16
507 SAD16_CACHELINE_LOOP i
511 SADX34_CACHELINE_FUNC 16, 16, sse2, ssse3
512 SADX34_CACHELINE_FUNC 16, 8, sse2, ssse3
519 %macro SSD_INC_2x16P_SSE2 0
522 movdqu xmm3, [rdi+rsi]
523 movdqu xmm4, [rdx+rcx]
554 %macro SSD_START_SSE2 0
555 pxor xmm7, xmm7 ; zero
556 pxor xmm0, xmm0 ; mm0 holds the sum
559 %macro SSD_END_SSE2 0
565 ;-----------------------------------------------------------------------------
566 ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
567 ;-----------------------------------------------------------------------------
568 cglobal x264_pixel_ssd_16x16_sse2
575 ;-----------------------------------------------------------------------------
576 ; int x264_pixel_ssd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
577 ;-----------------------------------------------------------------------------
578 cglobal x264_pixel_ssd_16x8_sse2
597 SUMSUB_BADC %1, %2, %3, %4
598 SUMSUB_BADC %1, %3, %2, %4
602 SUMSUB_BADC %1, %5, %2, %6
603 SUMSUB_BADC %3, %7, %4, %8
604 SUMSUB_BADC %1, %3, %2, %4
605 SUMSUB_BADC %5, %7, %6, %8
606 SUMSUB_BADC %1, %2, %3, %4
607 SUMSUB_BADC %5, %6, %7, %8
610 ;;; row transform not used, because phaddw is much slower than paddw on a Conroe
617 ;%macro HADAMARD4x1_SSSE3 5 ; ABCD-T -> ADTC
618 ; PHSUMSUB %1, %2, %5
619 ; PHSUMSUB %3, %4, %2
620 ; PHSUMSUB %1, %3, %4
621 ; PHSUMSUB %5, %2, %3
630 %macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
636 %macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
637 SBUTTERFLY dqa, dq, %1, %2, %5
638 SBUTTERFLY dqa, dq, %3, %4, %2
639 SBUTTERFLY dqa, qdq, %1, %3, %4
640 SBUTTERFLY dqa, qdq, %5, %2, %3
643 %macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
644 SBUTTERFLY dqa, wd, %1, %2, %5
645 SBUTTERFLY dqa, wd, %3, %4, %2
646 SBUTTERFLY dqa, dq, %1, %3, %4
647 SBUTTERFLY2 dqa, dq, %5, %2, %3
648 SBUTTERFLY dqa, qdq, %1, %3, %2
649 SBUTTERFLY2 dqa, qdq, %4, %5, %3
652 %macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB
653 SBUTTERFLY dqa, wd, %1, %2, %9
654 SBUTTERFLY dqa, wd, %3, %4, %2
655 SBUTTERFLY dqa, wd, %5, %6, %4
656 SBUTTERFLY dqa, wd, %7, %8, %6
657 SBUTTERFLY dqa, dq, %1, %3, %8
658 SBUTTERFLY dqa, dq, %9, %2, %3
659 SBUTTERFLY dqa, dq, %5, %7, %2
660 SBUTTERFLY dqa, dq, %4, %6, %7
661 SBUTTERFLY dqa, qdq, %1, %5, %6
662 SBUTTERFLY dqa, qdq, %9, %4, %5
663 SBUTTERFLY dqa, qdq, %8, %2, %4
664 SBUTTERFLY dqa, qdq, %3, %7, %2
667 %macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
675 %macro LOAD_DIFF_4x8P 6 ; 4x dest, 2x temp
676 LOAD_DIFF_8P %1, %5, [parm1q], [parm3q]
677 LOAD_DIFF_8P %2, %6, [parm1q+parm2q], [parm3q+parm4q]
678 LOAD_DIFF_8P %3, %5, [parm1q+2*parm2q], [parm3q+2*parm4q]
679 LOAD_DIFF_8P %4, %6, [parm1q+r10], [parm3q+r11]
682 %macro SUM1x8_SSE2 3 ; 01 junk sum
689 %macro SUM4x4_SSE2 4 ; 02 13 junk sum
702 %macro SUM8x4_SSE2 7 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6) sum
721 %macro SUM8x4_SSSE3 7 ; a02 a13 . b02 b13 . sum
732 %macro SATD_TWO_SSE2 0
733 LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
734 lea parm1q, [parm1q+4*parm2q]
735 lea parm3q, [parm3q+4*parm4q]
736 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
737 TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
738 HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
739 SUM8x4 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
756 ;-----------------------------------------------------------------------------
757 ; int x264_pixel_satd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
758 ;-----------------------------------------------------------------------------
759 cglobal x264_pixel_satd_16x16_%1
775 ;-----------------------------------------------------------------------------
776 ; int x264_pixel_satd_8x16_sse2 (uint8_t *, int, uint8_t *, int )
777 ;-----------------------------------------------------------------------------
778 cglobal x264_pixel_satd_8x16_%1
786 ;-----------------------------------------------------------------------------
787 ; int x264_pixel_satd_16x8_sse2 (uint8_t *, int, uint8_t *, int )
788 ;-----------------------------------------------------------------------------
789 cglobal x264_pixel_satd_16x8_%1
801 ;-----------------------------------------------------------------------------
802 ; int x264_pixel_satd_8x8_sse2 (uint8_t *, int, uint8_t *, int )
803 ;-----------------------------------------------------------------------------
804 cglobal x264_pixel_satd_8x8_%1
810 ;-----------------------------------------------------------------------------
811 ; int x264_pixel_satd_8x4_sse2 (uint8_t *, int, uint8_t *, int )
812 ;-----------------------------------------------------------------------------
813 cglobal x264_pixel_satd_8x4_%1
819 ;-----------------------------------------------------------------------------
820 ; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
821 ;-----------------------------------------------------------------------------
822 cglobal x264_pixel_sa8d_8x8_%1
825 LOAD_DIFF_4x8P xmm0, xmm1, xmm2, xmm3, xmm8, xmm8
826 lea parm1q, [parm1q+4*parm2q]
827 lea parm3q, [parm3q+4*parm4q]
828 LOAD_DIFF_4x8P xmm4, xmm5, xmm6, xmm7, xmm8, xmm8
830 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
831 TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
832 HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
835 SUM8x4 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
836 SUM8x4 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
840 add r8d, eax ; preserve rounding for 16x16
845 ;-----------------------------------------------------------------------------
846 ; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
847 ;-----------------------------------------------------------------------------
848 ;; violates calling convention
849 cglobal x264_pixel_sa8d_16x16_%1
851 call x264_pixel_sa8d_8x8_%1 ; pix[0]
852 lea parm1q, [parm1q+4*parm2q]
853 lea parm3q, [parm3q+4*parm4q]
854 call x264_pixel_sa8d_8x8_%1 ; pix[8*stride]
855 lea r10, [3*parm2q-2]
856 lea r11, [3*parm4q-2]
861 call x264_pixel_sa8d_8x8_%1 ; pix[8]
862 lea parm1q, [parm1q+4*parm2q]
863 lea parm3q, [parm3q+4*parm4q]
864 call x264_pixel_sa8d_8x8_%1 ; pix[8*stride+8]
871 %define SUM8x4 SUM8x4_SSE2
874 %define SUM8x4 SUM8x4_SSSE3
880 ;-----------------------------------------------------------------------------
881 ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
882 ;-----------------------------------------------------------------------------
883 cglobal x264_intra_sa8d_x3_8x8_core_sse2
886 movq xmm0, [parm1q+0*FENC_STRIDE]
887 movq xmm7, [parm1q+1*FENC_STRIDE]
888 movq xmm6, [parm1q+2*FENC_STRIDE]
889 movq xmm3, [parm1q+3*FENC_STRIDE]
890 movq xmm5, [parm1q+4*FENC_STRIDE]
891 movq xmm1, [parm1q+5*FENC_STRIDE]
892 movq xmm8, [parm1q+6*FENC_STRIDE]
893 movq xmm2, [parm1q+7*FENC_STRIDE]
902 HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
903 TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
904 HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
907 movzx edi, word [parm2q+0]
908 add di, word [parm2q+16]
918 SUM8x4_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
921 SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
923 SUM1x8_SSE2 xmm8, xmm10, xmm15
924 movdqa xmm14, xmm15 ; 7x8 sum
926 movdqa xmm8, [parm2q+0] ; left edge
931 SUM1x8_SSE2 xmm8, xmm10, xmm14
932 SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum
939 punpcklqdq xmm0, xmm4 ; transpose
940 movdqa xmm1, [parm2q+16] ; top edge
943 psrldq xmm2, 2 ; 8x7 sum
944 psubw xmm0, xmm1 ; 8x1 sum
945 SUM1x8_SSE2 xmm0, xmm1, xmm2
951 mov [parm3q+4], eax ; i8x8_h sa8d
956 mov [parm3q+8], eax ; i8x8_dc sa8d
961 mov [parm3q+0], eax ; i8x8_v sa8d
967 ;-----------------------------------------------------------------------------
968 ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
969 ; const uint8_t *pix2, int stride2, int sums[2][4] )
970 ;-----------------------------------------------------------------------------
971 cglobal x264_pixel_ssim_4x4x2_core_sse2
977 movdqa xmm8, [pw_1 GLOBAL]
997 pshufd xmm5, xmm3, 0xB1
1000 pshufd xmm6, xmm4, 0xB1
1003 pshufd xmm1, xmm1, 0xD8
1007 punpckldq xmm3, xmm4
1008 punpckhdq xmm5, xmm4
1009 movq [parm5q+ 0], xmm1
1010 movq [parm5q+ 8], xmm3
1012 movq [parm5q+16], xmm1
1013 movq [parm5q+24], xmm5
1016 ;-----------------------------------------------------------------------------
1017 ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
1018 ;-----------------------------------------------------------------------------
1019 cglobal x264_pixel_ssim_end4_sse2
1020 movdqa xmm0, [parm1q+ 0]
1021 movdqa xmm1, [parm1q+16]
1022 movdqa xmm2, [parm1q+32]
1023 movdqa xmm3, [parm1q+48]
1024 movdqa xmm4, [parm1q+64]
1025 paddd xmm0, [parm2q+ 0]
1026 paddd xmm1, [parm2q+16]
1027 paddd xmm2, [parm2q+32]
1028 paddd xmm3, [parm2q+48]
1029 paddd xmm4, [parm2q+64]
1034 movdqa xmm5, [ssim_c1 GLOBAL]
1035 movdqa xmm6, [ssim_c2 GLOBAL]
1036 TRANSPOSE4x4D xmm0, xmm1, xmm2, xmm3, xmm4
1038 ; s1=mm0, s2=mm3, ss=mm4, s12=mm2
1041 pmaddwd xmm1, xmm0 ; s1*s2
1043 pmaddwd xmm0, xmm0 ; s1*s1 + s2*s2
1047 psubd xmm2, xmm1 ; covar*2
1048 psubd xmm4, xmm0 ; vars
1053 cvtdq2ps xmm0, xmm0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
1054 cvtdq2ps xmm1, xmm1 ; (float)(s1*s2*2 + ssim_c1)
1055 cvtdq2ps xmm2, xmm2 ; (float)(covar*2 + ssim_c2)
1056 cvtdq2ps xmm4, xmm4 ; (float)(vars + ssim_c2)
1059 divps xmm1, xmm0 ; ssim
1063 lea rax, [mask_ff + 16 GLOBAL]
1064 movdqu xmm3, [rax + parm3q*4]
1066 movdqu xmm3, [mask_ff + parm3q*4 + 16]
1071 pshuflw xmm1, xmm0, 0xE