1 ;*****************************************************************************
2 ;* mc-a.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2015 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Dylan Yudaken <dyudaken@gmail.com>
10 ;* Holger Lubitz <holger@lubitz.org>
11 ;* Min Chen <chenm001.163.com>
12 ;* Oskar Arvidsson <oskar@irock.se>
14 ;* This program is free software; you can redistribute it and/or modify
15 ;* it under the terms of the GNU General Public License as published by
16 ;* the Free Software Foundation; either version 2 of the License, or
17 ;* (at your option) any later version.
19 ;* This program is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;* GNU General Public License for more details.
24 ;* You should have received a copy of the GNU General Public License
25 ;* along with this program; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
28 ;* This program is also available under a commercial proprietary license.
29 ;* For more information, contact us at licensing@x264.com.
30 ;*****************************************************************************
33 %include "x86util.asm"
37 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
38 ch_shuf_adj: times 8 db 0
57 cextern deinterleave_shufd
59 ;=============================================================================
60 ; implicit weighted biprediction
61 ;=============================================================================
62 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
64 DECLARE_REG_TMP 0,1,2,3,4,5,4,5
65 %macro AVG_START 0-1 0
69 DECLARE_REG_TMP 0,1,2,3,4,5,7,8
70 %macro AVG_START 0-1 0
74 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
75 %macro AVG_START 0-1 0
87 lea t4, [t4+t5*2*SIZEOF_PIXEL]
88 lea t2, [t2+t3*2*SIZEOF_PIXEL]
89 lea t0, [t0+t1*2*SIZEOF_PIXEL]
106 %macro BIWEIGHT_START_MMX 0
118 %else ;!HIGH_BIT_DEPTH
119 %macro BIWEIGHT_MMX 2
131 %macro BIWEIGHT_START_MMX 0
133 SPLATW m2, m2 ; weight_dst
135 psubw m3, m2 ; weight_src
136 mova m4, [pw_32] ; rounding
139 %endif ;HIGH_BIT_DEPTH
141 %macro BIWEIGHT_SSSE3 2
149 %macro BIWEIGHT_START_SSSE3 0
150 movzx t6d, byte r6m ; FIXME x86_64
160 SPLATW m3, m3 ; weight_dst,src
165 %macro BIWEIGHT_ROW 4
173 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
180 %else ;!HIGH_BIT_DEPTH
181 %macro BIWEIGHT_ROW 4
188 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
194 %endif ;HIGH_BIT_DEPTH
196 ;-----------------------------------------------------------------------------
197 ; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
198 ;-----------------------------------------------------------------------------
199 %macro AVG_WEIGHT 1-2 0
200 cglobal pixel_avg_weight_w%1
204 mova m7, [pw_pixel_max]
207 %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
210 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
214 %else ;!HIGH_BIT_DEPTH
216 %endif ;HIGH_BIT_DEPTH
218 movhps [t0+SIZEOF_PIXEL*t1], m6
221 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
222 BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
223 BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
230 %define BIWEIGHT BIWEIGHT_MMX
231 %define BIWEIGHT_START BIWEIGHT_START_MMX
241 %else ;!HIGH_BIT_DEPTH
245 %define BIWEIGHT BIWEIGHT_SSSE3
246 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
254 cglobal pixel_avg_weight_w16
260 vinserti128 m0, m0, [t2+t3], 1
261 vinserti128 m1, m1, [t4+t5], 1
262 SBUTTERFLY bw, 0, 1, 2
269 vextracti128 [t0+t1], m0, 1
271 %endif ;HIGH_BIT_DEPTH
273 ;=============================================================================
274 ; P frame explicit weighted prediction
275 ;=============================================================================
279 %macro WEIGHT_START 1
280 mova m0, [r4+ 0] ; 1<<denom
282 movd m2, [r4+32] ; denom
283 mova m4, [pw_pixel_max]
284 paddw m2, [sq_1] ; denom+1
301 %macro WEIGHT_TWO_ROW 4
303 %rep (%3+mmsize/2-1)/(mmsize/2)
304 %if %3-x/2 <= 4 && mmsize == 16
310 WEIGHT %1+x, %1+x+mmsize/2
312 WEIGHT %1+r3+x, %1+r3+x+mmsize/2
322 %else ; !HIGH_BIT_DEPTH
324 %macro WEIGHT_START 1
326 vbroadcasti128 m3, [r4]
327 vbroadcasti128 m4, [r4+16]
331 %if notcpuflag(ssse3)
338 ; src1, src2, dst1, dst2, fast
339 %macro WEIGHT_ROWx2 5
341 movh m1, [%1+mmsize/2]
343 movh m7, [%2+mmsize/2]
368 paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
383 ; src1, src2, dst1, dst2, width, fast
388 vinserti128 m0, m0, [%2], 1
401 vextracti128 [%4], m0, 1
404 vinserti128 m0, m0, [%2], 1
412 vextracti128 xm1, m0, 1
438 paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
450 movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
456 %macro WEIGHT_TWO_ROW 4
460 WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
467 WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
476 %endif ; HIGH_BIT_DEPTH
478 ;-----------------------------------------------------------------------------
479 ;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
480 ;-----------------------------------------------------------------------------
483 cglobal mc_weight_w%1, 6,6,8
486 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
487 ; we can merge the shift step into the scale factor
488 ; if (m3<<7) doesn't overflow an int16_t
493 WEIGHT_TWO_ROW r2, r0, %1, 0
499 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
503 WEIGHT_TWO_ROW r2, r0, %1, 1
555 %macro OFFSET_TWO_ROW 4
558 %if (%3*SIZEOF_PIXEL-x) >= mmsize
559 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
563 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
565 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
569 %if x >= %3*SIZEOF_PIXEL
575 ;-----------------------------------------------------------------------------
576 ;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
577 ;-----------------------------------------------------------------------------
579 cglobal mc_offset%2_w%1, 6,6
584 mova m3, [pw_pixel_max]
588 OFFSET_TWO_ROW r2, r0, %1, %2
616 ;=============================================================================
618 ;=============================================================================
620 ;-----------------------------------------------------------------------------
621 ; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
622 ; pixel *src2, intptr_t src2_stride, int weight );
623 ;-----------------------------------------------------------------------------
625 cglobal pixel_avg_%1x%2
628 jne pixel_avg_weight_w%1 %+ SUFFIX
629 %if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
630 jmp pixel_avg_w%1_avx2
632 %if mmsize == 16 && %1 == 16
634 jz pixel_avg_w%1_sse2
636 jmp pixel_avg_w%1_mmx2
640 ;-----------------------------------------------------------------------------
641 ; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
642 ; pixel *src2, intptr_t src2_stride, int height, int weight );
643 ;-----------------------------------------------------------------------------
646 cglobal pixel_avg_w%1
650 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
652 %2 m1, [t2+x+SIZEOF_PIXEL*t3]
655 pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
656 %else ;!HIGH_BIT_DEPTH
658 pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
661 %3 [t0+x+SIZEOF_PIXEL*t1], m1
670 AVG_FUNC 4, movq, movq
676 AVG_FUNC 8, movq, movq
681 AVG_FUNC 16, movq, movq
686 AVG_FUNC 4, movq, movq
692 AVG_FUNC 8, movdqu, movdqa
697 AVG_FUNC 16, movdqu, movdqa
701 %else ;!HIGH_BIT_DEPTH
704 AVG_FUNC 4, movd, movd
710 AVG_FUNC 8, movq, movq
715 AVG_FUNC 16, movq, movq
720 AVG_FUNC 16, movdqu, movdqa
738 AVG_FUNC 16, movdqu, movdqa
742 %endif ;HIGH_BIT_DEPTH
746 ;=============================================================================
748 ;=============================================================================
751 ;-----------------------------------------------------------------------------
752 ; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
753 ; uint16_t *src1, intptr_t src_stride,
754 ; uint16_t *src2, int height );
755 ;-----------------------------------------------------------------------------
757 cglobal pixel_avg2_w%1, 6,7,4
763 %if cpuflag(avx) || mmsize == 8
782 cglobal pixel_avg2_w%1, 6,7,8
789 %2 m3, [r2+r3*2+mmsize]
792 pavgw m1, [r2+r4+mmsize]
794 pavgw m3, [r2+r6+mmsize]
797 %2 m5, [r2+r4+mmsize]
799 %2 m7, [r2+r6+mmsize]
808 %3 [r0+r1*2+mmsize], m3
818 AVG2_W_TWO 8, movu, mova
821 AVG2_W_TWO 10, movd, movd
822 AVG2_W_TWO 16, movu, mova
827 cglobal pixel_avg2_w10_mmx2, 6,7
834 movu m3, [r2+r3*2+ 0]
835 movu m4, [r2+r3*2+ 8]
836 movh m5, [r2+r3*2+16]
846 mova [r0+r1*2+ 0], m3
847 mova [r0+r1*2+ 8], m4
848 movh [r0+r1*2+16], m5
855 cglobal pixel_avg2_w16_mmx2, 6,7
863 movu m4, [r2+r3*2+ 0]
864 movu m5, [r2+r3*2+ 8]
865 movu m6, [r2+r3*2+16]
866 movu m7, [r2+r3*2+24]
879 mova [r0+r1*2+ 0], m4
880 mova [r0+r1*2+ 8], m5
881 mova [r0+r1*2+16], m6
882 mova [r0+r1*2+24], m7
889 cglobal pixel_avg2_w18_mmx2, 6,7
913 %macro PIXEL_AVG_W18 0
914 cglobal pixel_avg2_w18, 6,7
947 %endif ; HIGH_BIT_DEPTH
949 %if HIGH_BIT_DEPTH == 0
950 ;-----------------------------------------------------------------------------
951 ; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
952 ; uint8_t *src1, intptr_t src_stride,
953 ; uint8_t *src2, int height );
954 ;-----------------------------------------------------------------------------
956 cglobal pixel_avg2_w%1_mmx2, 6,7
978 cglobal pixel_avg2_w%1_mmx2, 6,7
1004 cglobal pixel_avg2_w20_mmx2, 6,7
1013 movd mm5, [r4+r3+16]
1015 pavgb mm1, [r4+r2+8]
1016 pavgb mm2, [r4+r2+16]
1018 pavgb mm4, [r4+r6+8]
1019 pavgb mm5, [r4+r6+16]
1026 movd [r0+r1+16], mm5
1033 cglobal pixel_avg2_w16_sse2, 6,7
1051 cglobal pixel_avg2_w20_sse2, 6,7
1060 movd mm5, [r4+r3+16]
1063 pavgb mm4, [r4+r2+16]
1064 pavgb mm5, [r4+r6+16]
1069 movd [r0+r1+16], mm5
1076 cglobal pixel_avg2_w20, 6,7
1092 ; Cacheline split code for processors with high latencies for loads
1093 ; split over cache lines. See sad-a.asm for a more detailed explanation.
1094 ; This particular instance is complicated by the fact that src1 and src2
1095 ; can have different alignments. For simplicity and code size, only the
1096 ; MMX cacheline workaround is used. As a result, in the case of SSE2
1097 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
1098 ; is no cacheline split, and the MMX workaround if there is.
1108 %macro AVG_CACHELINE_START 0
1109 %assign stack_offset 0
1120 %macro AVG_CACHELINE_LOOP 2
1123 movq mm3, [r2+r4+%1]
1124 movq mm2, [r2+r4+8+%1]
1135 %macro AVG_CACHELINE_FUNC 2
1136 pixel_avg2_w%1_cache_mmx2:
1138 AVG_CACHELINE_LOOP 0, movq
1140 AVG_CACHELINE_LOOP 8, movq
1142 AVG_CACHELINE_LOOP 16, movd
1152 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
1154 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
1155 %define cachesplit pixel_avg2_w16_cache_mmx2
1157 %define cachesplit pixel_avg2_w%1_cache_mmx2
1159 cglobal pixel_avg2_w%1_cache%2_%3
1162 cmp eax, (%2-%1-(%1 % 8))
1164 jbe pixel_avg2_w%1_%3
1166 jb pixel_avg2_w%1_%3
1168 %if 0 ; or %1==8 - but the extra branch seems too expensive
1175 jz pixel_avg2_w%1_%3
1179 jz pixel_avg2_w%1_%3
1182 %if mmsize==16 || (%1==8 && %2==64)
1183 AVG_CACHELINE_FUNC %1, %2
1190 AVG_CACHELINE_CHECK 8, 64, mmx2
1191 AVG_CACHELINE_CHECK 12, 64, mmx2
1192 %if ARCH_X86_64 == 0
1193 AVG_CACHELINE_CHECK 16, 64, mmx2
1194 AVG_CACHELINE_CHECK 20, 64, mmx2
1195 AVG_CACHELINE_CHECK 8, 32, mmx2
1196 AVG_CACHELINE_CHECK 12, 32, mmx2
1197 AVG_CACHELINE_CHECK 16, 32, mmx2
1198 AVG_CACHELINE_CHECK 20, 32, mmx2
1201 AVG_CACHELINE_CHECK 16, 64, sse2
1202 AVG_CACHELINE_CHECK 20, 64, sse2
1204 ; computed jump assumes this loop is exactly 48 bytes
1205 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
1207 avg_w16_align%1_%2_ssse3:
1213 movdqa xmm1, [r2+r4+16]
1214 palignr xmm1, [r2+r4], %2
1218 movdqa xmm1, [r2+16]
1219 palignr xmm1, [r2], %1
1223 movdqa xmm1, [r2+16]
1224 movdqa xmm2, [r2+r4+16]
1225 palignr xmm1, [r2], %1
1226 palignr xmm2, [r2+r4], %2&15
1233 jg avg_w16_align%1_%2_ssse3
1236 ; make sure the first ones don't end up short
1238 times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
1242 cglobal pixel_avg2_w16_cache64_ssse3
1243 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
1247 jb x264_pixel_avg2_w16_sse2
1250 jz x264_pixel_avg2_w16_sse2
1257 lea r6, [r6*3] ;(offset + align*2)*3
1259 shl r6, 4 ;jump = (offset + align*2)*48
1260 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
1262 lea r7, [avg_w16_addr]
1265 lea r6, [avg_w16_addr + r6]
1272 AVG16_CACHELINE_LOOP_SSSE3 j, j
1273 AVG16_CACHELINE_LOOP_SSSE3 j, k
1277 %endif ; !HIGH_BIT_DEPTH
1279 ;=============================================================================
1281 ;=============================================================================
1294 %macro COPY2 2-4 0, 1
1295 movu m0, [r2+%3*mmsize]
1296 movu m1, [r2+%4*mmsize]
1297 movu m2, [r2+r3+%3*mmsize]
1298 movu m3, [r2+r3+%4*mmsize]
1299 mova [r0+%3*mmsize], m0
1300 mova [r0+%4*mmsize], m1
1301 mova [r0+r1+%3*mmsize], m2
1302 mova [r0+r1+%4*mmsize], m3
1303 movu m0, [r2+r3*2+%3*mmsize]
1304 movu m1, [r2+r3*2+%4*mmsize]
1305 movu m2, [r2+%2+%3*mmsize]
1306 movu m3, [r2+%2+%4*mmsize]
1307 mova [r0+r1*2+%3*mmsize], m0
1308 mova [r0+r1*2+%4*mmsize], m1
1309 mova [r0+%1+%3*mmsize], m2
1310 mova [r0+%1+%4*mmsize], m3
1318 ;-----------------------------------------------------------------------------
1319 ; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
1320 ; uint8_t *src, intptr_t i_src_stride, int i_height )
1321 ;-----------------------------------------------------------------------------
1323 cglobal mc_copy_w4_mmx, 4,6
1329 %if HIGH_BIT_DEPTH == 0
1341 %assign %%w %1*SIZEOF_PIXEL/mmsize
1343 cglobal mc_copy_w%1, 5,7
1363 INIT_XMM aligned, sse
1368 INIT_YMM aligned, avx
1372 ;=============================================================================
1374 ;=============================================================================
1375 ; assumes 64 byte cachelines
1376 ; FIXME doesn't cover all pixels in high depth and/or 4:4:4
1378 ;-----------------------------------------------------------------------------
1379 ; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
1380 ; pixel *pix_uv, intptr_t stride_uv, int mb_x )
1381 ;-----------------------------------------------------------------------------
1383 %macro PREFETCH_FENC 1
1385 cglobal prefetch_fenc_%1, 5,5
1390 lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
1398 lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
1409 cglobal prefetch_fenc_%1, 0,3
1416 lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
1429 lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
1438 %endif ; ARCH_X86_64
1445 ;-----------------------------------------------------------------------------
1446 ; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
1447 ;-----------------------------------------------------------------------------
1449 cglobal prefetch_ref, 3,3
1453 lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
1457 prefetcht0 [r0+r1*2]
1462 prefetcht0 [r0+r1*2]
1468 ;=============================================================================
1470 ;=============================================================================
1473 DECLARE_REG_TMP 6,7,8
1475 DECLARE_REG_TMP 0,1,2
1478 %macro MC_CHROMA_START 1
1495 movsxdifnidn t0, t0d
1496 add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
1500 %macro UNPACK_UNALIGNED 4
1503 punpckhwd %3, %1, %2
1510 shufps %2, %1, %3, q3131
1511 shufps %1, %3, q2020
1514 %else ; !HIGH_BIT_DEPTH
1515 %macro UNPACK_UNALIGNED 3
1523 %endif ; HIGH_BIT_DEPTH
1525 ;-----------------------------------------------------------------------------
1526 ; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
1527 ; uint8_t *src, intptr_t src_stride,
1529 ; int width, int height )
1530 ;-----------------------------------------------------------------------------
1548 add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
1553 jl mc_chroma_mmx2 %+ .skip_prologue
1564 pshufw m7, m5, q3232
1565 pshufw m6, m5, q0000
1566 pshufw m5, m5, q1111
1570 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
1572 pshufd m7, m5, q1111
1574 pshufd m6, m5, q0000
1575 pshufd m5, m5, q1111
1580 UNPACK_UNALIGNED m0, m1, m2, r3
1583 UNPACK_UNALIGNED m0, m1, [r3+2]
1587 %endif ; HIGH_BIT_DEPTH
1595 UNPACK_UNALIGNED m0, m1, m2, r3+r4
1597 %else ; !HIGH_BIT_DEPTH
1599 UNPACK_UNALIGNED m0, m1, [r3+r4+2]
1604 %endif ; HIGH_BIT_DEPTH
1622 %else ; !HIGH_BIT_DEPTH
1631 %endif ; HIGH_BIT_DEPTH
1648 %define multy0 [rsp-8]
1669 UNPACK_UNALIGNED m0, m2, m4, r3
1670 UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
1673 movu m1, [r3+mmsize/2]
1674 UNPACK_UNALIGNED m0, m2, [r3+2]
1675 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1693 UNPACK_UNALIGNED m0, m1, m2, r3
1697 UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
1701 %else ; !HIGH_BIT_DEPTH
1703 movu m1, [r3+mmsize/2]
1704 UNPACK_UNALIGNED m0, m2, [r3+2]
1705 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1716 %endif ; HIGH_BIT_DEPTH
1732 movh [r0+mmsize/2], m1
1737 movh [r1+mmsize/2], m1
1740 movhps [r1+mmsize/2], m1
1742 %else ; !HIGH_BIT_DEPTH
1745 pshufw m1, m0, q0020
1746 pshufw m0, m0, q0031
1750 pshufd m0, m0, q3120
1754 %endif ; HIGH_BIT_DEPTH
1768 lea r3, [t2+8*SIZEOF_PIXEL]
1769 lea r0, [t0+4*SIZEOF_PIXEL]
1770 lea r1, [t1+4*SIZEOF_PIXEL]
1775 add r3, 8*SIZEOF_PIXEL
1776 add r0, 4*SIZEOF_PIXEL
1777 add r1, 4*SIZEOF_PIXEL
1783 %if ARCH_X86_64 ; too many regs for x86_32
1784 RESET_MM_PERMUTATION
1786 %assign stack_offset stack_offset - stack_size_padded
1787 %assign stack_size_padded 0
1788 %assign xmm_regs_used 0
1793 mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
1797 mov r6d, 2*SIZEOF_PIXEL
1799 %if HIGH_BIT_DEPTH && mmsize == 16
1831 SBUTTERFLY wd, 0, 2, 6
1832 SBUTTERFLY wd, 1, 3, 7
1833 SBUTTERFLY wd, 0, 2, 6
1834 SBUTTERFLY wd, 1, 3, 7
1836 SBUTTERFLY wd, 0, 2, 6
1837 SBUTTERFLY wd, 1, 3, 7
1839 %else ; !HIGH_BIT_DEPTH
1851 %endif ; HIGH_BIT_DEPTH
1875 %else ; !HIGH_BIT_DEPTH
1894 %endif ; HIGH_BIT_DEPTH
1902 sub r2, 4*SIZEOF_PIXEL
1903 sub r4, 8*SIZEOF_PIXEL
1904 mov r7, 4*SIZEOF_PIXEL
1905 mov r8, 8*SIZEOF_PIXEL
1910 %endif ; ARCH_X86_64
1911 %endmacro ; MC_CHROMA
1913 %macro MC_CHROMA_SSSE3 0
1915 MC_CHROMA_START 10-cpuflag(avx2)
1924 imul t2d, t0d ; (x*255+8)*y
1925 imul r5d, t0d ; (x*255+8)*(8-y)
1928 %if cpuflag(cache64)
1932 lea t1, [ch_shuf_adj]
1933 movddup xm5, [t1 + t0*4]
1935 movddup xm5, [ch_shuf_adj + t0*4]
1937 paddb xm5, [ch_shuf]
1947 vpbroadcastw m6, xm6
1948 vpbroadcastw m7, xm7
1954 %define shiftround m8
1957 %define shiftround [pw_512]
1966 vinserti128 m0, m0, [r3+r4], 1
1967 vinserti128 m1, m1, [r3+r4*2], 1
1973 pmulhrsw m0, shiftround
1975 vextracti128 xm1, m0, 1
1990 vinserti128 m0, m0, [r3+8], 1
1994 vinserti128 m3, m3, [r3+r4+8], 1
1996 pmaddubsw m1, m0, m7
1997 pmaddubsw m2, m3, m6
1998 pmaddubsw m3, m3, m7
2001 vinserti128 m0, m0, [r3+r4*2+8], 1
2003 pmaddubsw m4, m0, m6
2007 pmulhrsw m1, shiftround
2008 pmulhrsw m3, shiftround
2010 mova m2, [deinterleave_shufd]
2012 vextracti128 xm2, m1, 1
2027 pmaddubsw m2, m1, m7
2032 pmulhrsw m1, shiftround
2033 pmulhrsw m3, shiftround
2074 pmulhrsw m0, shiftround ; x + 32 >> 6
2075 pmulhrsw m1, shiftround
2077 pshufd m0, m0, q3120
2083 movu m3, [r3+r4*2+8]
2093 pmulhrsw m2, shiftround
2094 pmulhrsw m3, shiftround
2096 pshufd m2, m2, q3120
2115 %else ; !HIGH_BIT_DEPTH
2122 INIT_XMM ssse3, cache64
2125 MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
2128 %endif ; HIGH_BIT_DEPTH