1 ;*****************************************************************************
2 ;* mc-a.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Dylan Yudaken <dyudaken@gmail.com>
10 ;* Holger Lubitz <holger@lubitz.org>
11 ;* Min Chen <chenm001.163.com>
12 ;* Oskar Arvidsson <oskar@irock.se>
14 ;* This program is free software; you can redistribute it and/or modify
15 ;* it under the terms of the GNU General Public License as published by
16 ;* the Free Software Foundation; either version 2 of the License, or
17 ;* (at your option) any later version.
19 ;* This program is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;* GNU General Public License for more details.
24 ;* You should have received a copy of the GNU General Public License
25 ;* along with this program; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
28 ;* This program is also available under a commercial proprietary license.
29 ;* For more information, contact us at licensing@x264.com.
30 ;*****************************************************************************
33 %include "x86util.asm"
37 pw_512: times 16 dw 512
38 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
39 ch_shuf_adj: times 8 db 0
57 cextern deinterleave_shufd
59 ;=============================================================================
60 ; implicit weighted biprediction
61 ;=============================================================================
62 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
64 DECLARE_REG_TMP 0,1,2,3,4,5,4,5
65 %macro AVG_START 0-1 0
69 DECLARE_REG_TMP 0,1,2,3,4,5,7,8
70 %macro AVG_START 0-1 0
74 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
75 %macro AVG_START 0-1 0
87 lea t4, [t4+t5*2*SIZEOF_PIXEL]
88 lea t2, [t2+t3*2*SIZEOF_PIXEL]
89 lea t0, [t0+t1*2*SIZEOF_PIXEL]
106 %macro BIWEIGHT_START_MMX 0
118 %else ;!HIGH_BIT_DEPTH
119 %macro BIWEIGHT_MMX 2
131 %macro BIWEIGHT_START_MMX 0
133 SPLATW m2, m2 ; weight_dst
135 psubw m3, m2 ; weight_src
136 mova m4, [pw_32] ; rounding
139 %endif ;HIGH_BIT_DEPTH
141 %macro BIWEIGHT_SSSE3 2
149 %macro BIWEIGHT_START_SSSE3 0
150 movzx t6d, byte r6m ; FIXME x86_64
160 SPLATW m3, m3 ; weight_dst,src
165 %macro BIWEIGHT_ROW 4
173 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
180 %else ;!HIGH_BIT_DEPTH
181 %macro BIWEIGHT_ROW 4
188 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
194 %endif ;HIGH_BIT_DEPTH
196 ;-----------------------------------------------------------------------------
197 ; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
198 ;-----------------------------------------------------------------------------
199 %macro AVG_WEIGHT 1-2 0
200 cglobal pixel_avg_weight_w%1
204 mova m7, [pw_pixel_max]
207 %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
210 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
214 %else ;!HIGH_BIT_DEPTH
216 %endif ;HIGH_BIT_DEPTH
218 movhps [t0+SIZEOF_PIXEL*t1], m6
221 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
222 BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
223 BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
230 %define BIWEIGHT BIWEIGHT_MMX
231 %define BIWEIGHT_START BIWEIGHT_START_MMX
241 %else ;!HIGH_BIT_DEPTH
245 %define BIWEIGHT BIWEIGHT_SSSE3
246 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
254 cglobal pixel_avg_weight_w16
260 vinserti128 m0, m0, [t2+t3], 1
261 vinserti128 m1, m1, [t4+t5], 1
262 SBUTTERFLY bw, 0, 1, 2
269 vextracti128 [t0+t1], m0, 1
271 %endif ;HIGH_BIT_DEPTH
273 ;=============================================================================
274 ; P frame explicit weighted prediction
275 ;=============================================================================
279 %macro WEIGHT_START 1
280 mova m0, [r4+ 0] ; 1<<denom
282 movd m2, [r4+32] ; denom
283 mova m4, [pw_pixel_max]
284 paddw m2, [sq_1] ; denom+1
301 %macro WEIGHT_TWO_ROW 4
303 %rep (%3+mmsize/2-1)/(mmsize/2)
304 %if %3-x/2 <= 4 && mmsize == 16
310 WEIGHT %1+x, %1+x+mmsize/2
312 WEIGHT %1+r3+x, %1+r3+x+mmsize/2
322 %else ; !HIGH_BIT_DEPTH
324 %macro WEIGHT_START 1
326 vbroadcasti128 m3, [r4]
327 vbroadcasti128 m4, [r4+16]
331 %if notcpuflag(ssse3)
338 ; src1, src2, dst1, dst2, fast
339 %macro WEIGHT_ROWx2 5
341 movh m1, [%1+mmsize/2]
343 movh m7, [%2+mmsize/2]
368 paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
383 ; src1, src2, dst1, dst2, width, fast
388 vinserti128 m0, m0, [%2], 1
401 vextracti128 [%4], m0, 1
404 vinserti128 m0, m0, [%2], 1
412 vextracti128 xm1, m0, 1
438 paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
450 movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
456 %macro WEIGHT_TWO_ROW 4
460 WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
467 WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
476 %endif ; HIGH_BIT_DEPTH
478 ;-----------------------------------------------------------------------------
479 ;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
480 ;-----------------------------------------------------------------------------
483 cglobal mc_weight_w%1, 6,6,8
486 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
487 ; we can merge the shift step into the scale factor
488 ; if (m3<<7) doesn't overflow an int16_t
493 WEIGHT_TWO_ROW r2, r0, %1, 0
499 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
503 WEIGHT_TWO_ROW r2, r0, %1, 1
555 %macro OFFSET_TWO_ROW 4
558 %if (%3*SIZEOF_PIXEL-x) >= mmsize
559 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
563 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
565 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
569 %if x >= %3*SIZEOF_PIXEL
575 ;-----------------------------------------------------------------------------
576 ;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
577 ;-----------------------------------------------------------------------------
579 cglobal mc_offset%2_w%1, 6,6
584 mova m3, [pw_pixel_max]
588 OFFSET_TWO_ROW r2, r0, %1, %2
616 ;=============================================================================
618 ;=============================================================================
620 ;-----------------------------------------------------------------------------
621 ; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
622 ; pixel *src2, intptr_t src2_stride, int weight );
623 ;-----------------------------------------------------------------------------
625 cglobal pixel_avg_%1x%2
628 jne pixel_avg_weight_w%1 %+ SUFFIX
629 %if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
630 jmp pixel_avg_w%1_avx2
632 %if mmsize == 16 && %1 == 16
634 jz pixel_avg_w%1_sse2
636 jmp pixel_avg_w%1_mmx2
640 ;-----------------------------------------------------------------------------
641 ; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
642 ; pixel *src2, intptr_t src2_stride, int height, int weight );
643 ;-----------------------------------------------------------------------------
646 cglobal pixel_avg_w%1
650 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
652 %2 m1, [t2+x+SIZEOF_PIXEL*t3]
655 pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
656 %else ;!HIGH_BIT_DEPTH
658 pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
661 %3 [t0+x+SIZEOF_PIXEL*t1], m1
670 AVG_FUNC 4, movq, movq
676 AVG_FUNC 8, movq, movq
681 AVG_FUNC 16, movq, movq
686 AVG_FUNC 4, movq, movq
692 AVG_FUNC 8, movdqu, movdqa
697 AVG_FUNC 16, movdqu, movdqa
701 %else ;!HIGH_BIT_DEPTH
704 AVG_FUNC 4, movd, movd
710 AVG_FUNC 8, movq, movq
715 AVG_FUNC 16, movq, movq
720 AVG_FUNC 16, movdqu, movdqa
738 AVG_FUNC 16, movdqu, movdqa
742 %endif ;HIGH_BIT_DEPTH
746 ;=============================================================================
748 ;=============================================================================
751 ;-----------------------------------------------------------------------------
752 ; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
753 ; uint16_t *src1, intptr_t src_stride,
754 ; uint16_t *src2, int height );
755 ;-----------------------------------------------------------------------------
757 cglobal pixel_avg2_w%1, 6,7,4
782 cglobal pixel_avg2_w%1, 6,7,8
789 %2 m3, [r2+r3*2+mmsize]
792 pavgw m1, [r2+r4+mmsize]
794 pavgw m3, [r2+r6+mmsize]
797 %2 m5, [r2+r4+mmsize]
799 %2 m7, [r2+r6+mmsize]
808 %3 [r0+r1*2+mmsize], m3
818 AVG2_W_TWO 8, movu, mova
821 AVG2_W_TWO 10, movd, movd
822 AVG2_W_TWO 16, movu, mova
825 cglobal pixel_avg2_w10_mmx2, 6,7
832 movu m3, [r2+r3*2+ 0]
833 movu m4, [r2+r3*2+ 8]
834 movh m5, [r2+r3*2+16]
844 mova [r0+r1*2+ 0], m3
845 mova [r0+r1*2+ 8], m4
846 movh [r0+r1*2+16], m5
853 cglobal pixel_avg2_w16_mmx2, 6,7
861 movu m4, [r2+r3*2+ 0]
862 movu m5, [r2+r3*2+ 8]
863 movu m6, [r2+r3*2+16]
864 movu m7, [r2+r3*2+24]
877 mova [r0+r1*2+ 0], m4
878 mova [r0+r1*2+ 8], m5
879 mova [r0+r1*2+16], m6
880 mova [r0+r1*2+24], m7
887 cglobal pixel_avg2_w18_mmx2, 6,7
912 cglobal pixel_avg2_w18_sse2, 6,7,6
932 %endif ; HIGH_BIT_DEPTH
934 %if HIGH_BIT_DEPTH == 0
935 ;-----------------------------------------------------------------------------
936 ; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
937 ; uint8_t *src1, intptr_t src_stride,
938 ; uint8_t *src2, int height );
939 ;-----------------------------------------------------------------------------
941 cglobal pixel_avg2_w%1_mmx2, 6,7
963 cglobal pixel_avg2_w%1_mmx2, 6,7
989 cglobal pixel_avg2_w20_mmx2, 6,7
1000 pavgb mm1, [r4+r2+8]
1001 pavgb mm2, [r4+r2+16]
1003 pavgb mm4, [r4+r6+8]
1004 pavgb mm5, [r4+r6+16]
1011 movd [r0+r1+16], mm5
1017 cglobal pixel_avg2_w16_sse2, 6,7
1022 movdqu xmm2, [r2+r3]
1023 movdqu xmm1, [r2+r4]
1024 movdqu xmm3, [r2+r6]
1029 movdqa [r0+r1], xmm2
1036 cglobal pixel_avg2_w20_%1, 6,7
1041 movdqu xmm2, [r4+r3]
1042 %ifidn %1, sse2_misalign
1044 movd mm5, [r4+r3+16]
1048 movdqu xmm1, [r4+r2]
1049 movdqu xmm3, [r4+r6]
1051 movd mm5, [r4+r3+16]
1055 pavgb mm4, [r4+r2+16]
1056 pavgb mm5, [r4+r6+16]
1060 movdqa [r0+r1], xmm2
1061 movd [r0+r1+16], mm5
1069 AVG2_W20 sse2_misalign
1072 cglobal pixel_avg2_w20, 6,7
1088 ; Cacheline split code for processors with high latencies for loads
1089 ; split over cache lines. See sad-a.asm for a more detailed explanation.
1090 ; This particular instance is complicated by the fact that src1 and src2
1091 ; can have different alignments. For simplicity and code size, only the
1092 ; MMX cacheline workaround is used. As a result, in the case of SSE2
1093 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
1094 ; is no cacheline split, and the MMX workaround if there is.
1104 %macro AVG_CACHELINE_START 0
1105 %assign stack_offset 0
1116 %macro AVG_CACHELINE_LOOP 2
1119 movq mm3, [r2+r4+%1]
1120 movq mm2, [r2+r4+8+%1]
1131 %macro AVG_CACHELINE_FUNC 2
1132 pixel_avg2_w%1_cache_mmx2:
1134 AVG_CACHELINE_LOOP 0, movq
1136 AVG_CACHELINE_LOOP 8, movq
1138 AVG_CACHELINE_LOOP 16, movd
1148 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
1150 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
1151 %define cachesplit pixel_avg2_w16_cache_mmx2
1153 %define cachesplit pixel_avg2_w%1_cache_mmx2
1155 cglobal pixel_avg2_w%1_cache%2_%3
1158 cmp eax, (%2-%1-(%1 % 8))
1160 jbe pixel_avg2_w%1_%3
1162 jb pixel_avg2_w%1_%3
1164 %if 0 ; or %1==8 - but the extra branch seems too expensive
1171 jz pixel_avg2_w%1_%3
1175 jz pixel_avg2_w%1_%3
1178 %if mmsize==16 || (%1==8 && %2==64)
1179 AVG_CACHELINE_FUNC %1, %2
1186 AVG_CACHELINE_CHECK 8, 64, mmx2
1187 AVG_CACHELINE_CHECK 12, 64, mmx2
1188 %if ARCH_X86_64 == 0
1189 AVG_CACHELINE_CHECK 16, 64, mmx2
1190 AVG_CACHELINE_CHECK 20, 64, mmx2
1191 AVG_CACHELINE_CHECK 8, 32, mmx2
1192 AVG_CACHELINE_CHECK 12, 32, mmx2
1193 AVG_CACHELINE_CHECK 16, 32, mmx2
1194 AVG_CACHELINE_CHECK 20, 32, mmx2
1197 AVG_CACHELINE_CHECK 16, 64, sse2
1198 AVG_CACHELINE_CHECK 20, 64, sse2
1200 ; computed jump assumes this loop is exactly 48 bytes
1201 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
1203 avg_w16_align%1_%2_ssse3:
1209 movdqa xmm1, [r2+r4+16]
1210 palignr xmm1, [r2+r4], %2
1214 movdqa xmm1, [r2+16]
1215 palignr xmm1, [r2], %1
1219 movdqa xmm1, [r2+16]
1220 movdqa xmm2, [r2+r4+16]
1221 palignr xmm1, [r2], %1
1222 palignr xmm2, [r2+r4], %2&15
1229 jg avg_w16_align%1_%2_ssse3
1232 ; make sure the first ones don't end up short
1234 times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
1238 cglobal pixel_avg2_w16_cache64_ssse3
1239 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
1243 jb x264_pixel_avg2_w16_sse2
1246 jz x264_pixel_avg2_w16_sse2
1253 lea r6, [r6*3] ;(offset + align*2)*3
1255 shl r6, 4 ;jump = (offset + align*2)*48
1256 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
1258 lea r7, [avg_w16_addr]
1261 lea r6, [avg_w16_addr + r6]
1268 AVG16_CACHELINE_LOOP_SSSE3 j, j
1269 AVG16_CACHELINE_LOOP_SSSE3 j, k
1273 %endif ; !HIGH_BIT_DEPTH
1275 ;=============================================================================
1277 ;=============================================================================
1290 %macro COPY2 2-4 0, 1
1291 movu m0, [r2+%3*mmsize]
1292 movu m1, [r2+%4*mmsize]
1293 movu m2, [r2+r3+%3*mmsize]
1294 movu m3, [r2+r3+%4*mmsize]
1295 movu m4, [r2+r3*2+%3*mmsize]
1296 movu m5, [r2+r3*2+%4*mmsize]
1297 movu m6, [r2+%2+%3*mmsize]
1298 movu m7, [r2+%2+%4*mmsize]
1299 mova [r0+%3*mmsize], m0
1300 mova [r0+%4*mmsize], m1
1301 mova [r0+r1+%3*mmsize], m2
1302 mova [r0+r1+%4*mmsize], m3
1303 mova [r0+r1*2+%3*mmsize], m4
1304 mova [r0+r1*2+%4*mmsize], m5
1305 mova [r0+%1+%3*mmsize], m6
1306 mova [r0+%1+%4*mmsize], m7
1314 ;-----------------------------------------------------------------------------
1315 ; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
1316 ; uint8_t *src, intptr_t i_src_stride, int i_height )
1317 ;-----------------------------------------------------------------------------
1319 cglobal mc_copy_w4_mmx, 4,6
1325 %if HIGH_BIT_DEPTH == 0
1337 %assign %%w %1*SIZEOF_PIXEL/mmsize
1339 cglobal mc_copy_w%1, 5,7,8*(%%w/2)
1359 INIT_XMM aligned, sse
1364 ;=============================================================================
1366 ;=============================================================================
1367 ; assumes 64 byte cachelines
1368 ; FIXME doesn't cover all pixels in high depth and/or 4:4:4
1370 ;-----------------------------------------------------------------------------
1371 ; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
1372 ; pixel *pix_uv, intptr_t stride_uv, int mb_x )
1373 ;-----------------------------------------------------------------------------
1375 %macro PREFETCH_FENC 1
1377 cglobal prefetch_fenc_%1, 5,5
1382 lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
1390 lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
1401 cglobal prefetch_fenc_%1, 0,3
1408 lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
1421 lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
1430 %endif ; ARCH_X86_64
1437 ;-----------------------------------------------------------------------------
1438 ; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
1439 ;-----------------------------------------------------------------------------
1441 cglobal prefetch_ref, 3,3
1445 lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
1449 prefetcht0 [r0+r1*2]
1454 prefetcht0 [r0+r1*2]
1460 ;=============================================================================
1462 ;=============================================================================
1465 DECLARE_REG_TMP 6,7,8
1467 DECLARE_REG_TMP 0,1,2
1470 %macro MC_CHROMA_START 1
1487 movsxdifnidn t0, t0d
1488 add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
1492 %macro UNPACK_UNALIGNED 4
1495 punpckhwd %3, %1, %2
1502 shufps %2, %1, %3, q3131
1503 shufps %1, %3, q2020
1506 %else ; !HIGH_BIT_DEPTH
1507 %macro UNPACK_UNALIGNED 3
1508 %if mmsize == 8 || cpuflag(misalign)
1515 %endif ; HIGH_BIT_DEPTH
1517 ;-----------------------------------------------------------------------------
1518 ; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
1519 ; uint8_t *src, intptr_t src_stride,
1521 ; int width, int height )
1522 ;-----------------------------------------------------------------------------
1540 add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
1545 jl mc_chroma_mmx2 %+ .skip_prologue
1556 pshufw m7, m5, q3232
1557 pshufw m6, m5, q0000
1558 pshufw m5, m5, q1111
1562 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
1564 pshufd m7, m5, q1111
1566 pshufd m6, m5, q0000
1567 pshufd m5, m5, q1111
1572 UNPACK_UNALIGNED m0, m1, m2, r3
1575 UNPACK_UNALIGNED m0, m1, [r3+2]
1579 %endif ; HIGH_BIT_DEPTH
1587 UNPACK_UNALIGNED m0, m1, m2, r3+r4
1589 %else ; !HIGH_BIT_DEPTH
1591 UNPACK_UNALIGNED m0, m1, [r3+r4+2]
1596 %endif ; HIGH_BIT_DEPTH
1614 %else ; !HIGH_BIT_DEPTH
1623 %endif ; HIGH_BIT_DEPTH
1640 %define multy0 [rsp-8]
1661 UNPACK_UNALIGNED m0, m2, m4, r3
1662 UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
1665 movu m1, [r3+mmsize/2]
1666 UNPACK_UNALIGNED m0, m2, [r3+2]
1667 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1685 UNPACK_UNALIGNED m0, m1, m2, r3
1689 UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
1693 %else ; !HIGH_BIT_DEPTH
1695 movu m1, [r3+mmsize/2]
1696 UNPACK_UNALIGNED m0, m2, [r3+2]
1697 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1708 %endif ; HIGH_BIT_DEPTH
1724 movh [r0+mmsize/2], m1
1729 movh [r1+mmsize/2], m1
1732 movhps [r1+mmsize/2], m1
1734 %else ; !HIGH_BIT_DEPTH
1737 pshufw m1, m0, q0020
1738 pshufw m0, m0, q0031
1742 pshufd m0, m0, q3120
1746 %endif ; HIGH_BIT_DEPTH
1760 lea r3, [t2+8*SIZEOF_PIXEL]
1761 lea r0, [t0+4*SIZEOF_PIXEL]
1762 lea r1, [t1+4*SIZEOF_PIXEL]
1767 add r3, 8*SIZEOF_PIXEL
1768 add r0, 4*SIZEOF_PIXEL
1769 add r1, 4*SIZEOF_PIXEL
1775 %if ARCH_X86_64 ; too many regs for x86_32
1776 RESET_MM_PERMUTATION
1778 %if xmm_regs_used > 6
1779 %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
1780 %assign xmm_regs_used 6
1786 mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
1790 mov r6d, 2*SIZEOF_PIXEL
1792 %if HIGH_BIT_DEPTH && mmsize == 16
1824 SBUTTERFLY wd, 0, 2, 6
1825 SBUTTERFLY wd, 1, 3, 7
1826 SBUTTERFLY wd, 0, 2, 6
1827 SBUTTERFLY wd, 1, 3, 7
1829 SBUTTERFLY wd, 0, 2, 6
1830 SBUTTERFLY wd, 1, 3, 7
1832 %else ; !HIGH_BIT_DEPTH
1844 %endif ; HIGH_BIT_DEPTH
1868 %else ; !HIGH_BIT_DEPTH
1887 %endif ; HIGH_BIT_DEPTH
1895 sub r2, 4*SIZEOF_PIXEL
1896 sub r4, 8*SIZEOF_PIXEL
1897 mov r7, 4*SIZEOF_PIXEL
1898 mov r8, 8*SIZEOF_PIXEL
1903 %endif ; ARCH_X86_64
1904 %endmacro ; MC_CHROMA
1906 %macro MC_CHROMA_SSSE3 0
1921 imul t2d, t0d ; (x*255+8)*y
1922 imul r5d, t0d ; (x*255+8)*(8-y)
1925 %if cpuflag(cache64)
1929 lea t1, [ch_shuf_adj]
1930 movddup xm5, [t1 + t0*4]
1932 movddup xm5, [ch_shuf_adj + t0*4]
1934 paddb xm5, [ch_shuf]
1944 vpbroadcastw m6, xm6
1945 vpbroadcastw m7, xm7
1951 %define shiftround m8
1954 %define shiftround [pw_512]
1963 vinserti128 m0, m0, [r3+r4], 1
1964 vinserti128 m1, m1, [r3+r4*2], 1
1970 pmulhrsw m0, shiftround
1972 vextracti128 xm1, m0, 1
1987 vinserti128 m0, m0, [r3+8], 1
1991 vinserti128 m3, m3, [r3+r4+8], 1
1993 pmaddubsw m1, m0, m7
1994 pmaddubsw m2, m3, m6
1995 pmaddubsw m3, m3, m7
1998 vinserti128 m0, m0, [r3+r4*2+8], 1
2000 pmaddubsw m4, m0, m6
2004 pmulhrsw m1, shiftround
2005 pmulhrsw m3, shiftround
2007 mova m2, [deinterleave_shufd]
2009 vextracti128 xm2, m1, 1
2024 pmaddubsw m2, m1, m7
2029 pmulhrsw m1, shiftround
2030 pmulhrsw m3, shiftround
2071 pmulhrsw m0, shiftround ; x + 32 >> 6
2072 pmulhrsw m1, shiftround
2074 pshufd m0, m0, q3120
2080 movu m3, [r3+r4*2+8]
2090 pmulhrsw m2, shiftround
2091 pmulhrsw m3, shiftround
2093 pshufd m2, m2, q3120
2112 %else ; !HIGH_BIT_DEPTH
2115 INIT_XMM sse2, misalign
2121 INIT_XMM ssse3, cache64
2124 MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
2127 %endif ; HIGH_BIT_DEPTH