1 ;*****************************************************************************
2 ;* mc-a.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Dylan Yudaken <dyudaken@gmail.com>
10 ;* Holger Lubitz <holger@lubitz.org>
11 ;* Min Chen <chenm001.163.com>
12 ;* Oskar Arvidsson <oskar@irock.se>
14 ;* This program is free software; you can redistribute it and/or modify
15 ;* it under the terms of the GNU General Public License as published by
16 ;* the Free Software Foundation; either version 2 of the License, or
17 ;* (at your option) any later version.
19 ;* This program is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;* GNU General Public License for more details.
24 ;* You should have received a copy of the GNU General Public License
25 ;* along with this program; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
28 ;* This program is also available under a commercial proprietary license.
29 ;* For more information, contact us at licensing@x264.com.
30 ;*****************************************************************************
33 %include "x86util.asm"
37 ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
38 ch_shuf_adj: times 8 db 0
56 ;=============================================================================
57 ; implicit weighted biprediction
58 ;=============================================================================
59 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
61 DECLARE_REG_TMP 0,1,2,3,4,5,10,11
62 %macro AVG_START 0-1 0
70 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
71 %macro AVG_START 0-1 0
95 %macro BIWEIGHT_START_MMX 0
97 SPLATW m2, m2 ; weight_dst
99 psubw m3, m2 ; weight_src
100 mova m4, [pw_32] ; rounding
104 %macro BIWEIGHT_SSSE3 2
113 %macro BIWEIGHT_START_SSSE3 0
114 movzx t6d, byte r6m ; FIXME x86_64
121 SPLATW m3, m3 ; weight_dst,src
124 %macro BIWEIGHT_ROW 4
131 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
137 ;-----------------------------------------------------------------------------
138 ; int pixel_avg_weight_w16( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
139 ;-----------------------------------------------------------------------------
140 %macro AVG_WEIGHT 2-3 0
141 cglobal pixel_avg_weight_w%2_%1
144 %if %2==8 && mmsize==16
147 BIWEIGHT [t2+t3], [t4+t5]
154 BIWEIGHT_ROW t0+x, t2+x, t4+x, %2
155 BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2
167 %define BIWEIGHT BIWEIGHT_MMX
168 %define BIWEIGHT_START BIWEIGHT_START_MMX
172 AVG_WEIGHT mmxext, 16
174 %define pixel_avg_weight_w4_sse2 pixel_avg_weight_w4_mmxext
175 AVG_WEIGHT sse2, 8, 7
176 AVG_WEIGHT sse2, 16, 7
177 %define BIWEIGHT BIWEIGHT_SSSE3
178 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
182 AVG_WEIGHT ssse3, 8, 7
183 AVG_WEIGHT ssse3, 16, 7
185 ;=============================================================================
186 ; P frame explicit weighted prediction
187 ;=============================================================================
189 %ifdef HIGH_BIT_DEPTH
190 %macro WEIGHT_START 1 ; (width)
191 movd m2, [r4+32] ; denom
192 movd m3, [r4+36] ; scale
193 mov TMP_REG, [r4+40] ; offset
195 shl TMP_REG, BIT_DEPTH-7
196 mova m4, [pw_pixel_max]
198 psllw m0, m2 ; 1<<denom
199 movd m1, TMP_REG ; 1+(offset<<(BIT_DEPTH-8+1))
200 psllw m3, 1 ; scale<<1
203 paddw m2, [sq_1] ; denom+1
206 %macro WEIGHT 2 ; (src1, src2)
218 %macro WEIGHT_TWO_ROW 3 ; (src, dst, width)
220 %rep (%3+mmsize/2-1)/(mmsize/2)
221 %if %3-x/2 <= 4 && mmsize == 16
227 WEIGHT %1+x, %1+x+mmsize/2
229 WEIGHT %1+r3+x, %1+r3+x+mmsize/2
239 %else ; !HIGH_BIT_DEPTH
241 %macro WEIGHT_START 1
246 %if (%1 == 20 || %1 == 12) && mmsize == 16
255 %macro WEIGHT_START_SSSE3 1
259 %if %1 == 20 || %1 == 12
266 ;; macro to weight mmsize bytes taking half from %1 and half from %2
267 %macro WEIGHT 2 ; (src1,src2)
270 punpcklbw m0, m2 ;setup
271 punpcklbw m1, m2 ;setup
274 paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
275 paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
280 %macro WEIGHT_SSSE3 2
293 %macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)
299 movd [%2], %1 ; width 2 can write garbage for last 2 bytes
303 %macro WEIGHT_ROW 3 ; (src,dst,width)
305 WEIGHT %1, (%1+(mmsize/2))
306 packuswb m0, m1 ;put bytes into m0
307 WEIGHT_SAVE_ROW m0, %2, %3
310 %macro WEIGHT_SAVE_COL 2 ;(dst,size)
318 movd [%1], m0 ; width 2 can write garbage for last 2 bytes
323 %macro WEIGHT_COL 3 ; (src,dst,width)
324 %if %3 <= 4 && mmsize == 16
328 WEIGHT_SAVE_COL %2, %3
332 WEIGHT_SAVE_COL %2, %3
337 %macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
341 WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize
342 WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize
345 WEIGHT_COL (%1+x),(%2+x),(%3-x)
354 %endif ; HIGH_BIT_DEPTH
356 ;-----------------------------------------------------------------------------
357 ;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
358 ;-----------------------------------------------------------------------------
363 %define HEIGHT_REG r5d
368 %define LOAD_HEIGHT mov r4d, r5m
369 %define HEIGHT_REG r4d
373 %ifdef HIGH_BIT_DEPTH
374 %assign NUMREGS NUMREGS+1
379 cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
384 WEIGHT_TWO_ROW r2, r0, %1
402 %ifdef HIGH_BIT_DEPTH
405 %define WEIGHT WEIGHT_SSSE3
406 %define WEIGHT_START WEIGHT_START_SSSE3
424 %macro OFFSET_TWO_ROW 4
428 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
431 OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
440 ;-----------------------------------------------------------------------------
441 ;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
442 ;-----------------------------------------------------------------------------
444 cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
448 OFFSET_TWO_ROW r2, r0, %1, %3
476 ;=============================================================================
478 ;=============================================================================
480 ;-----------------------------------------------------------------------------
481 ; void pixel_avg_4x4( uint8_t *dst, int dst_stride,
482 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
483 ;-----------------------------------------------------------------------------
485 cglobal pixel_avg_%1x%2_%3
488 jne pixel_avg_weight_w%1_%3
489 %if mmsize == 16 && %1 == 16
491 jz pixel_avg_w%1_sse2
493 jmp pixel_avg_w%1_mmxext
496 ;-----------------------------------------------------------------------------
497 ; void pixel_avg_w4( uint8_t *dst, int dst_stride,
498 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
499 ; int height, int weight );
500 ;-----------------------------------------------------------------------------
524 AVG_FUNC pixel_avg_w4_mmxext, movd, movd
529 AVG_FUNC pixel_avg_w8_mmxext, movq, movq
534 cglobal pixel_avg_w16_mmxext
554 AVG_FUNC pixel_avg_w16_sse2, movdqu, movdqa
572 ;=============================================================================
574 ;=============================================================================
576 %ifdef HIGH_BIT_DEPTH
577 ;-----------------------------------------------------------------------------
578 ; void pixel_avg2_wN( uint16_t *dst, int dst_stride,
579 ; uint16_t *src1, int src_stride,
580 ; uint16_t *src2, int height );
581 ;-----------------------------------------------------------------------------
583 cglobal pixel_avg2_w%1_%2, 6,7,4*(mmsize/16)
608 cglobal pixel_avg2_w%1_%4, 6,7,8*(mmsize/16)
615 %2 m3, [r2+r3*2+mmsize]
618 pavgw m1, [r2+r4+mmsize]
620 pavgw m3, [r2+r6+mmsize]
623 %2 m5, [r2+r4+mmsize]
625 %2 m7, [r2+r6+mmsize]
634 %3 [r0+r1*2+mmsize], m3
644 AVG2_W_TWO 8, movu, mova, mmxext
647 AVG2_W_TWO 10, movd, movd, sse2
648 AVG2_W_TWO 16, movu, mova, sse2
651 cglobal pixel_avg2_w10_mmxext, 6,7
658 movu m3, [r2+r3*2+ 0]
659 movu m4, [r2+r3*2+ 8]
660 movh m5, [r2+r3*2+16]
670 mova [r0+r1*2+ 0], m3
671 mova [r0+r1*2+ 8], m4
672 movh [r0+r1*2+16], m5
679 cglobal pixel_avg2_w16_mmxext, 6,7
687 movu m4, [r2+r3*2+ 0]
688 movu m5, [r2+r3*2+ 8]
689 movu m6, [r2+r3*2+16]
690 movu m7, [r2+r3*2+24]
703 mova [r0+r1*2+ 0], m4
704 mova [r0+r1*2+ 8], m5
705 mova [r0+r1*2+16], m6
706 mova [r0+r1*2+24], m7
713 cglobal pixel_avg2_w18_mmxext, 6,7
738 cglobal pixel_avg2_w18_sse2, 6,7,6
758 %endif ; HIGH_BIT_DEPTH
760 %ifndef HIGH_BIT_DEPTH
761 ;-----------------------------------------------------------------------------
762 ; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
763 ; uint8_t *src1, int src_stride,
764 ; uint8_t *src2, int height );
765 ;-----------------------------------------------------------------------------
767 cglobal pixel_avg2_w%1_mmxext, 6,7
788 cglobal pixel_avg2_w%1_mmxext, 6,7
814 cglobal pixel_avg2_w20_mmxext, 6,7
826 pavgb mm2, [r4+r2+16]
829 pavgb mm5, [r4+r6+16]
842 cglobal pixel_avg2_w16_sse2, 6,7
861 cglobal pixel_avg2_w20_%1, 6,7
867 %ifidn %1, sse2_misalign
880 pavgb mm4, [r4+r2+16]
881 pavgb mm5, [r4+r6+16]
894 AVG2_W20 sse2_misalign
896 ; Cacheline split code for processors with high latencies for loads
897 ; split over cache lines. See sad-a.asm for a more detailed explanation.
898 ; This particular instance is complicated by the fact that src1 and src2
899 ; can have different alignments. For simplicity and code size, only the
900 ; MMX cacheline workaround is used. As a result, in the case of SSE2
901 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
902 ; is no cacheline split, and the MMX workaround if there is.
912 %macro AVG_CACHELINE_START 0
913 %assign stack_offset 0
924 %macro AVG_CACHELINE_LOOP 2
928 movq mm2, [r2+r4+8+%1]
939 %macro AVG_CACHELINE_FUNC 2
940 pixel_avg2_w%1_cache_mmxext:
942 AVG_CACHELINE_LOOP 0, movq
944 AVG_CACHELINE_LOOP 8, movq
946 AVG_CACHELINE_LOOP 16, movd
956 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
958 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
959 %define cachesplit pixel_avg2_w16_cache_mmxext
961 %define cachesplit pixel_avg2_w%1_cache_mmxext
963 cglobal pixel_avg2_w%1_cache%2_%3
965 and eax, 0x1f|(%2>>1)
966 cmp eax, (32-%1-(%1 % 8))|(%2>>1)
968 jbe pixel_avg2_w%1_%3
972 %if 0 ; or %1==8 - but the extra branch seems too expensive
987 AVG_CACHELINE_FUNC %1, %2
988 %elif %1==8 && %2==64
989 AVG_CACHELINE_FUNC %1, %2
995 AVG_CACHELINE_CHECK 8, 64, mmxext
996 AVG_CACHELINE_CHECK 12, 64, mmxext
998 AVG_CACHELINE_CHECK 16, 64, mmxext
999 AVG_CACHELINE_CHECK 20, 64, mmxext
1000 AVG_CACHELINE_CHECK 8, 32, mmxext
1001 AVG_CACHELINE_CHECK 12, 32, mmxext
1002 AVG_CACHELINE_CHECK 16, 32, mmxext
1003 AVG_CACHELINE_CHECK 20, 32, mmxext
1005 AVG_CACHELINE_CHECK 16, 64, sse2
1006 AVG_CACHELINE_CHECK 20, 64, sse2
1008 ; computed jump assumes this loop is exactly 48 bytes
1009 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
1011 avg_w16_align%1_%2_ssse3:
1017 movdqa xmm1, [r2+r4+16]
1018 palignr xmm1, [r2+r4], %2
1022 movdqa xmm1, [r2+16]
1023 palignr xmm1, [r2], %1
1027 movdqa xmm1, [r2+16]
1028 movdqa xmm2, [r2+r4+16]
1029 palignr xmm1, [r2], %1
1030 palignr xmm2, [r2+r4], %2&15
1037 jg avg_w16_align%1_%2_ssse3
1040 times 13 db 0x90 ; make sure the first ones don't end up short
1044 cglobal pixel_avg2_w16_cache64_ssse3
1045 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
1049 jb x264_pixel_avg2_w16_sse2
1052 jz x264_pixel_avg2_w16_sse2
1059 lea r6, [r6*3] ;(offset + align*2)*3
1061 shl r6, 4 ;jump = (offset + align*2)*48
1062 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
1064 lea r11, [avg_w16_addr]
1067 lea r6, [avg_w16_addr + r6]
1079 AVG16_CACHELINE_LOOP_SSSE3 j, j
1080 AVG16_CACHELINE_LOOP_SSSE3 j, k
1084 %endif ; !HIGH_BIT_DEPTH
1086 ;=============================================================================
1088 ;=============================================================================
1101 %ifdef HIGH_BIT_DEPTH
1103 COPY4 %1, %2, %3, %4
1126 cglobal mc_copy_w4_mmx, 4,6
1132 COPY4 mova, mova, r4, r5
1136 COPY4 movu, mova, r4, r5
1139 cglobal mc_copy_w16_mmx, 5,7
1144 COPY_TWO mova, movu, r5, r6, mmsize*0, mmsize*1
1145 COPY_TWO mova, movu, r5, r6, mmsize*2, mmsize*3
1153 cglobal mc_copy_w%2_%4, 5,7,%5
1158 COPY_%1 mova, %3, r5, r6, 0, mmsize
1166 MC_COPY TWO, 8, movu, mmx, 0
1168 MC_COPY ONE, 8, movu, sse2, 0
1169 MC_COPY TWO, 16, movu, sse2, 8
1170 MC_COPY TWO, 16, mova, aligned_sse2, 8
1171 %endif ; HIGH_BIT_DEPTH
1173 %ifndef HIGH_BIT_DEPTH
1175 ;-----------------------------------------------------------------------------
1176 ; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
1177 ; uint8_t *src, int i_src_stride, int i_height )
1178 ;-----------------------------------------------------------------------------
1179 cglobal mc_copy_w4_mmx, 4,6
1184 COPY4 movd, movd, r4, r5
1188 COPY4 movd, movd, r4, r5
1191 cglobal mc_copy_w8_mmx, 5,7
1195 COPY4 movq, movq, r5, r6
1202 cglobal mc_copy_w16_mmx, 5,7
1211 movq mm5, [r2+r3*2+8]
1219 movq [r0+r1*2+8], mm5
1229 %macro COPY_W16_SSE2 2
1234 COPY4 movdqa, %2, r5, r6
1242 COPY_W16_SSE2 mc_copy_w16_sse2, movdqu
1243 ; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
1244 ; but with SSE3 the overhead is zero, so there's no reason not to include it.
1245 COPY_W16_SSE2 mc_copy_w16_sse3, lddqu
1246 COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa
1247 %endif ; !HIGH_BIT_DEPTH
1251 ;=============================================================================
1253 ;=============================================================================
1254 ; FIXME assumes 64 byte cachelines
1256 ;-----------------------------------------------------------------------------
1257 ; void prefetch_fenc( uint8_t *pix_y, int stride_y,
1258 ; uint8_t *pix_uv, int stride_uv, int mb_x )
1259 ;-----------------------------------------------------------------------------
1261 cglobal prefetch_fenc_mmxext, 5,5
1265 lea r0, [r0+r4*4+64]
1273 lea r2, [r2+rax*2+64]
1279 cglobal prefetch_fenc_mmxext, 0,3
1285 lea r0, [r0+r2*4+64]
1297 lea r0, [r0+r2*2+64]
1301 %endif ; ARCH_X86_64
1303 ;-----------------------------------------------------------------------------
1304 ; void prefetch_ref( uint8_t *pix, int stride, int parity )
1305 ;-----------------------------------------------------------------------------
1306 cglobal prefetch_ref_mmxext, 3,3
1309 lea r0, [r0+r2*8+64]
1313 prefetcht0 [r0+r1*2]
1318 prefetcht0 [r0+r1*2]
1324 ;=============================================================================
1326 ;=============================================================================
1329 DECLARE_REG_TMP 10,11,6
1331 DECLARE_REG_TMP 0,1,2
1334 %macro MC_CHROMA_START 0
1346 movsxdifnidn t0, t0d
1347 add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
1350 %ifdef HIGH_BIT_DEPTH
1351 %macro UNPACK_UNALIGNED 4
1362 shufps %1, %3, 10001000b
1363 shufps %2, %3, 11011101b
1366 %else ; !HIGH_BIT_DEPTH
1367 %macro UNPACK_UNALIGNED_MEM 3
1371 %macro UNPACK_UNALIGNED_LOAD 3
1375 %endif ; HIGH_BIT_DEPTH
1377 ;-----------------------------------------------------------------------------
1378 ; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
1379 ; uint8_t *src, int src_stride,
1381 ; int width, int height )
1382 ;-----------------------------------------------------------------------------
1384 cglobal mc_chroma_%1, 0,6
1400 add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
1405 jl mc_chroma_mmxext %+ .skip_prologue
1422 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
1430 %ifdef HIGH_BIT_DEPTH
1432 UNPACK_UNALIGNED m0, m1, m2, r3
1435 UNPACK_UNALIGNED m0, m1, [r3+2]
1439 %endif ; HIGH_BIT_DEPTH
1446 %ifdef HIGH_BIT_DEPTH
1447 UNPACK_UNALIGNED m0, m1, m2, r3+r4
1449 %else ; !HIGH_BIT_DEPTH
1451 UNPACK_UNALIGNED m0, m1, [r3+r4+2]
1456 %endif ; HIGH_BIT_DEPTH
1466 %ifdef HIGH_BIT_DEPTH
1474 %else ; !HIGH_BIT_DEPTH
1483 %endif ; HIGH_BIT_DEPTH
1497 %define multy0 [rsp-8]
1516 %ifdef HIGH_BIT_DEPTH
1517 UNPACK_UNALIGNED m0, m2, m4, r3
1518 UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
1521 movu m1, [r3+mmsize/2]
1522 UNPACK_UNALIGNED m0, m2, [r3+2]
1523 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1542 %ifdef HIGH_BIT_DEPTH
1543 UNPACK_UNALIGNED m0, m1, m2, r3
1547 UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
1551 %else ; !HIGH_BIT_DEPTH
1553 movu m1, [r3+mmsize/2]
1554 UNPACK_UNALIGNED m0, m2, [r3+2]
1555 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1568 %endif ; HIGH_BIT_DEPTH
1583 %ifdef HIGH_BIT_DEPTH
1585 movh [r0+mmsize/2], m1
1590 movh [r1+mmsize/2], m1
1593 movhps [r1+mmsize/2], m1
1595 %else ; !HIGH_BIT_DEPTH
1607 %endif ; HIGH_BIT_DEPTH
1621 lea r3, [t2+8*SIZEOF_PIXEL]
1622 lea r0, [t0+4*SIZEOF_PIXEL]
1623 lea r1, [t1+4*SIZEOF_PIXEL]
1628 add r3, 8*SIZEOF_PIXEL
1629 add r0, 4*SIZEOF_PIXEL
1630 add r1, 4*SIZEOF_PIXEL
1636 %ifdef ARCH_X86_64 ; too many regs for x86_32
1637 RESET_MM_PERMUTATION
1639 %if xmm_regs_used > 6
1640 %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
1641 %assign xmm_regs_used 6
1647 mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
1651 mov r6d, 2*SIZEOF_PIXEL
1653 %ifdef HIGH_BIT_DEPTH
1674 %ifdef HIGH_BIT_DEPTH
1687 SBUTTERFLY wd, 0, 2, 6
1688 SBUTTERFLY wd, 1, 3, 7
1689 SBUTTERFLY wd, 0, 2, 6
1690 SBUTTERFLY wd, 1, 3, 7
1692 SBUTTERFLY wd, 0, 2, 6
1693 SBUTTERFLY wd, 1, 3, 7
1695 %else ; !HIGH_BIT_DEPTH
1709 %endif ; HIGH_BIT_DEPTH
1720 %ifdef HIGH_BIT_DEPTH
1733 %else ; !HIGH_BIT_DEPTH
1752 %endif ; HIGH_BIT_DEPTH
1760 sub r2, 4*SIZEOF_PIXEL
1761 sub r4, 8*SIZEOF_PIXEL
1762 mov r10, 4*SIZEOF_PIXEL
1763 mov r11, 8*SIZEOF_PIXEL
1768 %endif ; ARCH_X86_64
1769 %endmacro ; MC_CHROMA
1772 %macro MC_CHROMA_SSSE3 0-1
1774 cglobal mc_chroma_ssse3%1, 0,6,9
1784 imul t2d, t0d ; (x*255+8)*y
1785 imul r5d, t0d ; (x*255+8)*(8-y)
1792 lea t1, [ch_shuf_adj]
1793 movddup m5, [t1 + t0*4]
1795 movddup m5, [ch_shuf_adj + t0*4]
1881 movu m3, [r3+r4*2+8]
1907 %ifdef HIGH_BIT_DEPTH
1912 %else ; !HIGH_BIT_DEPTH
1914 %define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
1917 MC_CHROMA sse2_misalign
1918 %define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD
1921 MC_CHROMA_SSSE3 _cache64
1922 %endif ; HIGH_BIT_DEPTH