1 ;*****************************************************************************
2 ;* mc-a.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2011 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Dylan Yudaken <dyudaken@gmail.com>
10 ;* Holger Lubitz <holger@lubitz.org>
11 ;* Min Chen <chenm001.163.com>
12 ;* Oskar Arvidsson <oskar@irock.se>
14 ;* This program is free software; you can redistribute it and/or modify
15 ;* it under the terms of the GNU General Public License as published by
16 ;* the Free Software Foundation; either version 2 of the License, or
17 ;* (at your option) any later version.
19 ;* This program is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;* GNU General Public License for more details.
24 ;* You should have received a copy of the GNU General Public License
25 ;* along with this program; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
28 ;* This program is also available under a commercial proprietary license.
29 ;* For more information, contact us at licensing@x264.com.
30 ;*****************************************************************************
33 %include "x86util.asm"
37 ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
38 ch_shuf_adj: times 8 db 0
57 ;=============================================================================
58 ; implicit weighted biprediction
59 ;=============================================================================
60 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
62 DECLARE_REG_TMP 0,1,2,3,4,5,10,11
63 %macro AVG_START 0-1 0
70 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
71 %macro AVG_START 0-1 0
84 lea t4, [t4+t5*2*SIZEOF_PIXEL]
85 lea t2, [t2+t3*2*SIZEOF_PIXEL]
86 lea t0, [t0+t1*2*SIZEOF_PIXEL]
102 %macro BIWEIGHT_START_MMX 0
114 %else ;!HIGH_BIT_DEPTH
115 %macro BIWEIGHT_MMX 2
127 %macro BIWEIGHT_START_MMX 0
129 SPLATW m2, m2 ; weight_dst
131 psubw m3, m2 ; weight_src
132 mova m4, [pw_32] ; rounding
135 %endif ;HIGH_BIT_DEPTH
137 %macro BIWEIGHT_SSSE3 2
146 %macro BIWEIGHT_START_SSSE3 0
147 movzx t6d, byte r6m ; FIXME x86_64
154 SPLATW m3, m3 ; weight_dst,src
157 %ifdef HIGH_BIT_DEPTH
158 %macro BIWEIGHT_ROW 4
166 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
173 %else ;!HIGH_BIT_DEPTH
174 %macro BIWEIGHT_ROW 4
181 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
187 %endif ;HIGH_BIT_DEPTH
189 ;-----------------------------------------------------------------------------
190 ; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
191 ;-----------------------------------------------------------------------------
192 %macro AVG_WEIGHT 1-2 0
193 cglobal pixel_avg_weight_w%1
196 %ifdef HIGH_BIT_DEPTH
197 mova m7, [pw_pixel_max]
200 %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
203 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
204 %ifdef HIGH_BIT_DEPTH
207 %else ;!HIGH_BIT_DEPTH
209 %endif ;HIGH_BIT_DEPTH
211 movhps [t0+SIZEOF_PIXEL*t1], m6
214 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
215 BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
216 BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
223 %define BIWEIGHT BIWEIGHT_MMX
224 %define BIWEIGHT_START BIWEIGHT_START_MMX
229 %ifdef HIGH_BIT_DEPTH
234 %else ;!HIGH_BIT_DEPTH
238 %define BIWEIGHT BIWEIGHT_SSSE3
239 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
245 %endif ;HIGH_BIT_DEPTH
247 ;=============================================================================
248 ; P frame explicit weighted prediction
249 ;=============================================================================
251 %ifdef HIGH_BIT_DEPTH
252 %macro WEIGHT_START 1 ; (width)
253 mova m0, [r4+ 0] ; 1<<denom
255 movd m2, [r4+32] ; denom
256 mova m4, [pw_pixel_max]
257 paddw m2, [sq_1] ; denom+1
260 %macro WEIGHT 2 ; (src1, src2)
272 %macro WEIGHT_TWO_ROW 3 ; (src, dst, width)
274 %rep (%3+mmsize/2-1)/(mmsize/2)
275 %if %3-x/2 <= 4 && mmsize == 16
281 WEIGHT %1+x, %1+x+mmsize/2
283 WEIGHT %1+r3+x, %1+r3+x+mmsize/2
293 %else ; !HIGH_BIT_DEPTH
295 %macro WEIGHT_START 1
300 %if (%1 == 20 || %1 == 12) && mmsize == 16
309 %macro WEIGHT_START_SSSE3 1
313 %if %1 == 20 || %1 == 12
320 ;; macro to weight mmsize bytes taking half from %1 and half from %2
321 %macro WEIGHT 2 ; (src1,src2)
324 punpcklbw m0, m2 ;setup
325 punpcklbw m1, m2 ;setup
328 paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
329 paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
334 %macro WEIGHT_SSSE3 2
347 %macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)
353 movd [%2], %1 ; width 2 can write garbage for last 2 bytes
357 %macro WEIGHT_ROW 3 ; (src,dst,width)
359 WEIGHT %1, (%1+(mmsize/2))
360 packuswb m0, m1 ;put bytes into m0
361 WEIGHT_SAVE_ROW m0, %2, %3
364 %macro WEIGHT_SAVE_COL 2 ;(dst,size)
372 movd [%1], m0 ; width 2 can write garbage for last 2 bytes
377 %macro WEIGHT_COL 3 ; (src,dst,width)
378 %if %3 <= 4 && mmsize == 16
382 WEIGHT_SAVE_COL %2, %3
386 WEIGHT_SAVE_COL %2, %3
391 %macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
395 WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize
396 WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize
399 WEIGHT_COL (%1+x),(%2+x),(%3-x)
408 %endif ; HIGH_BIT_DEPTH
410 ;-----------------------------------------------------------------------------
411 ;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
412 ;-----------------------------------------------------------------------------
417 %define HEIGHT_REG r5d
422 %define LOAD_HEIGHT mov r4d, r5m
423 %define HEIGHT_REG r4d
427 %ifdef HIGH_BIT_DEPTH
428 %assign NUMREGS NUMREGS+1
433 cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
438 WEIGHT_TWO_ROW r2, r0, %1
456 %ifdef HIGH_BIT_DEPTH
464 %define WEIGHT WEIGHT_SSSE3
465 %define WEIGHT_START WEIGHT_START_SSSE3
481 %ifdef HIGH_BIT_DEPTH
496 %macro OFFSET_TWO_ROW 4
499 %if (%3*SIZEOF_PIXEL-x) >= mmsize
500 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
503 %ifdef HIGH_BIT_DEPTH
504 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
506 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
510 %if x >= %3*SIZEOF_PIXEL
516 ;-----------------------------------------------------------------------------
517 ;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
518 ;-----------------------------------------------------------------------------
520 cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
523 %ifdef HIGH_BIT_DEPTH
525 mova m3, [pw_pixel_max]
530 OFFSET_TWO_ROW r2, r0, %1, %2
556 %ifdef HIGH_BIT_DEPTH
568 ;=============================================================================
570 ;=============================================================================
572 ;-----------------------------------------------------------------------------
573 ; void pixel_avg_4x4( pixel *dst, int dst_stride,
574 ; pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
575 ;-----------------------------------------------------------------------------
577 cglobal pixel_avg_%1x%2
580 jne pixel_avg_weight_w%1 %+ SUFFIX
581 %if mmsize == 16 && %1 == 16
583 jz pixel_avg_w%1_sse2
585 jmp pixel_avg_w%1_mmx2
588 ;-----------------------------------------------------------------------------
589 ; void pixel_avg_w4( pixel *dst, int dst_stride,
590 ; pixel *src1, int src1_stride, pixel *src2, int src2_stride,
591 ; int height, int weight );
592 ;-----------------------------------------------------------------------------
595 cglobal pixel_avg_w%1
599 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
601 %2 m1, [t2+x+SIZEOF_PIXEL*t3]
602 %ifdef HIGH_BIT_DEPTH
604 pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
605 %else ;!HIGH_BIT_DEPTH
607 pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
610 %3 [t0+x+SIZEOF_PIXEL*t1], m1
616 %ifdef HIGH_BIT_DEPTH
619 AVG_FUNC 4, movq, movq
624 AVG_FUNC 8, movq, movq
629 AVG_FUNC 16, movq, movq
634 AVG_FUNC 4, movq, movq
639 AVG_FUNC 8, movdqu, movdqa
644 AVG_FUNC 16, movdqu, movdqa
648 %else ;!HIGH_BIT_DEPTH
651 AVG_FUNC 4, movd, movd
656 AVG_FUNC 8, movq, movq
661 AVG_FUNC 16, movq, movq
666 AVG_FUNC 16, movdqu, movdqa
683 %endif ;HIGH_BIT_DEPTH
687 ;=============================================================================
689 ;=============================================================================
691 %ifdef HIGH_BIT_DEPTH
692 ;-----------------------------------------------------------------------------
693 ; void pixel_avg2_wN( uint16_t *dst, int dst_stride,
694 ; uint16_t *src1, int src_stride,
695 ; uint16_t *src2, int height );
696 ;-----------------------------------------------------------------------------
698 cglobal pixel_avg2_w%1, 6,7,4*(mmsize/16)
723 cglobal pixel_avg2_w%1, 6,7,8*(mmsize/16)
730 %2 m3, [r2+r3*2+mmsize]
733 pavgw m1, [r2+r4+mmsize]
735 pavgw m3, [r2+r6+mmsize]
738 %2 m5, [r2+r4+mmsize]
740 %2 m7, [r2+r6+mmsize]
749 %3 [r0+r1*2+mmsize], m3
759 AVG2_W_TWO 8, movu, mova
762 AVG2_W_TWO 10, movd, movd
763 AVG2_W_TWO 16, movu, mova
766 cglobal pixel_avg2_w10_mmx2, 6,7
773 movu m3, [r2+r3*2+ 0]
774 movu m4, [r2+r3*2+ 8]
775 movh m5, [r2+r3*2+16]
785 mova [r0+r1*2+ 0], m3
786 mova [r0+r1*2+ 8], m4
787 movh [r0+r1*2+16], m5
794 cglobal pixel_avg2_w16_mmx2, 6,7
802 movu m4, [r2+r3*2+ 0]
803 movu m5, [r2+r3*2+ 8]
804 movu m6, [r2+r3*2+16]
805 movu m7, [r2+r3*2+24]
818 mova [r0+r1*2+ 0], m4
819 mova [r0+r1*2+ 8], m5
820 mova [r0+r1*2+16], m6
821 mova [r0+r1*2+24], m7
828 cglobal pixel_avg2_w18_mmx2, 6,7
853 cglobal pixel_avg2_w18_sse2, 6,7,6
873 %endif ; HIGH_BIT_DEPTH
875 %ifndef HIGH_BIT_DEPTH
876 ;-----------------------------------------------------------------------------
877 ; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
878 ; uint8_t *src1, int src_stride,
879 ; uint8_t *src2, int height );
880 ;-----------------------------------------------------------------------------
882 cglobal pixel_avg2_w%1_mmx2, 6,7
904 cglobal pixel_avg2_w%1_mmx2, 6,7
930 cglobal pixel_avg2_w20_mmx2, 6,7
942 pavgb mm2, [r4+r2+16]
945 pavgb mm5, [r4+r6+16]
958 cglobal pixel_avg2_w16_sse2, 6,7
977 cglobal pixel_avg2_w20_%1, 6,7
983 %ifidn %1, sse2_misalign
996 pavgb mm4, [r4+r2+16]
997 pavgb mm5, [r4+r6+16]
1001 movdqa [r0+r1], xmm2
1002 movd [r0+r1+16], mm5
1010 AVG2_W20 sse2_misalign
1012 ; Cacheline split code for processors with high latencies for loads
1013 ; split over cache lines. See sad-a.asm for a more detailed explanation.
1014 ; This particular instance is complicated by the fact that src1 and src2
1015 ; can have different alignments. For simplicity and code size, only the
1016 ; MMX cacheline workaround is used. As a result, in the case of SSE2
1017 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
1018 ; is no cacheline split, and the MMX workaround if there is.
1028 %macro AVG_CACHELINE_START 0
1029 %assign stack_offset 0
1040 %macro AVG_CACHELINE_LOOP 2
1043 movq mm3, [r2+r4+%1]
1044 movq mm2, [r2+r4+8+%1]
1055 %macro AVG_CACHELINE_FUNC 2
1056 pixel_avg2_w%1_cache_mmx2:
1058 AVG_CACHELINE_LOOP 0, movq
1060 AVG_CACHELINE_LOOP 8, movq
1062 AVG_CACHELINE_LOOP 16, movd
1072 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
1074 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
1075 %define cachesplit pixel_avg2_w16_cache_mmx2
1077 %define cachesplit pixel_avg2_w%1_cache_mmx2
1079 cglobal pixel_avg2_w%1_cache%2_%3
1082 cmp eax, (%2-%1-(%1 % 8))
1084 jbe pixel_avg2_w%1_%3
1086 jb pixel_avg2_w%1_%3
1088 %if 0 ; or %1==8 - but the extra branch seems too expensive
1095 jz pixel_avg2_w%1_%3
1099 jz pixel_avg2_w%1_%3
1102 %if mmsize==16 || (%1==8 && %2==64)
1103 AVG_CACHELINE_FUNC %1, %2
1110 AVG_CACHELINE_CHECK 8, 64, mmx2
1111 AVG_CACHELINE_CHECK 12, 64, mmx2
1113 AVG_CACHELINE_CHECK 16, 64, mmx2
1114 AVG_CACHELINE_CHECK 20, 64, mmx2
1115 AVG_CACHELINE_CHECK 8, 32, mmx2
1116 AVG_CACHELINE_CHECK 12, 32, mmx2
1117 AVG_CACHELINE_CHECK 16, 32, mmx2
1118 AVG_CACHELINE_CHECK 20, 32, mmx2
1121 AVG_CACHELINE_CHECK 16, 64, sse2
1122 AVG_CACHELINE_CHECK 20, 64, sse2
1124 ; computed jump assumes this loop is exactly 48 bytes
1125 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
1127 avg_w16_align%1_%2_ssse3:
1133 movdqa xmm1, [r2+r4+16]
1134 palignr xmm1, [r2+r4], %2
1138 movdqa xmm1, [r2+16]
1139 palignr xmm1, [r2], %1
1143 movdqa xmm1, [r2+16]
1144 movdqa xmm2, [r2+r4+16]
1145 palignr xmm1, [r2], %1
1146 palignr xmm2, [r2+r4], %2&15
1153 jg avg_w16_align%1_%2_ssse3
1156 times 13 db 0x90 ; make sure the first ones don't end up short
1160 cglobal pixel_avg2_w16_cache64_ssse3
1161 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
1165 jb x264_pixel_avg2_w16_sse2
1168 jz x264_pixel_avg2_w16_sse2
1175 lea r6, [r6*3] ;(offset + align*2)*3
1177 shl r6, 4 ;jump = (offset + align*2)*48
1178 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
1180 lea r11, [avg_w16_addr]
1183 lea r6, [avg_w16_addr + r6]
1195 AVG16_CACHELINE_LOOP_SSSE3 j, j
1196 AVG16_CACHELINE_LOOP_SSSE3 j, k
1200 %endif ; !HIGH_BIT_DEPTH
1202 ;=============================================================================
1204 ;=============================================================================
1226 movu m4, [r2+r3*2+%3]
1227 movu m5, [r2+r3*2+%4]
1234 mova [r0+r1*2+%3], m4
1235 mova [r0+r1*2+%4], m5
1240 ;-----------------------------------------------------------------------------
1241 ; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
1242 ; uint8_t *src, int i_src_stride, int i_height )
1243 ;-----------------------------------------------------------------------------
1245 cglobal mc_copy_w4_mmx, 4,6
1251 %ifndef HIGH_BIT_DEPTH
1262 %ifdef HIGH_BIT_DEPTH
1263 cglobal mc_copy_w16_mmx, 5,7
1268 COPY_TWO r5, r6, mmsize*0, mmsize*1
1269 COPY_TWO r5, r6, mmsize*2, mmsize*3
1277 cglobal mc_copy_w%2, 5,7,%2-8
1282 COPY_%1 r5, r6, 0, mmsize
1295 INIT_XMM aligned, sse2
1297 %endif ; HIGH_BIT_DEPTH
1299 %ifndef HIGH_BIT_DEPTH
1301 cglobal mc_copy_w%2, 5,7
1305 %1 r5, r6, 0, mmsize
1315 MC_COPY COPY_TWO, 16
1318 ; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
1319 ; but with SSE3 the overhead is zero, so there's no reason not to include it.
1322 INIT_XMM aligned, sse2
1324 %endif ; !HIGH_BIT_DEPTH
1328 ;=============================================================================
1330 ;=============================================================================
1331 ; FIXME assumes 64 byte cachelines
1333 ;-----------------------------------------------------------------------------
1334 ; void prefetch_fenc( uint8_t *pix_y, int stride_y,
1335 ; uint8_t *pix_uv, int stride_uv, int mb_x )
1336 ;-----------------------------------------------------------------------------
1339 cglobal prefetch_fenc_mmx2, 5,5
1343 lea r0, [r0+r4*4+64]
1351 lea r2, [r2+rax*2+64]
1357 cglobal prefetch_fenc_mmx2, 0,3
1363 lea r0, [r0+r2*4+64]
1375 lea r0, [r0+r2*2+64]
1379 %endif ; ARCH_X86_64
1381 ;-----------------------------------------------------------------------------
1382 ; void prefetch_ref( uint8_t *pix, int stride, int parity )
1383 ;-----------------------------------------------------------------------------
1384 cglobal prefetch_ref_mmx2, 3,3
1387 lea r0, [r0+r2*8+64]
1391 prefetcht0 [r0+r1*2]
1396 prefetcht0 [r0+r1*2]
1402 ;=============================================================================
1404 ;=============================================================================
1407 DECLARE_REG_TMP 10,11,6
1409 DECLARE_REG_TMP 0,1,2
1412 %macro MC_CHROMA_START 0
1424 movsxdifnidn t0, t0d
1425 add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
1428 %ifdef HIGH_BIT_DEPTH
1429 %macro UNPACK_UNALIGNED 4
1432 punpckhwd %3, %1, %2
1439 shufps %2, %1, %3, 11011101b
1440 shufps %1, %3, 10001000b
1443 %else ; !HIGH_BIT_DEPTH
1444 %macro UNPACK_UNALIGNED 3
1445 %if mmsize == 8 || cpuflag(misalign)
1452 %endif ; HIGH_BIT_DEPTH
1454 ;-----------------------------------------------------------------------------
1455 ; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
1456 ; uint8_t *src, int src_stride,
1458 ; int width, int height )
1459 ;-----------------------------------------------------------------------------
1461 cglobal mc_chroma, 0,6
1477 add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
1482 jl mc_chroma_mmx2 %+ .skip_prologue
1499 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
1507 %ifdef HIGH_BIT_DEPTH
1509 UNPACK_UNALIGNED m0, m1, m2, r3
1512 UNPACK_UNALIGNED m0, m1, [r3+2]
1516 %endif ; HIGH_BIT_DEPTH
1523 %ifdef HIGH_BIT_DEPTH
1524 UNPACK_UNALIGNED m0, m1, m2, r3+r4
1526 %else ; !HIGH_BIT_DEPTH
1528 UNPACK_UNALIGNED m0, m1, [r3+r4+2]
1533 %endif ; HIGH_BIT_DEPTH
1543 %ifdef HIGH_BIT_DEPTH
1551 %else ; !HIGH_BIT_DEPTH
1560 %endif ; HIGH_BIT_DEPTH
1574 %define multy0 [rsp-8]
1593 %ifdef HIGH_BIT_DEPTH
1594 UNPACK_UNALIGNED m0, m2, m4, r3
1595 UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
1598 movu m1, [r3+mmsize/2]
1599 UNPACK_UNALIGNED m0, m2, [r3+2]
1600 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1617 %ifdef HIGH_BIT_DEPTH
1618 UNPACK_UNALIGNED m0, m1, m2, r3
1622 UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
1626 %else ; !HIGH_BIT_DEPTH
1628 movu m1, [r3+mmsize/2]
1629 UNPACK_UNALIGNED m0, m2, [r3+2]
1630 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1641 %endif ; HIGH_BIT_DEPTH
1655 %ifdef HIGH_BIT_DEPTH
1657 movh [r0+mmsize/2], m1
1662 movh [r1+mmsize/2], m1
1665 movhps [r1+mmsize/2], m1
1667 %else ; !HIGH_BIT_DEPTH
1679 %endif ; HIGH_BIT_DEPTH
1693 lea r3, [t2+8*SIZEOF_PIXEL]
1694 lea r0, [t0+4*SIZEOF_PIXEL]
1695 lea r1, [t1+4*SIZEOF_PIXEL]
1700 add r3, 8*SIZEOF_PIXEL
1701 add r0, 4*SIZEOF_PIXEL
1702 add r1, 4*SIZEOF_PIXEL
1708 %ifdef ARCH_X86_64 ; too many regs for x86_32
1709 RESET_MM_PERMUTATION
1711 %if xmm_regs_used > 6
1712 %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
1713 %assign xmm_regs_used 6
1719 mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
1723 mov r6d, 2*SIZEOF_PIXEL
1725 %ifdef HIGH_BIT_DEPTH
1746 %ifdef HIGH_BIT_DEPTH
1759 SBUTTERFLY wd, 0, 2, 6
1760 SBUTTERFLY wd, 1, 3, 7
1761 SBUTTERFLY wd, 0, 2, 6
1762 SBUTTERFLY wd, 1, 3, 7
1764 SBUTTERFLY wd, 0, 2, 6
1765 SBUTTERFLY wd, 1, 3, 7
1767 %else ; !HIGH_BIT_DEPTH
1779 %endif ; HIGH_BIT_DEPTH
1790 %ifdef HIGH_BIT_DEPTH
1803 %else ; !HIGH_BIT_DEPTH
1822 %endif ; HIGH_BIT_DEPTH
1830 sub r2, 4*SIZEOF_PIXEL
1831 sub r4, 8*SIZEOF_PIXEL
1832 mov r10, 4*SIZEOF_PIXEL
1833 mov r11, 8*SIZEOF_PIXEL
1838 %endif ; ARCH_X86_64
1839 %endmacro ; MC_CHROMA
1842 %macro MC_CHROMA_SSSE3 0
1843 cglobal mc_chroma, 0,6,9
1853 imul t2d, t0d ; (x*255+8)*y
1854 imul r5d, t0d ; (x*255+8)*(8-y)
1857 %if cpuflag(cache64)
1861 lea t1, [ch_shuf_adj]
1862 movddup m5, [t1 + t0*4]
1864 movddup m5, [ch_shuf_adj + t0*4]
1888 pmaddubsw m2, m1, m7
1949 movu m3, [r3+r4*2+8]
1975 %ifdef HIGH_BIT_DEPTH
1982 %else ; !HIGH_BIT_DEPTH
1985 INIT_XMM sse2, misalign
1991 INIT_XMM ssse3, cache64
1994 MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
1995 %endif ; HIGH_BIT_DEPTH