1 ;******************************************************************************
2 ;* VP9 inverse transform x86 SIMD optimizations
4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
24 %include "vp9itxfm_template.asm"
39 pd_3fff: times 4 dd 0x3fff
41 ; FIXME these should probably be shared between 8bpp and 10/12bpp
42 pw_m11585_11585: times 4 dw -11585, 11585
43 pw_11585_11585: times 8 dw 11585
44 pw_m15137_6270: times 4 dw -15137, 6270
45 pw_6270_15137: times 4 dw 6270, 15137
46 pw_11585x2: times 8 dw 11585*2
48 pw_5283_13377: times 4 dw 5283, 13377
49 pw_9929_13377: times 4 dw 9929, 13377
50 pw_15212_m13377: times 4 dw 15212, -13377
51 pw_15212_9929: times 4 dw 15212, 9929
52 pw_m5283_m15212: times 4 dw -5283, -15212
53 pw_13377x2: times 8 dw 13377*2
54 pw_m13377_13377: times 4 dw -13377, 13377
55 pw_13377_0: times 4 dw 13377, 0
56 pw_9929_m5283: times 4 dw 9929, -5283
58 pw_3196_16069: times 4 dw 3196, 16069
59 pw_m16069_3196: times 4 dw -16069, 3196
60 pw_13623_9102: times 4 dw 13623, 9102
61 pw_m9102_13623: times 4 dw -9102, 13623
63 pw_1606_16305: times 4 dw 1606, 16305
64 pw_m16305_1606: times 4 dw -16305, 1606
65 pw_12665_10394: times 4 dw 12665, 10394
66 pw_m10394_12665: times 4 dw -10394, 12665
67 pw_7723_14449: times 4 dw 7723, 14449
68 pw_m14449_7723: times 4 dw -14449, 7723
69 pw_15679_4756: times 4 dw 15679, 4756
70 pw_m4756_15679: times 4 dw -4756, 15679
71 pw_15137_6270: times 4 dw 15137, 6270
72 pw_m6270_15137: times 4 dw -6270, 15137
74 pw_804_16364: times 4 dw 804, 16364
75 pw_m16364_804: times 4 dw -16364, 804
76 pw_12140_11003: times 4 dw 12140, 11003
77 pw_m11003_12140: times 4 dw -11003, 12140
78 pw_7005_14811: times 4 dw 7005, 14811
79 pw_m14811_7005: times 4 dw -14811, 7005
80 pw_15426_5520: times 4 dw 15426, 5520
81 pw_m5520_15426: times 4 dw -5520, 15426
82 pw_16069_3196: times 4 dw 16069, 3196
83 pw_m3196_16069: times 4 dw -3196, 16069
84 pw_3981_15893: times 4 dw 3981, 15893
85 pw_m15893_3981: times 4 dw -15893, 3981
86 pw_14053_8423: times 4 dw 14053, 8423
87 pw_m8423_14053: times 4 dw -8423, 14053
88 pw_9760_13160: times 4 dw 9760, 13160
89 pw_m13160_9760: times 4 dw -13160, 9760
90 pw_16207_2404: times 4 dw 16207, 2404
91 pw_m2404_16207: times 4 dw -2404, 16207
92 pw_9102_13623: times 4 dw 9102, 13623
93 pw_m13623_9102: times 4 dw -13623, 9102
94 pw_m11585_m11585: times 8 dw -11585
96 pw_m3196_m16069: times 4 dw -3196, -16069
97 pw_m13623_m9102: times 4 dw -13623, -9102
98 pw_m6270_m15137: times 4 dw -6270, -15137
136 %macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
138 mova m%4, [%7+strideq]
146 mova [%7+strideq], m%4
149 %macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
154 mova [%1+%%y+%%x], %4
155 %assign %%x (%%x+mmsize)
161 ; the input coefficients are scaled up by 2 bit (which we downscale immediately
162 ; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
163 ; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
164 ; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
165 ; add 2 bits, we need to scale before converting to word in 12bpp, since the
166 ; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
167 ; we can scale after converting to words (which is half the instructions),
168 ; since the input is only 14+sign bit, which fits in 15+sign words directly.
170 %macro IWHT4_FN 2 ; bpp, max
171 cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
173 mova m0, [blockq+0*16+0]
174 mova m1, [blockq+1*16+0]
176 mova m4, [blockq+0*16+8]
177 mova m5, [blockq+1*16+8]
185 packssdw m0, [blockq+0*16+8]
186 packssdw m1, [blockq+1*16+8]
190 mova m2, [blockq+2*16+0]
191 mova m3, [blockq+3*16+0]
193 mova m4, [blockq+2*16+8]
194 mova m5, [blockq+3*16+8]
202 packssdw m2, [blockq+2*16+8]
203 packssdw m3, [blockq+3*16+8]
209 TRANSPOSE4x4W 0, 1, 2, 3, 4
213 VP9_STORE_2X 0, 1, 4, 5, 6, 7
214 lea dstq, [dstq+strideq*2]
215 VP9_STORE_2X 2, 3, 4, 5, 6, 7
216 ZERO_BLOCK blockq, 16, 4, m6
225 %macro VP9_IDCT4_WRITEOUT 0
244 VP9_STORE_2X 0, 1, 6, 7, 4, 5
245 lea dstq, [dstq+2*strideq]
246 VP9_STORE_2X 2, 3, 6, 7, 4, 5
249 %macro DC_ONLY 2 ; shift, zero
250 mov coefd, dword [blockq]
256 add coefd, ((1 << (%1 - 1)) << 14) + 8192
260 ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
261 ; in 15+1 words without additional effort, since the coefficients are 15bpp.
264 cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
273 mova m5, [pw_11585x2]
277 DEFINE_ARGS dst, stride, block, coef
284 pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
286 VP9_STORE_2X 0, 0, 6, 7, 4, 5
287 lea dstq, [dstq+2*strideq]
288 VP9_STORE_2X 0, 0, 6, 7, 4, 5
292 mova m0, [blockq+0*16+0]
293 mova m1, [blockq+1*16+0]
294 packssdw m0, [blockq+0*16+8]
295 packssdw m1, [blockq+1*16+8]
296 mova m2, [blockq+2*16+0]
297 mova m3, [blockq+3*16+0]
298 packssdw m2, [blockq+2*16+8]
299 packssdw m3, [blockq+3*16+8]
302 mova m6, [pw_11585x2]
304 mova m7, [pd_8192] ; rounding
306 TRANSPOSE4x4W 0, 1, 2, 3, 4
310 ZERO_BLOCK blockq, 16, 4, m4
321 cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
322 %if WIN64 && notcpuflag(ssse3)
325 movdqa xmm5, [pd_8192]
326 mova m0, [blockq+0*16+0]
327 mova m1, [blockq+1*16+0]
328 packssdw m0, [blockq+0*16+8]
329 packssdw m1, [blockq+1*16+8]
330 mova m2, [blockq+2*16+0]
331 mova m3, [blockq+3*16+0]
332 packssdw m2, [blockq+2*16+8]
333 packssdw m3, [blockq+3*16+8]
336 mova m6, [pw_11585x2]
338 %ifnidn %1%3, iadstiadst
342 TRANSPOSE4x4W 0, 1, 2, 3, 4
346 ZERO_BLOCK blockq, 16, 4, m4
352 IADST4_FN idct, IDCT4, iadst, IADST4
353 IADST4_FN iadst, IADST4, idct, IDCT4
354 IADST4_FN iadst, IADST4, iadst, IADST4
357 IADST4_FN idct, IDCT4, iadst, IADST4
358 IADST4_FN iadst, IADST4, idct, IDCT4
359 IADST4_FN iadst, IADST4, iadst, IADST4
361 ; inputs and outputs are dwords, coefficients are words
363 ; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
364 ; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
365 %macro SUMSUB_MUL 6 ; src/dst 1-2, tmp1-2, coef1-2
366 pand m%3, m%1, [pd_3fff]
367 pand m%4, m%2, [pd_3fff]
372 punpckhwd m%2, m%4, m%3
374 pmaddwd m%3, m%4, [pw_%6_%5]
375 pmaddwd m%1, m%2, [pw_%6_%5]
376 pmaddwd m%4, [pw_m%5_%6]
377 pmaddwd m%2, [pw_m%5_%6]
386 %macro IDCT4_12BPP_1D 0-6 0, 1, 2, 3, 4, 5
387 SUMSUB_MUL %1, %3, %5, %6, 11585, 11585
388 SUMSUB_MUL %2, %4, %5, %6, 15137, 6270
389 SUMSUB_BA d, %2, %1, %5
390 SUMSUB_BA d, %4, %3, %5
394 %macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
395 movh m%1, [dstq+strideq*0]
396 movh m%2, [dstq+strideq*2]
397 movhps m%1, [dstq+strideq*1]
398 movhps m%2, [dstq+stride3q ]
405 movh [dstq+strideq*0], m%1
406 movhps [dstq+strideq*1], m%1
407 movh [dstq+strideq*2], m%2
408 movhps [dstq+stride3q ], m%2
411 %macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
422 STORE_4x4 %2, %4, %1, %3, %5, %6
426 cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
430 ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
431 ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
432 ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
433 ; dword. After the final shift (4), the result is 13+sign bits, so we
434 ; don't need any additional processing to fit it in a word
435 DEFINE_ARGS dst, stride, block, coef
439 pshuflw m0, m0, q0000
442 DEFINE_ARGS dst, stride, stride3
443 lea stride3q, [strideq*3]
444 STORE_4x4 1, 3, 0, 0, m4, m5
448 DEFINE_ARGS dst, stride, block, eob
449 mova m0, [blockq+0*16]
450 mova m1, [blockq+1*16]
451 mova m2, [blockq+2*16]
452 mova m3, [blockq+3*16]
455 TRANSPOSE4x4D 0, 1, 2, 3, 4
459 ZERO_BLOCK blockq, 16, 4, m4
462 DEFINE_ARGS dst, stride, stride3
463 lea stride3q, [strideq*3]
465 ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, [pd_8], 4
493 ; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
494 ; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
495 ; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
496 ; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
497 %macro IADST4_12BPP_1D 0
498 pand m4, m0, [pd_3fff]
499 pand m5, m1, [pd_3fff]
506 pand m5, m2, [pd_3fff]
507 pand m6, m3, [pd_3fff]
514 SCRATCH 1, 8, rsp+0*mmsize, a
515 SCRATCH 5, 9, rsp+1*mmsize, b
517 ; m1/3 have the high bits of 0,1,2,3
518 ; m4/5 have the low bits of 0,1,2,3
521 pmaddwd m7, reg_b, [pw_15212_9929]
522 pmaddwd m6, m4, [pw_5283_13377]
523 pmaddwd m2, m3, [pw_15212_9929]
524 pmaddwd m0, reg_a, [pw_5283_13377]
527 pmaddwd m7, reg_b, [pw_m13377_13377]
528 pmaddwd m2, m4, [pw_13377_0]
529 pmaddwd m1, m3, [pw_m13377_13377]
530 pmaddwd m5, reg_a, [pw_13377_0]
540 pmaddwd m1, reg_b, [pw_m5283_m15212]
541 pmaddwd m6, m4, [pw_9929_13377]
542 pmaddwd m7, m3, [pw_m5283_m15212]
543 pmaddwd m5, reg_a, [pw_9929_13377]
546 UNSCRATCH 5, 9, rsp+1*mmsize, b
547 pmaddwd m5, [pw_9929_m5283]
548 pmaddwd m4, [pw_15212_m13377]
549 pmaddwd m3, [pw_9929_m5283]
550 UNSCRATCH 1, 8, rsp+0*mmsize, a
551 pmaddwd m1, [pw_15212_m13377]
564 %macro IADST4_12BPP_FN 4
565 cglobal vp9_%1_%3_4x4_add_12, 3, 3, 10, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
566 mova m0, [blockq+0*16]
567 mova m1, [blockq+1*16]
568 mova m2, [blockq+2*16]
569 mova m3, [blockq+3*16]
572 TRANSPOSE4x4D 0, 1, 2, 3, 4
576 ZERO_BLOCK blockq, 16, 4, m4
579 DEFINE_ARGS dst, stride, stride3
580 lea stride3q, [strideq*3]
582 ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, [pd_8], 4
587 IADST4_12BPP_FN idct, IDCT4, iadst, IADST4
588 IADST4_12BPP_FN iadst, IADST4, idct, IDCT4
589 IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
591 ; the following line has not been executed at the end of this macro:
592 ; UNSCRATCH 6, 8, rsp+%3*mmsize
593 %macro IDCT8_1D 1-3 2 * mmsize, 17 ; src, src_stride, stack_offset
598 IDCT4_12BPP_1D 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3
599 SCRATCH 4, 8, rsp+(%3+0)*mmsize
600 SCRATCH 6, 9, rsp+(%3+1)*mmsize
605 SUMSUB_MUL 1, 7, 4, 6, 16069, 3196 ; m1=t7a, m7=t4a
606 SUMSUB_MUL 5, 3, 4, 6, 9102, 13623 ; m5=t6a, m3=t5a
607 SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a
608 SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a
609 SUMSUB_MUL 1, 7, 4, 6, 11585, 11585 ; m1=t6, m7=t5
610 SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7
611 SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6
612 UNSCRATCH 4, 8, rsp+(%3+0)*mmsize
613 UNSCRATCH 6, 9, rsp+(%3+1)*mmsize
614 SCRATCH 2, 8, rsp+(%3+0)*mmsize
615 SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5
616 SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4
617 SWAP 0, 5, 4, 6, 2, 7
620 %macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
644 ; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
645 ; storage also instead of allocating two more stack spaces. This doesn't
646 ; matter much but it's something...
648 cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
649 17 * mmsize + 2 * ARCH_X86_32 * mmsize, \
650 dst, stride, block, eob
655 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
656 ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
658 DEFINE_ARGS dst, stride, block, coef
662 pshuflw m1, m1, q0000
664 DEFINE_ARGS dst, stride, cnt
667 STORE_2x8 3, 4, 1, m2, m0
668 lea dstq, [dstq+strideq*2]
674 mova [rsp+16*mmsize], m0
675 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
681 lea ptrq, [default_8x8]
682 movzx cntd, byte [ptrq+cntq-1]
684 movzx cntd, byte [default_8x8+cntq-1]
692 TRANSPOSE4x4D 0, 1, 2, 3, 6
693 mova [ptrq+ 0*mmsize], m0
694 mova [ptrq+ 2*mmsize], m1
695 mova [ptrq+ 4*mmsize], m2
696 mova [ptrq+ 6*mmsize], m3
697 UNSCRATCH 6, 8, rsp+17*mmsize
698 TRANSPOSE4x4D 4, 5, 6, 7, 0
699 mova [ptrq+ 1*mmsize], m4
700 mova [ptrq+ 3*mmsize], m5
701 mova [ptrq+ 5*mmsize], m6
702 mova [ptrq+ 7*mmsize], m7
708 ; zero-pad the remainder (skipped cols)
712 lea blockq, [blockq+skipq*(mmsize/2)]
715 mova [ptrq+mmsize*0], m0
716 mova [ptrq+mmsize*1], m0
717 mova [ptrq+mmsize*2], m0
718 mova [ptrq+mmsize*3], m0
724 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
725 lea stride3q, [strideq*3]
732 PRELOAD 9, rsp+16*mmsize, max
733 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, [pd_16], 5
734 lea dstq, [dstq+strideq*4]
735 UNSCRATCH 0, 8, rsp+17*mmsize
736 ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, reg_max, [pd_16], 5
739 lea dstq, [dstbakq+8]
748 ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
751 %macro DC_ONLY_64BIT 2 ; shift, zero
753 movsxd coefq, dword [blockq]
759 add coefq, ((1 << (%1 - 1)) << 14) + 8192
762 mov coefd, dword [blockq]
764 DEFINE_ARGS dst, stride, cnt, coef, coefl
777 add coefd, 1 << (%1 - 1)
783 cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
784 17 * mmsize + 2 * ARCH_X86_32 * mmsize, \
785 dst, stride, block, eob
788 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
790 ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
791 ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
792 DEFINE_ARGS dst, stride, block, coef, coefl
796 pshuflw m1, m1, q0000
798 DEFINE_ARGS dst, stride, cnt
801 STORE_2x8 3, 4, 1, m2, m0
802 lea dstq, [dstq+strideq*2]
807 ; inputs and outputs are dwords, coefficients are words
809 ; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
810 ; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
811 %macro SUMSUB_MUL_D 6 ; src/dst 1-2, dst3-4, coef1-2
812 pand m%3, m%1, [pd_3fff]
813 pand m%4, m%2, [pd_3fff]
818 punpckhwd m%2, m%4, m%3
820 pmaddwd m%3, m%4, [pw_%6_%5]
821 pmaddwd m%1, m%2, [pw_%6_%5]
822 pmaddwd m%4, [pw_m%5_%6]
823 pmaddwd m%2, [pw_m%5_%6]
826 ; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
827 ; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
828 %macro SUMSUB_PACK_D 5 ; src/dst 1-2, src3-4, tmp
829 SUMSUB_BA d, %1, %2, %5
830 SUMSUB_BA d, %3, %4, %5
848 ; the following line has not been executed at the end of this macro:
849 ; UNSCRATCH 6, 8, rsp+17*mmsize
850 %macro IADST8_1D 1 ; src
851 mova m0, [%1+ 0*mmsize]
852 mova m3, [%1+ 6*mmsize]
853 mova m4, [%1+ 8*mmsize]
854 mova m7, [%1+14*mmsize]
855 SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606 ; m7/1=t0a, m0/2=t1a
856 SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665 ; m3/5=t4a, m4/6=t5a
857 SCRATCH 0, 8, rsp+17*mmsize
858 SUMSUB_PACK_D 3, 7, 5, 1, 0 ; m3=t0, m7=t4
859 UNSCRATCH 0, 8, rsp+17*mmsize
860 SUMSUB_PACK_D 4, 0, 6, 2, 1 ; m4=t1, m0=t5
862 SCRATCH 3, 8, rsp+17*mmsize
863 SCRATCH 4, 9, rsp+18*mmsize
864 SCRATCH 7, 10, rsp+19*mmsize
865 SCRATCH 0, 11, rsp+20*mmsize
867 mova m1, [%1+ 2*mmsize]
868 mova m2, [%1+ 4*mmsize]
869 mova m5, [%1+10*mmsize]
870 mova m6, [%1+12*mmsize]
871 SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723 ; m5/8=t2a, m2/9=t3a
872 SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679 ; m1/10=t6a, m6/11=t7a
873 SCRATCH 2, 12, rsp+21*mmsize
874 SUMSUB_PACK_D 1, 5, 7, 3, 2 ; m1=t2, m5=t6
875 UNSCRATCH 2, 12, rsp+21*mmsize
876 SUMSUB_PACK_D 6, 2, 0, 4, 3 ; m6=t3, m2=t7
878 UNSCRATCH 7, 10, rsp+19*mmsize
879 UNSCRATCH 0, 11, rsp+20*mmsize
880 SCRATCH 1, 10, rsp+19*mmsize
881 SCRATCH 6, 11, rsp+20*mmsize
883 SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270 ; m7/8=t4a, m0/9=t5a
884 SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137 ; m2/10=t7a, m5/11=t6a
885 SCRATCH 2, 12, rsp+21*mmsize
886 SUMSUB_PACK_D 5, 7, 6, 3, 2 ; m5=-out1, m7=t6
887 UNSCRATCH 2, 12, rsp+21*mmsize
889 SUMSUB_PACK_D 2, 0, 1, 4, 3 ; m2=out6, m0=t7
890 SUMSUB_MUL 7, 0, 3, 4, 11585, 11585 ; m7=out2, m0=-out5
893 UNSCRATCH 3, 8, rsp+17*mmsize
894 UNSCRATCH 4, 9, rsp+18*mmsize
895 UNSCRATCH 1, 10, rsp+19*mmsize
896 UNSCRATCH 6, 11, rsp+20*mmsize
897 SCRATCH 2, 8, rsp+17*mmsize
898 SCRATCH 0, 9, rsp+18*mmsize
900 SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2
901 SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3
903 SUMSUB_MUL 3, 4, 2, 0, 11585, 11585 ; m3=-out3, m4=out4
906 UNSCRATCH 0, 9, rsp+18*mmsize
913 cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
914 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
915 dst, stride, block, eob
919 mova [rsp+16*mmsize], m0
920 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
927 movzx cntd, byte [ptrq+cntq-1]
929 movzx cntd, byte [%5_8x8+cntq-1]
937 TRANSPOSE4x4D 0, 1, 2, 3, 6
938 mova [ptrq+ 0*mmsize], m0
939 mova [ptrq+ 2*mmsize], m1
940 mova [ptrq+ 4*mmsize], m2
941 mova [ptrq+ 6*mmsize], m3
942 UNSCRATCH 6, 8, rsp+17*mmsize
943 TRANSPOSE4x4D 4, 5, 6, 7, 0
944 mova [ptrq+ 1*mmsize], m4
945 mova [ptrq+ 3*mmsize], m5
946 mova [ptrq+ 5*mmsize], m6
947 mova [ptrq+ 7*mmsize], m7
953 ; zero-pad the remainder (skipped cols)
957 lea blockq, [blockq+skipq*(mmsize/2)]
960 mova [ptrq+mmsize*0], m0
961 mova [ptrq+mmsize*1], m0
962 mova [ptrq+mmsize*2], m0
963 mova [ptrq+mmsize*3], m0
969 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
970 lea stride3q, [strideq*3]
977 PRELOAD 9, rsp+16*mmsize, max
978 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, [pd_16], 5
979 lea dstq, [dstq+strideq*4]
980 UNSCRATCH 0, 8, rsp+17*mmsize
981 ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, reg_max, [pd_16], 5
984 lea dstq, [dstbakq+8]
993 ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
996 cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
997 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
998 dst, stride, block, eob
1000 jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
1004 IADST8_FN idct, IDCT8, iadst, IADST8, row
1005 IADST8_FN iadst, IADST8, idct, IDCT8, col
1006 IADST8_FN iadst, IADST8, iadst, IADST8, default
1008 %macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
1009 IDCT8_1D %1, %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
1010 ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
1011 SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a
1012 SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a
1013 SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a
1014 SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a
1015 SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4
1016 mova [rsp+(%3+0)*mmsize], m5 ; t5
1017 mova [rsp+(%3+1)*mmsize], m7 ; t7
1019 mova m0, [%1+ 1*%2] ; in1
1020 mova m3, [%1+ 7*%2] ; in7
1021 mova m4, [%1+ 9*%2] ; in9
1022 mova m7, [%1+15*%2] ; in15
1024 SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a
1025 SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a
1026 SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9
1027 SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14
1028 SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a
1030 mova m1, [%1+ 3*%2] ; in3
1031 mova m2, [%1+ 5*%2] ; in5
1032 mova m5, [%1+11*%2] ; in11
1033 mova m6, [%1+13*%2] ; in13
1035 SCRATCH 0, 9, rsp+(%4+1)*mmsize
1036 SCRATCH 7, 10, rsp+(%4+2)*mmsize
1038 SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a
1039 SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a
1040 SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10
1041 SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13
1043 SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a
1045 UNSCRATCH 7, 10, rsp+(%4+2)*mmsize
1046 SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a
1047 SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10
1048 SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a
1049 SCRATCH 5, 10, rsp+(%4+2)*mmsize
1050 SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11
1051 UNSCRATCH 0, 9, rsp+(%4+1)*mmsize
1052 SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13
1053 SCRATCH 6, 9, rsp+(%4+1)*mmsize
1054 SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a
1056 ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
1059 UNSCRATCH 5, 15, rsp+(%4+7)*mmsize
1060 SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15
1061 SCRATCH 5, 15, rsp+(%4+7)*mmsize
1062 UNSCRATCH 5, 14, rsp+(%4+6)*mmsize
1063 SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14
1064 SCRATCH 5, 14, rsp+(%4+6)*mmsize
1065 UNSCRATCH 5, 13, rsp+(%4+5)*mmsize
1066 SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13
1067 SCRATCH 5, 13, rsp+(%4+5)*mmsize
1068 UNSCRATCH 5, 12, rsp+(%4+4)*mmsize
1069 SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12
1070 SCRATCH 5, 12, rsp+(%4+4)*mmsize
1071 UNSCRATCH 5, 11, rsp+(%4+3)*mmsize
1072 SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11
1073 SCRATCH 4, 11, rsp+(%4+3)*mmsize
1074 mova m4, [rsp+(%3+0)*mmsize]
1075 SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10
1076 mova [rsp+(%3+0)*mmsize], m5
1077 UNSCRATCH 5, 8, rsp+(%4+0)*mmsize
1078 UNSCRATCH 6, 9, rsp+(%4+1)*mmsize
1079 SCRATCH 2, 8, rsp+(%4+0)*mmsize
1080 SCRATCH 1, 9, rsp+(%4+1)*mmsize
1081 UNSCRATCH 1, 10, rsp+(%4+2)*mmsize
1082 SCRATCH 0, 10, rsp+(%4+2)*mmsize
1083 mova m0, [rsp+(%3+1)*mmsize]
1084 SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9
1085 SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8
1087 SWAP 0, 3, 1, 7, 2, 6, 4
1089 ; output order: 8-11|r67-70=out0-3
1091 ; 12-15|r71-74=out12-15
1095 cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
1096 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1097 dst, stride, block, eob
1102 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
1103 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
1105 DEFINE_ARGS dst, stride, block, coef
1109 pshuflw m1, m1, q0000
1111 DEFINE_ARGS dst, stride, cnt
1114 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
1115 STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize
1116 lea dstq, [dstq+strideq*2]
1122 mova [rsp+64*mmsize], m0
1123 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1129 lea ptrq, [default_16x16]
1130 movzx cntd, byte [ptrq+cntq-1]
1132 movzx cntd, byte [default_16x16+cntq-1]
1140 TRANSPOSE4x4D 0, 1, 2, 3, 7
1141 mova [ptrq+ 1*mmsize], m0
1142 mova [ptrq+ 5*mmsize], m1
1143 mova [ptrq+ 9*mmsize], m2
1144 mova [ptrq+13*mmsize], m3
1145 mova m7, [rsp+65*mmsize]
1146 TRANSPOSE4x4D 4, 5, 6, 7, 0
1147 mova [ptrq+ 2*mmsize], m4
1148 mova [ptrq+ 6*mmsize], m5
1149 mova [ptrq+10*mmsize], m6
1150 mova [ptrq+14*mmsize], m7
1151 UNSCRATCH 0, 8, rsp+67*mmsize
1152 UNSCRATCH 1, 9, rsp+68*mmsize
1153 UNSCRATCH 2, 10, rsp+69*mmsize
1154 UNSCRATCH 3, 11, rsp+70*mmsize
1155 TRANSPOSE4x4D 0, 1, 2, 3, 7
1156 mova [ptrq+ 0*mmsize], m0
1157 mova [ptrq+ 4*mmsize], m1
1158 mova [ptrq+ 8*mmsize], m2
1159 mova [ptrq+12*mmsize], m3
1160 UNSCRATCH 4, 12, rsp+71*mmsize
1161 UNSCRATCH 5, 13, rsp+72*mmsize
1162 UNSCRATCH 6, 14, rsp+73*mmsize
1163 UNSCRATCH 7, 15, rsp+74*mmsize
1164 TRANSPOSE4x4D 4, 5, 6, 7, 0
1165 mova [ptrq+ 3*mmsize], m4
1166 mova [ptrq+ 7*mmsize], m5
1167 mova [ptrq+11*mmsize], m6
1168 mova [ptrq+15*mmsize], m7
1169 add ptrq, 16 * mmsize
1174 ; zero-pad the remainder (skipped cols)
1178 lea blockq, [blockq+skipq*(mmsize/2)]
1181 mova [ptrq+mmsize*0], m0
1182 mova [ptrq+mmsize*1], m0
1183 mova [ptrq+mmsize*2], m0
1184 mova [ptrq+mmsize*3], m0
1185 mova [ptrq+mmsize*4], m0
1186 mova [ptrq+mmsize*5], m0
1187 mova [ptrq+mmsize*6], m0
1188 mova [ptrq+mmsize*7], m0
1189 add ptrq, 8 * mmsize
1194 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1195 lea stride3q, [strideq*3]
1202 lea dstq, [dstq+strideq*4]
1203 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
1204 lea dstq, [dstq+strideq*4]
1205 mova m0, [rsp+65*mmsize]
1206 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6
1209 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1213 UNSCRATCH 0, 8, rsp+67*mmsize
1214 UNSCRATCH 1, 9, rsp+68*mmsize
1215 UNSCRATCH 2, 10, rsp+69*mmsize
1216 UNSCRATCH 3, 11, rsp+70*mmsize
1217 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
1219 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1220 lea dstq, [dstbakq+stride3q*4]
1222 lea dstq, [dstq+stride3q*4]
1224 UNSCRATCH 4, 12, rsp+71*mmsize
1225 UNSCRATCH 5, 13, rsp+72*mmsize
1226 UNSCRATCH 6, 14, rsp+73*mmsize
1227 UNSCRATCH 0, 15, rsp+74*mmsize
1228 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6
1242 ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
1246 cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
1247 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1248 dst, stride, block, eob
1251 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
1253 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
1254 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
1255 DEFINE_ARGS dst, stride, block, coef, coefl
1259 pshuflw m1, m1, q0000
1261 DEFINE_ARGS dst, stride, cnt
1264 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
1265 STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize
1266 lea dstq, [dstq+strideq*2]
1271 ; r65-69 are available for spills
1272 ; r70-77 are available on x86-32 only (x86-64 should use m8-15)
1273 ; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
1274 %macro IADST16_1D 1 ; src
1275 mova m0, [%1+ 0*4*mmsize] ; in0
1276 mova m1, [%1+ 7*4*mmsize] ; in7
1277 mova m2, [%1+ 8*4*mmsize] ; in8
1278 mova m3, [%1+15*4*mmsize] ; in15
1279 SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1
1280 SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9
1281 SCRATCH 0, 8, rsp+70*mmsize
1282 SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a
1283 UNSCRATCH 0, 8, rsp+70*mmsize
1284 SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a
1285 mova [rsp+67*mmsize], m1
1286 SCRATCH 2, 9, rsp+71*mmsize
1287 SCRATCH 3, 12, rsp+74*mmsize
1288 SCRATCH 0, 13, rsp+75*mmsize
1290 mova m0, [%1+ 3*4*mmsize] ; in3
1291 mova m1, [%1+ 4*4*mmsize] ; in4
1292 mova m2, [%1+11*4*mmsize] ; in11
1293 mova m3, [%1+12*4*mmsize] ; in12
1294 SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5
1295 SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13
1296 SCRATCH 1, 10, rsp+72*mmsize
1297 SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a
1298 UNSCRATCH 1, 10, rsp+72*mmsize
1299 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a
1300 SCRATCH 0, 15, rsp+77*mmsize
1301 SCRATCH 3, 11, rsp+73*mmsize
1303 UNSCRATCH 0, 12, rsp+74*mmsize ; t8a
1304 UNSCRATCH 3, 13, rsp+75*mmsize ; t9a
1305 SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9
1306 SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12
1307 SCRATCH 1, 12, rsp+74*mmsize
1308 SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a
1309 UNSCRATCH 1, 12, rsp+74*mmsize
1310 SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a
1311 mova [rsp+65*mmsize], m2
1312 mova [rsp+66*mmsize], m1
1313 SCRATCH 0, 8, rsp+70*mmsize
1314 SCRATCH 3, 12, rsp+74*mmsize
1316 mova m0, [%1+ 2*4*mmsize] ; in2
1317 mova m1, [%1+ 5*4*mmsize] ; in5
1318 mova m2, [%1+10*4*mmsize] ; in10
1319 mova m3, [%1+13*4*mmsize] ; in13
1320 SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3
1321 SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11
1322 SCRATCH 0, 10, rsp+72*mmsize
1323 SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a
1324 UNSCRATCH 0, 10, rsp+72*mmsize
1325 SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a
1326 mova [rsp+68*mmsize], m1
1327 mova [rsp+69*mmsize], m2
1328 SCRATCH 3, 13, rsp+75*mmsize
1329 SCRATCH 0, 14, rsp+76*mmsize
1331 mova m0, [%1+ 1*4*mmsize] ; in1
1332 mova m1, [%1+ 6*4*mmsize] ; in6
1333 mova m2, [%1+ 9*4*mmsize] ; in9
1334 mova m3, [%1+14*4*mmsize] ; in14
1335 SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7
1336 SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15
1337 SCRATCH 1, 10, rsp+72*mmsize
1338 SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a
1339 UNSCRATCH 1, 10, rsp+72*mmsize
1340 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a
1342 UNSCRATCH 4, 13, rsp+75*mmsize ; t10a
1343 UNSCRATCH 5, 14, rsp+76*mmsize ; t11a
1344 SCRATCH 0, 13, rsp+75*mmsize
1345 SCRATCH 3, 14, rsp+76*mmsize
1346 SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11
1347 SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14
1348 SCRATCH 0, 10, rsp+72*mmsize
1349 SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a
1350 UNSCRATCH 0, 10, rsp+72*mmsize
1351 SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a
1353 UNSCRATCH 0, 8, rsp+70*mmsize ; t12a
1354 UNSCRATCH 3, 12, rsp+74*mmsize ; t13a
1355 SCRATCH 2, 8, rsp+70*mmsize
1356 SCRATCH 1, 12, rsp+74*mmsize
1357 SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13
1358 SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14
1359 SCRATCH 2, 10, rsp+72*mmsize
1360 SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a
1361 UNSCRATCH 2, 10, rsp+72*mmsize
1362 SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a
1365 UNSCRATCH 1, 9, rsp+71*mmsize ; t1a
1366 mova m2, [rsp+68*mmsize] ; t2a
1367 UNSCRATCH 6, 13, rsp+75*mmsize ; t6a
1368 UNSCRATCH 7, 14, rsp+76*mmsize ; t7a
1369 SCRATCH 4, 10, rsp+72*mmsize
1370 SCRATCH 5, 13, rsp+75*mmsize
1371 UNSCRATCH 4, 15, rsp+77*mmsize ; t4a
1372 UNSCRATCH 5, 11, rsp+73*mmsize ; t5a
1373 SCRATCH 0, 14, rsp+76*mmsize
1374 SCRATCH 3, 15, rsp+77*mmsize
1375 mova m0, [rsp+67*mmsize] ; t0a
1376 SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4
1377 SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5
1378 SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6
1379 SCRATCH 4, 9, rsp+71*mmsize
1380 mova m3, [rsp+69*mmsize] ; t3a
1381 SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7
1383 mova [rsp+67*mmsize], m5
1384 mova [rsp+68*mmsize], m6
1385 mova [rsp+69*mmsize], m7
1386 SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a
1387 SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a
1388 SCRATCH 1, 11, rsp+73*mmsize
1389 SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6
1391 UNSCRATCH 1, 11, rsp+73*mmsize
1392 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7
1393 SCRATCH 2, 11, rsp+73*mmsize
1394 UNSCRATCH 2, 12, rsp+74*mmsize ; t11a
1395 SCRATCH 3, 12, rsp+74*mmsize
1397 UNSCRATCH 3, 8, rsp+70*mmsize ; t10a
1398 mova m4, [rsp+65*mmsize] ; t8a
1399 mova m5, [rsp+66*mmsize] ; t9a
1400 SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10
1402 SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11
1403 UNSCRATCH 6, 9, rsp+71*mmsize ; t0
1404 UNSCRATCH 7, 14, rsp+76*mmsize ; t14a
1405 SCRATCH 3, 9, rsp+71*mmsize
1406 SCRATCH 2, 14, rsp+76*mmsize
1408 SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11
1409 mova [rsp+65*mmsize], m0
1410 SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9
1411 UNSCRATCH 0, 15, rsp+77*mmsize ; t15a
1412 SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5
1414 mova m2, [rsp+68*mmsize] ; t2
1415 SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a
1416 SCRATCH 2, 8, rsp+70*mmsize
1417 mova m2, [rsp+67*mmsize] ; t1
1418 mova m3, [rsp+69*mmsize] ; t3
1419 mova [rsp+67*mmsize], m7
1420 SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a
1422 SCRATCH 3, 15, rsp+77*mmsize
1423 SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7
1424 mova m7, [rsp+67*mmsize]
1427 SWAP 2, 5, 4, 6, 7, 3
1431 cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
1432 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1433 dst, stride, block, eob
1437 mova [rsp+64*mmsize], m0
1438 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1444 lea ptrq, [%7_16x16]
1445 movzx cntd, byte [ptrq+cntq-1]
1447 movzx cntd, byte [%7_16x16+cntq-1]
1455 TRANSPOSE4x4D 0, 1, 2, 3, 7
1456 mova [ptrq+ 1*mmsize], m0
1457 mova [ptrq+ 5*mmsize], m1
1458 mova [ptrq+ 9*mmsize], m2
1459 mova [ptrq+13*mmsize], m3
1460 mova m7, [rsp+65*mmsize]
1461 TRANSPOSE4x4D 4, 5, 6, 7, 0
1462 mova [ptrq+ 2*mmsize], m4
1463 mova [ptrq+ 6*mmsize], m5
1464 mova [ptrq+10*mmsize], m6
1465 mova [ptrq+14*mmsize], m7
1466 UNSCRATCH 0, 8, rsp+(%3+0)*mmsize
1467 UNSCRATCH 1, 9, rsp+(%3+1)*mmsize
1468 UNSCRATCH 2, 10, rsp+(%3+2)*mmsize
1469 UNSCRATCH 3, 11, rsp+(%3+3)*mmsize
1470 TRANSPOSE4x4D 0, 1, 2, 3, 7
1471 mova [ptrq+ 0*mmsize], m0
1472 mova [ptrq+ 4*mmsize], m1
1473 mova [ptrq+ 8*mmsize], m2
1474 mova [ptrq+12*mmsize], m3
1475 UNSCRATCH 4, 12, rsp+(%3+4)*mmsize
1476 UNSCRATCH 5, 13, rsp+(%3+5)*mmsize
1477 UNSCRATCH 6, 14, rsp+(%3+6)*mmsize
1478 UNSCRATCH 7, 15, rsp+(%3+7)*mmsize
1479 TRANSPOSE4x4D 4, 5, 6, 7, 0
1480 mova [ptrq+ 3*mmsize], m4
1481 mova [ptrq+ 7*mmsize], m5
1482 mova [ptrq+11*mmsize], m6
1483 mova [ptrq+15*mmsize], m7
1484 add ptrq, 16 * mmsize
1489 ; zero-pad the remainder (skipped cols)
1493 lea blockq, [blockq+skipq*(mmsize/2)]
1496 mova [ptrq+mmsize*0], m0
1497 mova [ptrq+mmsize*1], m0
1498 mova [ptrq+mmsize*2], m0
1499 mova [ptrq+mmsize*3], m0
1500 mova [ptrq+mmsize*4], m0
1501 mova [ptrq+mmsize*5], m0
1502 mova [ptrq+mmsize*6], m0
1503 mova [ptrq+mmsize*7], m0
1504 add ptrq, 8 * mmsize
1509 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1510 lea stride3q, [strideq*3]
1517 lea dstq, [dstq+strideq*4]
1518 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
1519 lea dstq, [dstq+strideq*4]
1520 mova m0, [rsp+65*mmsize]
1521 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6
1524 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1528 UNSCRATCH 0, 8, rsp+(%6+0)*mmsize
1529 UNSCRATCH 1, 9, rsp+(%6+1)*mmsize
1530 UNSCRATCH 2, 10, rsp+(%6+2)*mmsize
1531 UNSCRATCH 3, 11, rsp+(%6+3)*mmsize
1532 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
1534 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1535 lea dstq, [dstbakq+stride3q*4]
1537 lea dstq, [dstq+stride3q*4]
1539 UNSCRATCH 4, 12, rsp+(%6+4)*mmsize
1540 UNSCRATCH 5, 13, rsp+(%6+5)*mmsize
1541 UNSCRATCH 6, 14, rsp+(%6+6)*mmsize
1542 UNSCRATCH 0, 15, rsp+(%6+7)*mmsize
1543 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6
1557 ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
1560 cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
1561 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1562 dst, stride, block, eob
1564 jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
1568 IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row
1569 IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col
1570 IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
1572 %macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
1573 IDCT16_1D %2, 2 * %3, 272, 257
1575 mova [rsp+257*mmsize], m8
1576 mova [rsp+258*mmsize], m9
1577 mova [rsp+259*mmsize], m10
1578 mova [rsp+260*mmsize], m11
1579 mova [rsp+261*mmsize], m12
1580 mova [rsp+262*mmsize], m13
1581 mova [rsp+263*mmsize], m14
1582 mova [rsp+264*mmsize], m15
1584 mova [rsp+265*mmsize], m0
1585 mova [rsp+266*mmsize], m1
1586 mova [rsp+267*mmsize], m2
1587 mova [rsp+268*mmsize], m3
1588 mova [rsp+269*mmsize], m4
1589 mova [rsp+270*mmsize], m5
1590 mova [rsp+271*mmsize], m6
1593 ; r265-272: t4/5a/6a/7/8/9a/10/11a
1594 ; r261-264: t12a/13/14a/15
1595 ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
1597 mova m0, [%2+ 1*%3] ; in1
1598 mova m1, [%2+15*%3] ; in15
1599 mova m2, [%2+17*%3] ; in17
1600 mova m3, [%2+31*%3] ; in31
1601 SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a
1602 SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a
1603 SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17
1604 SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30
1605 SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a
1606 SCRATCH 0, 8, rsp+275*mmsize
1607 SCRATCH 2, 9, rsp+276*mmsize
1609 ; end of stage 1-3 first quart
1611 mova m0, [%2+ 7*%3] ; in7
1612 mova m2, [%2+ 9*%3] ; in9
1613 mova m4, [%2+23*%3] ; in23
1614 mova m5, [%2+25*%3] ; in25
1615 SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a
1616 SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a
1617 SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18
1618 SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29
1619 SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a
1621 ; end of stage 1-3 second quart
1623 SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a
1624 SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18
1625 UNSCRATCH 6, 8, rsp+275*mmsize ; t30a
1626 UNSCRATCH 7, 9, rsp+276*mmsize ; t31
1627 mova [rsp+273*mmsize], m4
1628 mova [rsp+274*mmsize], m0
1629 SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a
1630 SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29
1631 SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a
1632 SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19
1633 SCRATCH 3, 10, rsp+277*mmsize
1634 SCRATCH 1, 11, rsp+278*mmsize
1635 SCRATCH 7, 12, rsp+279*mmsize
1636 SCRATCH 6, 13, rsp+280*mmsize
1637 SCRATCH 5, 14, rsp+281*mmsize
1638 SCRATCH 2, 15, rsp+282*mmsize
1640 ; end of stage 4-5 first half
1642 mova m0, [%2+ 5*%3] ; in5
1643 mova m1, [%2+11*%3] ; in11
1644 mova m2, [%2+21*%3] ; in21
1645 mova m3, [%2+27*%3] ; in27
1646 SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a
1647 SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a
1648 SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21
1649 SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26
1650 SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a
1651 SCRATCH 0, 8, rsp+275*mmsize
1652 SCRATCH 2, 9, rsp+276*mmsize
1654 ; end of stage 1-3 third quart
1656 mova m0, [%2+ 3*%3] ; in3
1657 mova m2, [%2+13*%3] ; in13
1658 mova m4, [%2+19*%3] ; in19
1659 mova m5, [%2+29*%3] ; in29
1660 SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a
1661 SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a
1662 SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22
1663 SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25
1664 SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a
1666 ; end of stage 1-3 fourth quart
1668 SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a
1669 SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21
1670 UNSCRATCH 6, 8, rsp+275*mmsize ; t26a
1671 UNSCRATCH 7, 9, rsp+276*mmsize ; t27
1672 SCRATCH 3, 8, rsp+275*mmsize
1673 SCRATCH 1, 9, rsp+276*mmsize
1674 SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a
1675 SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26
1676 SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20
1677 SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a
1679 ; end of stage 4-5 second half
1681 UNSCRATCH 1, 12, rsp+279*mmsize ; t28
1682 UNSCRATCH 3, 13, rsp+280*mmsize ; t29a
1683 SCRATCH 4, 12, rsp+279*mmsize
1684 SCRATCH 0, 13, rsp+280*mmsize
1685 SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26
1686 SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a
1687 UNSCRATCH 0, 14, rsp+281*mmsize ; t30
1688 UNSCRATCH 4, 15, rsp+282*mmsize ; t31a
1689 SCRATCH 2, 14, rsp+281*mmsize
1690 SCRATCH 5, 15, rsp+282*mmsize
1691 SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a
1692 SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24
1694 mova m2, [rsp+273*mmsize] ; t16a
1695 mova m5, [rsp+274*mmsize] ; t17
1696 mova [rsp+273*mmsize], m6
1697 mova [rsp+274*mmsize], m7
1698 UNSCRATCH 6, 10, rsp+277*mmsize ; t18a
1699 UNSCRATCH 7, 11, rsp+278*mmsize ; t19
1700 SCRATCH 4, 10, rsp+277*mmsize
1701 SCRATCH 0, 11, rsp+278*mmsize
1702 UNSCRATCH 4, 12, rsp+279*mmsize ; t20
1703 UNSCRATCH 0, 13, rsp+280*mmsize ; t21a
1704 SCRATCH 3, 12, rsp+279*mmsize
1705 SCRATCH 1, 13, rsp+280*mmsize
1706 SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21
1707 SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a
1708 UNSCRATCH 3, 8, rsp+275*mmsize ; t22
1709 UNSCRATCH 1, 9, rsp+276*mmsize ; t23a
1710 SCRATCH 0, 8, rsp+275*mmsize
1711 SCRATCH 4, 9, rsp+276*mmsize
1712 SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a
1713 SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23
1717 UNSCRATCH 0, 10, rsp+277*mmsize ; t24
1718 UNSCRATCH 4, 11, rsp+278*mmsize ; t25a
1719 SCRATCH 1, 10, rsp+277*mmsize
1720 SCRATCH 3, 11, rsp+278*mmsize
1721 SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a
1722 SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22
1723 UNSCRATCH 1, 12, rsp+279*mmsize ; t26
1724 UNSCRATCH 3, 13, rsp+280*mmsize ; t27a
1725 SCRATCH 0, 12, rsp+279*mmsize
1726 SCRATCH 4, 13, rsp+280*mmsize
1727 SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20
1728 SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a
1732 mova m0, [rsp+269*mmsize] ; t8
1733 mova m4, [rsp+270*mmsize] ; t9a
1734 mova [rsp+269*mmsize], m1 ; t26a
1735 mova [rsp+270*mmsize], m3 ; t27
1736 mova m3, [rsp+271*mmsize] ; t10
1737 SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23
1738 SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22
1739 SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21
1740 mova m1, [rsp+272*mmsize] ; t11a
1741 mova [rsp+271*mmsize], m0
1742 SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20
1745 TRANSPOSE4x4D 2, 5, 6, 7, 0
1746 mova [ptrq+ 2*mmsize], m2
1747 mova [ptrq+10*mmsize], m5
1748 mova [ptrq+18*mmsize], m6
1749 mova [ptrq+26*mmsize], m7
1752 lea dstq, [dstq+strideq*8]
1753 ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
1755 mova m2, [rsp+271*mmsize]
1757 TRANSPOSE4x4D 1, 3, 4, 2, 0
1758 mova [ptrq+ 5*mmsize], m1
1759 mova [ptrq+13*mmsize], m3
1760 mova [ptrq+21*mmsize], m4
1761 mova [ptrq+29*mmsize], m2
1763 lea dstq, [dstq+stride3q*4]
1764 ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
1767 ; end of last stage + store for out8-11 and out20-23
1769 UNSCRATCH 0, 9, rsp+276*mmsize ; t19a
1770 UNSCRATCH 1, 8, rsp+275*mmsize ; t18
1771 UNSCRATCH 2, 11, rsp+278*mmsize ; t17a
1772 UNSCRATCH 3, 10, rsp+277*mmsize ; t16
1773 mova m7, [rsp+261*mmsize] ; t12a
1774 mova m6, [rsp+262*mmsize] ; t13
1775 mova m5, [rsp+263*mmsize] ; t14a
1776 SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19
1777 SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18
1778 SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17
1779 mova m4, [rsp+264*mmsize] ; t15
1780 SCRATCH 7, 8, rsp+275*mmsize
1781 SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16
1784 TRANSPOSE4x4D 0, 1, 2, 3, 7
1785 mova [ptrq+ 3*mmsize], m0
1786 mova [ptrq+11*mmsize], m1
1787 mova [ptrq+19*mmsize], m2
1788 mova [ptrq+27*mmsize], m3
1792 lea dstq, [dstbakq+stride3q*4]
1796 lea dstq, [dstq+stride3q*4]
1798 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
1800 UNSCRATCH 0, 8, rsp+275*mmsize ; out19
1802 TRANSPOSE4x4D 4, 5, 6, 0, 7
1803 mova [ptrq+ 4*mmsize], m4
1804 mova [ptrq+12*mmsize], m5
1805 mova [ptrq+20*mmsize], m6
1806 mova [ptrq+28*mmsize], m0
1808 lea dstq, [dstq+strideq*4]
1809 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
1812 ; end of last stage + store for out12-19
1817 mova m7, [rsp+257*mmsize] ; t0
1818 mova m6, [rsp+258*mmsize] ; t1
1819 mova m5, [rsp+259*mmsize] ; t2
1820 mova m4, [rsp+260*mmsize] ; t3
1821 mova m0, [rsp+274*mmsize] ; t31
1822 mova m1, [rsp+273*mmsize] ; t30a
1823 UNSCRATCH 2, 15, rsp+282*mmsize ; t29
1824 SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31
1825 SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30
1826 SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29
1827 SCRATCH 0, 9, rsp+276*mmsize
1828 UNSCRATCH 3, 14, rsp+281*mmsize ; t28a
1829 SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28
1832 TRANSPOSE4x4D 4, 5, 6, 7, 0
1833 mova [ptrq+ 7*mmsize], m4
1834 mova [ptrq+15*mmsize], m5
1835 mova [ptrq+23*mmsize], m6
1836 mova [ptrq+31*mmsize], m7
1843 lea dstq, [dstq+stride3q*4]
1844 ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
1846 UNSCRATCH 7, 9, rsp+276*mmsize ; out0
1848 TRANSPOSE4x4D 7, 1, 2, 3, 0
1849 mova [ptrq+ 0*mmsize], m7
1850 mova [ptrq+ 8*mmsize], m1
1851 mova [ptrq+16*mmsize], m2
1852 mova [ptrq+24*mmsize], m3
1855 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1859 ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
1861 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1865 ; end of last stage + store for out0-3 and out28-31
1870 mova m7, [rsp+265*mmsize] ; t4
1871 mova m6, [rsp+266*mmsize] ; t5a
1872 mova m5, [rsp+267*mmsize] ; t6a
1873 mova m4, [rsp+268*mmsize] ; t7
1874 mova m0, [rsp+270*mmsize] ; t27
1875 mova m1, [rsp+269*mmsize] ; t26a
1876 UNSCRATCH 2, 13, rsp+280*mmsize ; t25
1877 SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27
1878 SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26
1879 SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25
1880 UNSCRATCH 3, 12, rsp+279*mmsize ; t24a
1881 SCRATCH 7, 9, rsp+276*mmsize
1882 SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24
1885 TRANSPOSE4x4D 0, 1, 2, 3, 7
1886 mova [ptrq+ 1*mmsize], m0
1887 mova [ptrq+ 9*mmsize], m1
1888 mova [ptrq+17*mmsize], m2
1889 mova [ptrq+25*mmsize], m3
1893 lea dstq, [dstbakq+strideq*4]
1896 lea dstq, [dstq+strideq*4]
1898 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
1900 UNSCRATCH 0, 9, rsp+276*mmsize ; out27
1902 TRANSPOSE4x4D 4, 5, 6, 0, 7
1903 mova [ptrq+ 6*mmsize], m4
1904 mova [ptrq+14*mmsize], m5
1905 mova [ptrq+22*mmsize], m6
1906 mova [ptrq+30*mmsize], m0
1909 lea dstq, [dstbakq+stride3q*8]
1912 lea dstq, [dstq+stride3q*8]
1914 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
1917 ; end of last stage + store for out4-7 and out24-27
1921 cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
1922 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1923 dst, stride, block, eob
1928 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
1929 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
1931 DEFINE_ARGS dst, stride, block, coef
1935 pshuflw m1, m1, q0000
1937 DEFINE_ARGS dst, stride, cnt
1940 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
1941 STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
1948 mova [rsp+256*mmsize], m0
1949 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1955 lea ptrq, [default_32x32]
1956 movzx cntd, byte [ptrq+cntq-1]
1958 movzx cntd, byte [default_32x32+cntq-1]
1966 add ptrq, 32 * mmsize
1971 ; zero-pad the remainder (skipped cols)
1975 lea blockq, [blockq+skipq*(mmsize/4)]
1978 mova [ptrq+mmsize*0], m0
1979 mova [ptrq+mmsize*1], m0
1980 mova [ptrq+mmsize*2], m0
1981 mova [ptrq+mmsize*3], m0
1982 mova [ptrq+mmsize*4], m0
1983 mova [ptrq+mmsize*5], m0
1984 mova [ptrq+mmsize*6], m0
1985 mova [ptrq+mmsize*7], m0
1986 add ptrq, 8 * mmsize
1991 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1992 lea stride3q, [strideq*3]
2010 ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
2014 cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
2015 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
2016 dst, stride, block, eob
2019 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
2021 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
2022 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
2023 DEFINE_ARGS dst, stride, block, coef, coefl
2027 pshuflw m1, m1, q0000
2029 DEFINE_ARGS dst, stride, cnt
2032 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
2033 STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize