1 ;*******************************************************************************
2 ;* SIMD-optimized IDCT functions for HEVC decoding
3 ;* Copyright (c) 2014 Pierre-Edouard LEPERE
4 ;* Copyright (c) 2014 James Almer
5 ;* Copyright (c) 2016 Alexandra Hájková
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
29 pd_2048: times 4 dd 2048
30 pd_512: times 4 dd 512
32 ; 4x4 transform coeffs
34 pw_64_m64: times 4 dw 64, -64
35 pw_83_36: times 4 dw 83, 36
36 pw_36_m83: times 4 dw 36, -83
38 ; 8x8 transform coeffs
39 pw_89_75: times 4 dw 89, 75
40 pw_50_18: times 4 dw 50, 18
42 pw_75_m18: times 4 dw 75, -18
43 pw_m89_m50: times 4 dw -89, -50
45 pw_50_m89: times 4 dw 50, -89
46 pw_18_75: times 4 dw 18, 75
48 pw_18_m50: times 4 dw 18, -50
49 pw_75_m89: times 4 dw 75, -89
51 ; 16x16 transformation coeffs
52 trans_coeffs16: times 4 dw 90, 87
92 ; 32x32 transform coeffs
93 trans_coeff32: times 8 dw 90
239 ; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
241 ; %2 = number of loops
244 cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
245 movsx tmpd, word [coeffq]
246 add tmpd, (1 << (14 - %3)) + 1
250 DEFINE_ARGS coeff, cnt
253 mova [coeffq+mmsize*0], m0
254 mova [coeffq+mmsize*1], m0
255 mova [coeffq+mmsize*2], m0
256 mova [coeffq+mmsize*3], m0
258 mova [coeffq+mmsize*-4], m0
259 mova [coeffq+mmsize*-3], m0
260 mova [coeffq+mmsize*-2], m0
261 mova [coeffq+mmsize*-1], m0
269 %macro IDCT_DC_NL 2 ; No loop
270 cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
271 movsx tmpd, word [coeffq]
272 add tmpd, (1 << (14 - %2)) + 1
276 mova [coeffq+mmsize*0], m0
277 mova [coeffq+mmsize*1], m0
278 mova [coeffq+mmsize*2], m0
279 mova [coeffq+mmsize*3], m0
281 mova [coeffq+mmsize*4], m0
282 mova [coeffq+mmsize*5], m0
283 mova [coeffq+mmsize*6], m0
284 mova [coeffq+mmsize*7], m0
289 ; IDCT 4x4, expects input in m0, m1
291 ; %2 - 1/0 - SCALE and Transpose or not
292 ; %3 - 1/0 add constant or not
294 ; interleaves src0 with src2 to m0
295 ; and src1 with scr3 to m2
296 ; src0: 00 01 02 03 m0: 00 20 01 21 02 22 03 23
297 ; src1: 10 11 12 13 -->
298 ; src2: 20 21 22 23 m1: 10 30 11 31 12 32 13 33
301 SBUTTERFLY wd, 0, 1, 2
303 pmaddwd m2, m0, [pw_64] ; e0
304 pmaddwd m3, m1, [pw_83_36] ; o0
305 pmaddwd m0, [pw_64_m64] ; e1
306 pmaddwd m1, [pw_36_m83] ; o1
309 %assign %%add 1 << (%1 - 1)
310 mova m4, [pd_ %+ %%add]
315 SUMSUB_BADC d, 3, 2, 1, 0, 4
318 psrad m3, %1 ; e0 + o0
319 psrad m1, %1 ; e1 + o1
320 psrad m2, %1 ; e0 - o0
321 psrad m0, %1 ; e1 - o1
326 SBUTTERFLY wd, 3, 0, 1
327 SBUTTERFLY wd, 3, 0, 1
335 %assign shift (20 - %1)
336 %assign c_add (1 << (shift - 1))
337 %define arr_add pd_ %+ c_add
341 ; %2 - register add constant
343 ; shift = 20 - bit_depth
349 ; %1, %2 - registers to load packed 16 bit values to
350 ; %3, %4, %5, %6 - vertical offsets
351 ; %7 - horizontal offset
353 movq %1, [r0 + %3 + %7]
354 movhps %1, [r0 + %5 + %7]
355 movq %2, [r0 + %4 + %7]
356 movhps %2, [r0 + %6 + %7]
359 ; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
362 cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
364 mova m1, [coeffsq + 16]
370 mova [coeffsq + 16], m1
374 ; scale, pack (clip16) and store the residuals 0 e8[0] + o8[0] --> + %1
375 ; 4 at one time (4 columns) 1 e8[1] + o8[1]
376 ; from %5: e8/16 + o8/16, with %1 offset ...
377 ; and %3: e8/16 - o8/16, with %2 offset 6 e8[1] - o8[1]
378 ; %4 - shift 7 e8[0] - o8[0] --> + %2
383 movq [coeffsq + %1], %5
384 movhps [coeffsq + %2], %5
387 ; %1 - horizontal offset
389 ; %3, %4 - transform coeffs
390 ; %5 - vertical offset for e8 + o8
391 ; %6 - vertical offset for e8 - o8
392 ; %7 - register with e8 inside
394 ; %9 - register to store e8 +o8
395 ; %10 - register to store e8 - o8
401 paddd m7, m6, %7 ; o8 + e8
402 psubd %7, m6 ; e8 - o8
404 STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
411 ; 8x4 residuals are processed and stored
412 ; %1 - horizontal offset
414 ; %3 - offset of the even row
415 ; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
416 ; %5 - offset of the odd row
418 ; %7 - 1/0 add a constant in TR_4x4 or not
419 ; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
421 ; load 4 columns of even rows
422 LOAD_BLOCK m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
424 TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
426 ; load 4 columns of odd rows
427 LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
430 ; 10 11 12 13 m4: 10 30 11 31 12 32 13 33
433 ; m5: 50 70 51 71 52 72 53 73
435 SBUTTERFLY wd, 4, 5, 6
437 E8_O8 %1, %2, [pw_89_75], [pw_50_18], 0, %5 * 7, m0, %6, m8, m15
438 E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5, %5 * 6, m1, %6, m9, m14
439 E8_O8 %1, %2, [pw_50_m89], [pw_18_75], %5 * 2, %5 * 5, m2, %6, m10, m13
440 E8_O8 %1, %2, [pw_18_m50], [pw_75_m89], %5 * 3, %5 * 4, m3, %6, m11, m12
443 %macro STORE_PACKED 7
444 movq [r0 + %3 + %7], %1
445 movhps [r0 + %4 + %7], %1
446 movq [r0 + %5 + %7], %2
447 movhps [r0 + %6 + %7], %2
450 ; transpose 4x4 block packed
451 ; in %1 and %2 registers
452 ; %3 - temporary register
453 %macro TRANSPOSE_4x4 3
454 SBUTTERFLY wd, %1, %2, %3
455 SBUTTERFLY dq, %1, %2, %3
458 ; %1 - horizontal offset of the block i
459 ; %2 - vertical offset of the block i
460 ; %3 - width in bytes
461 ; %4 - vertical offset for the block j
462 ; %5 - horizontal offset for the block j
465 LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
466 TRANSPOSE_4x4 4, 5, 6
469 LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
471 STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
473 ; transpose and store M_i
476 TRANSPOSE_4x4 4, 5, 6
477 STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
480 ; %1 - horizontal offset
481 ; %2 - vertical offset of the block
482 ; %3 - width in bytes
483 %macro TRANSPOSE_BLOCK 3
484 LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
485 TRANSPOSE_4x4 4, 5, 6
486 STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
489 %macro TRANSPOSE_8x8 0
490 cglobal hevc_idct_transpose_8x8, 0, 0, 0
491 ; M1 M2 ^T = M1^t M3^t
495 TRANSPOSE_BLOCK 0, 0, 16
498 SWAP_BLOCKS 0, 64, 16, 0, 8
501 TRANSPOSE_BLOCK 8, 64, 16
506 ; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
509 cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
510 TR_8x4 0, 7, 32, 1, 16, 8, 1
511 TR_8x4 8, 7, 32, 1, 16, 8, 1
513 call hevc_idct_transpose_8x8_ %+ cpuname
516 TR_8x4 0, shift, 32, 1, 16, 8, 1
517 TR_8x4 8, shift, 32, 1, 16, 8, 1
519 TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
522 ; store intermedite e32 coeffs on stack
524 ; from m10: e8 + o8, with %6 offset
525 ; and %3: e8 - o8, with %7 offset
526 ; %4 - shift, unused here
532 ; %1, %2 - transform constants
533 ; %3, %4 - regs with interleaved coeffs
534 ; %5 - 1/0 SWAP or add
535 ; %6, %7 - registers for intermidiate sums
536 ; %8 - accumulator register
548 ; %1 - transform coeffs
549 ; %2, %3 offsets for storing e+o/e-o back to coeffsq
553 ; %7 - register with e16
554 ; %8, %9 - stack offsets for storing e+o/e-o
556 ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m5, m6, m7
557 ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
563 paddd m4, m7, %7 ; o16 + e16
564 psubd %7, m7 ; e16 - o16
565 STORE_%6 %2, %3, %7, %4, m4, %8, %9
569 ; produce 8x4 matrix of e16 coeffs
570 ; for 4 first rows and store it on stack (128 bytes)
571 TR_8x4 %1, 7, %4, %5, %6, %8, 0
574 LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
575 LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
577 SBUTTERFLY wd, 0, 1, 4
578 SBUTTERFLY wd, 2, 3, 4
580 E16_O16 trans_coeffs16, 0 + %1, 15 * %6 + %1, %2, %3, %7, m8, 0, 15 * 16
582 E16_O16 trans_coeffs16 + 64, %6 + %1, 14 * %6 + %1, %2, m8, %7, m9, 16, 14 * 16
583 E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
584 E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
585 E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
586 E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
587 E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1, 9 * %6 + %1, %2, m8, %7, m14, 6 * 16, 9 * 16
588 E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1, 8 * %6 + %1, %2, m8, %7, m15, 7 * 16, 8 * 16
591 %macro TRANSPOSE_16x16 0
592 cglobal hevc_idct_transpose_16x16, 0, 0, 0
593 ; M1 M2 M3 M4 ^T m1 m5 m9 m13 M_i^T = m_i
594 ; M5 M6 M7 M8 --> m2 m6 m10 m14
595 ; M9 M10 M11 M12 m3 m7 m11 m15
596 ; M13 M14 M15 M16 m4 m8 m12 m16
599 TRANSPOSE_BLOCK 0, 0, 32
602 SWAP_BLOCKS 0, 128, 32, 0, 8
604 SWAP_BLOCKS 0, 256, 32, 0, 16
606 SWAP_BLOCKS 0, 384, 32, 0, 24
609 TRANSPOSE_BLOCK 8, 128, 32
612 SWAP_BLOCKS 8, 256, 32, 128, 16
614 SWAP_BLOCKS 8, 384, 32, 128, 24
617 TRANSPOSE_BLOCK 16, 256, 32
620 SWAP_BLOCKS 16, 384, 32, 256, 24
623 TRANSPOSE_BLOCK 24, 384, 32
628 ; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
631 cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
634 TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
638 call hevc_idct_transpose_16x16_ %+ cpuname
643 TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
647 TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
650 ; scale, pack (clip16) and store the residuals 0 e32[0] + o32[0] --> %1
651 ; 4 at one time (4 columns) 1 e32[1] + o32[1]
652 ; %1 - address to store e32 + o32
653 ; %2 - address to store e32 - e32
654 ; %5 - reg with e32 + o32 ...
655 ; %3 - reg with e32 - o32 30 e32[1] - o32[1]
656 ; %4 - shift 31 e32[0] - o32[0] --> %2
665 ; %1 - transform coeffs
666 ; %2 - stack offset for e32
667 ; %2, %3 offsets for storing e+o/e-o back to coeffsq
669 ; %5 - stack offset of e32
671 ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m8, m9, m10
672 ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
673 ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
674 ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
676 paddd m11, m14, [rsp + %5]
677 paddd m12, m10, m11 ; o32 + e32
678 psubd m11, m10 ; e32 - o32
679 STORE_32 %2, %3, m11, %4, m12
682 ; %1 - horizontal offset
685 TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
687 LOAD_BLOCK m0, m1, 64, 3 * 64, 5 * 64, 7 * 64, %1
688 LOAD_BLOCK m2, m3, 9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
689 LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
690 LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
692 SBUTTERFLY wd, 0, 1, 8
693 SBUTTERFLY wd, 2, 3, 8
694 SBUTTERFLY wd, 4, 5, 8
695 SBUTTERFLY wd, 6, 7, 8
704 lea r2, [trans_coeff32 + 15 * 128]
705 lea r3, [coeffsq + %1]
706 lea r4, [r3 + 16 * 64]
709 E32_O32 r2, r3 + r5 * 4, r4, shift, r5
716 %macro TRANSPOSE_32x32 0
717 cglobal hevc_idct_transpose_32x32, 0, 0, 0
725 TRANSPOSE_BLOCK 0, 0, 64 ; M1
729 SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
734 TRANSPOSE_BLOCK 8, 256, 64 ; M9
739 SWAP_BLOCKS 8, r2, 64, 256, r3
745 TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
750 SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
756 TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
761 SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
767 TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
772 SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
778 TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
779 SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
780 SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
782 TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
783 SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
785 TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
790 ; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
793 cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
796 TR_32x4 8 * r1, %1, 1
800 call hevc_idct_transpose_32x32_ %+ cpuname
804 TR_32x4 8 * r1, %1, 0
808 TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
811 %macro INIT_IDCT_DC 1
821 %if HAVE_AVX2_EXTERNAL
825 %endif ;HAVE_AVX2_EXTERNAL