1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
31 pb_F8: times 16 db 0xF8
32 pb_FE: times 16 db 0xFE
33 pb_27_63: times 8 db 27, 63
34 pb_18_63: times 8 db 18, 63
35 pb_9_63: times 8 db 9, 63
45 ;-----------------------------------------------------------------------------
46 ; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
47 ;-----------------------------------------------------------------------------
49 ; macro called with 7 mm register indexes as argument, and 4 regular registers
51 ; first 4 mm registers will carry the transposed pixel data
52 ; the other three are scratchspace (one would be sufficient, but this allows
53 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
55 ; first two regular registers are buf+4*stride and buf+5*stride
56 ; third is -stride, fourth is +stride
57 %macro READ_8x4_INTERLEAVED 11
58 ; interleave 8 (A-H) rows of 4 pixels each
59 movd m%1, [%8+%10*4] ; A0-3
60 movd m%5, [%9+%10*4] ; B0-3
61 movd m%2, [%8+%10*2] ; C0-3
62 movd m%6, [%8+%10] ; D0-3
65 movd m%4, [%9+%11] ; G0-3
66 punpcklbw m%1, m%5 ; A/B interleaved
67 movd m%5, [%9+%11*2] ; H0-3
68 punpcklbw m%2, m%6 ; C/D interleaved
69 punpcklbw m%3, m%7 ; E/F interleaved
70 punpcklbw m%4, m%5 ; G/H interleaved
73 ; macro called with 7 mm register indexes as argument, and 5 regular registers
74 ; first 11 mean the same as READ_8x4_TRANSPOSED above
75 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
76 ; will be set to second regular register + 8*stride at the end
77 %macro READ_16x4_INTERLEAVED 12
78 ; transpose 16 (A-P) rows of 4 pixels each
81 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
82 movd m%1, [%8+%10*4] ; A0-3
83 movd m%3, [%12+%10*4] ; I0-3
84 movd m%2, [%8+%10*2] ; C0-3
85 movd m%4, [%12+%10*2] ; K0-3
86 movd m%6, [%8+%10] ; D0-3
87 movd m%5, [%12+%10] ; L0-3
88 movd m%7, [%12] ; M0-3
90 punpcklbw m%1, m%3 ; A/I
92 punpcklbw m%2, m%4 ; C/K
93 punpcklbw m%6, m%5 ; D/L
94 punpcklbw m%3, m%7 ; E/M
95 punpcklbw m%2, m%6 ; C/D/K/L interleaved
97 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
98 movd m%5, [%9+%10*4] ; B0-3
99 movd m%4, [%12+%10*4] ; J0-3
100 movd m%7, [%9] ; F0-3
101 movd m%6, [%12] ; N0-3
102 punpcklbw m%5, m%4 ; B/J
103 punpcklbw m%7, m%6 ; F/N
104 punpcklbw m%1, m%5 ; A/B/I/J interleaved
105 punpcklbw m%3, m%7 ; E/F/M/N interleaved
106 movd m%4, [%9+%11] ; G0-3
107 movd m%6, [%12+%11] ; O0-3
108 movd m%5, [%9+%11*2] ; H0-3
109 movd m%7, [%12+%11*2] ; P0-3
110 punpcklbw m%4, m%6 ; G/O
111 punpcklbw m%5, m%7 ; H/P
112 punpcklbw m%4, m%5 ; G/H/O/P interleaved
115 ; write 4 mm registers of 2 dwords each
116 ; first four arguments are mm register indexes containing source data
117 ; last four are registers containing buf+4*stride, buf+5*stride,
118 ; -stride and +stride
120 ; write out (2 dwords per register)
135 ; write 4 xmm registers of 4 dwords each
136 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
137 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
138 ; we add 1*stride to the third regular registry in the process
139 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
140 ; same memory region), or 8 if they cover two separate buffers (third one points to
141 ; a different memory region than the first two), allowing for more optimal code for
144 ; write out (4 dwords per register), start with dwords zero
195 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
196 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
198 ; 3 is a general-purpose register that we will clobber
200 ; 3 is a pointer to the destination's 5th line
201 ; 4 is a pointer to the destination's 4th line
202 ; 5/6 is -stride and +stride
233 pextrw [%3+%4*4], %1, 0
234 pextrw [%2+%4*4], %1, 1
235 pextrw [%3+%4*2], %1, 2
236 pextrw [%3+%4 ], %1, 3
239 pextrw [%2+%5 ], %1, 6
240 pextrw [%2+%5*2], %1, 7
270 %macro SIMPLE_LOOPFILTER 2
271 cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
272 %if mmsize == 8 ; mmx/mmxext
278 SPLATB_REG m7, flim, m0 ; splat "flim" into register
280 ; set up indexes to address 4 rows
282 DEFINE_ARGS dst1, mstride, stride, cntr, dst2
284 DEFINE_ARGS dst1, mstride, stride, dst3, dst2
286 mov strideq, mstrideq
289 lea dst1q, [dst1q+4*strideq-2]
292 %if mmsize == 8 ; mmx / mmxext
296 ; read 4 half/full rows of pixels
297 mova m0, [dst1q+mstrideq*2] ; p1
298 mova m1, [dst1q+mstrideq] ; p0
299 mova m2, [dst1q] ; q0
300 mova m3, [dst1q+ strideq] ; q1
302 lea dst2q, [dst1q+ strideq]
304 %if mmsize == 8 ; mmx/mmxext
305 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
307 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
309 TRANSPOSE4x4W 0, 1, 2, 3, 4
313 mova m5, m2 ; m5=backup of q0
314 mova m6, m1 ; m6=backup of p0
315 psubusb m1, m2 ; p0-q0
316 psubusb m2, m6 ; q0-p0
317 por m1, m2 ; FFABS(p0-q0)
318 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
322 psubusb m3, m0 ; q1-p1
323 psubusb m0, m4 ; p1-q1
324 por m3, m0 ; FFABS(p1-q1)
328 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
330 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
334 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
336 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
340 psubsb m5, m0 ; q0-p0 (signed)
343 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
344 pand m2, m3 ; apply filter mask (m3)
348 paddsb m2, [pb_4] ; f1<<3=a+4
349 paddsb m1, [pb_3] ; f2<<3=a+3
351 pand m1, m3 ; cache f2<<3
355 pcmpgtb m0, m2 ; which values are <0?
356 psubb m3, m2 ; -f1<<3
362 paddusb m4, m3 ; q0-f1
366 pcmpgtb m0, m1 ; which values are <0?
367 psubb m3, m1 ; -f2<<3
373 psubusb m6, m3 ; p0+f2
378 mova [dst1q+mstrideq], m6
381 SBUTTERFLY bw, 6, 4, 0
383 %if mmsize == 16 ; sse2
387 WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
388 lea dst2q, [dst3q+mstrideq+1]
392 WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
394 WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
398 %if mmsize == 8 ; mmx/mmxext
401 add dst1q, 8 ; advance 8 cols = pixels
403 lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
415 SIMPLE_LOOPFILTER v, 4
416 SIMPLE_LOOPFILTER h, 5
418 SIMPLE_LOOPFILTER v, 4
419 SIMPLE_LOOPFILTER h, 5
423 SIMPLE_LOOPFILTER v, 3
424 SIMPLE_LOOPFILTER h, 5
426 SIMPLE_LOOPFILTER v, 3
427 SIMPLE_LOOPFILTER h, 5
429 SIMPLE_LOOPFILTER h, 5
431 ;-----------------------------------------------------------------------------
432 ; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
433 ; int flimE, int flimI, int hev_thr);
434 ;-----------------------------------------------------------------------------
436 %macro INNER_LOOPFILTER 2
438 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
439 %ifidn %1, v ; [3]=hev() result
440 %define stack_size mmsize * -4
441 %else ; h ; extra storage space for transposes
442 %define stack_size mmsize * -5
447 cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
449 cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
457 ; splat function arguments
458 SPLATB_REG m0, flimEq, m7 ; E
459 SPLATB_REG m1, flimIq, m7 ; I
460 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
462 %define m_flimE [rsp]
463 %define m_flimI [rsp+mmsize]
464 %define m_hevthr [rsp+mmsize*2]
465 %define m_maskres [rsp+mmsize*3]
466 %define m_p0backup [rsp+mmsize*3]
467 %define m_q0backup [rsp+mmsize*4]
476 %define m_maskres m12
477 %define m_p0backup m12
478 %define m_q0backup m8
480 ; splat function arguments
481 SPLATB_REG m_flimE, flimEq, m7 ; E
482 SPLATB_REG m_flimI, flimIq, m7 ; I
483 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
487 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
489 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
492 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
494 mov strideq, mstrideq
497 lea dst1q, [dst1q+strideq*4-4]
499 lea dst8q, [dst8q+strideq*4-4]
507 lea dst2q, [dst1q+strideq]
509 %if %2 == 8 && mmsize == 16
514 movrow m0, [dst1q+mstrideq*4] ; p3
515 movrow m1, [dst2q+mstrideq*4] ; p2
516 movrow m2, [dst1q+mstrideq*2] ; p1
517 movrow m5, [dst2q] ; q1
518 movrow m6, [dst2q+ strideq*1] ; q2
519 movrow m7, [dst2q+ strideq*2] ; q3
520 %if mmsize == 16 && %2 == 8
521 movhps m0, [dst8q+mstrideq*4]
522 movhps m2, [dst8q+mstrideq*2]
524 movhps m1, [dst8q+mstrideq*4]
526 movhps m6, [dst8q+ strideq ]
527 movhps m7, [dst8q+ strideq*2]
530 %elif mmsize == 8 ; mmx/mmxext (h)
531 ; read 8 rows of 8px each
532 movu m0, [dst1q+mstrideq*4]
533 movu m1, [dst2q+mstrideq*4]
534 movu m2, [dst1q+mstrideq*2]
535 movu m3, [dst1q+mstrideq ]
538 movu m6, [dst2q+ strideq ]
541 TRANSPOSE4x4B 0, 1, 2, 3, 7
543 movu m7, [dst2q+ strideq*2]
544 TRANSPOSE4x4B 4, 5, 6, 7, 1
545 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
546 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
547 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
549 mova m_q0backup, m2 ; store q0
550 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
551 mova m_p0backup, m5 ; store p0
558 lea dst8q, [dst1q+ strideq*8]
561 ; read 16 rows of 8px each, interleave
562 movh m0, [dst1q+mstrideq*4]
563 movh m1, [dst8q+mstrideq*4]
564 movh m2, [dst1q+mstrideq*2]
565 movh m5, [dst8q+mstrideq*2]
566 movh m3, [dst1q+mstrideq ]
567 movh m6, [dst8q+mstrideq ]
570 punpcklbw m0, m1 ; A/I
571 punpcklbw m2, m5 ; C/K
572 punpcklbw m3, m6 ; D/L
573 punpcklbw m4, m7 ; E/M
576 movh m1, [dst2q+mstrideq*4]
577 movh m6, [dst8q+mstrideq*4]
580 punpcklbw m1, m6 ; B/J
581 punpcklbw m5, m7 ; F/N
582 movh m6, [dst2q+ strideq ]
583 movh m7, [dst8q+ strideq ]
584 punpcklbw m6, m7 ; G/O
587 TRANSPOSE4x4B 0, 1, 2, 3, 7
593 movh m7, [dst2q+ strideq*2]
594 movh m1, [dst8q+ strideq*2]
595 punpcklbw m7, m1 ; H/P
596 TRANSPOSE4x4B 4, 5, 6, 7, 1
597 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
598 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
599 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
605 mova m_q0backup, m2 ; store q0
607 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
611 mova m_p0backup, m5 ; store p0
619 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
622 psubusb m4, m0 ; p2-p3
623 psubusb m0, m1 ; p3-p2
624 por m0, m4 ; abs(p3-p2)
628 psubusb m4, m1 ; p1-p2
629 psubusb m1, m2 ; p2-p1
630 por m1, m4 ; abs(p2-p1)
634 psubusb m4, m7 ; q2-q3
635 psubusb m7, m6 ; q3-q2
636 por m7, m4 ; abs(q3-q2)
640 psubusb m4, m6 ; q1-q2
641 psubusb m6, m5 ; q2-q1
642 por m6, m4 ; abs(q2-q1)
644 %if notcpuflag(mmxext)
651 pcmpeqb m0, m3 ; abs(p3-p2) <= I
652 pcmpeqb m1, m3 ; abs(p2-p1) <= I
653 pcmpeqb m7, m3 ; abs(q3-q2) <= I
654 pcmpeqb m6, m3 ; abs(q2-q1) <= I
664 ; normal_limit and high_edge_variance for p1-p0, q1-q0
665 SWAP 7, 3 ; now m7 is zero
667 movrow m3, [dst1q+mstrideq ] ; p0
668 %if mmsize == 16 && %2 == 8
669 movhps m3, [dst8q+mstrideq ]
681 psubusb m1, m3 ; p1-p0
682 psubusb m6, m2 ; p0-p1
683 por m1, m6 ; abs(p1-p0)
684 %if notcpuflag(mmxext)
688 pcmpeqb m1, m7 ; abs(p1-p0) <= I
689 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
693 pmaxub m0, m1 ; max_I
694 SWAP 1, 4 ; max_hev_thresh
697 SWAP 6, 4 ; now m6 is I
699 movrow m4, [dst1q] ; q0
700 %if mmsize == 16 && %2 == 8
712 psubusb m1, m5 ; q0-q1
713 psubusb m7, m4 ; q1-q0
714 por m1, m7 ; abs(q1-q0)
715 %if notcpuflag(mmxext)
720 pcmpeqb m1, m6 ; abs(q1-q0) <= I
721 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
723 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
731 pcmpeqb m0, m7 ; max(abs(..)) <= I
732 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
737 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
743 mova m6, m4 ; keep copies of p0/q0 around for later use
745 psubusb m1, m4 ; p0-q0
746 psubusb m6, m3 ; q0-p0
747 por m1, m6 ; abs(q0-p0)
748 paddusb m1, m1 ; m1=2*abs(q0-p0)
754 psubusb m7, m5 ; p1-q1
755 psubusb m6, m2 ; q1-p1
756 por m7, m6 ; abs(q1-p1)
759 psrlq m7, 1 ; abs(q1-p1)/2
760 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
762 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
763 pand m0, m7 ; normal_limit result
765 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
766 %ifdef m8 ; x86-64 && sse2
769 %else ; x86-32 or mmx/mmxext
770 %define m_pb_80 [pb_80]
776 psubsb m1, m7 ; (signed) q0-p0
781 psubsb m6, m7 ; (signed) p1-q1
786 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
805 paddusb m3, m1 ; p0+f2
816 paddusb m4, m1 ; q0-f1
823 %if notcpuflag(mmxext)
830 %if notcpuflag(mmxext)
844 paddusb m5, m1 ; q1-a
845 paddusb m2, m0 ; p1+a
849 movrow [dst1q+mstrideq*2], m2
850 movrow [dst1q+mstrideq ], m3
852 movrow [dst1q+ strideq ], m5
853 %if mmsize == 16 && %2 == 8
854 movhps [dst8q+mstrideq*2], m2
855 movhps [dst8q+mstrideq ], m3
857 movhps [dst8q+ strideq ], m5
864 TRANSPOSE4x4B 2, 3, 4, 5, 6
866 %if mmsize == 8 ; mmx/mmxext (h)
867 WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
869 lea dst8q, [dst8q+mstrideq +2]
870 WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
884 lea dst1q, [dst1q+ strideq*8-2]
899 INNER_LOOPFILTER v, 16
900 INNER_LOOPFILTER h, 16
901 INNER_LOOPFILTER v, 8
902 INNER_LOOPFILTER h, 8
905 INNER_LOOPFILTER v, 16
906 INNER_LOOPFILTER h, 16
907 INNER_LOOPFILTER v, 8
908 INNER_LOOPFILTER h, 8
912 INNER_LOOPFILTER v, 16
913 INNER_LOOPFILTER h, 16
914 INNER_LOOPFILTER v, 8
915 INNER_LOOPFILTER h, 8
918 INNER_LOOPFILTER v, 16
919 INNER_LOOPFILTER h, 16
920 INNER_LOOPFILTER v, 8
921 INNER_LOOPFILTER h, 8
923 ;-----------------------------------------------------------------------------
924 ; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
925 ; int flimE, int flimI, int hev_thr);
926 ;-----------------------------------------------------------------------------
928 %macro MBEDGE_LOOPFILTER 2
930 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
931 %if mmsize == 16 ; [3]=hev() result
932 ; [4]=filter tmp result
933 ; [5]/[6] = p2/q2 backup
934 ; [7]=lim_res sign result
935 %define stack_size mmsize * -7
936 %else ; 8 ; extra storage space for transposes
937 %define stack_size mmsize * -8
942 cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
944 cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
952 ; splat function arguments
953 SPLATB_REG m0, flimEq, m7 ; E
954 SPLATB_REG m1, flimIq, m7 ; I
955 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
957 %define m_flimE [rsp]
958 %define m_flimI [rsp+mmsize]
959 %define m_hevthr [rsp+mmsize*2]
960 %define m_maskres [rsp+mmsize*3]
961 %define m_limres [rsp+mmsize*4]
962 %define m_p0backup [rsp+mmsize*3]
963 %define m_q0backup [rsp+mmsize*4]
964 %define m_p2backup [rsp+mmsize*5]
965 %define m_q2backup [rsp+mmsize*6]
967 %define m_limsign [rsp]
969 %define m_limsign [rsp+mmsize*7]
975 %else ; sse2 on x86-64
979 %define m_maskres m12
981 %define m_p0backup m12
982 %define m_q0backup m8
983 %define m_p2backup m13
984 %define m_q2backup m14
987 ; splat function arguments
988 SPLATB_REG m_flimE, flimEq, m7 ; E
989 SPLATB_REG m_flimI, flimIq, m7 ; I
990 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
994 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
996 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
999 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
1001 mov strideq, mstrideq
1004 lea dst1q, [dst1q+strideq*4-4]
1005 %if %2 == 8 ; chroma
1006 lea dst8q, [dst8q+strideq*4-4]
1014 lea dst2q, [dst1q+ strideq ]
1016 %if %2 == 8 && mmsize == 16
1021 movrow m0, [dst1q+mstrideq*4] ; p3
1022 movrow m1, [dst2q+mstrideq*4] ; p2
1023 movrow m2, [dst1q+mstrideq*2] ; p1
1024 movrow m5, [dst2q] ; q1
1025 movrow m6, [dst2q+ strideq ] ; q2
1026 movrow m7, [dst2q+ strideq*2] ; q3
1027 %if mmsize == 16 && %2 == 8
1028 movhps m0, [dst8q+mstrideq*4]
1029 movhps m2, [dst8q+mstrideq*2]
1031 movhps m1, [dst8q+mstrideq*4]
1033 movhps m6, [dst8q+ strideq ]
1034 movhps m7, [dst8q+ strideq*2]
1037 %elif mmsize == 8 ; mmx/mmxext (h)
1038 ; read 8 rows of 8px each
1039 movu m0, [dst1q+mstrideq*4]
1040 movu m1, [dst2q+mstrideq*4]
1041 movu m2, [dst1q+mstrideq*2]
1042 movu m3, [dst1q+mstrideq ]
1045 movu m6, [dst2q+ strideq ]
1048 TRANSPOSE4x4B 0, 1, 2, 3, 7
1050 movu m7, [dst2q+ strideq*2]
1051 TRANSPOSE4x4B 4, 5, 6, 7, 1
1052 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1053 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1054 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1056 mova m_q0backup, m2 ; store q0
1057 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1058 mova m_p0backup, m5 ; store p0
1065 lea dst8q, [dst1q+ strideq*8 ]
1068 ; read 16 rows of 8px each, interleave
1069 movh m0, [dst1q+mstrideq*4]
1070 movh m1, [dst8q+mstrideq*4]
1071 movh m2, [dst1q+mstrideq*2]
1072 movh m5, [dst8q+mstrideq*2]
1073 movh m3, [dst1q+mstrideq ]
1074 movh m6, [dst8q+mstrideq ]
1077 punpcklbw m0, m1 ; A/I
1078 punpcklbw m2, m5 ; C/K
1079 punpcklbw m3, m6 ; D/L
1080 punpcklbw m4, m7 ; E/M
1083 movh m1, [dst2q+mstrideq*4]
1084 movh m6, [dst8q+mstrideq*4]
1087 punpcklbw m1, m6 ; B/J
1088 punpcklbw m5, m7 ; F/N
1089 movh m6, [dst2q+ strideq ]
1090 movh m7, [dst8q+ strideq ]
1091 punpcklbw m6, m7 ; G/O
1094 TRANSPOSE4x4B 0, 1, 2, 3, 7
1100 movh m7, [dst2q+ strideq*2]
1101 movh m1, [dst8q+ strideq*2]
1102 punpcklbw m7, m1 ; H/P
1103 TRANSPOSE4x4B 4, 5, 6, 7, 1
1104 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1105 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1106 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1112 mova m_q0backup, m2 ; store q0
1114 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1118 mova m_p0backup, m5 ; store p0
1126 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1129 psubusb m4, m0 ; p2-p3
1130 psubusb m0, m1 ; p3-p2
1131 por m0, m4 ; abs(p3-p2)
1135 psubusb m4, m1 ; p1-p2
1137 psubusb m1, m2 ; p2-p1
1138 por m1, m4 ; abs(p2-p1)
1142 psubusb m4, m7 ; q2-q3
1143 psubusb m7, m6 ; q3-q2
1144 por m7, m4 ; abs(q3-q2)
1148 psubusb m4, m6 ; q1-q2
1150 psubusb m6, m5 ; q2-q1
1151 por m6, m4 ; abs(q2-q1)
1153 %if notcpuflag(mmxext)
1160 pcmpeqb m0, m3 ; abs(p3-p2) <= I
1161 pcmpeqb m1, m3 ; abs(p2-p1) <= I
1162 pcmpeqb m7, m3 ; abs(q3-q2) <= I
1163 pcmpeqb m6, m3 ; abs(q2-q1) <= I
1173 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1174 SWAP 7, 3 ; now m7 is zero
1176 movrow m3, [dst1q+mstrideq ] ; p0
1177 %if mmsize == 16 && %2 == 8
1178 movhps m3, [dst8q+mstrideq ]
1190 psubusb m1, m3 ; p1-p0
1191 psubusb m6, m2 ; p0-p1
1192 por m1, m6 ; abs(p1-p0)
1193 %if notcpuflag(mmxext)
1196 psubusb m6, m_hevthr
1197 pcmpeqb m1, m7 ; abs(p1-p0) <= I
1198 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
1202 pmaxub m0, m1 ; max_I
1203 SWAP 1, 4 ; max_hev_thresh
1206 SWAP 6, 4 ; now m6 is I
1208 movrow m4, [dst1q] ; q0
1209 %if mmsize == 16 && %2 == 8
1221 psubusb m1, m5 ; q0-q1
1222 psubusb m7, m4 ; q1-q0
1223 por m1, m7 ; abs(q1-q0)
1224 %if notcpuflag(mmxext)
1227 psubusb m7, m_hevthr
1229 pcmpeqb m1, m6 ; abs(q1-q0) <= I
1230 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
1232 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
1239 psubusb m6, m_hevthr
1240 pcmpeqb m0, m7 ; max(abs(..)) <= I
1241 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
1246 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1252 mova m6, m4 ; keep copies of p0/q0 around for later use
1254 psubusb m1, m4 ; p0-q0
1255 psubusb m6, m3 ; q0-p0
1256 por m1, m6 ; abs(q0-p0)
1257 paddusb m1, m1 ; m1=2*abs(q0-p0)
1263 psubusb m7, m5 ; p1-q1
1264 psubusb m6, m2 ; q1-p1
1265 por m7, m6 ; abs(q1-p1)
1268 psrlq m7, 1 ; abs(q1-p1)/2
1269 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
1271 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1272 pand m0, m7 ; normal_limit result
1274 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1275 %ifdef m8 ; x86-64 && sse2
1278 %else ; x86-32 or mmx/mmxext
1279 %define m_pb_80 [pb_80]
1285 psubsb m1, m7 ; (signed) q0-p0
1290 psubsb m6, m7 ; (signed) p1-q1
1297 mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
1304 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
1322 paddusb m3, m1 ; p0+f2
1333 paddusb m4, m1 ; q0-f1
1335 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
1348 pcmpgtb m0, m1 ; which are negative
1350 punpcklbw m6, m7 ; interleave with "1" for rounding
1353 punpcklbw m6, m0 ; signed byte->word
1363 SWAP 0, 10 ; don't lose lim_sign copy
1376 mova m_maskres, m6 ; backup for later in filter
1385 packsswb m6, m1 ; a0
1391 mova m6, [pb_18_63] ; pipelining
1395 paddusb m3, m0 ; p0+a0
1396 psubusb m4, m0 ; q0-a0
1425 packsswb m6, m1 ; a1
1435 paddusb m2, m0 ; p1+a1
1436 psubusb m5, m0 ; q1-a1
1470 packsswb m6, m1 ; a1
1484 paddusb m1, m7 ; p1+a1
1485 psubusb m6, m7 ; q1-a1
1489 movrow [dst2q+mstrideq*4], m1
1490 movrow [dst1q+mstrideq*2], m2
1491 movrow [dst1q+mstrideq ], m3
1494 movrow [dst2q+ strideq ], m6
1495 %if mmsize == 16 && %2 == 8
1497 movhps [dst8q+mstrideq*2], m1
1498 movhps [dst8q+mstrideq ], m2
1502 movhps [dst8q+ strideq ], m5
1503 movhps [dst8q+ strideq*2], m6
1510 TRANSPOSE4x4B 1, 2, 3, 4, 0
1511 SBUTTERFLY bw, 5, 6, 0
1513 %if mmsize == 8 ; mmx/mmxext (h)
1514 WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
1516 WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
1518 lea dst8q, [dst8q+mstrideq+1]
1519 WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
1520 lea dst1q, [dst2q+mstrideq+4]
1521 lea dst8q, [dst8q+mstrideq+4]
1525 WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
1527 lea dst2q, [dst8q+ strideq ]
1529 WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
1534 %if %2 == 8 ; chroma
1543 lea dst1q, [dst1q+ strideq*8-5]
1551 %else ; mmsize == 16
1558 MBEDGE_LOOPFILTER v, 16
1559 MBEDGE_LOOPFILTER h, 16
1560 MBEDGE_LOOPFILTER v, 8
1561 MBEDGE_LOOPFILTER h, 8
1564 MBEDGE_LOOPFILTER v, 16
1565 MBEDGE_LOOPFILTER h, 16
1566 MBEDGE_LOOPFILTER v, 8
1567 MBEDGE_LOOPFILTER h, 8
1571 MBEDGE_LOOPFILTER v, 16
1572 MBEDGE_LOOPFILTER h, 16
1573 MBEDGE_LOOPFILTER v, 8
1574 MBEDGE_LOOPFILTER h, 8
1577 MBEDGE_LOOPFILTER v, 16
1578 MBEDGE_LOOPFILTER h, 16
1579 MBEDGE_LOOPFILTER v, 8
1580 MBEDGE_LOOPFILTER h, 8
1583 MBEDGE_LOOPFILTER h, 16
1584 MBEDGE_LOOPFILTER h, 8