1 ;*****************************************************************************
2 ;* x86-optimized functions for removegrain filter
4 ;* Copyright (C) 2015 James Darnley
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License along
19 ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 ;*****************************************************************************
28 %include "libavutil/x86/x86util.asm"
34 pw_div9: times 16 dw ((1<<16)+4)/9
38 ;*** Preprocessor helpers
40 %define a1 srcq+stride_n-1
41 %define a2 srcq+stride_n
42 %define a3 srcq+stride_n+1
46 %define a6 srcq+stride_p-1
47 %define a7 srcq+stride_p
48 %define a8 srcq+stride_p+1
50 ; %1 dest simd register
51 ; %2 source memory location
52 ; %3 zero location (simd register/memory)
70 ; %1 zero location (simd register/memory)
71 %macro LOAD_SQUARE_16 1
84 ; %2 simd register to hold maximums
85 ; %3 simd register to hold minimums
86 ; %4 temp location (simd register/memory)
94 SORT_PAIR ub, m1, m8, m9
95 SORT_PAIR ub, m2, m7, m10
96 SORT_PAIR ub, m3, m6, m11
97 SORT_PAIR ub, m4, m5, m12
101 %macro SORT_AXIS_16 0
102 SORT_PAIR sw, m1, m8, m9
103 SORT_PAIR sw, m2, m7, m10
104 SORT_PAIR sw, m3, m6, m11
105 SORT_PAIR sw, m4, m5, m12
108 ; The loop doesn't need to do all the iterations. It could stop when the right
109 ; pixels are in the right registers.
116 SORT_PAIR ub, m %+ i , m %+ j , m9
124 ; %1 dest simd register
125 ; %2 source (simd register/memory)
126 ; %3 temp simd register
134 ; %1 dest simd register
135 ; %2 source (simd register/memory)
136 ; %3 temp simd register
144 ; %1 simd register that holds the "false" values and will hold the result
145 ; %2 simd register that holds the "true" values
146 ; %3 location (simd register/memory) that hold the mask
149 vpblendvb %1, %1, %2, %3
161 cglobal rg_fl_mode_1, 4, 5, 3, 0, dst, src, stride, pixels
164 %define stride_p strideq
211 cglobal rg_fl_mode_2, 4, 5, 10, 0, dst, src, stride, pixels
214 %define stride_p strideq
230 cglobal rg_fl_mode_3, 4, 5, 10, 0, dst, src, stride, pixels
233 %define stride_p strideq
249 cglobal rg_fl_mode_4, 4, 5, 10, 0, dst, src, stride, pixels
252 %define stride_p strideq
268 cglobal rg_fl_mode_5, 4, 5, 13, 0, dst, src, stride, pixels
271 %define stride_p strideq
293 ABS_DIFF m9, m0, m1 ; c1
294 ABS_DIFF m10, m0, m2 ; c2
295 ABS_DIFF m11, m0, m3 ; c3
296 ABS_DIFF m12, m0, m4 ; c4
300 pminub m9, m12 ; mindiff
306 ; Notice the order here: c1, c3, c2, c4
318 cglobal rg_fl_mode_6, 4, 5, 16, 0, dst, src, stride, pixels
321 %define stride_p strideq
324 ; Some register saving suggestions: the zero can be somewhere other than a
325 ; register, the center pixels could be on the stack.
336 CLIPW m9, m1, m8 ; clip1
337 CLIPW m10, m2, m7 ; clip2
338 CLIPW m11, m3, m6 ; clip3
339 CLIPW m12, m4, m5 ; clip4
350 ABS_DIFF_W m1, m0, m13
351 ABS_DIFF_W m2, m0, m14
352 ABS_DIFF_W m3, m0, m13
353 ABS_DIFF_W m4, m0, m14
362 ; As the differences (d1..d4) can only be positive, there is no need to
363 ; clip to zero. Also, the maximum positive value is less than 768.
381 sub pixelsd, mmsize/2
385 ; This is just copy-pasted straight from mode 6 with the left shifts removed.
386 cglobal rg_fl_mode_7, 4, 5, 16, 0, dst, src, stride, pixels
389 %define stride_p strideq
392 ; Can this be done without unpacking?
403 CLIPW m9, m1, m8 ; clip1
404 CLIPW m10, m2, m7 ; clip2
405 CLIPW m11, m3, m6 ; clip3
406 CLIPW m12, m4, m5 ; clip4
417 ABS_DIFF_W m1, m0, m13
418 ABS_DIFF_W m2, m0, m14
419 ABS_DIFF_W m3, m0, m13
420 ABS_DIFF_W m4, m0, m14
442 sub pixelsd, mmsize/2
446 ; This is just copy-pasted straight from mode 6 with a few changes.
447 cglobal rg_fl_mode_8, 4, 5, 16, 0, dst, src, stride, pixels
450 %define stride_p strideq
462 CLIPW m9, m1, m8 ; clip1
463 CLIPW m10, m2, m7 ; clip2
464 CLIPW m11, m3, m6 ; clip3
465 CLIPW m12, m4, m5 ; clip4
480 ABS_DIFF_W m1, m0, m13
481 ABS_DIFF_W m2, m0, m14
482 ABS_DIFF_W m3, m0, m13
483 ABS_DIFF_W m4, m0, m14
488 ; As the differences (d1..d4) can only be positive, there is no need to
489 ; clip to zero. Also, the maximum positive value is less than 768.
507 sub pixelsd, mmsize/2
511 cglobal rg_fl_mode_9, 4, 5, 13, 0, dst, src, stride, pixels
514 %define stride_p strideq
525 CLIPUB m9, m1, m8 ; clip1
526 CLIPUB m10, m2, m7 ; clip2
527 CLIPUB m11, m3, m6 ; clip3
528 CLIPUB m12, m4, m5 ; clip4
555 cglobal rg_fl_mode_10, 4, 5, 8, 0, dst, src, stride, pixels
558 %define stride_p strideq
568 movu m3, [a5] ; load pixel
570 ABS_DIFF m4, m0, m7 ; absolute difference from center
571 pminub m1, m4 ; mindiff
572 pcmpeqb m4, m1 ; if (difference == mindiff)
573 BLEND m2, m3, m4 ; return pixel
624 cglobal rg_fl_mode_11_12, 4, 5, 7, 0, dst, src, stride, pixels
627 %define stride_p strideq
662 sub pixelsd, mmsize/2
666 cglobal rg_fl_mode_13_14, 4, 5, 8, 0, dst, src, stride, pixels
669 %define stride_p strideq
705 cglobal rg_fl_mode_15_16, 4, 5, 16, 0, dst, src, stride, pixels
708 %define stride_p strideq
718 ABS_DIFF_W m9, m8, m12
719 ABS_DIFF_W m10, m7, m13
720 ABS_DIFF_W m11, m6, m14
738 SORT_PAIR ub, m1, m8, m0
739 SORT_PAIR ub, m2, m7, m9
740 SORT_PAIR ub, m3, m6, m14
754 sub pixelsd, mmsize/2
758 cglobal rg_fl_mode_17, 4, 5, 9, 0, dst, src, stride, pixels
761 %define stride_p strideq
790 cglobal rg_fl_mode_18, 4, 5, 16, 0, dst, src, stride, pixels
793 %define stride_p strideq
802 ABS_DIFF m10, m0, m12
803 pmaxub m9, m10 ; m9 = d1
807 ABS_DIFF m10, m0, m12
808 ABS_DIFF m11, m0, m13
809 pmaxub m10, m11 ; m10 = d2
813 ABS_DIFF m11, m0, m13
814 ABS_DIFF m12, m0, m14
815 pmaxub m11, m12 ; m11 = d3
819 ABS_DIFF m12, m0, m14
820 ABS_DIFF m13, m0, m15
821 pmaxub m12, m13 ; m12 = d4
826 pminub m13, m12 ; m13 = mindiff
840 CLIPUB m13, m1, m8 ; m13 = ret...d1
849 por m14, m11 ; m14 = ret...d3
858 por m15, m10 ; m15 = ret...d2
867 por m1, m12 ; m15 = ret...d4
877 cglobal rg_fl_mode_19, 4, 5, 7, 0, dst, src, stride, pixels
880 %define stride_p strideq
913 sub pixelsd, mmsize/2
917 cglobal rg_fl_mode_20, 4, 5, 7, 0, dst, src, stride, pixels
920 %define stride_p strideq
949 pmulhuw m1, [pw_div9]
956 sub pixelsd, mmsize/2
960 cglobal rg_fl_mode_21, 4, 5, 8, 0, dst, src, stride, pixels
963 %define stride_p strideq
1000 punpckhbw m4, m2, m0
1002 punpckhbw m5, m3, m0
1016 punpckhbw m4, m2, m0
1018 punpckhbw m5, m3, m0
1039 cglobal rg_fl_mode_22, 4, 5, 8, 0, dst, src, stride, pixels
1042 %define stride_p strideq
1043 %define stride_n r4q
1081 cglobal rg_fl_mode_23, 4, 5, 16, 0, dst, src, stride, pixels
1084 %define stride_p strideq
1085 %define stride_n r4q
1096 psubw m9, m1 ; linediff1
1097 psubw m10, m2 ; linediff2
1098 psubw m11, m3 ; linediff3
1099 psubw m12, m4 ; linediff4
1123 pminsw m10, m14 ; u2
1138 sub pixelsd, mmsize/2
1142 cglobal rg_fl_mode_24, 4, 5, 16, mmsize, dst, src, stride, pixels
1145 %define stride_p strideq
1146 %define stride_n r4q
1158 psubw m9, m1 ; linediff1
1159 psubw m10, m2 ; linediff2
1160 psubw m11, m3 ; linediff3
1161 psubw m12, m4 ; linediff4
1163 psubw m1, [rsp] ; td1
1164 psubw m2, [rsp] ; td2
1165 psubw m3, [rsp] ; td3
1166 psubw m4, [rsp] ; td4
1195 pminsw m10, m13 ; u2
1196 pminsw m11, m14 ; u3
1197 pminsw m12, m15 ; u4
1201 pmaxsw m1, m3 ; d without max(d,0)
1202 pmaxsw m9, m11 ; u without max(u,0)
1215 sub pixelsd, mmsize/2