1 ;*****************************************************************************
2 ;* x86-optimized functions for gblur filter
4 ;* This file is part of FFmpeg.
6 ;* FFmpeg is free software; you can redistribute it and/or
7 ;* modify it under the terms of the GNU Lesser General Public
8 ;* License as published by the Free Software Foundation; either
9 ;* version 2.1 of the License, or (at your option) any later version.
11 ;* FFmpeg is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;* Lesser General Public License for more details.
16 ;* You should have received a copy of the GNU Lesser General Public
17 ;* License along with FFmpeg; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 ;******************************************************************************
21 %include "libavutil/x86/x86util.asm"
25 ; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
26 ; float nu, float bscale)
30 cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain
32 cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain
37 DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain
39 movsxdifnidn widthq, widthd
41 mulss m2, m0, m0 ; nu ^ 2
42 mulss m3, m2, m0 ; nu ^ 3
43 mulss m4, m3, m0 ; nu ^ 4
49 ; w = w - ((w - 1) & 3)
65 mulss m5, m1, [ptrq + xq * 4]
66 movss [ptrq + xq * 4], m5
69 ; Here we are vectorizing the c version by 4
70 ; for (x = 1; x < width; x++)
71 ; ptr[x] += nu * ptr[x - 1];
72 ; let p0 stands for ptr[x-1], the data from last loop
73 ; and [p1,p2,p3,p4] be the vector data for this loop.
74 ; Unrolling the loop, we get:
76 ; p2' = p2 + p1*nu + p0*nu^2
77 ; p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3
78 ; p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4
79 ; so we can do it in simd:
80 ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu +
81 ; [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 +
85 movu m6, [ptrq + xq * 4] ; s = [p1,p2,p3,p4]
86 pslldq m7, m6, 4 ; [0, p1,p2,p3]
87 movss m7, m5 ; [p0,p1,p2,p3]
88 FMULADD_PS m6, m7, m0, m6, m8 ; s += [p0,p1,p2,p3] * nu
89 pslldq m7, 4 ; [0,p0,p1,p2]
90 FMULADD_PS m6, m7, m2, m6, m8 ; s += [0,p0,p1,p2] * nu^2
92 FMULADD_PS m6, m7, m3, m6, m8 ; s += [0,0,p0,p1] * nu^3
94 FMULADD_PS m6, m7, m4, m6, m8 ; s += [0,0,0,p0] * nu^4
95 movu [ptrq + xq * 4], m6
96 shufps m5, m6, m6, q3333
106 ; ptr[x] += nu * ptr[x-1]
107 movss m5, [ptrq + 4*xq - 4]
109 addss m5, [ptrq + 4*xq]
110 movss [ptrq + 4*xq], m5
115 ; ptr[width - 1] *= bscale
117 mulss m5, m1, [ptrq + 4*xq]
118 movss [ptrq + 4*xq], m5
123 ; ptr[x - 1] += nu * ptr[x];
124 ; The idea here is basically the same as filter rightwards.
125 ; But we need to take care as the data layout is different.
126 ; Let p0 stands for the ptr[x], which is the data from last loop.
127 ; The way we do it in simd as below:
128 ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1]
129 ; + [p-3, p-2, p-1, p0] * nu
130 ; + [p-2, p-1, p0, 0] * nu^2
131 ; + [p-1, p0, 0, 0] * nu^3
132 ; + [p0, 0, 0, 0] * nu^4
135 movu m6, [ptrq + xq * 4] ; s = [p-4, p-3, p-2, p-1]
136 psrldq m7, m6, 4 ; [p-3, p-2, p-1, 0 ]
137 blendps m7, m5, 0x8 ; [p-3, p-2, p-1, p0 ]
138 FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu
140 FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0, 0] * nu^2
142 FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0, 0, 0] * nu^3
144 FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0, 0, 0, 0] * nu^4
145 movu [ptrq + xq * 4], m6
146 shufps m5, m6, m6, 0 ; m5 = [p-4', p-4', p-4', p-4']
154 ; ptr[x-1] += nu * ptr[x]
155 movss m5, [ptrq + 4*xq]
157 addss m5, [ptrq + 4*xq - 4]
158 movss [ptrq + 4*xq - 4], m5
164 ; reset aligned width for next line
187 %macro POSTSCALE_SLICE 0
188 cglobal postscale_slice, 2, 2, 4, ptr, length, postscale, min, max
193 VBROADCASTSS m0, postscalem
194 VBROADCASTSS m1, minm
195 VBROADCASTSS m2, maxm
201 VBROADCASTSS m2, maxm
210 mulps m3, m0, [ptrq + lengthq]
212 movu m3, [ptrq + lengthq]
217 movu [ptrq+lengthq], m3
228 %if HAVE_AVX2_EXTERNAL