1 ;******************************************************************************
2 ;* linear least squares model
4 ;* Copyright (c) 2013 Loren Merritt
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "x86util.asm"
28 %define MAX_VARS_ALIGN (MAX_VARS+4)
29 %define COVAR_STRIDE MAX_VARS_ALIGN*8
30 %define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE]
33 .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN
34 .coeff: resq MAX_VARS*MAX_VARS
35 .variance: resq MAX_VARS
50 cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
52 mov id, [ctxq + LLSModel.indep_count]
53 lea varq, [varq + iq*8]
57 ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal
58 mova m1, [varq + iq*8]
59 mova m3, [varq + iq*8 + 16]
66 lea covarq, [covar2q + 16]
67 ADDPD_MEM COVAR(-2,0), m0
68 ADDPD_MEM COVAR(-2,1), m1
73 ; Compute all 16 pairwise products of a 4x4 block
78 ADDPD_MEM COVAR(0,0), m0
79 ADDPD_MEM COVAR(0,1), m1
80 ADDPD_MEM COVAR(0,2), m2
81 ADDPD_MEM COVAR(0,3), m3
82 mova m3, [varq + jq*8 + 16]
87 ADDPD_MEM COVAR(2,0), m0
88 ADDPD_MEM COVAR(2,1), m1
89 ADDPD_MEM COVAR(2,2), m2
90 ADDPD_MEM COVAR(2,3), m3
91 mova m3, [varq + jq*8 + 32]
103 ADDPD_MEM COVAR(0,0), m4
104 ADDPD_MEM COVAR(0,1), m5
105 ADDPD_MEM COVAR(0,2), m6
106 ADDPD_MEM COVAR(0,3), m7
109 add covar2q, 4*COVAR_STRIDE+32
115 %define covarq covar2q
117 movsd m0, [varq + iq*8]
119 mulpd m0, [varq + jq*8]
120 ADDPD_MEM COVAR(0,0), m0
122 add covarq, COVAR_STRIDE
129 cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
131 mov countd, [ctxq + LLSModel.indep_count]
132 lea count2d, [countq-2]
135 ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal
136 mova ymm1, [varq + iq*8]
137 vbroadcastsd ymm4, [varq + iq*8]
138 vbroadcastsd ymm5, [varq + iq*8 + 8]
139 vbroadcastsd ymm6, [varq + iq*8 + 16]
140 vbroadcastsd ymm7, [varq + iq*8 + 24]
141 vextractf128 xmm3, ymm1, 1
143 mova ymm0, COVAR(iq ,0)
144 mova xmm2, COVAR(iq+2,2)
145 fmaddpd ymm0, ymm1, ymm4, ymm0
146 fmaddpd xmm2, xmm3, xmm6, xmm2
147 fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
148 fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
149 mova COVAR(iq ,0), ymm0
150 mova COVAR(iq ,1), ymm1
151 mova COVAR(iq+2,2), xmm2
152 mova COVAR(iq+2,3), xmm3
154 vmulpd ymm0, ymm1, ymm4
155 vmulpd ymm1, ymm1, ymm5
156 vmulpd xmm2, xmm3, xmm6
157 vmulpd xmm3, xmm3, xmm7
158 ADDPD_MEM COVAR(iq ,0), ymm0
159 ADDPD_MEM COVAR(iq ,1), ymm1
160 ADDPD_MEM COVAR(iq+2,2), xmm2
161 ADDPD_MEM COVAR(iq+2,3), xmm3
162 %endif ; cpuflag(fma3)
167 ; Compute all 16 pairwise products of a 4x4 block
168 mova ymm3, [varq + jq*8]
170 mova ymm0, COVAR(jq, 0)
171 mova ymm1, COVAR(jq, 1)
172 mova ymm2, COVAR(jq, 2)
173 fmaddpd ymm0, ymm3, ymm4, ymm0
174 fmaddpd ymm1, ymm3, ymm5, ymm1
175 fmaddpd ymm2, ymm3, ymm6, ymm2
176 fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
177 mova COVAR(jq, 0), ymm0
178 mova COVAR(jq, 1), ymm1
179 mova COVAR(jq, 2), ymm2
180 mova COVAR(jq, 3), ymm3
182 vmulpd ymm0, ymm3, ymm4
183 vmulpd ymm1, ymm3, ymm5
184 vmulpd ymm2, ymm3, ymm6
185 vmulpd ymm3, ymm3, ymm7
186 ADDPD_MEM COVAR(jq,0), ymm0
187 ADDPD_MEM COVAR(jq,1), ymm1
188 ADDPD_MEM COVAR(jq,2), ymm2
189 ADDPD_MEM COVAR(jq,3), ymm3
190 %endif ; cpuflag(fma3)
197 mova xmm3, [varq + jq*8]
199 mova xmm0, COVAR(jq, 0)
200 mova xmm1, COVAR(jq, 1)
201 mova xmm2, COVAR(jq, 2)
202 fmaddpd xmm0, xmm3, xmm4, xmm0
203 fmaddpd xmm1, xmm3, xmm5, xmm1
204 fmaddpd xmm2, xmm3, xmm6, xmm2
205 fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
206 mova COVAR(jq, 0), xmm0
207 mova COVAR(jq, 1), xmm1
208 mova COVAR(jq, 2), xmm2
209 mova COVAR(jq, 3), xmm3
211 vmulpd xmm0, xmm3, xmm4
212 vmulpd xmm1, xmm3, xmm5
213 vmulpd xmm2, xmm3, xmm6
214 vmulpd xmm3, xmm3, xmm7
215 ADDPD_MEM COVAR(jq,0), xmm0
216 ADDPD_MEM COVAR(jq,1), xmm1
217 ADDPD_MEM COVAR(jq,2), xmm2
218 ADDPD_MEM COVAR(jq,3), xmm3
219 %endif ; cpuflag(fma3)
222 add covarq, 4*COVAR_STRIDE
229 vmovddup xmm0, [varq + iq*8]
231 mova xmm1, [varq + jq*8]
232 fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
233 mova COVAR(jq,0), xmm0
235 vmulpd xmm0, [varq + jq*8]
236 ADDPD_MEM COVAR(jq,0), xmm0
237 %endif ; cpuflag(fma3)
239 add covarq, COVAR_STRIDE
244 %endmacro ; UPDATE_LLS
246 %if HAVE_AVX_EXTERNAL
250 %if HAVE_FMA3_EXTERNAL
256 cglobal evaluate_lls, 3,4,2, ctx, var, order, i
257 ; This function is often called on the same buffer as update_lls, but with
258 ; an offset. They can't both be aligned.
259 ; Load halves rather than movu to avoid store-forwarding stalls, since the
260 ; input was initialized immediately prior to this function using scalar math.
263 imul orderd, MAX_VARS
264 lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
266 movhpd m0, [varq + 8]
268 lea coefsq, [coefsq + iq*8]
269 lea varq, [varq + iq*8]
273 movsd m1, [varq + iq*8]
274 movhpd m1, [varq + iq*8 + 8]
275 mulpd m1, [coefsq + iq*8]
280 movsd m1, [varq + iq*8]
281 mulsd m1, [coefsq + iq*8]