1 ;******************************************************************************
2 ;* MMX optimized DSP utils
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
26 pb_zzzzzzzz77777777: times 8 db -1
28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
31 section .text align=16
42 %macro FLOAT_TO_INT16_INTERLEAVE6 1
43 ; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
44 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
49 %define lend dword r2m
51 mov src1q, [srcq+1*gprsize]
52 mov src2q, [srcq+2*gprsize]
53 mov src3q, [srcq+3*gprsize]
54 mov src4q, [srcq+4*gprsize]
55 mov src5q, [srcq+5*gprsize]
64 cvtps2pi mm1, [srcq+src1q]
65 cvtps2pi mm2, [srcq+src2q]
66 cvtps2pi mm3, [srcq+src3q]
67 cvtps2pi mm4, [srcq+src4q]
68 cvtps2pi mm5, [srcq+src5q]
89 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
91 %define pswapd PSWAPD_SSE
92 FLOAT_TO_INT16_INTERLEAVE6 sse
93 %define cvtps2pi pf2id
94 %define pswapd PSWAPD_3DN1
95 FLOAT_TO_INT16_INTERLEAVE6 3dnow
97 FLOAT_TO_INT16_INTERLEAVE6 3dn2
102 %macro SCALARPRODUCT 1
103 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
104 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
112 movu m0, [v1q + orderq]
113 movu m1, [v1q + orderq + mmsize]
114 pmaddwd m0, [v2q + orderq]
115 pmaddwd m1, [v2q + orderq + mmsize]
133 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
134 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
149 movu m0, [v2q + orderq]
150 movu m1, [v2q + orderq + mmsize]
151 mova m4, [v1q + orderq]
152 mova m5, [v1q + orderq + mmsize]
153 movu m2, [v3q + orderq]
154 movu m3, [v3q + orderq + mmsize]
163 mova [v1q + orderq], m2
164 mova [v1q + orderq + mmsize], m3
184 %macro SCALARPRODUCT_LOOP 1
190 mova m4, [v2q + orderq]
191 mova m0, [v2q + orderq + mmsize]
195 mova m5, [v3q + orderq]
196 mova m2, [v3q + orderq + mmsize]
200 mova m0, [v2q + orderq]
201 mova m1, [v2q + orderq + mmsize]
202 mova m2, [v3q + orderq]
203 mova m3, [v3q + orderq + mmsize]
205 %define t0 [v1q + orderq]
206 %define t1 [v1q + orderq + mmsize]
221 mova [v1q + orderq], m2
222 mova [v1q + orderq + mmsize], m3
229 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
230 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
240 mova m4, [v2q + orderq]
241 mova m5, [v3q + orderq]
242 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
257 SCALARPRODUCT_LOOP 14
258 SCALARPRODUCT_LOOP 12
259 SCALARPRODUCT_LOOP 10
275 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
276 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
279 movd mm4, [left_topq]
284 psubb mm0, mm4 ; t-tl
296 psubb mm0, mm4 ; t-tl
302 paddb mm4, mm3 ; t-tl+l
307 pmaxub mm3, mm5 ; median
308 paddb mm3, mm2 ; +residual
328 movzx r2d, byte [dstq-1]
330 movzx r2d, byte [topq-1]
335 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
361 movhps [dstq+wq+8], m0
373 ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
375 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
377 mova m5, [pb_7 GLOBAL]
378 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
379 mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
385 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
386 mova m5, [pb_f GLOBAL]
387 mova m6, [pb_zzzzzzzz77777777 GLOBAL]
388 mova m4, [pb_zzzz3333zzzzbbbb GLOBAL]
389 mova m3, [pb_zz11zz55zz99zzdd GLOBAL]
393 jnz add_hfyu_left_prediction_ssse3.skip_prologue