]> git.sesse.net Git - ffmpeg/blob - libavfilter/x86/vf_spp.c
Merge commit '5b6f42da98c26a8aee8d2c2edfcbd0633ad1c607'
[ffmpeg] / libavfilter / x86 / vf_spp.c
1 /*
2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20
21
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/mem.h"
25 #include "libavutil/x86/asm.h"
26 #include "libavfilter/vf_spp.h"
27
28 #if HAVE_MMX_INLINE
29 static void hardthresh_mmx(int16_t dst[64], const int16_t src[64],
30                            int qp, const uint8_t *permutation)
31 {
32     int bias = 0; //FIXME
33     unsigned int threshold1;
34
35     threshold1 = qp * ((1<<4) - bias) - 1;
36
37 #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3)    \
38     "movq " #src0 ", %%mm0      \n"                                     \
39     "movq " #src1 ", %%mm1      \n"                                     \
40     "movq " #src2 ", %%mm2      \n"                                     \
41     "movq " #src3 ", %%mm3      \n"                                     \
42     "psubw %%mm4, %%mm0         \n"                                     \
43     "psubw %%mm4, %%mm1         \n"                                     \
44     "psubw %%mm4, %%mm2         \n"                                     \
45     "psubw %%mm4, %%mm3         \n"                                     \
46     "paddusw %%mm5, %%mm0       \n"                                     \
47     "paddusw %%mm5, %%mm1       \n"                                     \
48     "paddusw %%mm5, %%mm2       \n"                                     \
49     "paddusw %%mm5, %%mm3       \n"                                     \
50     "paddw %%mm6, %%mm0         \n"                                     \
51     "paddw %%mm6, %%mm1         \n"                                     \
52     "paddw %%mm6, %%mm2         \n"                                     \
53     "paddw %%mm6, %%mm3         \n"                                     \
54     "psubusw %%mm6, %%mm0       \n"                                     \
55     "psubusw %%mm6, %%mm1       \n"                                     \
56     "psubusw %%mm6, %%mm2       \n"                                     \
57     "psubusw %%mm6, %%mm3       \n"                                     \
58     "psraw $3, %%mm0            \n"                                     \
59     "psraw $3, %%mm1            \n"                                     \
60     "psraw $3, %%mm2            \n"                                     \
61     "psraw $3, %%mm3            \n"                                     \
62                                                                         \
63     "movq %%mm0, %%mm7          \n"                                     \
64     "punpcklwd %%mm2, %%mm0     \n" /*A*/                               \
65     "punpckhwd %%mm2, %%mm7     \n" /*C*/                               \
66     "movq %%mm1, %%mm2          \n"                                     \
67     "punpcklwd %%mm3, %%mm1     \n" /*B*/                               \
68     "punpckhwd %%mm3, %%mm2     \n" /*D*/                               \
69     "movq %%mm0, %%mm3          \n"                                     \
70     "punpcklwd %%mm1, %%mm0     \n" /*A*/                               \
71     "punpckhwd %%mm7, %%mm3     \n" /*C*/                               \
72     "punpcklwd %%mm2, %%mm7     \n" /*B*/                               \
73     "punpckhwd %%mm2, %%mm1     \n" /*D*/                               \
74                                                                         \
75     "movq %%mm0, " #dst0 "      \n"                                     \
76     "movq %%mm7, " #dst1 "      \n"                                     \
77     "movq %%mm3, " #dst2 "      \n"                                     \
78     "movq %%mm1, " #dst3 "      \n"
79
80     __asm__ volatile(
81         "movd %2, %%mm4             \n"
82         "movd %3, %%mm5             \n"
83         "movd %4, %%mm6             \n"
84         "packssdw %%mm4, %%mm4      \n"
85         "packssdw %%mm5, %%mm5      \n"
86         "packssdw %%mm6, %%mm6      \n"
87         "packssdw %%mm4, %%mm4      \n"
88         "packssdw %%mm5, %%mm5      \n"
89         "packssdw %%mm6, %%mm6      \n"
90         REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
91         REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
92         REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
93         REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
94         : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
95     );
96     dst[0] = (src[0] + 4) >> 3;
97 }
98
99 static void softthresh_mmx(int16_t dst[64], const int16_t src[64],
100                            int qp, const uint8_t *permutation)
101 {
102     int bias = 0; //FIXME
103     unsigned int threshold1;
104
105     threshold1 = qp*((1<<4) - bias) - 1;
106
107 #undef REQUANT_CORE
108 #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3)    \
109     "movq " #src0 ", %%mm0      \n"                                     \
110     "movq " #src1 ", %%mm1      \n"                                     \
111     "pxor %%mm6, %%mm6          \n"                                     \
112     "pxor %%mm7, %%mm7          \n"                                     \
113     "pcmpgtw %%mm0, %%mm6       \n"                                     \
114     "pcmpgtw %%mm1, %%mm7       \n"                                     \
115     "pxor %%mm6, %%mm0          \n"                                     \
116     "pxor %%mm7, %%mm1          \n"                                     \
117     "psubusw %%mm4, %%mm0       \n"                                     \
118     "psubusw %%mm4, %%mm1       \n"                                     \
119     "pxor %%mm6, %%mm0          \n"                                     \
120     "pxor %%mm7, %%mm1          \n"                                     \
121     "movq " #src2 ", %%mm2      \n"                                     \
122     "movq " #src3 ", %%mm3      \n"                                     \
123     "pxor %%mm6, %%mm6          \n"                                     \
124     "pxor %%mm7, %%mm7          \n"                                     \
125     "pcmpgtw %%mm2, %%mm6       \n"                                     \
126     "pcmpgtw %%mm3, %%mm7       \n"                                     \
127     "pxor %%mm6, %%mm2          \n"                                     \
128     "pxor %%mm7, %%mm3          \n"                                     \
129     "psubusw %%mm4, %%mm2       \n"                                     \
130     "psubusw %%mm4, %%mm3       \n"                                     \
131     "pxor %%mm6, %%mm2          \n"                                     \
132     "pxor %%mm7, %%mm3          \n"                                     \
133                                                                         \
134     "paddsw %%mm5, %%mm0        \n"                                     \
135     "paddsw %%mm5, %%mm1        \n"                                     \
136     "paddsw %%mm5, %%mm2        \n"                                     \
137     "paddsw %%mm5, %%mm3        \n"                                     \
138     "psraw $3, %%mm0            \n"                                     \
139     "psraw $3, %%mm1            \n"                                     \
140     "psraw $3, %%mm2            \n"                                     \
141     "psraw $3, %%mm3            \n"                                     \
142                                                                         \
143     "movq %%mm0, %%mm7          \n"                                     \
144     "punpcklwd %%mm2, %%mm0     \n" /*A*/                               \
145     "punpckhwd %%mm2, %%mm7     \n" /*C*/                               \
146     "movq %%mm1, %%mm2          \n"                                     \
147     "punpcklwd %%mm3, %%mm1     \n" /*B*/                               \
148     "punpckhwd %%mm3, %%mm2     \n" /*D*/                               \
149     "movq %%mm0, %%mm3          \n"                                     \
150     "punpcklwd %%mm1, %%mm0     \n" /*A*/                               \
151     "punpckhwd %%mm7, %%mm3     \n" /*C*/                               \
152     "punpcklwd %%mm2, %%mm7     \n" /*B*/                               \
153     "punpckhwd %%mm2, %%mm1     \n" /*D*/                               \
154                                                                         \
155     "movq %%mm0, " #dst0 "      \n"                                     \
156     "movq %%mm7, " #dst1 "      \n"                                     \
157     "movq %%mm3, " #dst2 "      \n"                                     \
158     "movq %%mm1, " #dst3 "      \n"
159
160     __asm__ volatile(
161         "movd %2, %%mm4             \n"
162         "movd %3, %%mm5             \n"
163         "packssdw %%mm4, %%mm4      \n"
164         "packssdw %%mm5, %%mm5      \n"
165         "packssdw %%mm4, %%mm4      \n"
166         "packssdw %%mm5, %%mm5      \n"
167         REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
168         REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
169         REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
170         REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
171         : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed?
172     );
173
174     dst[0] = (src[0] + 4) >> 3;
175 }
176
177 static void store_slice_mmx(uint8_t *dst, const int16_t *src,
178                             int dst_stride, int src_stride,
179                             int width, int height, int log2_scale,
180                             const uint8_t dither[8][8])
181 {
182     int y;
183
184     for (y = 0; y < height; y++) {
185         uint8_t *dst1 = dst;
186         const int16_t *src1 = src;
187         __asm__ volatile(
188             "movq (%3), %%mm3           \n"
189             "movq (%3), %%mm4           \n"
190             "movd %4, %%mm2             \n"
191             "pxor %%mm0, %%mm0          \n"
192             "punpcklbw %%mm0, %%mm3     \n"
193             "punpckhbw %%mm0, %%mm4     \n"
194             "psraw %%mm2, %%mm3         \n"
195             "psraw %%mm2, %%mm4         \n"
196             "movd %5, %%mm2             \n"
197             "1:                         \n"
198             "movq (%0), %%mm0           \n"
199             "movq 8(%0), %%mm1          \n"
200             "paddw %%mm3, %%mm0         \n"
201             "paddw %%mm4, %%mm1         \n"
202             "psraw %%mm2, %%mm0         \n"
203             "psraw %%mm2, %%mm1         \n"
204             "packuswb %%mm1, %%mm0      \n"
205             "movq %%mm0, (%1)           \n"
206             "add $16, %0                \n"
207             "add $8, %1                 \n"
208             "cmp %2, %1                 \n"
209             " jb 1b                     \n"
210             : "+r" (src1), "+r"(dst1)
211             : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale)
212         );
213         src += src_stride;
214         dst += dst_stride;
215     }
216 }
217
218 #endif /* HAVE_MMX_INLINE */
219
220 av_cold void ff_spp_init_x86(SPPContext *s)
221 {
222 #if HAVE_MMX_INLINE
223     int cpu_flags = av_get_cpu_flags();
224
225     if (cpu_flags & AV_CPU_FLAG_MMX) {
226         int64_t bps;
227         s->store_slice = store_slice_mmx;
228         av_opt_get_int(s->dct, "bits_per_sample", 0, &bps);
229         if (bps <= 8) {
230             switch (s->mode) {
231             case 0: s->requantize = hardthresh_mmx; break;
232             case 1: s->requantize = softthresh_mmx; break;
233             }
234         }
235     }
236 #endif
237 }