]> git.sesse.net Git - vlc/blob - modules/codec/avcodec/copy.c
4375c971a577dd4f1ef832fce1fef543f6439283
[vlc] / modules / codec / avcodec / copy.c
1 /*****************************************************************************
2  * copy.c: Fast YV12/NV12 copy
3  *****************************************************************************
4  * Copyright (C) 2010 Laurent Aimar
5  * $Id$
6  *
7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23
24 #ifdef HAVE_CONFIG_H
25 # include "config.h"
26 #endif
27
28 #include <vlc_common.h>
29 #include <vlc_picture.h>
30 #include <vlc_cpu.h>
31 #include <assert.h>
32
33 #include "copy.h"
34
35 /* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
36  * storing data with the SSE>=2 instruction store.
37  */
38 #define COPY64(dstp, srcp, load, store) \
39     asm volatile (                      \
40         load "  0(%[src]), %%xmm1\n"    \
41         load " 16(%[src]), %%xmm2\n"    \
42         load " 32(%[src]), %%xmm3\n"    \
43         load " 48(%[src]), %%xmm4\n"    \
44         store " %%xmm1,    0(%[dst])\n" \
45         store " %%xmm2,   16(%[dst])\n" \
46         store " %%xmm3,   32(%[dst])\n" \
47         store " %%xmm4,   48(%[dst])\n" \
48         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
49
50 /* Execute the instruction op only if SSE2 is supported. */
51 #ifdef CAN_COMPILE_SSE2
52 #   define ASM_SSE2(cpu, op) do {          \
53         if (cpu & CPU_CAPABILITY_SSE2)  \
54             asm volatile (op);    \
55     } while (0)
56 #else
57 #   define ASM_SSE2(cpu, op)
58 #endif
59
60 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
61  * as used by some video surface.
62  * XXX It is really efficient only when SSE4.1 is available.
63  */
64 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
65                          const uint8_t *src, size_t src_pitch,
66                          unsigned unaligned,
67                          unsigned width, unsigned height,
68                          unsigned cpu)
69 {
70     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
71
72     ASM_SSE2(cpu, "mfence");
73     for (unsigned y = 0; y < height; y++) {
74         unsigned x;
75
76         for (x = 0; x < unaligned; x++)
77             dst[x] = src[x];
78
79 #ifdef CAN_COMPILE_SSE4_1
80         if (cpu & CPU_CAPABILITY_SSE4_1) {
81             if (!unaligned) {
82                 for (; x+63 < width; x += 64)
83                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
84             } else {
85                 for (; x+63 < width; x += 64)
86                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
87             }
88         } else
89 #endif
90 #ifdef CAN_COMPILE_SSE2
91         if (cpu & CPU_CAPABILITY_SSE2) {
92             if (!unaligned) {
93                 for (; x+63 < width; x += 64)
94                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
95             } else {
96                 for (; x+63 < width; x += 64)
97                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
98             }
99         }
100 #endif
101
102         for (; x < width; x++)
103             dst[x] = src[x];
104
105         src += src_pitch;
106         dst += dst_pitch;
107     }
108 }
109
110 static void Copy2d(uint8_t *dst, size_t dst_pitch,
111                    const uint8_t *src, size_t src_pitch,
112                    unsigned width, unsigned height,
113                    unsigned cpu)
114 {
115     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
116
117     ASM_SSE2(cpu, "mfence");
118
119     for (unsigned y = 0; y < height; y++) {
120         unsigned x = 0;
121         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
122
123 #ifdef CAN_COMPILE_SSE2
124         if (cpu & CPU_CAPABILITY_SSE2) {
125             if (!unaligned) {
126                 for (; x+63 < width; x += 64)
127                     COPY64(&dst[x], &src[x], "movdqa", "movntdq");
128             } else {
129                 for (; x+63 < width; x += 64)
130                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
131             }
132         }
133 #endif
134
135         for (; x < width; x++)
136             dst[x] = src[x];
137
138         src += src_pitch;
139         dst += dst_pitch;
140     }
141 }
142
143 static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
144                     uint8_t *dstv, size_t dstv_pitch,
145                     const uint8_t *src, size_t src_pitch,
146                     unsigned width, unsigned height, unsigned cpu)
147 {
148     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
149                                 1, 3, 5, 7, 9, 11, 13, 15 };
150     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
151                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
152
153     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
154
155     ASM_SSE2(cpu, "mfence");
156
157     for (unsigned y = 0; y < height; y++) {
158         unsigned x = 0;
159
160 #define LOAD64 \
161     "movdqa  0(%[src]), %%xmm0\n" \
162     "movdqa 16(%[src]), %%xmm1\n" \
163     "movdqa 32(%[src]), %%xmm2\n" \
164     "movdqa 48(%[src]), %%xmm3\n"
165
166 #define STORE2X32 \
167     "movq   %%xmm0,   0(%[dst1])\n" \
168     "movq   %%xmm1,   8(%[dst1])\n" \
169     "movhpd %%xmm0,   0(%[dst2])\n" \
170     "movhpd %%xmm1,   8(%[dst2])\n" \
171     "movq   %%xmm2,  16(%[dst1])\n" \
172     "movq   %%xmm3,  24(%[dst1])\n" \
173     "movhpd %%xmm2,  16(%[dst2])\n" \
174     "movhpd %%xmm3,  24(%[dst2])\n"
175
176 #ifdef CAN_COMPILE_SSSE3
177         if (cpu & CPU_CAPABILITY_SSSE3) {
178             for (x = 0; x < (width & ~31); x += 32) {
179                 asm volatile (
180                     "movdqu (%[shuffle]), %%xmm7\n"
181                     LOAD64
182                     "pshufb  %%xmm7, %%xmm0\n"
183                     "pshufb  %%xmm7, %%xmm1\n"
184                     "pshufb  %%xmm7, %%xmm2\n"
185                     "pshufb  %%xmm7, %%xmm3\n"
186                     STORE2X32
187                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
188             }
189         } else
190 #endif
191 #ifdef CAN_COMPILE_SSE2
192         if (cpu & CPU_CAPABILITY_SSE2) {
193             for (x = 0; x < (width & ~31); x += 32) {
194                 asm volatile (
195                     "movdqu (%[mask]), %%xmm7\n"
196                     LOAD64
197                     "movdqa   %%xmm0, %%xmm4\n"
198                     "movdqa   %%xmm1, %%xmm5\n"
199                     "movdqa   %%xmm2, %%xmm6\n"
200                     "psrlw    $8,     %%xmm0\n"
201                     "psrlw    $8,     %%xmm1\n"
202                     "pand     %%xmm7, %%xmm4\n"
203                     "pand     %%xmm7, %%xmm5\n"
204                     "pand     %%xmm7, %%xmm6\n"
205                     "packuswb %%xmm4, %%xmm0\n"
206                     "packuswb %%xmm5, %%xmm1\n"
207                     "pand     %%xmm3, %%xmm7\n"
208                     "psrlw    $8,     %%xmm2\n"
209                     "psrlw    $8,     %%xmm3\n"
210                     "packuswb %%xmm6, %%xmm2\n"
211                     "packuswb %%xmm7, %%xmm3\n"
212                     STORE2X32
213                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
214             }
215         }
216 #endif
217 #undef STORE2X32
218 #undef LOAD64
219
220         for (; x < width; x++) {
221             dstu[x] = src[2*x+0];
222             dstv[x] = src[2*x+1];
223         }
224         src  += src_pitch;
225         dstu += dstu_pitch;
226         dstv += dstv_pitch;
227     }
228 }
229
230 static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
231                       uint8_t *cache, size_t cache_size,
232                       unsigned width, unsigned height,
233                       unsigned cpu)
234 {
235     const unsigned w16 = (width+15) & ~15;
236     const unsigned hstep = cache_size / w16;
237     assert(hstep > 0);
238
239     for (unsigned y = 0; y < height; y += hstep) {
240         const unsigned unaligned = (intptr_t)src & 0x0f;
241         const unsigned hblock =  __MIN(hstep, height - y);
242
243         /* Copy a bunch of line into our cache */
244         CopyFromUswc(cache, w16,
245                      src, src_pitch,
246                      unaligned,
247                      width, hblock, cpu);
248
249         /* Copy from our cache to the destination */
250         Copy2d(dst, dst_pitch,
251                cache, w16,
252                width, hblock, cpu);
253
254         /* */
255         src += src_pitch * hblock;
256         dst += dst_pitch * hblock;
257     }
258
259     ASM_SSE2(cpu, "mfence");
260 }
261 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
262                         uint8_t *dstv, size_t dstv_pitch,
263                         const uint8_t *src, size_t src_pitch,
264                         uint8_t *cache, size_t cache_size,
265                         unsigned width, unsigned height,
266                         unsigned cpu)
267 {
268     const unsigned w2_16 = (2*width+15) & ~15;
269     const unsigned hstep = cache_size / w2_16;
270     assert(hstep > 0);
271
272     for (unsigned y = 0; y < height; y += hstep) {
273         const unsigned unaligned = (intptr_t)src & 0x0f;
274         const unsigned hblock =  __MIN(hstep, height - y);
275
276         /* Copy a bunch of line into our cache */
277         CopyFromUswc(cache, w2_16,
278                      src, src_pitch,
279                      unaligned,
280                      2*width, hblock, cpu);
281
282         /* Copy from our cache to the destination */
283         SplitUV(dstu, dstu_pitch,
284                 dstv, dstv_pitch,
285                 cache, w2_16,
286                 width, hblock, cpu);
287
288         /* */
289         src  += src_pitch  * hblock;
290         dstu += dstu_pitch * hblock;
291         dstv += dstv_pitch * hblock;
292     }
293
294     ASM_SSE2(cpu, "mfence");
295 }
296
297 int CopyInitCache(copy_cache_t *cache, unsigned width)
298 {
299     cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
300     cache->buffer = vlc_memalign(&cache->base, 16, cache->size);
301     if (!cache->base)
302         return VLC_EGENERIC;
303     return VLC_SUCCESS;
304 }
305 void CopyCleanCache(copy_cache_t *cache)
306 {
307     free(cache->base);
308
309     cache->base   = NULL;
310     cache->buffer = NULL;
311     cache->size   = 0;
312 }
313
314 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
315                   unsigned width, unsigned height,
316                   copy_cache_t *cache)
317 {
318     const unsigned cpu = vlc_CPU();
319
320     /* */
321     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
322               src[0], src_pitch[0],
323               cache->buffer, cache->size,
324               width, height, cpu);
325     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
326                 dst->p[1].p_pixels, dst->p[1].i_pitch,
327                 src[1], src_pitch[1],
328                 cache->buffer, cache->size,
329                 width/2, height/2, cpu);
330
331     ASM_SSE2(cpu, "emms");
332 }
333 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
334                   unsigned width, unsigned height,
335                   copy_cache_t *cache)
336 {
337     const unsigned cpu = vlc_CPU();
338
339     /* */
340     for (unsigned n = 0; n < 3; n++) {
341         const unsigned d = n > 0 ? 2 : 1;
342         CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
343                   src[n], src_pitch[n],
344                   cache->buffer, cache->size,
345                   width/d, height/d, cpu);
346     }
347     ASM_SSE2(cpu, "emms");
348 }
349
350 #undef ASM_SSE2
351 #undef COPY64
352