]> git.sesse.net Git - vlc/blob - modules/video_chroma/copy.c
mediacodec: skip prerolled frames
[vlc] / modules / video_chroma / copy.c
1 /*****************************************************************************
2  * copy.c: Fast YV12/NV12 copy
3  *****************************************************************************
4  * Copyright (C) 2010 Laurent Aimar
5  * $Id$
6  *
7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
8  *
9  * This program is free software; you can redistribute it and/or modify it
10  * under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with this program; if not, write to the Free Software Foundation,
21  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23
24 #ifdef HAVE_CONFIG_H
25 # include "config.h"
26 #endif
27
28 #include <vlc_common.h>
29 #include <vlc_picture.h>
30 #include <vlc_cpu.h>
31 #include <assert.h>
32
33 #include "copy.h"
34
35 int CopyInitCache(copy_cache_t *cache, unsigned width)
36 {
37 #ifdef CAN_COMPILE_SSE2
38     cache->size = __MAX((width + 0x3f) & ~ 0x3f, 4096);
39     cache->buffer = vlc_memalign(64, cache->size);
40     if (!cache->buffer)
41         return VLC_EGENERIC;
42 #else
43     (void) cache; (void) width;
44 #endif
45     return VLC_SUCCESS;
46 }
47
48 void CopyCleanCache(copy_cache_t *cache)
49 {
50 #ifdef CAN_COMPILE_SSE2
51     vlc_free(cache->buffer);
52     cache->buffer = NULL;
53     cache->size   = 0;
54 #else
55     (void) cache;
56 #endif
57 }
58
59 #ifdef CAN_COMPILE_SSE2
60 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
61  * load and storing data with the SSE>=2 instruction store.
62  */
63 #define COPY16(dstp, srcp, load, store) \
64     asm volatile (                      \
65         load "  0(%[src]), %%xmm1\n"    \
66         store " %%xmm1,    0(%[dst])\n" \
67         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
68
69 #define COPY64(dstp, srcp, load, store) \
70     asm volatile (                      \
71         load "  0(%[src]), %%xmm1\n"    \
72         load " 16(%[src]), %%xmm2\n"    \
73         load " 32(%[src]), %%xmm3\n"    \
74         load " 48(%[src]), %%xmm4\n"    \
75         store " %%xmm1,    0(%[dst])\n" \
76         store " %%xmm2,   16(%[dst])\n" \
77         store " %%xmm3,   32(%[dst])\n" \
78         store " %%xmm4,   48(%[dst])\n" \
79         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
80
81 #ifndef __SSE4_1__
82 # undef vlc_CPU_SSE4_1
83 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
84 #endif
85
86 #ifndef __SSSE3__
87 # undef vlc_CPU_SSSE3
88 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
89 #endif
90
91 #ifndef __SSE2__
92 # undef vlc_CPU_SSE2
93 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
94 #endif
95
96 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
97  * as used by some video surface.
98  * XXX It is really efficient only when SSE4.1 is available.
99  */
100 VLC_SSE
101 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
102                          const uint8_t *src, size_t src_pitch,
103                          unsigned width, unsigned height,
104                          unsigned cpu)
105 {
106 #ifndef CAN_COMPILE_SSSE3
107     VLC_UNUSED(cpu);
108 #endif
109     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
110
111     asm volatile ("mfence");
112
113     for (unsigned y = 0; y < height; y++) {
114         const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
115         unsigned x = unaligned;
116
117 #ifdef CAN_COMPILE_SSE4_1
118         if (vlc_CPU_SSE4_1()) {
119             if (!unaligned) {
120                 for (; x+63 < width; x += 64)
121                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
122             } else {
123                 COPY16(dst, src, "movdqu", "movdqa");
124                 for (; x+63 < width; x += 64)
125                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
126             }
127         } else
128 #endif
129         {
130             if (!unaligned) {
131                 for (; x+63 < width; x += 64)
132                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
133             } else {
134                 COPY16(dst, src, "movdqu", "movdqa");
135                 for (; x+63 < width; x += 64)
136                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
137             }
138         }
139
140         for (; x < width; x++)
141             dst[x] = src[x];
142
143         src += src_pitch;
144         dst += dst_pitch;
145     }
146     asm volatile ("mfence");
147 }
148
149 VLC_SSE
150 static void Copy2d(uint8_t *dst, size_t dst_pitch,
151                    const uint8_t *src, size_t src_pitch,
152                    unsigned width, unsigned height)
153 {
154     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
155
156     for (unsigned y = 0; y < height; y++) {
157         unsigned x = 0;
158
159         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
160         if (!unaligned) {
161             for (; x+63 < width; x += 64)
162                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
163         } else {
164             for (; x+63 < width; x += 64)
165                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
166         }
167
168         for (; x < width; x++)
169             dst[x] = src[x];
170
171         src += src_pitch;
172         dst += dst_pitch;
173     }
174 }
175
176 VLC_SSE
177 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
178                         uint8_t *dstv, size_t dstv_pitch,
179                         const uint8_t *src, size_t src_pitch,
180                         unsigned width, unsigned height, unsigned cpu)
181 {
182 #ifndef CAN_COMPILE_SSSE3
183     VLC_UNUSED(cpu);
184 #endif
185     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
186                                 1, 3, 5, 7, 9, 11, 13, 15 };
187     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
188                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
189
190     assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
191
192     for (unsigned y = 0; y < height; y++) {
193         unsigned x = 0;
194
195 #define LOAD64 \
196     "movdqa  0(%[src]), %%xmm0\n" \
197     "movdqa 16(%[src]), %%xmm1\n" \
198     "movdqa 32(%[src]), %%xmm2\n" \
199     "movdqa 48(%[src]), %%xmm3\n"
200
201 #define STORE2X32 \
202     "movq   %%xmm0,   0(%[dst1])\n" \
203     "movq   %%xmm1,   8(%[dst1])\n" \
204     "movhpd %%xmm0,   0(%[dst2])\n" \
205     "movhpd %%xmm1,   8(%[dst2])\n" \
206     "movq   %%xmm2,  16(%[dst1])\n" \
207     "movq   %%xmm3,  24(%[dst1])\n" \
208     "movhpd %%xmm2,  16(%[dst2])\n" \
209     "movhpd %%xmm3,  24(%[dst2])\n"
210
211 #ifdef CAN_COMPILE_SSSE3
212         if (vlc_CPU_SSSE3())
213         {
214             for (x = 0; x < (width & ~31); x += 32) {
215                 asm volatile (
216                     "movdqu (%[shuffle]), %%xmm7\n"
217                     LOAD64
218                     "pshufb  %%xmm7, %%xmm0\n"
219                     "pshufb  %%xmm7, %%xmm1\n"
220                     "pshufb  %%xmm7, %%xmm2\n"
221                     "pshufb  %%xmm7, %%xmm3\n"
222                     STORE2X32
223                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
224             }
225         } else
226 #endif
227         {
228             for (x = 0; x < (width & ~31); x += 32) {
229                 asm volatile (
230                     "movdqu (%[mask]), %%xmm7\n"
231                     LOAD64
232                     "movdqa   %%xmm0, %%xmm4\n"
233                     "movdqa   %%xmm1, %%xmm5\n"
234                     "movdqa   %%xmm2, %%xmm6\n"
235                     "psrlw    $8,     %%xmm0\n"
236                     "psrlw    $8,     %%xmm1\n"
237                     "pand     %%xmm7, %%xmm4\n"
238                     "pand     %%xmm7, %%xmm5\n"
239                     "pand     %%xmm7, %%xmm6\n"
240                     "packuswb %%xmm4, %%xmm0\n"
241                     "packuswb %%xmm5, %%xmm1\n"
242                     "pand     %%xmm3, %%xmm7\n"
243                     "psrlw    $8,     %%xmm2\n"
244                     "psrlw    $8,     %%xmm3\n"
245                     "packuswb %%xmm6, %%xmm2\n"
246                     "packuswb %%xmm7, %%xmm3\n"
247                     STORE2X32
248                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
249             }
250         }
251 #undef STORE2X32
252 #undef LOAD64
253
254         for (; x < width; x++) {
255             dstu[x] = src[2*x+0];
256             dstv[x] = src[2*x+1];
257         }
258         src  += src_pitch;
259         dstu += dstu_pitch;
260         dstv += dstv_pitch;
261     }
262 }
263
264 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
265                           const uint8_t *src, size_t src_pitch,
266                           uint8_t *cache, size_t cache_size,
267                           unsigned width, unsigned height, unsigned cpu)
268 {
269     const unsigned w16 = (width+15) & ~15;
270     const unsigned hstep = cache_size / w16;
271     assert(hstep > 0);
272
273     for (unsigned y = 0; y < height; y += hstep) {
274         const unsigned hblock =  __MIN(hstep, height - y);
275
276         /* Copy a bunch of line into our cache */
277         CopyFromUswc(cache, w16,
278                      src, src_pitch,
279                      width, hblock, cpu);
280
281         /* Copy from our cache to the destination */
282         Copy2d(dst, dst_pitch,
283                cache, w16,
284                width, hblock);
285
286         /* */
287         src += src_pitch * hblock;
288         dst += dst_pitch * hblock;
289     }
290 }
291
292 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
293                             uint8_t *dstv, size_t dstv_pitch,
294                             const uint8_t *src, size_t src_pitch,
295                             uint8_t *cache, size_t cache_size,
296                             unsigned width, unsigned height, unsigned cpu)
297 {
298     const unsigned w16 = (2*width+15) & ~15;
299     const unsigned hstep = cache_size / w16;
300     assert(hstep > 0);
301
302     for (unsigned y = 0; y < height; y += hstep) {
303         const unsigned hblock =  __MIN(hstep, height - y);
304
305         /* Copy a bunch of line into our cache */
306         CopyFromUswc(cache, w16, src, src_pitch,
307                      2*width, hblock, cpu);
308
309         /* Copy from our cache to the destination */
310         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
311                     cache, w16, width, hblock, cpu);
312
313         /* */
314         src  += src_pitch  * hblock;
315         dstu += dstu_pitch * hblock;
316         dstv += dstv_pitch * hblock;
317     }
318 }
319
320 static void SSE_CopyFromNv12(picture_t *dst,
321                              uint8_t *src[2], size_t src_pitch[2],
322                              unsigned width, unsigned height,
323                              copy_cache_t *cache, unsigned cpu)
324 {
325     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
326                   src[0], src_pitch[0],
327                   cache->buffer, cache->size,
328                   width, height, cpu);
329     SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
330                     dst->p[1].p_pixels, dst->p[1].i_pitch,
331                     src[1], src_pitch[1],
332                     cache->buffer, cache->size,
333                     (width+1)/2, (height+1)/2, cpu);
334     asm volatile ("emms");
335 }
336
337 static void SSE_CopyFromYv12(picture_t *dst,
338                              uint8_t *src[3], size_t src_pitch[3],
339                              unsigned width, unsigned height,
340                              copy_cache_t *cache, unsigned cpu)
341 {
342     for (unsigned n = 0; n < 3; n++) {
343         const unsigned d = n > 0 ? 2 : 1;
344         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
345                       src[n], src_pitch[n],
346                       cache->buffer, cache->size,
347                       (width+d-1)/d, (height+d-1)/d, cpu);
348     }
349     asm volatile ("emms");
350 }
351 #undef COPY64
352 #endif /* CAN_COMPILE_SSE2 */
353
354 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
355                       const uint8_t *src, size_t src_pitch,
356                       unsigned width, unsigned height)
357 {
358     for (unsigned y = 0; y < height; y++) {
359         memcpy(dst, src, width);
360         src += src_pitch;
361         dst += dst_pitch;
362     }
363 }
364
365 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
366                         uint8_t *dstv, size_t dstv_pitch,
367                         const uint8_t *src, size_t src_pitch,
368                         unsigned width, unsigned height)
369 {
370     for (unsigned y = 0; y < height; y++) {
371         for (unsigned x = 0; x < width; x++) {
372             dstu[x] = src[2*x+0];
373             dstv[x] = src[2*x+1];
374         }
375         src  += src_pitch;
376         dstu += dstu_pitch;
377         dstv += dstv_pitch;
378     }
379 }
380
381 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
382                   unsigned width, unsigned height,
383                   copy_cache_t *cache)
384 {
385 #ifdef CAN_COMPILE_SSE2
386     unsigned cpu = vlc_CPU();
387     if (vlc_CPU_SSE2())
388         return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
389                                 cache, cpu);
390 #else
391     (void) cache;
392 #endif
393
394     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
395               src[0], src_pitch[0],
396               width, height);
397     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
398                 dst->p[1].p_pixels, dst->p[1].i_pitch,
399                 src[1], src_pitch[1],
400                 width/2, height/2);
401 }
402
403 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
404                   unsigned width, unsigned height,
405                   copy_cache_t *cache)
406 {
407 #ifdef CAN_COMPILE_SSE2
408     unsigned cpu = vlc_CPU();
409     if (vlc_CPU_SSE2())
410         return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
411                                 cache, cpu);
412 #else
413     (void) cache;
414 #endif
415
416      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
417                src[0], src_pitch[0], width, height);
418      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
419                src[1], src_pitch[1], width / 2, height / 2);
420      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
421                src[2], src_pitch[2], width / 2, height / 2);
422 }