]> git.sesse.net Git - vlc/blob - modules/video_chroma/copy.c
swscale: use function video_format_IsSimilar.
[vlc] / modules / video_chroma / copy.c
1 /*****************************************************************************
2  * copy.c: Fast YV12/NV12 copy
3  *****************************************************************************
4  * Copyright (C) 2010 Laurent Aimar
5  * $Id$
6  *
7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
8  *
9  * This program is free software; you can redistribute it and/or modify it
10  * under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with this program; if not, write to the Free Software Foundation,
21  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23
24 #ifdef HAVE_CONFIG_H
25 # include "config.h"
26 #endif
27
28 #include <vlc_common.h>
29 #include <vlc_picture.h>
30 #include <vlc_cpu.h>
31 #include <assert.h>
32
33 #include "copy.h"
34
35 int CopyInitCache(copy_cache_t *cache, unsigned width)
36 {
37 #ifdef CAN_COMPILE_SSE2
38     cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
39     cache->buffer = vlc_memalign(16, cache->size);
40     if (!cache->buffer)
41         return VLC_EGENERIC;
42 #else
43     (void) cache; (void) width;
44 #endif
45     return VLC_SUCCESS;
46 }
47
48 void CopyCleanCache(copy_cache_t *cache)
49 {
50 #ifdef CAN_COMPILE_SSE2
51     vlc_free(cache->buffer);
52     cache->buffer = NULL;
53     cache->size   = 0;
54 #else
55     (void) cache;
56 #endif
57 }
58
59 #ifdef CAN_COMPILE_SSE2
60 /* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
61  * load and storing data with the SSE>=2 instruction store.
62  */
63 #define COPY64(dstp, srcp, load, store) \
64     asm volatile (                      \
65         load "  0(%[src]), %%xmm1\n"    \
66         load " 16(%[src]), %%xmm2\n"    \
67         load " 32(%[src]), %%xmm3\n"    \
68         load " 48(%[src]), %%xmm4\n"    \
69         store " %%xmm1,    0(%[dst])\n" \
70         store " %%xmm2,   16(%[dst])\n" \
71         store " %%xmm3,   32(%[dst])\n" \
72         store " %%xmm4,   48(%[dst])\n" \
73         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
74
75 #ifndef __SSE4_1__
76 # undef vlc_CPU_SSE4_1
77 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
78 #endif
79
80 #ifndef __SSSE3__
81 # undef vlc_CPU_SSSE3
82 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
83 #endif
84
85 #ifndef __SSE2__
86 # undef vlc_CPU_SSE2
87 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
88 #endif
89
90 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
91  * as used by some video surface.
92  * XXX It is really efficient only when SSE4.1 is available.
93  */
94 VLC_SSE
95 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
96                          const uint8_t *src, size_t src_pitch,
97                          unsigned width, unsigned height,
98                          unsigned cpu)
99 {
100     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
101
102     asm volatile ("mfence");
103
104     for (unsigned y = 0; y < height; y++) {
105         const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
106         unsigned x = 0;
107
108         for (; x < unaligned; x++)
109             dst[x] = src[x];
110
111 #ifdef CAN_COMPILE_SSE4_1
112         if (vlc_CPU_SSE4_1()) {
113             if (!unaligned) {
114                 for (; x+63 < width; x += 64)
115                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
116             } else {
117                 for (; x+63 < width; x += 64)
118                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
119             }
120         } else
121 #endif
122         {
123             if (!unaligned) {
124                 for (; x+63 < width; x += 64)
125                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
126             } else {
127                 for (; x+63 < width; x += 64)
128                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
129             }
130         }
131
132         for (; x < width; x++)
133             dst[x] = src[x];
134
135         src += src_pitch;
136         dst += dst_pitch;
137     }
138 }
139
140 VLC_SSE
141 static void Copy2d(uint8_t *dst, size_t dst_pitch,
142                    const uint8_t *src, size_t src_pitch,
143                    unsigned width, unsigned height)
144 {
145     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
146
147     asm volatile ("mfence");
148
149     for (unsigned y = 0; y < height; y++) {
150         unsigned x = 0;
151
152         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
153         if (!unaligned) {
154             for (; x+63 < width; x += 64)
155                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
156         } else {
157             for (; x+63 < width; x += 64)
158                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
159         }
160
161         for (; x < width; x++)
162             dst[x] = src[x];
163
164         src += src_pitch;
165         dst += dst_pitch;
166     }
167 }
168
169 VLC_SSE
170 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
171                         uint8_t *dstv, size_t dstv_pitch,
172                         const uint8_t *src, size_t src_pitch,
173                         unsigned width, unsigned height, unsigned cpu)
174 {
175     VLC_UNUSED(cpu);
176     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
177                                 1, 3, 5, 7, 9, 11, 13, 15 };
178     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
179                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
180
181     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
182
183     asm volatile ("mfence");
184
185     for (unsigned y = 0; y < height; y++) {
186         unsigned x = 0;
187
188 #define LOAD64 \
189     "movdqa  0(%[src]), %%xmm0\n" \
190     "movdqa 16(%[src]), %%xmm1\n" \
191     "movdqa 32(%[src]), %%xmm2\n" \
192     "movdqa 48(%[src]), %%xmm3\n"
193
194 #define STORE2X32 \
195     "movq   %%xmm0,   0(%[dst1])\n" \
196     "movq   %%xmm1,   8(%[dst1])\n" \
197     "movhpd %%xmm0,   0(%[dst2])\n" \
198     "movhpd %%xmm1,   8(%[dst2])\n" \
199     "movq   %%xmm2,  16(%[dst1])\n" \
200     "movq   %%xmm3,  24(%[dst1])\n" \
201     "movhpd %%xmm2,  16(%[dst2])\n" \
202     "movhpd %%xmm3,  24(%[dst2])\n"
203
204 #ifdef CAN_COMPILE_SSSE3
205         if (vlc_CPU_SSSE3())
206         {
207             for (x = 0; x < (width & ~31); x += 32) {
208                 asm volatile (
209                     "movdqu (%[shuffle]), %%xmm7\n"
210                     LOAD64
211                     "pshufb  %%xmm7, %%xmm0\n"
212                     "pshufb  %%xmm7, %%xmm1\n"
213                     "pshufb  %%xmm7, %%xmm2\n"
214                     "pshufb  %%xmm7, %%xmm3\n"
215                     STORE2X32
216                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
217             }
218         } else
219 #endif
220         {
221             for (x = 0; x < (width & ~31); x += 32) {
222                 asm volatile (
223                     "movdqu (%[mask]), %%xmm7\n"
224                     LOAD64
225                     "movdqa   %%xmm0, %%xmm4\n"
226                     "movdqa   %%xmm1, %%xmm5\n"
227                     "movdqa   %%xmm2, %%xmm6\n"
228                     "psrlw    $8,     %%xmm0\n"
229                     "psrlw    $8,     %%xmm1\n"
230                     "pand     %%xmm7, %%xmm4\n"
231                     "pand     %%xmm7, %%xmm5\n"
232                     "pand     %%xmm7, %%xmm6\n"
233                     "packuswb %%xmm4, %%xmm0\n"
234                     "packuswb %%xmm5, %%xmm1\n"
235                     "pand     %%xmm3, %%xmm7\n"
236                     "psrlw    $8,     %%xmm2\n"
237                     "psrlw    $8,     %%xmm3\n"
238                     "packuswb %%xmm6, %%xmm2\n"
239                     "packuswb %%xmm7, %%xmm3\n"
240                     STORE2X32
241                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
242             }
243         }
244 #undef STORE2X32
245 #undef LOAD64
246
247         for (; x < width; x++) {
248             dstu[x] = src[2*x+0];
249             dstv[x] = src[2*x+1];
250         }
251         src  += src_pitch;
252         dstu += dstu_pitch;
253         dstv += dstv_pitch;
254     }
255 }
256
257 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
258                           const uint8_t *src, size_t src_pitch,
259                           uint8_t *cache, size_t cache_size,
260                           unsigned width, unsigned height, unsigned cpu)
261 {
262     const unsigned w16 = (width+15) & ~15;
263     const unsigned hstep = cache_size / w16;
264     assert(hstep > 0);
265
266     for (unsigned y = 0; y < height; y += hstep) {
267         const unsigned hblock =  __MIN(hstep, height - y);
268
269         /* Copy a bunch of line into our cache */
270         CopyFromUswc(cache, w16,
271                      src, src_pitch,
272                      width, hblock, cpu);
273
274         /* Copy from our cache to the destination */
275         Copy2d(dst, dst_pitch,
276                cache, w16,
277                width, hblock);
278
279         /* */
280         src += src_pitch * hblock;
281         dst += dst_pitch * hblock;
282     }
283     asm volatile ("mfence");
284 }
285
286 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
287                             uint8_t *dstv, size_t dstv_pitch,
288                             const uint8_t *src, size_t src_pitch,
289                             uint8_t *cache, size_t cache_size,
290                             unsigned width, unsigned height, unsigned cpu)
291 {
292     const unsigned w2_16 = (2*width+15) & ~15;
293     const unsigned hstep = cache_size / w2_16;
294     assert(hstep > 0);
295
296     for (unsigned y = 0; y < height; y += hstep) {
297         const unsigned hblock =  __MIN(hstep, height - y);
298
299         /* Copy a bunch of line into our cache */
300         CopyFromUswc(cache, w2_16, src, src_pitch,
301                      2*width, hblock, cpu);
302
303         /* Copy from our cache to the destination */
304         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
305                     cache, w2_16, width, hblock, cpu);
306
307         /* */
308         src  += src_pitch  * hblock;
309         dstu += dstu_pitch * hblock;
310         dstv += dstv_pitch * hblock;
311     }
312     asm volatile ("mfence");
313 }
314
315 static void SSE_CopyFromNv12(picture_t *dst,
316                              uint8_t *src[2], size_t src_pitch[2],
317                              unsigned width, unsigned height,
318                              copy_cache_t *cache, unsigned cpu)
319 {
320     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
321                   src[0], src_pitch[0],
322                   cache->buffer, cache->size,
323                   width, height, cpu);
324     SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
325                     dst->p[1].p_pixels, dst->p[1].i_pitch,
326                     src[1], src_pitch[1],
327                     cache->buffer, cache->size,
328                     width/2, height/2, cpu);
329     asm volatile ("emms");
330 }
331
332 static void SSE_CopyFromYv12(picture_t *dst,
333                              uint8_t *src[3], size_t src_pitch[3],
334                              unsigned width, unsigned height,
335                              copy_cache_t *cache, unsigned cpu)
336 {
337     for (unsigned n = 0; n < 3; n++) {
338         const unsigned d = n > 0 ? 2 : 1;
339         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
340                       src[n], src_pitch[n],
341                       cache->buffer, cache->size,
342                       width/d, height/d, cpu);
343     }
344     asm volatile ("emms");
345 }
346 #undef COPY64
347 #endif /* CAN_COMPILE_SSE2 */
348
349 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
350                       const uint8_t *src, size_t src_pitch,
351                       unsigned width, unsigned height)
352 {
353     for (unsigned y = 0; y < height; y++) {
354         memcpy(dst, src, width);
355         src += src_pitch;
356         dst += dst_pitch;
357     }
358 }
359
360 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
361                         uint8_t *dstv, size_t dstv_pitch,
362                         const uint8_t *src, size_t src_pitch,
363                         unsigned width, unsigned height)
364 {
365     for (unsigned y = 0; y < height; y++) {
366         for (unsigned x = 0; x < width; x++) {
367             dstu[x] = src[2*x+0];
368             dstv[x] = src[2*x+1];
369         }
370         src  += src_pitch;
371         dstu += dstu_pitch;
372         dstv += dstv_pitch;
373     }
374 }
375
376 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
377                   unsigned width, unsigned height,
378                   copy_cache_t *cache)
379 {
380 #ifdef CAN_COMPILE_SSE2
381     unsigned cpu = vlc_CPU();
382     if (vlc_CPU_SSE2())
383         return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
384                                 cache, cpu);
385 #else
386     (void) cache;
387 #endif
388
389     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
390               src[0], src_pitch[0],
391               width, height);
392     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
393                 dst->p[1].p_pixels, dst->p[1].i_pitch,
394                 src[1], src_pitch[1],
395                 width/2, height/2);
396 }
397
398 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
399                   unsigned width, unsigned height,
400                   copy_cache_t *cache)
401 {
402 #ifdef CAN_COMPILE_SSE2
403     unsigned cpu = vlc_CPU();
404     if (vlc_CPU_SSE2())
405         return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
406                                 cache, cpu);
407 #else
408     (void) cache;
409 #endif
410
411      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
412                src[0], src_pitch[0], width, height);
413      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
414                src[1], src_pitch[1], width / 2, height / 2);
415      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
416                src[2], src_pitch[2], width / 2, height / 2);
417 }