1 /*****************************************************************************
2 * copy.c: Fast YV12/NV12 copy
3 *****************************************************************************
4 * Copyright (C) 2010 Laurent Aimar
7 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
28 #include <vlc_common.h>
29 #include <vlc_picture.h>
35 int CopyInitCache(copy_cache_t *cache, unsigned width)
37 #ifdef CAN_COMPILE_SSE2
38 cache->size = __MAX((width + 0x3f) & ~ 0x3f, 4096);
39 cache->buffer = vlc_memalign(64, cache->size);
43 (void) cache; (void) width;
48 void CopyCleanCache(copy_cache_t *cache)
50 #ifdef CAN_COMPILE_SSE2
51 vlc_free(cache->buffer);
59 #ifdef CAN_COMPILE_SSE2
60 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
61 * load and storing data with the SSE>=2 instruction store.
63 #define COPY16(dstp, srcp, load, store) \
65 load " 0(%[src]), %%xmm1\n" \
66 store " %%xmm1, 0(%[dst])\n" \
67 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
69 #define COPY64(dstp, srcp, load, store) \
71 load " 0(%[src]), %%xmm1\n" \
72 load " 16(%[src]), %%xmm2\n" \
73 load " 32(%[src]), %%xmm3\n" \
74 load " 48(%[src]), %%xmm4\n" \
75 store " %%xmm1, 0(%[dst])\n" \
76 store " %%xmm2, 16(%[dst])\n" \
77 store " %%xmm3, 32(%[dst])\n" \
78 store " %%xmm4, 48(%[dst])\n" \
79 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
82 # undef vlc_CPU_SSE4_1
83 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
88 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
93 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
96 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
97 * as used by some video surface.
98 * XXX It is really efficient only when SSE4.1 is available.
101 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
102 const uint8_t *src, size_t src_pitch,
103 unsigned width, unsigned height,
106 #ifndef CAN_COMPILE_SSSE3
109 assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
111 asm volatile ("mfence");
113 for (unsigned y = 0; y < height; y++) {
114 const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
115 unsigned x = unaligned;
117 #ifdef CAN_COMPILE_SSE4_1
118 if (vlc_CPU_SSE4_1()) {
120 for (; x+63 < width; x += 64)
121 COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
123 COPY16(dst, src, "movdqu", "movdqa");
124 for (; x+63 < width; x += 64)
125 COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
131 for (; x+63 < width; x += 64)
132 COPY64(&dst[x], &src[x], "movdqa", "movdqa");
134 COPY16(dst, src, "movdqu", "movdqa");
135 for (; x+63 < width; x += 64)
136 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
140 for (; x < width; x++)
146 asm volatile ("mfence");
150 static void Copy2d(uint8_t *dst, size_t dst_pitch,
151 const uint8_t *src, size_t src_pitch,
152 unsigned width, unsigned height)
154 assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
156 for (unsigned y = 0; y < height; y++) {
159 bool unaligned = ((intptr_t)dst & 0x0f) != 0;
161 for (; x+63 < width; x += 64)
162 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
164 for (; x+63 < width; x += 64)
165 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
168 for (; x < width; x++)
177 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
178 uint8_t *dstv, size_t dstv_pitch,
179 const uint8_t *src, size_t src_pitch,
180 unsigned width, unsigned height, unsigned cpu)
182 #ifndef CAN_COMPILE_SSSE3
185 const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
186 1, 3, 5, 7, 9, 11, 13, 15 };
187 const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
188 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
190 assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
192 for (unsigned y = 0; y < height; y++) {
196 "movdqa 0(%[src]), %%xmm0\n" \
197 "movdqa 16(%[src]), %%xmm1\n" \
198 "movdqa 32(%[src]), %%xmm2\n" \
199 "movdqa 48(%[src]), %%xmm3\n"
202 "movq %%xmm0, 0(%[dst1])\n" \
203 "movq %%xmm1, 8(%[dst1])\n" \
204 "movhpd %%xmm0, 0(%[dst2])\n" \
205 "movhpd %%xmm1, 8(%[dst2])\n" \
206 "movq %%xmm2, 16(%[dst1])\n" \
207 "movq %%xmm3, 24(%[dst1])\n" \
208 "movhpd %%xmm2, 16(%[dst2])\n" \
209 "movhpd %%xmm3, 24(%[dst2])\n"
211 #ifdef CAN_COMPILE_SSSE3
214 for (x = 0; x < (width & ~31); x += 32) {
216 "movdqu (%[shuffle]), %%xmm7\n"
218 "pshufb %%xmm7, %%xmm0\n"
219 "pshufb %%xmm7, %%xmm1\n"
220 "pshufb %%xmm7, %%xmm2\n"
221 "pshufb %%xmm7, %%xmm3\n"
223 : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
228 for (x = 0; x < (width & ~31); x += 32) {
230 "movdqu (%[mask]), %%xmm7\n"
232 "movdqa %%xmm0, %%xmm4\n"
233 "movdqa %%xmm1, %%xmm5\n"
234 "movdqa %%xmm2, %%xmm6\n"
237 "pand %%xmm7, %%xmm4\n"
238 "pand %%xmm7, %%xmm5\n"
239 "pand %%xmm7, %%xmm6\n"
240 "packuswb %%xmm4, %%xmm0\n"
241 "packuswb %%xmm5, %%xmm1\n"
242 "pand %%xmm3, %%xmm7\n"
245 "packuswb %%xmm6, %%xmm2\n"
246 "packuswb %%xmm7, %%xmm3\n"
248 : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
254 for (; x < width; x++) {
255 dstu[x] = src[2*x+0];
256 dstv[x] = src[2*x+1];
264 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
265 const uint8_t *src, size_t src_pitch,
266 uint8_t *cache, size_t cache_size,
267 unsigned width, unsigned height, unsigned cpu)
269 const unsigned w16 = (width+15) & ~15;
270 const unsigned hstep = cache_size / w16;
273 for (unsigned y = 0; y < height; y += hstep) {
274 const unsigned hblock = __MIN(hstep, height - y);
276 /* Copy a bunch of line into our cache */
277 CopyFromUswc(cache, w16,
281 /* Copy from our cache to the destination */
282 Copy2d(dst, dst_pitch,
287 src += src_pitch * hblock;
288 dst += dst_pitch * hblock;
292 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
293 uint8_t *dstv, size_t dstv_pitch,
294 const uint8_t *src, size_t src_pitch,
295 uint8_t *cache, size_t cache_size,
296 unsigned width, unsigned height, unsigned cpu)
298 const unsigned w16 = (2*width+15) & ~15;
299 const unsigned hstep = cache_size / w16;
302 for (unsigned y = 0; y < height; y += hstep) {
303 const unsigned hblock = __MIN(hstep, height - y);
305 /* Copy a bunch of line into our cache */
306 CopyFromUswc(cache, w16, src, src_pitch,
307 2*width, hblock, cpu);
309 /* Copy from our cache to the destination */
310 SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
311 cache, w16, width, hblock, cpu);
314 src += src_pitch * hblock;
315 dstu += dstu_pitch * hblock;
316 dstv += dstv_pitch * hblock;
320 static void SSE_CopyFromNv12(picture_t *dst,
321 uint8_t *src[2], size_t src_pitch[2],
322 unsigned width, unsigned height,
323 copy_cache_t *cache, unsigned cpu)
325 SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
326 src[0], src_pitch[0],
327 cache->buffer, cache->size,
329 SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
330 dst->p[1].p_pixels, dst->p[1].i_pitch,
331 src[1], src_pitch[1],
332 cache->buffer, cache->size,
333 (width+1)/2, (height+1)/2, cpu);
334 asm volatile ("emms");
337 static void SSE_CopyFromYv12(picture_t *dst,
338 uint8_t *src[3], size_t src_pitch[3],
339 unsigned width, unsigned height,
340 copy_cache_t *cache, unsigned cpu)
342 for (unsigned n = 0; n < 3; n++) {
343 const unsigned d = n > 0 ? 2 : 1;
344 SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
345 src[n], src_pitch[n],
346 cache->buffer, cache->size,
347 (width+d-1)/d, (height+d-1)/d, cpu);
349 asm volatile ("emms");
352 #endif /* CAN_COMPILE_SSE2 */
354 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
355 const uint8_t *src, size_t src_pitch,
356 unsigned width, unsigned height)
358 for (unsigned y = 0; y < height; y++) {
359 memcpy(dst, src, width);
365 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
366 uint8_t *dstv, size_t dstv_pitch,
367 const uint8_t *src, size_t src_pitch,
368 unsigned width, unsigned height)
370 for (unsigned y = 0; y < height; y++) {
371 for (unsigned x = 0; x < width; x++) {
372 dstu[x] = src[2*x+0];
373 dstv[x] = src[2*x+1];
381 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
382 unsigned width, unsigned height,
385 #ifdef CAN_COMPILE_SSE2
386 unsigned cpu = vlc_CPU();
388 return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
394 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
395 src[0], src_pitch[0],
397 SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
398 dst->p[1].p_pixels, dst->p[1].i_pitch,
399 src[1], src_pitch[1],
403 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
404 unsigned width, unsigned height,
407 #ifdef CAN_COMPILE_SSE2
408 unsigned cpu = vlc_CPU();
410 return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
416 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
417 src[0], src_pitch[0], width, height);
418 CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
419 src[1], src_pitch[1], width / 2, height / 2);
420 CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
421 src[2], src_pitch[2], width / 2, height / 2);