1 /*****************************************************************************
2 * copy.c: Fast YV12/NV12 copy
3 *****************************************************************************
4 * Copyright (C) 2010 Laurent Aimar
7 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
28 #include <vlc_common.h>
29 #include <vlc_picture.h>
35 /* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
36 * load and storing data with the SSE>=2 instruction store.
38 #define COPY64(dstp, srcp, load, store) \
40 load " 0(%[src]), %%xmm1\n" \
41 load " 16(%[src]), %%xmm2\n" \
42 load " 32(%[src]), %%xmm3\n" \
43 load " 48(%[src]), %%xmm4\n" \
44 store " %%xmm1, 0(%[dst])\n" \
45 store " %%xmm2, 16(%[dst])\n" \
46 store " %%xmm3, 32(%[dst])\n" \
47 store " %%xmm4, 48(%[dst])\n" \
48 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
51 # undef vlc_CPU_SSE4_1
52 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
57 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
60 /* Execute the instruction op only if SSE2 is supported. */
61 #ifdef CAN_COMPILE_SSE2
63 # define ASM_SSE2(cpu, op) asm volatile (op)
65 # define ASM_SSE2(cpu, op) do { \
66 if (cpu & VLC_CPU_SSE2) \
70 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
73 # define ASM_SSE2(cpu, op)
76 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
77 * as used by some video surface.
78 * XXX It is really efficient only when SSE4.1 is available.
80 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
81 const uint8_t *src, size_t src_pitch,
82 unsigned width, unsigned height,
85 assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
87 ASM_SSE2(cpu, "mfence");
88 for (unsigned y = 0; y < height; y++) {
89 const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
92 for (; x < unaligned; x++)
95 #ifdef CAN_COMPILE_SSE4_1
96 if (vlc_CPU_SSE4_1()) {
98 for (; x+63 < width; x += 64)
99 COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
101 for (; x+63 < width; x += 64)
102 COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
106 #ifdef CAN_COMPILE_SSE2
107 if (vlc_CPU_SSE2()) {
109 for (; x+63 < width; x += 64)
110 COPY64(&dst[x], &src[x], "movdqa", "movdqa");
112 for (; x+63 < width; x += 64)
113 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
118 for (; x < width; x++)
126 static void Copy2d(uint8_t *dst, size_t dst_pitch,
127 const uint8_t *src, size_t src_pitch,
128 unsigned width, unsigned height,
131 assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
133 ASM_SSE2(cpu, "mfence");
135 for (unsigned y = 0; y < height; y++) {
138 #ifdef CAN_COMPILE_SSE2
139 bool unaligned = ((intptr_t)dst & 0x0f) != 0;
141 if (vlc_CPU_SSE2()) {
143 for (; x+63 < width; x += 64)
144 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
146 for (; x+63 < width; x += 64)
147 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
152 for (; x < width; x++)
160 static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
161 uint8_t *dstv, size_t dstv_pitch,
162 const uint8_t *src, size_t src_pitch,
163 unsigned width, unsigned height, unsigned cpu)
165 const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
166 1, 3, 5, 7, 9, 11, 13, 15 };
167 const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
168 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
170 assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
172 ASM_SSE2(cpu, "mfence");
174 for (unsigned y = 0; y < height; y++) {
178 "movdqa 0(%[src]), %%xmm0\n" \
179 "movdqa 16(%[src]), %%xmm1\n" \
180 "movdqa 32(%[src]), %%xmm2\n" \
181 "movdqa 48(%[src]), %%xmm3\n"
184 "movq %%xmm0, 0(%[dst1])\n" \
185 "movq %%xmm1, 8(%[dst1])\n" \
186 "movhpd %%xmm0, 0(%[dst2])\n" \
187 "movhpd %%xmm1, 8(%[dst2])\n" \
188 "movq %%xmm2, 16(%[dst1])\n" \
189 "movq %%xmm3, 24(%[dst1])\n" \
190 "movhpd %%xmm2, 16(%[dst2])\n" \
191 "movhpd %%xmm3, 24(%[dst2])\n"
193 #ifdef CAN_COMPILE_SSSE3
196 for (x = 0; x < (width & ~31); x += 32) {
198 "movdqu (%[shuffle]), %%xmm7\n"
200 "pshufb %%xmm7, %%xmm0\n"
201 "pshufb %%xmm7, %%xmm1\n"
202 "pshufb %%xmm7, %%xmm2\n"
203 "pshufb %%xmm7, %%xmm3\n"
205 : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
209 #ifdef CAN_COMPILE_SSE2
210 if (vlc_CPU_SSE2()) {
211 for (x = 0; x < (width & ~31); x += 32) {
213 "movdqu (%[mask]), %%xmm7\n"
215 "movdqa %%xmm0, %%xmm4\n"
216 "movdqa %%xmm1, %%xmm5\n"
217 "movdqa %%xmm2, %%xmm6\n"
220 "pand %%xmm7, %%xmm4\n"
221 "pand %%xmm7, %%xmm5\n"
222 "pand %%xmm7, %%xmm6\n"
223 "packuswb %%xmm4, %%xmm0\n"
224 "packuswb %%xmm5, %%xmm1\n"
225 "pand %%xmm3, %%xmm7\n"
228 "packuswb %%xmm6, %%xmm2\n"
229 "packuswb %%xmm7, %%xmm3\n"
231 : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
238 for (; x < width; x++) {
239 dstu[x] = src[2*x+0];
240 dstv[x] = src[2*x+1];
248 static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
249 uint8_t *cache, size_t cache_size,
250 unsigned width, unsigned height,
253 const unsigned w16 = (width+15) & ~15;
254 const unsigned hstep = cache_size / w16;
257 for (unsigned y = 0; y < height; y += hstep) {
258 const unsigned hblock = __MIN(hstep, height - y);
260 /* Copy a bunch of line into our cache */
261 CopyFromUswc(cache, w16,
265 /* Copy from our cache to the destination */
266 Copy2d(dst, dst_pitch,
271 src += src_pitch * hblock;
272 dst += dst_pitch * hblock;
275 ASM_SSE2(cpu, "mfence");
277 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
278 uint8_t *dstv, size_t dstv_pitch,
279 const uint8_t *src, size_t src_pitch,
280 uint8_t *cache, size_t cache_size,
281 unsigned width, unsigned height,
284 const unsigned w2_16 = (2*width+15) & ~15;
285 const unsigned hstep = cache_size / w2_16;
288 for (unsigned y = 0; y < height; y += hstep) {
289 const unsigned hblock = __MIN(hstep, height - y);
291 /* Copy a bunch of line into our cache */
292 CopyFromUswc(cache, w2_16,
294 2*width, hblock, cpu);
296 /* Copy from our cache to the destination */
297 SplitUV(dstu, dstu_pitch,
303 src += src_pitch * hblock;
304 dstu += dstu_pitch * hblock;
305 dstv += dstv_pitch * hblock;
308 ASM_SSE2(cpu, "mfence");
311 int CopyInitCache(copy_cache_t *cache, unsigned width)
313 cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
314 cache->buffer = vlc_memalign(16, cache->size);
319 void CopyCleanCache(copy_cache_t *cache)
321 vlc_free(cache->buffer);
322 cache->buffer = NULL;
326 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
327 unsigned width, unsigned height,
330 const unsigned cpu = vlc_CPU();
333 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
334 src[0], src_pitch[0],
335 cache->buffer, cache->size,
337 SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
338 dst->p[1].p_pixels, dst->p[1].i_pitch,
339 src[1], src_pitch[1],
340 cache->buffer, cache->size,
341 width/2, height/2, cpu);
343 ASM_SSE2(cpu, "emms");
345 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
346 unsigned width, unsigned height,
349 const unsigned cpu = vlc_CPU();
352 for (unsigned n = 0; n < 3; n++) {
353 const unsigned d = n > 0 ? 2 : 1;
354 CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
355 src[n], src_pitch[n],
356 cache->buffer, cache->size,
357 width/d, height/d, cpu);
359 ASM_SSE2(cpu, "emms");