From: Laurent Aimar Date: Sat, 8 May 2010 09:16:22 +0000 (+0200) Subject: Moved out optimized VA nv12/yv12 copy functions from dxva2. X-Git-Tag: 1.2.0-pre1~6726 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=0777bcbfcb4ce001cfb74693f32d8bf0d451be15;p=vlc Moved out optimized VA nv12/yv12 copy functions from dxva2. --- diff --git a/modules/codec/avcodec/Modules.am b/modules/codec/avcodec/Modules.am index db3195eca2..2a40979a0f 100644 --- a/modules/codec/avcodec/Modules.am +++ b/modules/codec/avcodec/Modules.am @@ -10,6 +10,8 @@ libavcodec_plugin_la_SOURCES = \ chroma.c \ vaapi.c \ dxva2.c \ + copy.c \ + copy.h \ va.h \ $(NULL) if ENABLE_SOUT diff --git a/modules/codec/avcodec/copy.c b/modules/codec/avcodec/copy.c new file mode 100644 index 0000000000..6bc0d8207c --- /dev/null +++ b/modules/codec/avcodec/copy.c @@ -0,0 +1,355 @@ +/***************************************************************************** + * copy.c: Fast YV12/NV12 copy + ***************************************************************************** + * Copyright (C) 2010 Laurent Aimar + * $Id$ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. + *****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include + +#include "copy.h" + +/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and + * storing data with the SSE>=2 instruction store. + */ +#define COPY64(dstp, srcp, load, store) \ + asm volatile ( \ + load " 0(%[src]), %%xmm1\n" \ + load " 16(%[src]), %%xmm2\n" \ + load " 32(%[src]), %%xmm3\n" \ + load " 48(%[src]), %%xmm4\n" \ + store " %%xmm1, 0(%[dst])\n" \ + store " %%xmm2, 16(%[dst])\n" \ + store " %%xmm3, 32(%[dst])\n" \ + store " %%xmm4, 48(%[dst])\n" \ + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory") + +/* Execute the instruction op only if SSE2 is supported. */ +#ifdef CAN_COMPILE_SSE2 +# define ASM_SSE2(cpu, op) do { \ + if (cpu & CPU_CAPABILITY_SSE2) \ + asm volatile (op); \ + } while (0) +#else +# define ASM_SSE2(cpu, op) +#endif + +/* Optimized copy from "Uncacheable Speculative Write Combining" memory + * as used by some video surface. + * XXX It is really efficient only when SSE4.1 is available. + */ +static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, + const uint8_t *src, size_t src_pitch, + unsigned unaligned, + unsigned width, unsigned height, + unsigned cpu) +{ + assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0); + + ASM_SSE2(cpu, "mfence"); + for (unsigned y = 0; y < height; y++) { + unsigned x; + + for (x = 0; x < unaligned; x++) + dst[x] = src[x]; + +#ifdef CAN_COMPILE_SSE4_1 + if (cpu & CPU_CAPABILITY_SSE4_1) { + if (!unaligned) { + for (; x+63 < width; x += 64) + COPY64(&dst[x], &src[x], "movntdqa", "movdqa"); + } else { + for (; x+63 < width; x += 64) + COPY64(&dst[x], &src[x], "movntdqa", "movdqu"); + } + } else +#endif +#ifdef CAN_COMPILE_SSE2 + if (cpu & CPU_CAPABILITY_SSE2) { + if (!unaligned) { + for (; x+63 < width; x += 64) + COPY64(&dst[x], &src[x], "movdqa", "movdqa"); + } else { + for (; x+63 < width; x += 64) + COPY64(&dst[x], &src[x], "movdqa", "movdqu"); + } + } +#endif + + for (; x < width; x++) + dst[x] = src[x]; + + src += src_pitch; + dst += dst_pitch; + } +} + +static void Copy2d(uint8_t *dst, size_t dst_pitch, + const uint8_t *src, size_t src_pitch, + unsigned width, unsigned height, + unsigned cpu) +{ + assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); + + ASM_SSE2(cpu, "mfence"); + + for (unsigned y = 0; y < height; y++) { + unsigned x = 0; + bool unaligned = ((intptr_t)dst & 0x0f) != 0; + +#ifdef CAN_COMPILE_SSE2 + if (cpu & CPU_CAPABILITY_SSE2) { + if (!unaligned) { + for (; x+63 < width; x += 64) + COPY64(&dst[x], &src[x], "movdqa", "movntdq"); + } else { + for (; x+63 < width; x += 64) + COPY64(&dst[x], &src[x], "movdqa", "movdqu"); + } + } +#endif + + for (; x < width; x++) + dst[x] = src[x]; + + src += src_pitch; + dst += dst_pitch; + } +} + +static void SplitUV(uint8_t *dstu, size_t dstu_pitch, + uint8_t *dstv, size_t dstv_pitch, + const uint8_t *src, size_t src_pitch, + unsigned width, unsigned height, unsigned cpu) +{ + const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14, + 1, 3, 5, 7, 9, 11, 13, 15 }; + const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, + 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 }; + + assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); + + ASM_SSE2(cpu, "mfence"); + + for (unsigned y = 0; y < height; y++) { + unsigned x = 0; + +#define LOAD64 \ + "movdqa 0(%[src]), %%xmm0\n" \ + "movdqa 16(%[src]), %%xmm1\n" \ + "movdqa 32(%[src]), %%xmm2\n" \ + "movdqa 48(%[src]), %%xmm3\n" + +#define STORE2X32 \ + "movq %%xmm0, 0(%[dst1])\n" \ + "movq %%xmm1, 8(%[dst1])\n" \ + "movhpd %%xmm0, 0(%[dst2])\n" \ + "movhpd %%xmm1, 8(%[dst2])\n" \ + "movq %%xmm2, 16(%[dst1])\n" \ + "movq %%xmm3, 24(%[dst1])\n" \ + "movhpd %%xmm2, 16(%[dst2])\n" \ + "movhpd %%xmm3, 24(%[dst2])\n" + +#ifdef CAN_COMPILE_SSSE3 + if (cpu & CPU_CAPABILITY_SSSE3) { + for (x = 0; x < (width & ~31); x += 32) { + asm volatile ( + "movdqu (%[shuffle]), %%xmm7\n" + LOAD64 + "pshufb %%xmm7, %%xmm0\n" + "pshufb %%xmm7, %%xmm1\n" + "pshufb %%xmm7, %%xmm2\n" + "pshufb %%xmm7, %%xmm3\n" + STORE2X32 + : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory"); + } + } else +#endif +#ifdef CAN_COMPILE_SSE2 + if (cpu & CPU_CAPABILITY_SSE2) { + for (x = 0; x < (width & ~31); x += 32) { + asm volatile ( + "movdqu (%[mask]), %%xmm7\n" + LOAD64 + "movdqa %%xmm0, %%xmm4\n" + "movdqa %%xmm1, %%xmm5\n" + "movdqa %%xmm2, %%xmm6\n" + "psrlw $8, %%xmm0\n" + "psrlw $8, %%xmm1\n" + "pand %%xmm7, %%xmm4\n" + "pand %%xmm7, %%xmm5\n" + "pand %%xmm7, %%xmm6\n" + "packuswb %%xmm4, %%xmm0\n" + "packuswb %%xmm5, %%xmm1\n" + "pand %%xmm3, %%xmm7\n" + "psrlw $8, %%xmm2\n" + "psrlw $8, %%xmm3\n" + "packuswb %%xmm6, %%xmm2\n" + "packuswb %%xmm7, %%xmm3\n" + STORE2X32 + : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory"); + } + } +#endif +#undef STORE2X32 +#undef LOAD64 + + for (; x < width; x++) { + dstu[x] = src[2*x+0]; + dstv[x] = src[2*x+1]; + } + src += src_pitch; + dstu += dstu_pitch; + dstv += dstv_pitch; + } +} + +static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch, + uint8_t *cache, size_t cache_size, + unsigned width, unsigned height, + unsigned cpu) +{ + const unsigned w16 = (width+15) & ~15; + const unsigned hstep = cache_size / w16; + assert(hstep > 0); + + for (unsigned y = 0; y < height; y += hstep) { + const unsigned unaligned = (intptr_t)src & 0x0f; + const unsigned hblock = __MIN(hstep, height - y); + + /* Copy a bunch of line into our cache */ + CopyFromUswc(cache, w16, + src, src_pitch, + unaligned, + width, hblock, cpu); + + /* Copy from our cache to the destination */ + Copy2d(dst, dst_pitch, + cache, w16, + width, hblock, cpu); + + /* */ + src += src_pitch * hblock; + dst += dst_pitch * hblock; + } + + ASM_SSE2(cpu, "mfence"); +} +static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch, + uint8_t *dstv, size_t dstv_pitch, + const uint8_t *src, size_t src_pitch, + uint8_t *cache, size_t cache_size, + unsigned width, unsigned height, + unsigned cpu) +{ + const unsigned w2_16 = (2*width+15) & ~15; + const unsigned hstep = cache_size / w2_16; + assert(hstep > 0); + + for (unsigned y = 0; y < height; y += hstep) { + const unsigned unaligned = (intptr_t)src & 0x0f; + const unsigned hblock = __MIN(hstep, height - y); + + /* Copy a bunch of line into our cache */ + CopyFromUswc(cache, w2_16, + src, src_pitch, + unaligned, + 2*width, hblock, cpu); + + /* Copy from our cache to the destination */ + SplitUV(dstu, dstu_pitch, + dstv, dstv_pitch, + cache, w2_16, + width, hblock, cpu); + + /* */ + src += src_pitch * hblock; + dstu += dstu_pitch * hblock; + dstv += dstv_pitch * hblock; + } + + ASM_SSE2(cpu, "mfence"); +} + +int CopyInitCache(copy_cache_t *cache, unsigned width) +{ + cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096); + cache->base = malloc(16 + cache->size); + if (cache->base == NULL) { + cache->buffer = NULL; + return VLC_EGENERIC; + } + cache->buffer = &cache->base[16 - ((intptr_t)cache->base & 0x0f)]; + return VLC_SUCCESS; +} +void CopyCleanCache(copy_cache_t *cache) +{ + free(cache->base); + + cache->base = NULL; + cache->buffer = NULL; + cache->size = 0; +} + +void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2], + unsigned width, unsigned height, + copy_cache_t *cache) +{ + const unsigned cpu = vlc_CPU(); + + /* */ + CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, + src[0], src_pitch[0], + cache->buffer, cache->size, + width, height, cpu); + SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch, + dst->p[1].p_pixels, dst->p[1].i_pitch, + src[1], src_pitch[1], + cache->buffer, cache->size, + width/2, height/2, cpu); + + ASM_SSE2(cpu, "emms"); +} +void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3], + unsigned width, unsigned height, + copy_cache_t *cache) +{ + const unsigned cpu = vlc_CPU(); + + /* */ + for (unsigned n = 0; n < 3; n++) { + const unsigned d = n > 0 ? 2 : 1; + CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch, + src[n], src_pitch[n], + cache->buffer, cache->size, + width/d, height/d, cpu); + } + ASM_SSE2(cpu, "emms"); +} + +#undef ASM_SSE2 +#undef COPY64 + diff --git a/modules/codec/avcodec/copy.h b/modules/codec/avcodec/copy.h new file mode 100644 index 0000000000..312bf4c4f3 --- /dev/null +++ b/modules/codec/avcodec/copy.h @@ -0,0 +1,44 @@ +/***************************************************************************** + * copy.h: Fast YV12/NV12 copy + ***************************************************************************** + * Copyright (C) 2009 Laurent Aimar + * $Id$ + * + * Authors: Laurent Aimar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. + *****************************************************************************/ + +#ifndef _VLC_AVCODEC_COPY_H +#define _VLC_AVCODEC_COPY_H 1 + +typedef struct { + uint8_t *base; + uint8_t *buffer; + size_t size; +} copy_cache_t; + +int CopyInitCache(copy_cache_t *cache, unsigned width); +void CopyCleanCache(copy_cache_t *cache); + +void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2], + unsigned width, unsigned height, + copy_cache_t *cache); +void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3], + unsigned width, unsigned height, + copy_cache_t *cache); + +#endif + diff --git a/modules/codec/avcodec/dxva2.c b/modules/codec/avcodec/dxva2.c index 74c7d12085..eb29869e9f 100644 --- a/modules/codec/avcodec/dxva2.c +++ b/modules/codec/avcodec/dxva2.c @@ -47,6 +47,7 @@ #include "avcodec.h" #include "va.h" +#include "copy.h" #ifdef HAVE_AVCODEC_DXVA2 @@ -252,9 +253,7 @@ typedef struct /* Option conversion */ D3DFORMAT output; - uint8_t *surface_cache_base; - uint8_t *surface_cache; - size_t surface_cache_size; + copy_cache_t surface_cache; /* */ struct dxva_context hw; @@ -297,13 +296,6 @@ static int DxResetVideoDecoder(vlc_va_dxva2_t *); static void DxCreateVideoConversion(vlc_va_dxva2_t *); static void DxDestroyVideoConversion(vlc_va_dxva2_t *); -static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src, - uint8_t *cache, size_t cache_size, - unsigned width, unsigned height); -static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src, - uint8_t *cache, size_t cache_size, - unsigned width, unsigned height); - /* */ static int Setup(vlc_va_t *external, void **hw, vlc_fourcc_t *chroma, int width, int height) @@ -356,7 +348,7 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff) vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external); LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)ff->data[3]; - if (!va->surface_cache) + if (!va->surface_cache.buffer) return VLC_EGENERIC; /* */ @@ -370,14 +362,33 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff) } if (va->render == MAKEFOURCC('Y','V','1','2')) { - CopyFromYv12(picture, &lock, - va->surface_cache, va->surface_cache_size, - va->surface_width, va->surface_height); + uint8_t *plane[3] = { + lock.pBits, + (uint8_t*)lock.pBits + lock.Pitch * va->surface_height, + (uint8_t*)lock.pBits + lock.Pitch * va->surface_height + + (lock.Pitch/2) * (va->surface_height/2) + }; + size_t pitch[3] = { + lock.Pitch, + lock.Pitch / 2, + lock.Pitch / 2, + }; + CopyFromYv12(picture, plane, pitch, + va->surface_width, va->surface_height, + &va->surface_cache); } else { assert(va->render == MAKEFOURCC('N','V','1','2')); - CopyFromNv12(picture, &lock, - va->surface_cache, va->surface_cache_size, - va->surface_width, va->surface_height); + uint8_t *plane[2] = { + lock.pBits, + (uint8_t*)lock.pBits + lock.Pitch * va->surface_height + }; + size_t pitch[2] = { + lock.Pitch, + lock.Pitch / 2, + }; + CopyFromNv12(picture, plane, pitch, + va->surface_width, va->surface_height, + &va->surface_cache); } /* */ @@ -954,319 +965,12 @@ static void DxCreateVideoConversion(vlc_va_dxva2_t *va) va->output = va->render; break; } - va->surface_cache_size = __MAX((va->surface_width + 0x0f) & ~ 0x0f, 4096); - va->surface_cache_base = malloc(16 + va->surface_cache_size); - va->surface_cache = &va->surface_cache_base[16 - ((intptr_t)va->surface_cache_base & 0x0f)]; + CopyInitCache(&va->surface_cache, va->surface_width); } static void DxDestroyVideoConversion(vlc_va_dxva2_t *va) { - free(va->surface_cache_base); - va->surface_cache_base = NULL; - va->surface_cache = NULL; - va->surface_cache_size = 0; + CopyCleanCache(&va->surface_cache); } - -/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and - * storing data with the SSE>=2 instruction store. - */ -#define COPY64(dstp, srcp, load, store) \ - asm volatile ( \ - load " 0(%[src]), %%xmm1\n" \ - load " 16(%[src]), %%xmm2\n" \ - load " 32(%[src]), %%xmm3\n" \ - load " 48(%[src]), %%xmm4\n" \ - store " %%xmm1, 0(%[dst])\n" \ - store " %%xmm2, 16(%[dst])\n" \ - store " %%xmm3, 32(%[dst])\n" \ - store " %%xmm4, 48(%[dst])\n" \ - : : [dst]"r"(dstp), [src]"r"(srcp) : "memory") - -/* Execute the instruction op only if SSE2 is supported. */ -#ifdef CAN_COMPILE_SSE2 -# define ASM_SSE2(cpu, op) do { \ - if (cpu & CPU_CAPABILITY_SSE2) \ - asm volatile (op); \ - } while (0) -#else -# define ASM_SSE2(cpu, op) -#endif - -/* Optimized copy from "Uncacheable Speculative Write Combining" memory - * as used by some video surface. - * XXX It is really efficient only when SSE4.1 is available. - */ -static void CopyFromUswc(uint8_t *dst, size_t dst_pitch, - const uint8_t *src, size_t src_pitch, - unsigned unaligned, - unsigned width, unsigned height, - unsigned cpu) -{ - assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0); - - ASM_SSE2(cpu, "mfence"); - for (unsigned y = 0; y < height; y++) { - unsigned x; - - for (x = 0; x < unaligned; x++) - dst[x] = src[x]; - -#ifdef CAN_COMPILE_SSE4_1 - if (cpu & CPU_CAPABILITY_SSE4_1) { - if (!unaligned) { - for (; x+63 < width; x += 64) - COPY64(&dst[x], &src[x], "movntdqa", "movdqa"); - } else { - for (; x+63 < width; x += 64) - COPY64(&dst[x], &src[x], "movntdqa", "movdqu"); - } - } else -#endif -#ifdef CAN_COMPILE_SSE2 - if (cpu & CPU_CAPABILITY_SSE2) { - if (!unaligned) { - for (; x+63 < width; x += 64) - COPY64(&dst[x], &src[x], "movdqa", "movdqa"); - } else { - for (; x+63 < width; x += 64) - COPY64(&dst[x], &src[x], "movdqa", "movdqu"); - } - } -#endif - - for (; x < width; x++) - dst[x] = src[x]; - - src += src_pitch; - dst += dst_pitch; - } -} - -static void Copy2d(uint8_t *dst, size_t dst_pitch, - const uint8_t *src, size_t src_pitch, - unsigned width, unsigned height, - unsigned cpu) -{ - assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); - - ASM_SSE2(cpu, "mfence"); - - for (unsigned y = 0; y < height; y++) { - unsigned x = 0; - bool unaligned = ((intptr_t)dst & 0x0f) != 0; - -#ifdef CAN_COMPILE_SSE2 - if (cpu & CPU_CAPABILITY_SSE2) { - if (!unaligned) { - for (; x+63 < width; x += 64) - COPY64(&dst[x], &src[x], "movdqa", "movntdq"); - } else { - for (; x+63 < width; x += 64) - COPY64(&dst[x], &src[x], "movdqa", "movdqu"); - } - } -#endif - - for (; x < width; x++) - dst[x] = src[x]; - - src += src_pitch; - dst += dst_pitch; - } -} - -static void SplitUV(uint8_t *dstu, size_t dstu_pitch, - uint8_t *dstv, size_t dstv_pitch, - const uint8_t *src, size_t src_pitch, - unsigned width, unsigned height, unsigned cpu) -{ - const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14, - 1, 3, 5, 7, 9, 11, 13, 15 }; - const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, - 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 }; - - assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0); - - ASM_SSE2(cpu, "mfence"); - - for (unsigned y = 0; y < height; y++) { - unsigned x = 0; - -#define LOAD64 \ - "movdqa 0(%[src]), %%xmm0\n" \ - "movdqa 16(%[src]), %%xmm1\n" \ - "movdqa 32(%[src]), %%xmm2\n" \ - "movdqa 48(%[src]), %%xmm3\n" - -#define STORE2X32 \ - "movq %%xmm0, 0(%[dst1])\n" \ - "movq %%xmm1, 8(%[dst1])\n" \ - "movhpd %%xmm0, 0(%[dst2])\n" \ - "movhpd %%xmm1, 8(%[dst2])\n" \ - "movq %%xmm2, 16(%[dst1])\n" \ - "movq %%xmm3, 24(%[dst1])\n" \ - "movhpd %%xmm2, 16(%[dst2])\n" \ - "movhpd %%xmm3, 24(%[dst2])\n" - -#ifdef CAN_COMPILE_SSSE3 - if (cpu & CPU_CAPABILITY_SSSE3) { - for (x = 0; x < (width & ~31); x += 32) { - asm volatile ( - "movdqu (%[shuffle]), %%xmm7\n" - LOAD64 - "pshufb %%xmm7, %%xmm0\n" - "pshufb %%xmm7, %%xmm1\n" - "pshufb %%xmm7, %%xmm2\n" - "pshufb %%xmm7, %%xmm3\n" - STORE2X32 - : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory"); - } - } else -#endif -#ifdef CAN_COMPILE_SSE2 - if (cpu & CPU_CAPABILITY_SSE2) { - for (x = 0; x < (width & ~31); x += 32) { - asm volatile ( - "movdqu (%[mask]), %%xmm7\n" - LOAD64 - "movdqa %%xmm0, %%xmm4\n" - "movdqa %%xmm1, %%xmm5\n" - "movdqa %%xmm2, %%xmm6\n" - "psrlw $8, %%xmm0\n" - "psrlw $8, %%xmm1\n" - "pand %%xmm7, %%xmm4\n" - "pand %%xmm7, %%xmm5\n" - "pand %%xmm7, %%xmm6\n" - "packuswb %%xmm4, %%xmm0\n" - "packuswb %%xmm5, %%xmm1\n" - "pand %%xmm3, %%xmm7\n" - "psrlw $8, %%xmm2\n" - "psrlw $8, %%xmm3\n" - "packuswb %%xmm6, %%xmm2\n" - "packuswb %%xmm7, %%xmm3\n" - STORE2X32 - : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory"); - } - } -#endif -#undef STORE2X32 -#undef LOAD64 - - for (; x < width; x++) { - dstu[x] = src[2*x+0]; - dstv[x] = src[2*x+1]; - } - src += src_pitch; - dstu += dstu_pitch; - dstv += dstv_pitch; - } -} - -static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch, - uint8_t *cache, size_t cache_size, - unsigned width, unsigned height, - unsigned cpu) -{ - const unsigned w16 = (width+15) & ~15; - const unsigned hstep = cache_size / w16; - assert(hstep > 0); - - for (unsigned y = 0; y < height; y += hstep) { - const unsigned unaligned = (intptr_t)src & 0x0f; - const unsigned hblock = __MIN(hstep, height - y); - - /* Copy a bunch of line into our cache */ - CopyFromUswc(cache, w16, - src, src_pitch, - unaligned, - width, hblock, cpu); - - /* Copy from our cache to the destination */ - Copy2d(dst, dst_pitch, - cache, w16, - width, hblock, cpu); - - /* */ - src += src_pitch * hblock; - dst += dst_pitch * hblock; - } - - ASM_SSE2(cpu, "mfence"); -} -static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch, - uint8_t *dstv, size_t dstv_pitch, - const uint8_t *src, size_t src_pitch, - uint8_t *cache, size_t cache_size, - unsigned width, unsigned height, - unsigned cpu) -{ - const unsigned w2_16 = (2*width+15) & ~15; - const unsigned hstep = cache_size / w2_16; - assert(hstep > 0); - - for (unsigned y = 0; y < height; y += hstep) { - const unsigned unaligned = (intptr_t)src & 0x0f; - const unsigned hblock = __MIN(hstep, height - y); - - /* Copy a bunch of line into our cache */ - CopyFromUswc(cache, w2_16, - src, src_pitch, - unaligned, - 2*width, hblock, cpu); - - /* Copy from our cache to the destination */ - SplitUV(dstu, dstu_pitch, - dstv, dstv_pitch, - cache, w2_16, - width, hblock, cpu); - - /* */ - src += src_pitch * hblock; - dstu += dstu_pitch * hblock; - dstv += dstv_pitch * hblock; - } - - ASM_SSE2(cpu, "mfence"); -} - -static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src, - uint8_t *cache, size_t cache_size, - unsigned width, unsigned height) -{ - const unsigned cpu = vlc_CPU(); - - /* */ - CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, - src->pBits, src->Pitch, - cache, cache_size, - width, height, cpu); - SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch, - dst->p[1].p_pixels, dst->p[1].i_pitch, - (const uint8_t*)src->pBits + src->Pitch * height, src->Pitch, - cache, cache_size, - width/2, height/2, cpu); - - ASM_SSE2(cpu, "emms"); -} -static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src, - uint8_t *cache, size_t cache_size, - unsigned width, unsigned height) -{ - const unsigned cpu = vlc_CPU(); - - /* */ - for (unsigned n = 0, offset = 0; n < 3; n++) { - const unsigned d = n > 0 ? 2 : 1; - CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch, - (const uint8_t*)src->pBits + offset, src->Pitch/d, - cache, cache_size, - width/d, height/d, cpu); - offset += (src->Pitch/d) * (height/d); - } - ASM_SSE2(cpu, "emms"); -} -#undef ASM_SSE2 -#undef COPY64 - #else vlc_va_t *vlc_va_NewDxva2(vlc_object_t *log, int codec_id) {