#include "avcodec.h"
#include "va.h"
+#include "copy.h"
#ifdef HAVE_AVCODEC_DXVA2
#include <shlwapi.h>
#include <d3d9.h>
-/* FIXME */
-#define CoTaskMemFree(x)
-
/* */
#define DXVA2_E_NOT_INITIALIZED MAKE_HRESULT(1, 4, 4096)
#define DXVA2_E_NEW_VIDEO_DEVICE MAKE_HRESULT(1, 4, 4097)
static const GUID DXVA2_ModeH264_F = {
0x1b81be69, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
};
+static const GUID DXVADDI_Intel_ModeH264_A = {
+ 0x604F8E64, 0x4951,0x4c54, {0x88,0xFE,0xAB,0xD2,0x5C,0x15,0xB3,0xD6}
+};
+static const GUID DXVADDI_Intel_ModeH264_C = {
+ 0x604F8E66,0x4951, 0x4c54, {0x88,0xFE,0xAB,0xD2,0x5C,0x15,0xB3,0xD6}
+};
+static const GUID DXVADDI_Intel_ModeH264_E = {
+ 0x604F8E68,0x4951, 0x4c54, {0x88,0xFE,0xAB,0xD2,0x5C,0x15,0xB3,0xD6}
+};
static const GUID DXVA2_ModeWMV8_A = {
0x1b81be80, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
};
0x1b81beA3, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
};
+static const GUID DXVA_NoEncrypt = {
+ 0x1b81bed0, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
+};
+
/* */
typedef struct {
const char *name;
} dxva2_mode_t;
/* XXX Prefered modes must come first */
static const dxva2_mode_t dxva2_modes[] = {
- { "DXVA2_ModeMPEG2_VLD", &DXVA2_ModeMPEG2_VLD, 0 },
+ { "DXVA2_ModeMPEG2_VLD", &DXVA2_ModeMPEG2_VLD, CODEC_ID_MPEG2VIDEO },
{ "DXVA2_ModeMPEG2_MoComp", &DXVA2_ModeMPEG2_MoComp, 0 },
{ "DXVA2_ModeMPEG2_IDCT", &DXVA2_ModeMPEG2_IDCT, 0 },
- { "H.264 variable-length decoder (VLD), FGT", &DXVA2_ModeH264_F, CODEC_ID_H264 },
- { "H.264 VLD, no FGT", &DXVA2_ModeH264_E, CODEC_ID_H264 },
- { "H.264 IDCT, FGT", &DXVA2_ModeH264_D, 0, },
- { "H.264 inverse discrete cosine transform (IDCT), no FGT", &DXVA2_ModeH264_C, 0, },
- { "H.264 MoComp, FGT", &DXVA2_ModeH264_B, 0, },
- { "H.264 motion compensation (MoComp), no FGT", &DXVA2_ModeH264_A, 0, },
+ { "H.264 variable-length decoder (VLD), FGT", &DXVA2_ModeH264_F, CODEC_ID_H264 },
+ { "H.264 VLD, no FGT", &DXVA2_ModeH264_E, CODEC_ID_H264 },
+ { "H.264 VLD, no FGT (Intel)", &DXVADDI_Intel_ModeH264_E, CODEC_ID_H264 },
+ { "H.264 IDCT, FGT", &DXVA2_ModeH264_D, 0 },
+ { "H.264 inverse discrete cosine transform (IDCT), no FGT", &DXVA2_ModeH264_C, 0 },
+ { "H.264 inverse discrete cosine transform (IDCT), no FGT (Intel)", &DXVADDI_Intel_ModeH264_C, 0 },
+ { "H.264 MoComp, FGT", &DXVA2_ModeH264_B, 0 },
+ { "H.264 motion compensation (MoComp), no FGT", &DXVA2_ModeH264_A, 0 },
+ { "H.264 motion compensation (MoComp), no FGT (Intel)", &DXVADDI_Intel_ModeH264_A, 0 },
{ "Windows Media Video 8 MoComp", &DXVA2_ModeWMV8_B, 0 },
{ "Windows Media Video 8 post processing", &DXVA2_ModeWMV8_A, 0 },
{ "Windows Media Video 9 MoComp", &DXVA2_ModeWMV9_B, 0 },
{ "Windows Media Video 9 post processing", &DXVA2_ModeWMV9_A, 0 },
- { "VC-1 VLD", &DXVA2_ModeVC1_D, 0 },
+ { "VC-1 VLD", &DXVA2_ModeVC1_D, CODEC_ID_VC1 },
+ { "VC-1 VLD", &DXVA2_ModeVC1_D, CODEC_ID_WMV3 },
{ "VC-1 IDCT", &DXVA2_ModeVC1_C, 0 },
{ "VC-1 MoComp", &DXVA2_ModeVC1_B, 0 },
{ "VC-1 post processing", &DXVA2_ModeVC1_A, 0 },
}
/* */
-#define VLC_CODEC_NV12 VLC_FOURCC('N','V','1','2') /* TODO move to vlc_fourcc.h */
typedef struct {
const char *name;
D3DFORMAT format;
/* */
vlc_object_t *log;
int codec_id;
+ int width;
+ int height;
/* DLL */
HINSTANCE hd3d9_dll;
/* Option conversion */
D3DFORMAT output;
- uint8_t *surface_cache_base;
- uint8_t *surface_cache;
- size_t surface_cache_size;
+ copy_cache_t surface_cache;
/* */
struct dxva_context hw;
static void DxCreateVideoConversion(vlc_va_dxva2_t *);
static void DxDestroyVideoConversion(vlc_va_dxva2_t *);
-static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height);
-static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height);
-
/* */
static int Setup(vlc_va_t *external, void **hw, vlc_fourcc_t *chroma,
int width, int height)
{
vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
- if (va->surface_width == width &&
- va->surface_height == height)
+ if (va->width == width && va->height == height && va->decoder)
goto ok;
/* */
return VLC_EGENERIC;
/* */
va->hw.decoder = va->decoder;
- if (va->codec_id == CODEC_ID_H264)
- va->hw.cfg = &va->cfg;
+ va->hw.cfg = &va->cfg;
va->hw.surface_count = va->surface_count;
va->hw.surface = va->hw_surface;
for (unsigned i = 0; i < va->surface_count; i++)
vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)ff->data[3];
- if (!va->surface_cache)
+ if (!va->surface_cache.buffer)
return VLC_EGENERIC;
/* */
}
if (va->render == MAKEFOURCC('Y','V','1','2')) {
- CopyFromYv12(picture, &lock,
- va->surface_cache, va->surface_cache_size,
- va->surface_width, va->surface_height);
+ uint8_t *plane[3] = {
+ lock.pBits,
+ (uint8_t*)lock.pBits + lock.Pitch * va->surface_height,
+ (uint8_t*)lock.pBits + lock.Pitch * va->surface_height
+ + (lock.Pitch/2) * (va->surface_height/2)
+ };
+ size_t pitch[3] = {
+ lock.Pitch,
+ lock.Pitch / 2,
+ lock.Pitch / 2,
+ };
+ CopyFromYv12(picture, plane, pitch,
+ va->width, va->height,
+ &va->surface_cache);
} else {
assert(va->render == MAKEFOURCC('N','V','1','2'));
- CopyFromNv12(picture, &lock,
- va->surface_cache, va->surface_cache_size,
- va->surface_width, va->surface_height);
+ uint8_t *plane[2] = {
+ lock.pBits,
+ (uint8_t*)lock.pBits + lock.Pitch * va->surface_height
+ };
+ size_t pitch[2] = {
+ lock.Pitch,
+ lock.Pitch,
+ };
+ CopyFromNv12(picture, plane, pitch,
+ va->width, va->height,
+ &va->surface_cache);
}
/* */
*/
static char *DxDescribe(vlc_va_dxva2_t *va)
{
+ static const struct {
+ unsigned id;
+ char name[32];
+ } vendors [] = {
+ { 0x1002, "ATI" },
+ { 0x10DE, "NVIDIA" },
+ { 0x8086, "Intel" },
+ { 0x5333, "S3 Graphics" },
+ { 0, "" }
+ };
D3DADAPTER_IDENTIFIER9 *id = &va->d3dai;
+
+ const char *vendor = "Unknown";
+ for (int i = 0; vendors[i].id != 0; i++) {
+ if (vendors[i].id == id->VendorId) {
+ vendor = vendors[i].name;
+ break;
+ }
+ }
+
char *description;
- if (asprintf(&description, "DXVA2 (%.*s, vendor %d, device %d, revision %d)",
+ if (asprintf(&description, "DXVA2 (%.*s, vendor %d(%s), device %d, revision %d)",
sizeof(id->Description), id->Description,
- id->VendorId, id->DeviceId, id->Revision) < 0)
+ id->VendorId, vendor, id->DeviceId, id->Revision) < 0)
return NULL;
return description;
}
msg_Dbg(va->log, "DxCreateVideoDecoder id %d %dx%d",
codec_id, fmt->i_width, fmt->i_height);
+ va->width = fmt->i_width;
+ va->height = fmt->i_height;
+
/* Allocates all surfaces needed for the decoder */
+ va->surface_width = (fmt->i_width + 15) & ~15;
+ va->surface_height = (fmt->i_height + 15) & ~15;
switch (codec_id) {
case CODEC_ID_H264:
va->surface_count = 16 + 1;
}
LPDIRECT3DSURFACE9 surface_list[VA_DXVA2_MAX_SURFACE_COUNT];
if (FAILED(IDirectXVideoDecoderService_CreateSurface(va->vs,
- fmt->i_width,
- fmt->i_height,
+ va->surface_width,
+ va->surface_height,
va->surface_count - 1,
va->render,
D3DPOOL_DEFAULT,
surface->refcount = 0;
surface->order = 0;
}
- va->surface_width = fmt->i_width;
- va->surface_height = fmt->i_height;
msg_Dbg(va->log, "IDirectXVideoAccelerationService_CreateSurface succeed with %d surfaces (%dx%d)",
va->surface_count, fmt->i_width, fmt->i_height);
msg_Dbg(va->log, "we got %d decoder configurations", cfg_count);
/* Select the best decoder configuration */
- bool has_cfg = false;
+ int cfg_score = 0;
for (unsigned i = 0; i < cfg_count; i++) {
const DXVA2_ConfigPictureDecode *cfg = &cfg_list[i];
i, cfg->ConfigBitstreamRaw);
/* */
- if ((!has_cfg && cfg->ConfigBitstreamRaw == 1) ||
- (codec_id == CODEC_ID_H264 && cfg->ConfigBitstreamRaw == 2)) {
+ int score;
+ if (cfg->ConfigBitstreamRaw == 1)
+ score = 1;
+ else if (codec_id == CODEC_ID_H264 && cfg->ConfigBitstreamRaw == 2)
+ score = 2;
+ else
+ continue;
+ if (IsEqualGUID(&cfg->guidConfigBitstreamEncryption, &DXVA_NoEncrypt))
+ score += 16;
+
+ if (cfg_score < score) {
va->cfg = *cfg;
- has_cfg = true;
+ cfg_score = score;
}
}
CoTaskMemFree(cfg_list);
- if (!has_cfg) {
+ if (cfg_score <= 0) {
msg_Err(va->log, "Failed to find a supported decoder configuration");
return VLC_EGENERIC;
}
va->output = va->render;
break;
}
- va->surface_cache_size = __MAX((va->surface_width + 0x0f) & ~ 0x0f, 4096);
- va->surface_cache_base = malloc(16 + va->surface_cache_size);
- va->surface_cache = &va->surface_cache_base[16 - ((intptr_t)va->surface_cache_base & 0x0f)];
+ CopyInitCache(&va->surface_cache, va->surface_width);
}
static void DxDestroyVideoConversion(vlc_va_dxva2_t *va)
{
- free(va->surface_cache_base);
- va->surface_cache_base = NULL;
- va->surface_cache = NULL;
- va->surface_cache_size = 0;
-}
-
-/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
- * storing data with the SSE>=2 instruction store.
- */
-#define COPY64(dstp, srcp, load, store) \
- asm volatile ( \
- load " 0(%[src]), %%xmm1\n" \
- load " 16(%[src]), %%xmm2\n" \
- load " 32(%[src]), %%xmm3\n" \
- load " 48(%[src]), %%xmm4\n" \
- store " %%xmm1, 0(%[dst])\n" \
- store " %%xmm2, 16(%[dst])\n" \
- store " %%xmm3, 32(%[dst])\n" \
- store " %%xmm4, 48(%[dst])\n" \
- : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
-
-/* Execute the instruction op only if SSE2 is supported. */
-#ifdef CAN_COMPILE_SSE2
-# define ASM_SSE2(cpu, op) do { \
- if (cpu & CPU_CAPABILITY_SSE2) \
- asm volatile (op); \
- } while (0)
-#else
-# define ASM_SSE2(cpu, op)
-#endif
-
-/* Optimized copy from "Uncacheable Speculative Write Combining" memory
- * as used by some video surface.
- * XXX It is really efficient only when SSE4.1 is available.
- */
-static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned unaligned,
- unsigned width, unsigned height,
- unsigned cpu)
-{
- assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
-
- ASM_SSE2(cpu, "mfence");
- for (unsigned y = 0; y < height; y++) {
- unsigned x;
-
- for (x = 0; x < unaligned; x++)
- dst[x] = src[x];
-
-#ifdef CAN_COMPILE_SSE4_1
- if (cpu & CPU_CAPABILITY_SSE4_1) {
- if (!unaligned) {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
- } else {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
- }
- } else
-#endif
-#ifdef CAN_COMPILE_SSE2
- if (cpu & CPU_CAPABILITY_SSE2) {
- if (!unaligned) {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movdqa");
- } else {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movdqu");
- }
- }
-#endif
-
- for (; x < width; x++)
- dst[x] = src[x];
-
- src += src_pitch;
- dst += dst_pitch;
- }
+ CopyCleanCache(&va->surface_cache);
}
-
-static void Copy2d(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height,
- unsigned cpu)
-{
- assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
-
- ASM_SSE2(cpu, "mfence");
-
- for (unsigned y = 0; y < height; y++) {
- unsigned x = 0;
- bool unaligned = ((intptr_t)dst & 0x0f) != 0;
-
-#ifdef CAN_COMPILE_SSE2
- if (cpu & CPU_CAPABILITY_SSE2) {
- if (!unaligned) {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movntdq");
- } else {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movdqu");
- }
- }
-#endif
-
- for (; x < width; x++)
- dst[x] = src[x];
-
- src += src_pitch;
- dst += dst_pitch;
- }
-}
-
-static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
- uint8_t *dstv, size_t dstv_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height, unsigned cpu)
-{
- const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
- 1, 3, 5, 7, 9, 11, 13, 15 };
- const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
- 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
-
- assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
-
- ASM_SSE2(cpu, "mfence");
-
- for (unsigned y = 0; y < height; y++) {
- unsigned x = 0;
-
-#define LOAD64 \
- "movdqa 0(%[src]), %%xmm0\n" \
- "movdqa 16(%[src]), %%xmm1\n" \
- "movdqa 32(%[src]), %%xmm2\n" \
- "movdqa 48(%[src]), %%xmm3\n"
-
-#define STORE2X32 \
- "movq %%xmm0, 0(%[dst1])\n" \
- "movq %%xmm1, 8(%[dst1])\n" \
- "movhpd %%xmm0, 0(%[dst2])\n" \
- "movhpd %%xmm1, 8(%[dst2])\n" \
- "movq %%xmm2, 16(%[dst1])\n" \
- "movq %%xmm3, 24(%[dst1])\n" \
- "movhpd %%xmm2, 16(%[dst2])\n" \
- "movhpd %%xmm3, 24(%[dst2])\n"
-
-#ifdef CAN_COMPILE_SSSE3
- if (cpu & CPU_CAPABILITY_SSSE3) {
- for (x = 0; x < (width & ~31); x += 32) {
- asm volatile (
- "movdqu (%[shuffle]), %%xmm7\n"
- LOAD64
- "pshufb %%xmm7, %%xmm0\n"
- "pshufb %%xmm7, %%xmm1\n"
- "pshufb %%xmm7, %%xmm2\n"
- "pshufb %%xmm7, %%xmm3\n"
- STORE2X32
- : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
- }
- } else
-#endif
-#ifdef CAN_COMPILE_SSE2
- if (cpu & CPU_CAPABILITY_SSE2) {
- for (x = 0; x < (width & ~31); x += 32) {
- asm volatile (
- "movdqu (%[mask]), %%xmm7\n"
- LOAD64
- "movdqa %%xmm0, %%xmm4\n"
- "movdqa %%xmm1, %%xmm5\n"
- "movdqa %%xmm2, %%xmm6\n"
- "psrlw $8, %%xmm0\n"
- "psrlw $8, %%xmm1\n"
- "pand %%xmm7, %%xmm4\n"
- "pand %%xmm7, %%xmm5\n"
- "pand %%xmm7, %%xmm6\n"
- "packuswb %%xmm4, %%xmm0\n"
- "packuswb %%xmm5, %%xmm1\n"
- "pand %%xmm3, %%xmm7\n"
- "psrlw $8, %%xmm2\n"
- "psrlw $8, %%xmm3\n"
- "packuswb %%xmm6, %%xmm2\n"
- "packuswb %%xmm7, %%xmm3\n"
- STORE2X32
- : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
- }
- }
-#endif
-#undef STORE2X32
-#undef LOAD64
-
- for (; x < width; x++) {
- dstu[x] = src[2*x+0];
- dstv[x] = src[2*x+1];
- }
- src += src_pitch;
- dstu += dstu_pitch;
- dstv += dstv_pitch;
- }
-}
-
-static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height,
- unsigned cpu)
-{
- const unsigned w16 = (width+15) & ~15;
- const unsigned hstep = cache_size / w16;
- assert(hstep > 0);
-
- for (unsigned y = 0; y < height; y += hstep) {
- const unsigned unaligned = (intptr_t)src & 0x0f;
- const unsigned hblock = __MIN(hstep, height - y);
-
- /* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w16,
- src, src_pitch,
- unaligned,
- width, hblock, cpu);
-
- /* Copy from our cache to the destination */
- Copy2d(dst, dst_pitch,
- cache, w16,
- width, hblock, cpu);
-
- /* */
- src += src_pitch * hblock;
- dst += dst_pitch * hblock;
- }
-
- ASM_SSE2(cpu, "mfence");
-}
-static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
- uint8_t *dstv, size_t dstv_pitch,
- const uint8_t *src, size_t src_pitch,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height,
- unsigned cpu)
-{
- const unsigned w2_16 = (2*width+15) & ~15;
- const unsigned hstep = cache_size / w2_16;
- assert(hstep > 0);
-
- for (unsigned y = 0; y < height; y += hstep) {
- const unsigned unaligned = (intptr_t)src & 0x0f;
- const unsigned hblock = __MIN(hstep, height - y);
-
- /* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w2_16,
- src, src_pitch,
- unaligned,
- 2*width, hblock, cpu);
-
- /* Copy from our cache to the destination */
- SplitUV(dstu, dstu_pitch,
- dstv, dstv_pitch,
- cache, w2_16,
- width, hblock, cpu);
-
- /* */
- src += src_pitch * hblock;
- dstu += dstu_pitch * hblock;
- dstv += dstv_pitch * hblock;
- }
-
- ASM_SSE2(cpu, "mfence");
-}
-
-static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height)
-{
- const unsigned cpu = vlc_CPU();
-
- /* */
- CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
- src->pBits, src->Pitch,
- cache, cache_size,
- width, height, cpu);
- SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
- dst->p[1].p_pixels, dst->p[1].i_pitch,
- (const uint8_t*)src->pBits + src->Pitch * height, src->Pitch,
- cache, cache_size,
- width/2, height/2, cpu);
-
- ASM_SSE2(cpu, "emms");
-}
-static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height)
-{
- const unsigned cpu = vlc_CPU();
-
- /* */
- for (unsigned n = 0, offset = 0; n < 3; n++) {
- const unsigned d = n > 0 ? 2 : 1;
- CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
- (const uint8_t*)src->pBits + offset, src->Pitch/d,
- cache, cache_size,
- width/d, height/d, cpu);
- offset += (src->Pitch/d) * (height/d);
- }
- ASM_SSE2(cpu, "emms");
-}
-#undef ASM_SSE2
-#undef COPY64
-
#else
vlc_va_t *vlc_va_NewDxva2(vlc_object_t *log, int codec_id)
{