]> git.sesse.net Git - vlc/blob - modules/codec/avcodec/dxva2.c
Do not use types not existing in official dxva2api.h header.
[vlc] / modules / codec / avcodec / dxva2.c
1 /*****************************************************************************
2  * va.c: Video Acceleration helpers
3  *****************************************************************************
4  * Copyright (C) 2009 Geoffroy Couprie
5  * Copyright (C) 2009 Laurent Aimar
6  * $Id$
7  *
8  * Authors: Geoffroy Couprie <geal@videolan.org>
9  *          Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
24  *****************************************************************************/
25
26 #ifdef HAVE_CONFIG_H
27 # include "config.h"
28 #endif
29
30 #include <vlc_common.h>
31 #include <vlc_picture.h>
32 #include <vlc_fourcc.h>
33 #include <vlc_cpu.h>
34 #include <assert.h>
35
36 #ifdef HAVE_LIBAVCODEC_AVCODEC_H
37 #   include <libavcodec/avcodec.h>
38 #   ifdef HAVE_AVCODEC_DXVA2
39 #       define DXVA2API_USE_BITFIELDS
40 #       include <libavcodec/dxva2.h>
41 #   endif
42 #elif defined(HAVE_FFMPEG_AVCODEC_H)
43 #   include <ffmpeg/avcodec.h>
44 #else
45 #   include <avcodec.h>
46 #endif
47
48 #include "avcodec.h"
49 #include "va.h"
50
51 #ifdef HAVE_AVCODEC_DXVA2
52
53 #include <windows.h>
54 #include <windowsx.h>
55 #include <ole2.h>
56 #include <commctrl.h>
57 #include <shlwapi.h>
58 #include <d3d9.h>
59
60 /* FIXME */
61 #define CoTaskMemFree(x)
62
63 /* */
64 #define DXVA2_E_NOT_INITIALIZED     MAKE_HRESULT(1, 4, 4096)
65 #define DXVA2_E_NEW_VIDEO_DEVICE    MAKE_HRESULT(1, 4, 4097)
66 #define DXVA2_E_VIDEO_DEVICE_LOCKED MAKE_HRESULT(1, 4, 4098)
67 #define DXVA2_E_NOT_AVAILABLE       MAKE_HRESULT(1, 4, 4099)
68
69 static const GUID DXVA2_ModeMPEG2_MoComp = {
70     0xe6a9f44b, 0x61b0, 0x4563, {0x9e,0xa4,0x63,0xd2,0xa3,0xc6,0xfe,0x66}
71 };
72 static const GUID DXVA2_ModeMPEG2_IDCT = {
73   0xbf22ad00, 0x03ea, 0x4690, {0x80,0x77,0x47,0x33,0x46,0x20,0x9b,0x7e}
74 };
75 static const GUID DXVA2_ModeMPEG2_VLD = {
76    0xee27417f, 0x5e28, 0x4e65, {0xbe,0xea,0x1d,0x26,0xb5,0x08,0xad,0xc9}
77 };
78
79 static const GUID DXVA2_ModeH264_A = {
80     0x1b81be64, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
81 };
82 static const GUID DXVA2_ModeH264_B = {
83     0x1b81be65, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
84 };
85 static const GUID DXVA2_ModeH264_C = {
86     0x1b81be66, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
87 };
88 static const GUID DXVA2_ModeH264_D = {
89     0x1b81be67, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
90 };
91 static const GUID DXVA2_ModeH264_E = {
92     0x1b81be68, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
93 };
94 static const GUID DXVA2_ModeH264_F = {
95     0x1b81be69, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
96 };
97 static const GUID DXVA2_ModeWMV8_A = {
98     0x1b81be80, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
99 };
100 static const GUID DXVA2_ModeWMV8_B = {
101     0x1b81be81, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
102 };
103 static const GUID DXVA2_ModeWMV9_A = {
104     0x1b81be90, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
105 };
106 static const GUID DXVA2_ModeWMV9_B = {
107     0x1b81be91, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
108 };
109 static const GUID DXVA2_ModeWMV9_C = {
110     0x1b81be94, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
111 };
112
113 static const GUID DXVA2_ModeVC1_A = {
114     0x1b81beA0, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
115 };
116 static const GUID DXVA2_ModeVC1_B = {
117     0x1b81beA1, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
118 };
119 static const GUID DXVA2_ModeVC1_C = {
120     0x1b81beA2, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
121 };
122 static const GUID DXVA2_ModeVC1_D = {
123     0x1b81beA3, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
124 };
125
126 /* */
127 typedef struct {
128     const char   *name;
129     const GUID   *guid;
130     int          codec;
131 } dxva2_mode_t;
132 /* XXX Prefered modes must come first */
133 static const dxva2_mode_t dxva2_modes[] = {
134     { "DXVA2_ModeMPEG2_VLD",    &DXVA2_ModeMPEG2_VLD,     0 },
135     { "DXVA2_ModeMPEG2_MoComp", &DXVA2_ModeMPEG2_MoComp,  0 },
136     { "DXVA2_ModeMPEG2_IDCT",   &DXVA2_ModeMPEG2_IDCT,    0 },
137
138     { "H.264 variable-length decoder (VLD), FGT",               &DXVA2_ModeH264_F, CODEC_ID_H264 },
139     { "H.264 VLD, no FGT",                                      &DXVA2_ModeH264_E, CODEC_ID_H264 },
140     { "H.264 IDCT, FGT",                                        &DXVA2_ModeH264_D, 0,            },
141     { "H.264 inverse discrete cosine transform (IDCT), no FGT", &DXVA2_ModeH264_C, 0,            },
142     { "H.264 MoComp, FGT",                                      &DXVA2_ModeH264_B, 0,            },
143     { "H.264 motion compensation (MoComp), no FGT",             &DXVA2_ModeH264_A, 0,            },
144
145     { "Windows Media Video 8 MoComp",           &DXVA2_ModeWMV8_B, 0 },
146     { "Windows Media Video 8 post processing",  &DXVA2_ModeWMV8_A, 0 },
147
148     {  "Windows Media Video 9 IDCT",            &DXVA2_ModeWMV9_C, 0 },
149     {  "Windows Media Video 9 MoComp",          &DXVA2_ModeWMV9_B, 0 },
150     {  "Windows Media Video 9 post processing", &DXVA2_ModeWMV9_A, 0 },
151
152     { "VC-1 VLD",             &DXVA2_ModeVC1_D, 0 },
153     { "VC-1 IDCT",            &DXVA2_ModeVC1_C, 0 },
154     { "VC-1 MoComp",          &DXVA2_ModeVC1_B, 0 },
155     { "VC-1 post processing", &DXVA2_ModeVC1_A, 0 },
156
157     { NULL, NULL, 0 }
158 };
159
160 static const dxva2_mode_t *Dxva2FindMode(const GUID *guid)
161 {
162     for (unsigned i = 0; dxva2_modes[i].name; i++) {
163         if (IsEqualGUID(dxva2_modes[i].guid, guid))
164             return &dxva2_modes[i];
165     }
166     return NULL;
167 }
168
169 /* */
170 #define VLC_CODEC_NV12 VLC_FOURCC('N','V','1','2') /* TODO move to vlc_fourcc.h */
171 typedef struct {
172     const char   *name;
173     D3DFORMAT    format;
174     vlc_fourcc_t codec;
175 } d3d_format_t;
176 /* XXX Prefered format must come first */
177 static const d3d_format_t d3d_formats[] = {
178     { "YV12",   MAKEFOURCC('Y','V','1','2'),    VLC_CODEC_YV12 },
179     { "NV12",   MAKEFOURCC('N','V','1','2'),    VLC_CODEC_NV12 },
180
181     { NULL, 0, 0 }
182 };
183
184 static const d3d_format_t *D3dFindFormat(D3DFORMAT format)
185 {
186     for (unsigned i = 0; d3d_formats[i].name; i++) {
187         if (d3d_formats[i].format == format)
188             return &d3d_formats[i];
189     }
190     return NULL;
191 }
192
193 static const GUID IID_IDirectXVideoDecoderService = {
194     0xfc51a551, 0xd5e7, 0x11d9, {0xaf,0x55,0x00,0x05,0x4e,0x43,0xff,0x02}
195 };
196 static const GUID IID_IDirectXVideoAccelerationService = {
197     0xfc51a550, 0xd5e7, 0x11d9, {0xaf,0x55,0x00,0x05,0x4e,0x43,0xff,0x02}
198 };
199
200 /* */
201 typedef struct {
202     LPDIRECT3DSURFACE9 d3d;
203     int                refcount;
204     unsigned int       order;
205 } vlc_va_surface_t;
206
207 #define VA_DXVA2_MAX_SURFACE_COUNT (64)
208 typedef struct
209 {
210     /* */
211     vlc_va_t va;
212
213     /* */
214     vlc_object_t *log;
215     int          codec_id;
216
217     /* DLL */
218         HINSTANCE             hd3d9_dll;
219     HINSTANCE             hdxva2_dll;
220
221     /* Direct3D */
222         D3DPRESENT_PARAMETERS d3dpp;
223         LPDIRECT3D9           d3dobj;
224         LPDIRECT3DDEVICE9     d3ddev;
225
226     /* Device manager */
227     UINT                     token;
228     IDirect3DDeviceManager9  *devmng;
229     HANDLE                   device;
230
231     /* Video service */
232     IDirectXVideoDecoderService  *vs;
233     GUID                         input;
234     D3DFORMAT                    render;
235
236     /* Video decoder */
237     DXVA2_ConfigPictureDecode    cfg;
238     IDirectXVideoDecoder         *decoder;
239
240     /* Option conversion */
241     D3DFORMAT                    output;
242     uint8_t                      *surface_cache_base;
243     uint8_t                      *surface_cache;
244     size_t                       surface_cache_size;
245
246     /* */
247     struct dxva_context hw;
248
249     /* */
250     unsigned     surface_count;
251     unsigned     surface_order;
252     int          surface_width;
253     int          surface_height;
254     vlc_fourcc_t surface_chroma;
255
256     vlc_va_surface_t surface[VA_DXVA2_MAX_SURFACE_COUNT];
257     LPDIRECT3DSURFACE9 hw_surface[VA_DXVA2_MAX_SURFACE_COUNT];
258 } vlc_va_dxva2_t;
259
260 /* */
261 static vlc_va_dxva2_t *vlc_va_dxva2_Get(void *external)
262 {
263     assert(external == (void*)(&((vlc_va_dxva2_t*)external)->va));
264     return external;
265 }
266
267 /* */
268 static int D3dCreateDevice(vlc_va_dxva2_t *);
269 static void D3dDestroyDevice(vlc_va_dxva2_t *);
270 static char *DxDescribe(vlc_va_dxva2_t *);
271
272 static int D3dCreateDeviceManager(vlc_va_dxva2_t *);
273 static void D3dDestroyDeviceManager(vlc_va_dxva2_t *);
274
275 static int DxCreateVideoService(vlc_va_dxva2_t *);
276 static void DxDestroyVideoService(vlc_va_dxva2_t *);
277 static int DxFindVideoServiceConversion(vlc_va_dxva2_t *, GUID *input, D3DFORMAT *output);
278
279 static int DxCreateVideoDecoder(vlc_va_dxva2_t *,
280                                 int codec_id, const video_format_t *);
281 static void DxDestroyVideoDecoder(vlc_va_dxva2_t *);
282 static int DxResetVideoDecoder(vlc_va_dxva2_t *);
283
284 static void DxCreateVideoConversion(vlc_va_dxva2_t *);
285 static void DxDestroyVideoConversion(vlc_va_dxva2_t *);
286
287 static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src,
288                          uint8_t *cache, size_t cache_size,
289                          unsigned width, unsigned height);
290 static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src,
291                          uint8_t *cache, size_t cache_size,
292                          unsigned width, unsigned height);
293
294 /* */
295 static int Setup(vlc_va_t *external, void **hw, vlc_fourcc_t *chroma,
296                  int width, int height)
297 {
298     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
299
300     if (va->surface_width  == width &&
301         va->surface_height == height)
302         goto ok;
303
304     /* */
305     DxDestroyVideoConversion(va);
306     DxDestroyVideoDecoder(va);
307
308     *hw = NULL;
309     *chroma = 0;
310     if (width <= 0 || height <= 0)
311         return VLC_EGENERIC;
312
313     /* FIXME transmit a video_format_t by VaSetup directly */
314     video_format_t fmt;
315     memset(&fmt, 0, sizeof(fmt));
316     fmt.i_width = width;
317     fmt.i_height = height;
318
319     if (DxCreateVideoDecoder(va, va->codec_id, &fmt))
320         return VLC_EGENERIC;
321     /* */
322     va->hw.decoder = va->decoder;
323     if (va->codec_id == CODEC_ID_H264)
324         va->hw.cfg = &va->cfg;
325     va->hw.surface_count = va->surface_count;
326     va->hw.surface = va->hw_surface;
327     for (unsigned i = 0; i < va->surface_count; i++)
328         va->hw.surface[i] = va->surface[i].d3d;
329
330     /* */
331     DxCreateVideoConversion(va);
332
333     /* */
334 ok:
335     *hw = &va->hw;
336     const d3d_format_t *output = D3dFindFormat(va->output);
337     *chroma = output->codec;
338
339     return VLC_SUCCESS;
340 }
341
342 static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
343 {
344     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
345     LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)ff->data[3];
346
347     if (!va->surface_cache)
348         return VLC_EGENERIC;
349
350     /* */
351     assert(va->output == MAKEFOURCC('Y','V','1','2'));
352
353     /* */
354     D3DLOCKED_RECT lock;
355     if (FAILED(IDirect3DSurface9_LockRect(d3d, &lock, NULL, D3DLOCK_READONLY))) {
356         msg_Err(va->log, "Failed to lock surface");
357         return VLC_EGENERIC;
358     }
359
360     if (va->render == MAKEFOURCC('Y','V','1','2')) {
361         CopyFromYv12(picture, &lock,
362                      va->surface_cache, va->surface_cache_size,
363                      va->surface_width, va->surface_height);
364     } else {
365         assert(va->render == MAKEFOURCC('N','V','1','2'));
366         CopyFromNv12(picture, &lock,
367                      va->surface_cache, va->surface_cache_size,
368                      va->surface_width, va->surface_height);
369     }
370
371     /* */
372     IDirect3DSurface9_UnlockRect(d3d);
373     return VLC_SUCCESS;
374 }
375 /* FIXME it is nearly common with VAAPI */
376 static int Get(vlc_va_t *external, AVFrame *ff)
377 {
378     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
379
380     /* Check the device */
381     HRESULT hr = IDirect3DDeviceManager9_TestDevice(va->devmng, va->device);
382     if (hr == DXVA2_E_NEW_VIDEO_DEVICE) {
383         if (DxResetVideoDecoder(va))
384             return VLC_EGENERIC;
385     } else if (FAILED(hr)) {
386         msg_Err(va->log, "IDirect3DDeviceManager9_TestDevice %u", (unsigned)hr);
387         return VLC_EGENERIC;
388     }
389
390     /* Grab an unused surface, in case none are, try the oldest
391      * XXX using the oldest is a workaround in case a problem happens with ffmpeg */
392     unsigned i, old;
393     for (i = 0, old = 0; i < va->surface_count; i++) {
394         vlc_va_surface_t *surface = &va->surface[i];
395
396         if (!surface->refcount)
397             break;
398
399         if (surface->order < va->surface[old].order)
400             old = i;
401     }
402     if (i >= va->surface_count)
403         i = old;
404
405     vlc_va_surface_t *surface = &va->surface[i];
406
407     surface->refcount = 1;
408     surface->order = va->surface_order++;
409
410     /* */
411     for (int i = 0; i < 4; i++) {
412         ff->data[i] = NULL;
413         ff->linesize[i] = 0;
414
415         if (i == 0 || i == 3)
416             ff->data[i] = (void*)surface->d3d;/* Yummie */
417     }
418     return VLC_SUCCESS;
419 }
420 static void Release(vlc_va_t *external, AVFrame *ff)
421 {
422     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
423     LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)ff->data[3];
424
425     for (unsigned i = 0; i < va->surface_count; i++) {
426         vlc_va_surface_t *surface = &va->surface[i];
427
428         if (surface->d3d == d3d)
429             surface->refcount--;
430     }
431 }
432 static void Close(vlc_va_t *external)
433 {
434     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
435
436     DxDestroyVideoConversion(va);
437     DxDestroyVideoDecoder(va);
438     DxDestroyVideoService(va);
439     D3dDestroyDeviceManager(va);
440     D3dDestroyDevice(va);
441
442     if (va->hdxva2_dll)
443         FreeLibrary(va->hdxva2_dll);
444     if (va->hd3d9_dll)
445         FreeLibrary(va->hd3d9_dll);
446
447     free(va->va.description);
448     free(va);
449 }
450
451 vlc_va_t *vlc_va_NewDxva2(vlc_object_t *log, int codec_id)
452 {
453     vlc_va_dxva2_t *va = calloc(1, sizeof(*va));
454     if (!va)
455         return NULL;
456
457     /* */
458     va->log = log;
459     va->codec_id = codec_id;
460
461     /* Load dll*/
462     va->hd3d9_dll = LoadLibrary(TEXT("D3D9.DLL"));
463     if (!va->hd3d9_dll) {
464         msg_Warn(va->log, "cannot load d3d9.dll");
465         goto error;
466     }
467     va->hdxva2_dll = LoadLibrary(TEXT("DXVA2.DLL"));
468     if (!va->hdxva2_dll) {
469         msg_Warn(va->log, "cannot load dxva2.dll");
470         goto error;
471     }
472     msg_Dbg(va->log, "DLLs loaded");
473
474     /* */
475     if (D3dCreateDevice(va)) {
476         msg_Err(va->log, "Failed to create Direct3D device");
477         goto error;
478     }
479     msg_Dbg(va->log, "D3dCreateDevice succeed");
480
481     if (D3dCreateDeviceManager(va)) {
482         msg_Err(va->log, "D3dCreateDeviceManager failed");
483         goto error;
484     }
485
486     if (DxCreateVideoService(va)) {
487         msg_Err(va->log, "DxCreateVideoService failed");
488         goto error;
489     }
490
491     /* */
492     if (DxFindVideoServiceConversion(va, &va->input, &va->render)) {
493         msg_Err(va->log, "DxFindVideoServiceConversion failed");
494         goto error;
495     }
496
497     /* TODO print the hardware name/vendor for debugging purposes */
498     va->va.description = DxDescribe(va);
499     va->va.setup   = Setup;
500     va->va.get     = Get;
501     va->va.release = Release;
502     va->va.extract = Extract;
503     va->va.close   = Close;
504     return &va->va;
505
506 error:
507     Close(&va->va);
508     return NULL;
509 }
510 /* */
511
512 /**
513  * It creates a Direct3D device usable for DXVA 2
514  */
515 static int D3dCreateDevice(vlc_va_dxva2_t *va)
516 {
517     /* */
518     LPDIRECT3D9 (WINAPI *Create9)(UINT SDKVersion);
519     Create9 = (void *)GetProcAddress(va->hd3d9_dll,
520                                      TEXT("Direct3DCreate9"));
521     if (!Create9) {
522         msg_Err(va->log, "Cannot locate reference to Direct3DCreate9 ABI in DLL");
523         return VLC_EGENERIC;
524     }
525
526     /* */
527     LPDIRECT3D9 d3dobj;
528     d3dobj = Create9(D3D_SDK_VERSION);
529     if (!d3dobj) {
530         msg_Err(va->log, "Direct3DCreate9 failed");
531         return VLC_EGENERIC;
532     }
533     va->d3dobj = d3dobj;
534
535     /* */
536     D3DPRESENT_PARAMETERS *d3dpp = &va->d3dpp;
537     ZeroMemory(d3dpp, sizeof(*d3dpp));
538     d3dpp->Flags                  = D3DPRESENTFLAG_VIDEO;
539     d3dpp->Windowed               = TRUE;
540     d3dpp->hDeviceWindow          = NULL;
541     d3dpp->SwapEffect             = D3DSWAPEFFECT_DISCARD;
542     d3dpp->MultiSampleType        = D3DMULTISAMPLE_NONE;
543     d3dpp->PresentationInterval   = D3DPRESENT_INTERVAL_DEFAULT;
544     d3dpp->BackBufferCount        = 0;                  /* FIXME what to put here */
545     d3dpp->BackBufferFormat       = D3DFMT_X8R8G8B8;    /* FIXME what to put here */
546     d3dpp->BackBufferWidth        = 0;
547     d3dpp->BackBufferHeight       = 0;
548     d3dpp->EnableAutoDepthStencil = FALSE;
549
550     /* Direct3D needs a HWND to create a device, even without using ::Present
551     this HWND is used to alert Direct3D when there's a change of focus window.
552     For now, use GetShellWindow, as it looks harmless */
553     LPDIRECT3DDEVICE9 d3ddev;
554     if (FAILED(IDirect3D9_CreateDevice(d3dobj, D3DADAPTER_DEFAULT,
555                                        D3DDEVTYPE_HAL, GetShellWindow(),
556                                        D3DCREATE_SOFTWARE_VERTEXPROCESSING |
557                                        D3DCREATE_MULTITHREADED,
558                                        d3dpp, &d3ddev))) {
559         msg_Err(va->log, "IDirect3D9_CreateDevice failed");
560         return VLC_EGENERIC;
561     }
562     va->d3ddev = d3ddev;
563
564     return VLC_SUCCESS;
565 }
566 /**
567  * It releases a Direct3D device and its resources.
568  */
569 static void D3dDestroyDevice(vlc_va_dxva2_t *va)
570 {
571     if (va->d3ddev)
572         IDirect3DDevice9_Release(va->d3ddev);
573     if (va->d3dobj)
574         IDirect3D9_Release(va->d3dobj);
575 }
576 /**
577  * It describes our Direct3D object
578  */
579 static char *DxDescribe(vlc_va_dxva2_t *va)
580 {
581     D3DADAPTER_IDENTIFIER9 id;
582     ZeroMemory(&id, sizeof(id));
583
584     if (FAILED(IDirect3D9_GetAdapterIdentifier(va->d3dobj,
585                                                D3DADAPTER_DEFAULT, 0, &id)))
586         return strdup("DXVA2 (unknown)");
587
588     char *description;
589     if (asprintf(&description, "DXVA2 (%.*s, vendor %d, device %d, revision %d)",
590                  sizeof(id.Description), id.Description,
591                  id.VendorId, id.DeviceId, id.Revision) < 0)
592         return NULL;
593     return description;
594 }
595
596 /**
597  * It creates a Direct3D device manager
598  */
599 static int D3dCreateDeviceManager(vlc_va_dxva2_t *va)
600 {
601     HRESULT (WINAPI *CreateDeviceManager9)(UINT *pResetToken,
602                                            IDirect3DDeviceManager9 **);
603     CreateDeviceManager9 =
604       (void *)GetProcAddress(va->hdxva2_dll,
605                              TEXT("DXVA2CreateDirect3DDeviceManager9"));
606
607     if (!CreateDeviceManager9) {
608         msg_Err(va->log, "cannot load function\n");
609         return VLC_EGENERIC;
610     }
611     msg_Dbg(va->log, "OurDirect3DCreateDeviceManager9 Success!");
612
613     UINT token;
614     IDirect3DDeviceManager9 *devmng;
615     if (FAILED(CreateDeviceManager9(&token, &devmng))) {
616         msg_Err(va->log, " OurDirect3DCreateDeviceManager9 failed");
617         return VLC_EGENERIC;
618     }
619     va->token  = token;
620     va->devmng = devmng;
621     msg_Info(va->log, "obtained IDirect3DDeviceManager9");
622
623     HRESULT hr = IDirect3DDeviceManager9_ResetDevice(devmng, va->d3ddev, token);
624     if (FAILED(hr)) {
625         msg_Err(va->log, "IDirect3DDeviceManager9_ResetDevice failed: %08x", (unsigned)hr);
626         return VLC_EGENERIC;
627     }
628     return VLC_SUCCESS;
629 }
630 /**
631  * It destroys a Direct3D device manager
632  */
633 static void D3dDestroyDeviceManager(vlc_va_dxva2_t *va)
634 {
635     if (va->devmng)
636         IDirect3DDeviceManager9_Release(va->devmng);
637 }
638
639 /**
640  * It creates a DirectX video service
641  */
642 static int DxCreateVideoService(vlc_va_dxva2_t *va)
643 {
644     HRESULT (WINAPI *CreateVideoService)(IDirect3DDevice9 *,
645                                          REFIID riid,
646                                          void **ppService);
647     CreateVideoService =
648       (void *)GetProcAddress(va->hdxva2_dll,
649                              TEXT("DXVA2CreateVideoService"));
650
651     if (!CreateVideoService) {
652         msg_Err(va->log, "cannot load function\n");
653         return 4;
654     }
655     msg_Info(va->log, "DXVA2CreateVideoService Success!");
656
657     HRESULT hr;
658
659     HANDLE device;
660     hr = IDirect3DDeviceManager9_OpenDeviceHandle(va->devmng, &device);
661     if (FAILED(hr)) {
662         msg_Err(va->log, "OpenDeviceHandle failed");
663         return VLC_EGENERIC;
664     }
665     va->device = device;
666
667     IDirectXVideoDecoderService *vs;
668     hr = IDirect3DDeviceManager9_GetVideoService(va->devmng, device,
669                                                  &IID_IDirectXVideoDecoderService,
670                                                  &vs);
671     if (FAILED(hr)) {
672         msg_Err(va->log, "GetVideoService failed");
673         return VLC_EGENERIC;
674     }
675     va->vs = vs;
676
677     return VLC_SUCCESS;
678 }
679 /**
680  * It destroys a DirectX video service
681  */
682 static void DxDestroyVideoService(vlc_va_dxva2_t *va)
683 {
684     if (va->device)
685         IDirect3DDeviceManager9_CloseDeviceHandle(va->devmng, va->device);
686     if (va->vs)
687         IDirectXVideoDecoderService_Release(va->vs);
688 }
689 /**
690  * Find the best suited decoder mode GUID and render format.
691  */
692 static int DxFindVideoServiceConversion(vlc_va_dxva2_t *va, GUID *input, D3DFORMAT *output)
693 {
694     /* Retreive supported modes from the decoder service */
695     UINT input_count = 0;
696     GUID *input_list = NULL;
697     if (FAILED(IDirectXVideoDecoderService_GetDecoderDeviceGuids(va->vs,
698                                                                  &input_count,
699                                                                  &input_list))) {
700         msg_Err(va->log, "IDirectXVideoDecoderService_GetDecoderDeviceGuids failed");
701         return VLC_EGENERIC;
702     }
703     for (unsigned i = 0; i < input_count; i++) {
704         const GUID *g = &input_list[i];
705         const dxva2_mode_t *mode = Dxva2FindMode(g);
706         if (mode) {
707             msg_Dbg(va->log, "- '%s' is supported by hardware", mode->name);
708         } else {
709             msg_Warn(va->log, "- Unknown GUID = %08X-%04x-%04x-XXXX",
710                      (unsigned)g->Data1, g->Data2, g->Data3);
711         }
712     }
713
714     /* Try all supported mode by our priority */
715     for (unsigned i = 0; dxva2_modes[i].name; i++) {
716         const dxva2_mode_t *mode = &dxva2_modes[i];
717         if (!mode->codec || mode->codec != va->codec_id)
718             continue;
719
720         /* */
721         bool is_suported = false;
722         for (const GUID *g = &input_list[0]; !is_suported && g < &input_list[input_count]; g++) {
723             is_suported = IsEqualGUID(mode->guid, g);
724         }
725         if (!is_suported)
726             continue;
727
728         /* */
729         msg_Dbg(va->log, "Trying to use '%s' as input", mode->name);
730         UINT      output_count = 0;
731         D3DFORMAT *output_list = NULL;
732         if (FAILED(IDirectXVideoDecoderService_GetDecoderRenderTargets(va->vs, mode->guid,
733                                                                        &output_count,
734                                                                        &output_list))) {
735             msg_Err(va->log, "IDirectXVideoDecoderService_GetDecoderRenderTargets failed");
736             continue;
737         }
738         for (unsigned j = 0; j < output_count; j++) {
739             const D3DFORMAT f = output_list[j];
740             const d3d_format_t *format = D3dFindFormat(f);
741             if (format) {
742                 msg_Dbg(va->log, "%s is supported for output", format->name);
743             } else {
744                 msg_Dbg(va->log, "%d is supported for output (%4.4s)", f, (const char*)&f);
745             }
746         }
747
748         /* */
749         for (unsigned j = 0; d3d_formats[j].name; j++) {
750             const d3d_format_t *format = &d3d_formats[j];
751
752             /* */
753             bool is_suported = false;
754             for (unsigned k = 0; !is_suported && k < output_count; k++) {
755                 is_suported = format->format == output_list[k];
756             }
757             if (!is_suported)
758                 continue;
759
760             /* We have our solution */
761             msg_Dbg(va->log, "Using '%s' to decode to '%s'", mode->name, format->name);
762             *input  = *mode->guid;
763             *output = format->format;
764             CoTaskMemFree(output_list);
765             CoTaskMemFree(input_list);
766             return VLC_SUCCESS;
767         }
768         CoTaskMemFree(output_list);
769     }
770     CoTaskMemFree(input_list);
771     return VLC_EGENERIC;
772 }
773
774 /**
775  * It creates a DXVA2 decoder using the given video format
776  */
777 static int DxCreateVideoDecoder(vlc_va_dxva2_t *va,
778                                 int codec_id, const video_format_t *fmt)
779 {
780     /* */
781     msg_Dbg(va->log, "DxCreateVideoDecoder id %d %dx%d",
782             codec_id, fmt->i_width, fmt->i_height);
783
784     /* Allocates all surfaces needed for the decoder */
785     switch (codec_id) {
786     case CODEC_ID_H264:
787         va->surface_count = 16 + 1;
788         break;
789     default:
790         va->surface_count = 2 + 1;
791         break;
792     }
793     LPDIRECT3DSURFACE9 surface_list[VA_DXVA2_MAX_SURFACE_COUNT];
794     if (FAILED(IDirectXVideoDecoderService_CreateSurface(va->vs,
795                                                          fmt->i_width,
796                                                          fmt->i_height,
797                                                          va->surface_count - 1,
798                                                          va->render,
799                                                          D3DPOOL_DEFAULT,
800                                                          0,
801                                                          DXVA2_VideoDecoderRenderTarget,
802                                                          surface_list,
803                                                          NULL))) {
804         msg_Err(va->log, "IDirectXVideoAccelerationService_CreateSurface failed\n");
805         va->surface_count = 0;
806         return VLC_EGENERIC;
807     }
808     for (unsigned i = 0; i < va->surface_count; i++) {
809         vlc_va_surface_t *surface = &va->surface[i];
810         surface->d3d = surface_list[i];
811         surface->refcount = 0;
812         surface->order = 0;
813     }
814     va->surface_width  = fmt->i_width;
815     va->surface_height = fmt->i_height;
816     msg_Dbg(va->log, "IDirectXVideoAccelerationService_CreateSurface succeed with %d surfaces (%dx%d)",
817             va->surface_count, fmt->i_width, fmt->i_height);
818
819     /* */
820     DXVA2_VideoDesc dsc;
821     ZeroMemory(&dsc, sizeof(dsc));
822     dsc.SampleWidth     = fmt->i_width;
823     dsc.SampleHeight    = fmt->i_height;
824     dsc.Format          = va->render;
825     if (fmt->i_frame_rate > 0 && fmt->i_frame_rate_base > 0) {
826         dsc.InputSampleFreq.Numerator   = fmt->i_frame_rate;
827         dsc.InputSampleFreq.Denominator = fmt->i_frame_rate_base;
828     } else {
829         dsc.InputSampleFreq.Numerator   = 0;
830         dsc.InputSampleFreq.Denominator = 0;
831     }
832     dsc.OutputFrameFreq = dsc.InputSampleFreq;
833     dsc.UABProtectionLevel = FALSE;
834     dsc.Reserved = 0;
835
836     /* FIXME I am unsure we can let unknown everywhere */
837     DXVA2_ExtendedFormat *ext = &dsc.SampleFormat;
838     ext->SampleFormat = 0;//DXVA2_SampleUnknown;
839     ext->VideoChromaSubsampling = 0;//DXVA2_VideoChromaSubsampling_Unknown;
840     ext->NominalRange = 0;//DXVA2_NominalRange_Unknown;
841     ext->VideoTransferMatrix = 0;//DXVA2_VideoTransferMatrix_Unknown;
842     ext->VideoLighting = 0;//DXVA2_VideoLighting_Unknown;
843     ext->VideoPrimaries = 0;//DXVA2_VideoPrimaries_Unknown;
844     ext->VideoTransferFunction = 0;//DXVA2_VideoTransFunc_Unknown;
845
846     /* List all configurations available for the decoder */
847     UINT                      cfg_count = 0;
848     DXVA2_ConfigPictureDecode *cfg_list = NULL;
849     if (FAILED(IDirectXVideoDecoderService_GetDecoderConfigurations(va->vs,
850                                                                     &va->input,
851                                                                     &dsc,
852                                                                     NULL,
853                                                                     &cfg_count,
854                                                                     &cfg_list))) {
855         msg_Err(va->log, "IDirectXVideoDecoderService_GetDecoderConfigurations failed\n");
856         return VLC_EGENERIC;
857     }
858     msg_Dbg(va->log, "we got %d decoder configurations", cfg_count);
859
860     /* Select the best decoder configuration */
861     bool has_cfg = false;
862     for (unsigned i = 0; i < cfg_count; i++) {
863         const DXVA2_ConfigPictureDecode *cfg = &cfg_list[i];
864
865         /* */
866         msg_Dbg(va->log, "configuration[%d] ConfigBitstreamRaw %d",
867                 i, cfg->ConfigBitstreamRaw);
868
869         /* */
870         if ((!has_cfg && cfg->ConfigBitstreamRaw == 1) ||
871             (codec_id == CODEC_ID_H264 && cfg->ConfigBitstreamRaw == 2)) {
872             va->cfg = *cfg;
873             has_cfg = true;
874         }
875     }
876     CoTaskMemFree(cfg_list);
877     if (!has_cfg) {
878         msg_Err(va->log, "Failed to find a supported decoder configuration");
879         return VLC_EGENERIC;
880     }
881
882     /* Create the decoder */
883     IDirectXVideoDecoder *decoder;
884     if (FAILED(IDirectXVideoDecoderService_CreateVideoDecoder(va->vs,
885                                                               &va->input,
886                                                               &dsc,
887                                                               &va->cfg,
888                                                               surface_list,
889                                                               va->surface_count,
890                                                               &decoder))) {
891         msg_Err(va->log, "IDirectXVideoDecoderService_CreateVideoDecoder failed\n");
892         return VLC_EGENERIC;
893     }
894     va->decoder = decoder;
895     msg_Dbg(va->log, "IDirectXVideoDecoderService_CreateVideoDecoder succeed");
896     return VLC_SUCCESS;
897 }
898 static void DxDestroyVideoDecoder(vlc_va_dxva2_t *va)
899 {
900     if (va->decoder)
901         IDirectXVideoDecoder_Release(va->decoder);
902     va->decoder = NULL;
903
904     for (unsigned i = 0; i < va->surface_count; i++)
905         IDirect3DSurface9_Release(va->surface[i].d3d);
906     va->surface_count = 0;
907 }
908 static int DxResetVideoDecoder(vlc_va_dxva2_t *va)
909 {
910     msg_Err(va->log, "DxResetVideoDecoder unimplemented");
911     return VLC_EGENERIC;
912 }
913
914 static void DxCreateVideoConversion(vlc_va_dxva2_t *va)
915 {
916     switch (va->render) {
917     case MAKEFOURCC('N','V','1','2'):
918         va->output = MAKEFOURCC('Y','V','1','2');
919         break;
920     default:
921         va->output = va->render;
922         break;
923     }
924     va->surface_cache_size = __MAX((va->surface_width + 0x0f) & ~ 0x0f, 4096);
925     va->surface_cache_base = malloc(16 + va->surface_cache_size);
926     va->surface_cache      = &va->surface_cache_base[16 - ((intptr_t)va->surface_cache_base & 0x0f)];
927 }
928 static void DxDestroyVideoConversion(vlc_va_dxva2_t *va)
929 {
930     free(va->surface_cache_base);
931     va->surface_cache_base = NULL;
932     va->surface_cache      = NULL;
933     va->surface_cache_size = 0;
934 }
935
936 /* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
937  * storing data with the SSE>=2 instruction store.
938  */
939 #define COPY64(dstp, srcp, load, store) \
940     asm volatile (                      \
941         load "  0(%[src]), %%xmm1\n"    \
942         load " 16(%[src]), %%xmm2\n"    \
943         load " 32(%[src]), %%xmm3\n"    \
944         load " 48(%[src]), %%xmm4\n"    \
945         store " %%xmm1,    0(%[dst])\n" \
946         store " %%xmm2,   16(%[dst])\n" \
947         store " %%xmm3,   32(%[dst])\n" \
948         store " %%xmm4,   48(%[dst])\n" \
949         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
950
951 /* Execute the instruction op only if SSE2 is supported. */
952 #ifdef CAN_COMPILE_SSE2
953 #   define ASM_SSE2(cpu, op) do {          \
954         if (cpu & CPU_CAPABILITY_SSE2)  \
955             asm volatile (op);    \
956     } while (0)
957 #else
958 #   define ASM_SSE2(cpu, op)
959 #endif
960
961 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
962  * as used by some video surface.
963  * XXX It is really efficient only when SSE4.1 is available.
964  */
965 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
966                          const uint8_t *src, size_t src_pitch,
967                          unsigned unaligned,
968                          unsigned width, unsigned height,
969                          unsigned cpu)
970 {
971     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
972
973     ASM_SSE2(cpu, "mfence");
974     for (unsigned y = 0; y < height; y++) {
975         unsigned x;
976
977         for (x = 0; x < unaligned; x++)
978             dst[x] = src[x];
979
980 #ifdef CAN_COMPILE_SSE4_1
981         if (cpu & CPU_CAPABILITY_SSE4_1) {
982             if (!unaligned) {
983                 for (; x+63 < width; x += 64)
984                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
985             } else {
986                 for (; x+63 < width; x += 64)
987                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
988             }
989         } else
990 #endif
991 #ifdef CAN_COMPILE_SSE2
992         if (cpu & CPU_CAPABILITY_SSE2) {
993             if (!unaligned) {
994                 for (; x+63 < width; x += 64)
995                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
996             } else {
997                 for (; x+63 < width; x += 64)
998                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
999             }
1000         }
1001 #endif
1002
1003         for (; x < width; x++)
1004             dst[x] = src[x];
1005
1006         src += src_pitch;
1007         dst += dst_pitch;
1008     }
1009 }
1010
1011 static void Copy2d(uint8_t *dst, size_t dst_pitch,
1012                    const uint8_t *src, size_t src_pitch,
1013                    unsigned width, unsigned height,
1014                    unsigned cpu)
1015 {
1016     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
1017
1018     ASM_SSE2(cpu, "mfence");
1019
1020     for (unsigned y = 0; y < height; y++) {
1021         unsigned x = 0;
1022         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
1023
1024 #ifdef CAN_COMPILE_SSE2
1025         if (cpu & CPU_CAPABILITY_SSE2) {
1026             if (!unaligned) {
1027                 for (; x+63 < width; x += 64)
1028                     COPY64(&dst[x], &src[x], "movdqa", "movntdq");
1029             } else {
1030                 for (; x+63 < width; x += 64)
1031                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
1032             }
1033         }
1034 #endif
1035
1036         for (; x < width; x++)
1037             dst[x] = src[x];
1038
1039         src += src_pitch;
1040         dst += dst_pitch;
1041     }
1042 }
1043
1044 static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
1045                     uint8_t *dstv, size_t dstv_pitch,
1046                     const uint8_t *src, size_t src_pitch,
1047                     unsigned width, unsigned height, unsigned cpu)
1048 {
1049     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1050                                 1, 3, 5, 7, 9, 11, 13, 15 };
1051     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
1052                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
1053
1054     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
1055
1056     ASM_SSE2(cpu, "mfence");
1057
1058     for (unsigned y = 0; y < height; y++) {
1059         unsigned x = 0;
1060
1061 #define LOAD64 \
1062     "movdqa  0(%[src]), %%xmm0\n" \
1063     "movdqa 16(%[src]), %%xmm1\n" \
1064     "movdqa 32(%[src]), %%xmm2\n" \
1065     "movdqa 48(%[src]), %%xmm3\n"
1066
1067 #define STORE2X32 \
1068     "movq   %%xmm0,   0(%[dst1])\n" \
1069     "movq   %%xmm1,   8(%[dst1])\n" \
1070     "movhpd %%xmm0,   0(%[dst2])\n" \
1071     "movhpd %%xmm1,   8(%[dst2])\n" \
1072     "movq   %%xmm2,  16(%[dst1])\n" \
1073     "movq   %%xmm3,  24(%[dst1])\n" \
1074     "movhpd %%xmm2,  16(%[dst2])\n" \
1075     "movhpd %%xmm3,  24(%[dst2])\n"
1076
1077 #ifdef CAN_COMPILE_SSSE3
1078         if (cpu & CPU_CAPABILITY_SSSE3) {
1079             for (x = 0; x < (width & ~31); x += 32) {
1080                 asm volatile (
1081                     "movdqu (%[shuffle]), %%xmm7\n"
1082                     LOAD64
1083                     "pshufb  %%xmm7, %%xmm0\n"
1084                     "pshufb  %%xmm7, %%xmm1\n"
1085                     "pshufb  %%xmm7, %%xmm2\n"
1086                     "pshufb  %%xmm7, %%xmm3\n"
1087                     STORE2X32
1088                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
1089             }
1090         } else
1091 #endif
1092 #ifdef CAN_COMPILE_SSE2
1093         if (cpu & CPU_CAPABILITY_SSE2) {
1094             for (x = 0; x < (width & ~31); x += 32) {
1095                 asm volatile (
1096                     "movdqu (%[mask]), %%xmm7\n"
1097                     LOAD64
1098                     "movdqa   %%xmm0, %%xmm4\n"
1099                     "movdqa   %%xmm1, %%xmm5\n"
1100                     "movdqa   %%xmm2, %%xmm6\n"
1101                     "psrlw    $8,     %%xmm0\n"
1102                     "psrlw    $8,     %%xmm1\n"
1103                     "pand     %%xmm7, %%xmm4\n"
1104                     "pand     %%xmm7, %%xmm5\n"
1105                     "pand     %%xmm7, %%xmm6\n"
1106                     "packuswb %%xmm4, %%xmm0\n"
1107                     "packuswb %%xmm5, %%xmm1\n"
1108                     "pand     %%xmm3, %%xmm7\n"
1109                     "psrlw    $8,     %%xmm2\n"
1110                     "psrlw    $8,     %%xmm3\n"
1111                     "packuswb %%xmm6, %%xmm2\n"
1112                     "packuswb %%xmm7, %%xmm3\n"
1113                     STORE2X32
1114                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
1115             }
1116         }
1117 #endif
1118 #undef STORE2X32
1119 #undef LOAD64
1120
1121         for (; x < width; x++) {
1122             dstu[x] = src[2*x+0];
1123             dstv[x] = src[2*x+1];
1124         }
1125         src  += src_pitch;
1126         dstu += dstu_pitch;
1127         dstv += dstv_pitch;
1128     }
1129 }
1130
1131 static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
1132                       uint8_t *cache, size_t cache_size,
1133                       unsigned width, unsigned height,
1134                       unsigned cpu)
1135 {
1136     const unsigned w16 = (width+15) & ~15;
1137     const unsigned hstep = cache_size / w16;
1138     assert(hstep > 0);
1139
1140     for (unsigned y = 0; y < height; y += hstep) {
1141         const unsigned unaligned = (intptr_t)src & 0x0f;
1142         const unsigned hblock =  __MIN(hstep, height - y);
1143
1144         /* Copy a bunch of line into our cache */
1145         CopyFromUswc(cache, w16,
1146                      src, src_pitch,
1147                      unaligned,
1148                      width, hblock, cpu);
1149
1150         /* Copy from our cache to the destination */
1151         Copy2d(dst, dst_pitch,
1152                cache, w16,
1153                width, hblock, cpu);
1154
1155         /* */
1156         src += src_pitch * hblock;
1157         dst += dst_pitch * hblock;
1158     }
1159
1160     ASM_SSE2(cpu, "mfence");
1161 }
1162 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
1163                         uint8_t *dstv, size_t dstv_pitch,
1164                         const uint8_t *src, size_t src_pitch,
1165                         uint8_t *cache, size_t cache_size,
1166                         unsigned width, unsigned height,
1167                         unsigned cpu)
1168 {
1169     const unsigned w2_16 = (2*width+15) & ~15;
1170     const unsigned hstep = cache_size / w2_16;
1171     assert(hstep > 0);
1172
1173     for (unsigned y = 0; y < height; y += hstep) {
1174         const unsigned unaligned = (intptr_t)src & 0x0f;
1175         const unsigned hblock =  __MIN(hstep, height - y);
1176
1177         /* Copy a bunch of line into our cache */
1178         CopyFromUswc(cache, w2_16,
1179                      src, src_pitch,
1180                      unaligned,
1181                      2*width, hblock, cpu);
1182
1183         /* Copy from our cache to the destination */
1184         SplitUV(dstu, dstu_pitch,
1185                 dstv, dstv_pitch,
1186                 cache, w2_16,
1187                 width, hblock, cpu);
1188
1189         /* */
1190         src  += src_pitch  * hblock;
1191         dstu += dstu_pitch * hblock;
1192         dstv += dstv_pitch * hblock;
1193     }
1194
1195     ASM_SSE2(cpu, "mfence");
1196 }
1197
1198 static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src,
1199                          uint8_t *cache, size_t cache_size,
1200                          unsigned width, unsigned height)
1201 {
1202     const unsigned cpu = vlc_CPU();
1203
1204     /* */
1205     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
1206               src->pBits, src->Pitch,
1207               cache, cache_size,
1208               width, height, cpu);
1209     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
1210                 dst->p[1].p_pixels, dst->p[1].i_pitch,
1211                 (const uint8_t*)src->pBits + src->Pitch * height, src->Pitch,
1212                 cache, cache_size,
1213                 width/2, height/2, cpu);
1214
1215     ASM_SSE2(cpu, "emms");
1216 }
1217 static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src,
1218                          uint8_t *cache, size_t cache_size,
1219                          unsigned width, unsigned height)
1220 {
1221     const unsigned cpu = vlc_CPU();
1222
1223     /* */
1224     for (unsigned n = 0, offset = 0; n < 3; n++) {
1225         const unsigned d = n > 0 ? 2 : 1;
1226         CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
1227                   (const uint8_t*)src->pBits + offset, src->Pitch/d,
1228                   cache, cache_size,
1229                   width/d, height/d, cpu);
1230         offset += (src->Pitch/d) * (height/d);
1231     }
1232     ASM_SSE2(cpu, "emms");
1233 }
1234 #undef ASM_SSE2
1235 #undef COPY64
1236
1237 #else
1238 vlc_va_t *vlc_va_NewDxva2(vlc_object_t *log, int codec_id)
1239 {
1240     (void)log;
1241     (void)codec_id;
1242     return NULL;
1243 }
1244 #endif