]> git.sesse.net Git - vlc/blob - modules/codec/avcodec/dxva2.c
Moved out avcodec SampleFormat -> VLC formats to its own function.
[vlc] / modules / codec / avcodec / dxva2.c
1 /*****************************************************************************
2  * va.c: Video Acceleration helpers
3  *****************************************************************************
4  * Copyright (C) 2009 Geoffroy Couprie
5  * Copyright (C) 2009 Laurent Aimar
6  * $Id$
7  *
8  * Authors: Geoffroy Couprie <geal@videolan.org>
9  *          Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
24  *****************************************************************************/
25
26 #ifdef HAVE_CONFIG_H
27 # include "config.h"
28 #endif
29
30 #include <vlc_common.h>
31 #include <vlc_picture.h>
32 #include <vlc_fourcc.h>
33 #include <vlc_cpu.h>
34 #include <assert.h>
35
36 #ifdef HAVE_LIBAVCODEC_AVCODEC_H
37 #   include <libavcodec/avcodec.h>
38 #   ifdef HAVE_AVCODEC_DXVA2
39 #       define DXVA2API_USE_BITFIELDS
40 #       include <libavcodec/dxva2.h>
41 #   endif
42 #elif defined(HAVE_FFMPEG_AVCODEC_H)
43 #   include <ffmpeg/avcodec.h>
44 #else
45 #   include <avcodec.h>
46 #endif
47
48 #include "avcodec.h"
49 #include "va.h"
50
51 #ifdef HAVE_AVCODEC_DXVA2
52
53 #include <windows.h>
54 #include <windowsx.h>
55 #include <ole2.h>
56 #include <commctrl.h>
57 #include <shlwapi.h>
58 #include <d3d9.h>
59
60 /* FIXME */
61 #define CoTaskMemFree(x)
62
63 /* */
64 #define DXVA2_E_NOT_INITIALIZED     MAKE_HRESULT(1, 4, 4096)
65 #define DXVA2_E_NEW_VIDEO_DEVICE    MAKE_HRESULT(1, 4, 4097)
66 #define DXVA2_E_VIDEO_DEVICE_LOCKED MAKE_HRESULT(1, 4, 4098)
67 #define DXVA2_E_NOT_AVAILABLE       MAKE_HRESULT(1, 4, 4099)
68
69 static const GUID DXVA2_ModeMPEG2_MoComp = {
70     0xe6a9f44b, 0x61b0, 0x4563, {0x9e,0xa4,0x63,0xd2,0xa3,0xc6,0xfe,0x66}
71 };
72 static const GUID DXVA2_ModeMPEG2_IDCT = {
73   0xbf22ad00, 0x03ea, 0x4690, {0x80,0x77,0x47,0x33,0x46,0x20,0x9b,0x7e}
74 };
75 static const GUID DXVA2_ModeMPEG2_VLD = {
76    0xee27417f, 0x5e28, 0x4e65, {0xbe,0xea,0x1d,0x26,0xb5,0x08,0xad,0xc9}
77 };
78
79 static const GUID DXVA2_ModeH264_A = {
80     0x1b81be64, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
81 };
82 static const GUID DXVA2_ModeH264_B = {
83     0x1b81be65, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
84 };
85 static const GUID DXVA2_ModeH264_C = {
86     0x1b81be66, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
87 };
88 static const GUID DXVA2_ModeH264_D = {
89     0x1b81be67, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
90 };
91 static const GUID DXVA2_ModeH264_E = {
92     0x1b81be68, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
93 };
94 static const GUID DXVA2_ModeH264_F = {
95     0x1b81be69, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
96 };
97 static const GUID DXVADDI_Intel_ModeH264_A = {
98     0x604F8E64, 0x4951,0x4c54, {0x88,0xFE,0xAB,0xD2,0x5C,0x15,0xB3,0xD6}
99 };
100 static const GUID DXVADDI_Intel_ModeH264_C = {
101     0x604F8E66,0x4951, 0x4c54, {0x88,0xFE,0xAB,0xD2,0x5C,0x15,0xB3,0xD6}
102 };
103 static const GUID DXVADDI_Intel_ModeH264_E = {
104     0x604F8E68,0x4951, 0x4c54, {0x88,0xFE,0xAB,0xD2,0x5C,0x15,0xB3,0xD6}
105 };
106 static const GUID DXVA2_ModeWMV8_A = {
107     0x1b81be80, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
108 };
109 static const GUID DXVA2_ModeWMV8_B = {
110     0x1b81be81, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
111 };
112 static const GUID DXVA2_ModeWMV9_A = {
113     0x1b81be90, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
114 };
115 static const GUID DXVA2_ModeWMV9_B = {
116     0x1b81be91, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
117 };
118 static const GUID DXVA2_ModeWMV9_C = {
119     0x1b81be94, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
120 };
121
122 static const GUID DXVA2_ModeVC1_A = {
123     0x1b81beA0, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
124 };
125 static const GUID DXVA2_ModeVC1_B = {
126     0x1b81beA1, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
127 };
128 static const GUID DXVA2_ModeVC1_C = {
129     0x1b81beA2, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
130 };
131 static const GUID DXVA2_ModeVC1_D = {
132     0x1b81beA3, 0xa0c7,0x11d3, {0xb9,0x84,0x00,0xc0,0x4f,0x2e,0x73,0xc5}
133 };
134
135 /* */
136 typedef struct {
137     const char   *name;
138     const GUID   *guid;
139     int          codec;
140 } dxva2_mode_t;
141 /* XXX Prefered modes must come first */
142 static const dxva2_mode_t dxva2_modes[] = {
143     { "DXVA2_ModeMPEG2_VLD",    &DXVA2_ModeMPEG2_VLD,     0 },
144     { "DXVA2_ModeMPEG2_MoComp", &DXVA2_ModeMPEG2_MoComp,  0 },
145     { "DXVA2_ModeMPEG2_IDCT",   &DXVA2_ModeMPEG2_IDCT,    0 },
146
147     { "H.264 variable-length decoder (VLD), FGT",               &DXVA2_ModeH264_F,         CODEC_ID_H264 },
148     { "H.264 VLD, no FGT",                                      &DXVA2_ModeH264_E,         CODEC_ID_H264 },
149     { "H.264 VLD, no FGT (Intel)",                              &DXVADDI_Intel_ModeH264_E, CODEC_ID_H264 },
150     { "H.264 IDCT, FGT",                                        &DXVA2_ModeH264_D,         0             },
151     { "H.264 inverse discrete cosine transform (IDCT), no FGT", &DXVA2_ModeH264_C,         0             },
152     { "H.264 inverse discrete cosine transform (IDCT), no FGT (Intel)", &DXVADDI_Intel_ModeH264_C, 0     },
153     { "H.264 MoComp, FGT",                                      &DXVA2_ModeH264_B,         0             },
154     { "H.264 motion compensation (MoComp), no FGT",             &DXVA2_ModeH264_A,         0             },
155     { "H.264 motion compensation (MoComp), no FGT (Intel)",     &DXVADDI_Intel_ModeH264_A, 0             },
156
157     { "Windows Media Video 8 MoComp",           &DXVA2_ModeWMV8_B, 0 },
158     { "Windows Media Video 8 post processing",  &DXVA2_ModeWMV8_A, 0 },
159
160     {  "Windows Media Video 9 IDCT",            &DXVA2_ModeWMV9_C, 0 },
161     {  "Windows Media Video 9 MoComp",          &DXVA2_ModeWMV9_B, 0 },
162     {  "Windows Media Video 9 post processing", &DXVA2_ModeWMV9_A, 0 },
163
164     { "VC-1 VLD",             &DXVA2_ModeVC1_D, CODEC_ID_VC1 },
165     { "VC-1 VLD",             &DXVA2_ModeVC1_D, CODEC_ID_WMV3 },
166     { "VC-1 IDCT",            &DXVA2_ModeVC1_C, 0 },
167     { "VC-1 MoComp",          &DXVA2_ModeVC1_B, 0 },
168     { "VC-1 post processing", &DXVA2_ModeVC1_A, 0 },
169
170     { NULL, NULL, 0 }
171 };
172
173 static const dxva2_mode_t *Dxva2FindMode(const GUID *guid)
174 {
175     for (unsigned i = 0; dxva2_modes[i].name; i++) {
176         if (IsEqualGUID(dxva2_modes[i].guid, guid))
177             return &dxva2_modes[i];
178     }
179     return NULL;
180 }
181
182 /* */
183 #define VLC_CODEC_NV12 VLC_FOURCC('N','V','1','2') /* TODO move to vlc_fourcc.h */
184 typedef struct {
185     const char   *name;
186     D3DFORMAT    format;
187     vlc_fourcc_t codec;
188 } d3d_format_t;
189 /* XXX Prefered format must come first */
190 static const d3d_format_t d3d_formats[] = {
191     { "YV12",   MAKEFOURCC('Y','V','1','2'),    VLC_CODEC_YV12 },
192     { "NV12",   MAKEFOURCC('N','V','1','2'),    VLC_CODEC_NV12 },
193
194     { NULL, 0, 0 }
195 };
196
197 static const d3d_format_t *D3dFindFormat(D3DFORMAT format)
198 {
199     for (unsigned i = 0; d3d_formats[i].name; i++) {
200         if (d3d_formats[i].format == format)
201             return &d3d_formats[i];
202     }
203     return NULL;
204 }
205
206 static const GUID IID_IDirectXVideoDecoderService = {
207     0xfc51a551, 0xd5e7, 0x11d9, {0xaf,0x55,0x00,0x05,0x4e,0x43,0xff,0x02}
208 };
209 static const GUID IID_IDirectXVideoAccelerationService = {
210     0xfc51a550, 0xd5e7, 0x11d9, {0xaf,0x55,0x00,0x05,0x4e,0x43,0xff,0x02}
211 };
212
213 /* */
214 typedef struct {
215     LPDIRECT3DSURFACE9 d3d;
216     int                refcount;
217     unsigned int       order;
218 } vlc_va_surface_t;
219
220 #define VA_DXVA2_MAX_SURFACE_COUNT (64)
221 typedef struct
222 {
223     /* */
224     vlc_va_t va;
225
226     /* */
227     vlc_object_t *log;
228     int          codec_id;
229
230     /* DLL */
231         HINSTANCE             hd3d9_dll;
232     HINSTANCE             hdxva2_dll;
233
234     /* Direct3D */
235     D3DPRESENT_PARAMETERS  d3dpp;
236     LPDIRECT3D9            d3dobj;
237     D3DADAPTER_IDENTIFIER9 d3dai;
238     LPDIRECT3DDEVICE9      d3ddev;
239
240     /* Device manager */
241     UINT                     token;
242     IDirect3DDeviceManager9  *devmng;
243     HANDLE                   device;
244
245     /* Video service */
246     IDirectXVideoDecoderService  *vs;
247     GUID                         input;
248     D3DFORMAT                    render;
249
250     /* Video decoder */
251     DXVA2_ConfigPictureDecode    cfg;
252     IDirectXVideoDecoder         *decoder;
253
254     /* Option conversion */
255     D3DFORMAT                    output;
256     uint8_t                      *surface_cache_base;
257     uint8_t                      *surface_cache;
258     size_t                       surface_cache_size;
259
260     /* */
261     struct dxva_context hw;
262
263     /* */
264     unsigned     surface_count;
265     unsigned     surface_order;
266     int          surface_width;
267     int          surface_height;
268     vlc_fourcc_t surface_chroma;
269
270     vlc_va_surface_t surface[VA_DXVA2_MAX_SURFACE_COUNT];
271     LPDIRECT3DSURFACE9 hw_surface[VA_DXVA2_MAX_SURFACE_COUNT];
272 } vlc_va_dxva2_t;
273
274 /* */
275 static vlc_va_dxva2_t *vlc_va_dxva2_Get(void *external)
276 {
277     assert(external == (void*)(&((vlc_va_dxva2_t*)external)->va));
278     return external;
279 }
280
281 /* */
282 static int D3dCreateDevice(vlc_va_dxva2_t *);
283 static void D3dDestroyDevice(vlc_va_dxva2_t *);
284 static char *DxDescribe(vlc_va_dxva2_t *);
285
286 static int D3dCreateDeviceManager(vlc_va_dxva2_t *);
287 static void D3dDestroyDeviceManager(vlc_va_dxva2_t *);
288
289 static int DxCreateVideoService(vlc_va_dxva2_t *);
290 static void DxDestroyVideoService(vlc_va_dxva2_t *);
291 static int DxFindVideoServiceConversion(vlc_va_dxva2_t *, GUID *input, D3DFORMAT *output);
292
293 static int DxCreateVideoDecoder(vlc_va_dxva2_t *,
294                                 int codec_id, const video_format_t *);
295 static void DxDestroyVideoDecoder(vlc_va_dxva2_t *);
296 static int DxResetVideoDecoder(vlc_va_dxva2_t *);
297
298 static void DxCreateVideoConversion(vlc_va_dxva2_t *);
299 static void DxDestroyVideoConversion(vlc_va_dxva2_t *);
300
301 static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src,
302                          uint8_t *cache, size_t cache_size,
303                          unsigned width, unsigned height);
304 static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src,
305                          uint8_t *cache, size_t cache_size,
306                          unsigned width, unsigned height);
307
308 /* */
309 static int Setup(vlc_va_t *external, void **hw, vlc_fourcc_t *chroma,
310                  int width, int height)
311 {
312     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
313
314     if (va->surface_width  == width &&
315         va->surface_height == height)
316         goto ok;
317
318     /* */
319     DxDestroyVideoConversion(va);
320     DxDestroyVideoDecoder(va);
321
322     *hw = NULL;
323     *chroma = 0;
324     if (width <= 0 || height <= 0)
325         return VLC_EGENERIC;
326
327     /* FIXME transmit a video_format_t by VaSetup directly */
328     video_format_t fmt;
329     memset(&fmt, 0, sizeof(fmt));
330     fmt.i_width = width;
331     fmt.i_height = height;
332
333     if (DxCreateVideoDecoder(va, va->codec_id, &fmt))
334         return VLC_EGENERIC;
335     /* */
336     va->hw.decoder = va->decoder;
337     va->hw.cfg = &va->cfg;
338     va->hw.surface_count = va->surface_count;
339     va->hw.surface = va->hw_surface;
340     for (unsigned i = 0; i < va->surface_count; i++)
341         va->hw.surface[i] = va->surface[i].d3d;
342
343     /* */
344     DxCreateVideoConversion(va);
345
346     /* */
347 ok:
348     *hw = &va->hw;
349     const d3d_format_t *output = D3dFindFormat(va->output);
350     *chroma = output->codec;
351
352     return VLC_SUCCESS;
353 }
354
355 static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
356 {
357     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
358     LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)ff->data[3];
359
360     if (!va->surface_cache)
361         return VLC_EGENERIC;
362
363     /* */
364     assert(va->output == MAKEFOURCC('Y','V','1','2'));
365
366     /* */
367     D3DLOCKED_RECT lock;
368     if (FAILED(IDirect3DSurface9_LockRect(d3d, &lock, NULL, D3DLOCK_READONLY))) {
369         msg_Err(va->log, "Failed to lock surface");
370         return VLC_EGENERIC;
371     }
372
373     if (va->render == MAKEFOURCC('Y','V','1','2')) {
374         CopyFromYv12(picture, &lock,
375                      va->surface_cache, va->surface_cache_size,
376                      va->surface_width, va->surface_height);
377     } else {
378         assert(va->render == MAKEFOURCC('N','V','1','2'));
379         CopyFromNv12(picture, &lock,
380                      va->surface_cache, va->surface_cache_size,
381                      va->surface_width, va->surface_height);
382     }
383
384     /* */
385     IDirect3DSurface9_UnlockRect(d3d);
386     return VLC_SUCCESS;
387 }
388 /* FIXME it is nearly common with VAAPI */
389 static int Get(vlc_va_t *external, AVFrame *ff)
390 {
391     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
392
393     /* Check the device */
394     HRESULT hr = IDirect3DDeviceManager9_TestDevice(va->devmng, va->device);
395     if (hr == DXVA2_E_NEW_VIDEO_DEVICE) {
396         if (DxResetVideoDecoder(va))
397             return VLC_EGENERIC;
398     } else if (FAILED(hr)) {
399         msg_Err(va->log, "IDirect3DDeviceManager9_TestDevice %u", (unsigned)hr);
400         return VLC_EGENERIC;
401     }
402
403     /* Grab an unused surface, in case none are, try the oldest
404      * XXX using the oldest is a workaround in case a problem happens with ffmpeg */
405     unsigned i, old;
406     for (i = 0, old = 0; i < va->surface_count; i++) {
407         vlc_va_surface_t *surface = &va->surface[i];
408
409         if (!surface->refcount)
410             break;
411
412         if (surface->order < va->surface[old].order)
413             old = i;
414     }
415     if (i >= va->surface_count)
416         i = old;
417
418     vlc_va_surface_t *surface = &va->surface[i];
419
420     surface->refcount = 1;
421     surface->order = va->surface_order++;
422
423     /* */
424     for (int i = 0; i < 4; i++) {
425         ff->data[i] = NULL;
426         ff->linesize[i] = 0;
427
428         if (i == 0 || i == 3)
429             ff->data[i] = (void*)surface->d3d;/* Yummie */
430     }
431     return VLC_SUCCESS;
432 }
433 static void Release(vlc_va_t *external, AVFrame *ff)
434 {
435     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
436     LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)ff->data[3];
437
438     for (unsigned i = 0; i < va->surface_count; i++) {
439         vlc_va_surface_t *surface = &va->surface[i];
440
441         if (surface->d3d == d3d)
442             surface->refcount--;
443     }
444 }
445 static void Close(vlc_va_t *external)
446 {
447     vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
448
449     DxDestroyVideoConversion(va);
450     DxDestroyVideoDecoder(va);
451     DxDestroyVideoService(va);
452     D3dDestroyDeviceManager(va);
453     D3dDestroyDevice(va);
454
455     if (va->hdxva2_dll)
456         FreeLibrary(va->hdxva2_dll);
457     if (va->hd3d9_dll)
458         FreeLibrary(va->hd3d9_dll);
459
460     free(va->va.description);
461     free(va);
462 }
463
464 vlc_va_t *vlc_va_NewDxva2(vlc_object_t *log, int codec_id)
465 {
466     vlc_va_dxva2_t *va = calloc(1, sizeof(*va));
467     if (!va)
468         return NULL;
469
470     /* */
471     va->log = log;
472     va->codec_id = codec_id;
473
474     /* Load dll*/
475     va->hd3d9_dll = LoadLibrary(TEXT("D3D9.DLL"));
476     if (!va->hd3d9_dll) {
477         msg_Warn(va->log, "cannot load d3d9.dll");
478         goto error;
479     }
480     va->hdxva2_dll = LoadLibrary(TEXT("DXVA2.DLL"));
481     if (!va->hdxva2_dll) {
482         msg_Warn(va->log, "cannot load dxva2.dll");
483         goto error;
484     }
485     msg_Dbg(va->log, "DLLs loaded");
486
487     /* */
488     if (D3dCreateDevice(va)) {
489         msg_Err(va->log, "Failed to create Direct3D device");
490         goto error;
491     }
492     msg_Dbg(va->log, "D3dCreateDevice succeed");
493
494     if (D3dCreateDeviceManager(va)) {
495         msg_Err(va->log, "D3dCreateDeviceManager failed");
496         goto error;
497     }
498
499     if (DxCreateVideoService(va)) {
500         msg_Err(va->log, "DxCreateVideoService failed");
501         goto error;
502     }
503
504     /* */
505     if (DxFindVideoServiceConversion(va, &va->input, &va->render)) {
506         msg_Err(va->log, "DxFindVideoServiceConversion failed");
507         goto error;
508     }
509
510     /* TODO print the hardware name/vendor for debugging purposes */
511     va->va.description = DxDescribe(va);
512     va->va.setup   = Setup;
513     va->va.get     = Get;
514     va->va.release = Release;
515     va->va.extract = Extract;
516     va->va.close   = Close;
517     return &va->va;
518
519 error:
520     Close(&va->va);
521     return NULL;
522 }
523 /* */
524
525 /**
526  * It creates a Direct3D device usable for DXVA 2
527  */
528 static int D3dCreateDevice(vlc_va_dxva2_t *va)
529 {
530     /* */
531     LPDIRECT3D9 (WINAPI *Create9)(UINT SDKVersion);
532     Create9 = (void *)GetProcAddress(va->hd3d9_dll,
533                                      TEXT("Direct3DCreate9"));
534     if (!Create9) {
535         msg_Err(va->log, "Cannot locate reference to Direct3DCreate9 ABI in DLL");
536         return VLC_EGENERIC;
537     }
538
539     /* */
540     LPDIRECT3D9 d3dobj;
541     d3dobj = Create9(D3D_SDK_VERSION);
542     if (!d3dobj) {
543         msg_Err(va->log, "Direct3DCreate9 failed");
544         return VLC_EGENERIC;
545     }
546     va->d3dobj = d3dobj;
547
548     /* */
549     D3DADAPTER_IDENTIFIER9 *d3dai = &va->d3dai;
550     if (FAILED(IDirect3D9_GetAdapterIdentifier(va->d3dobj,
551                                                D3DADAPTER_DEFAULT, 0, d3dai))) {
552         msg_Warn(va->log, "IDirect3D9_GetAdapterIdentifier failed");
553         ZeroMemory(d3dai, sizeof(*d3dai));
554     }
555
556     /* */
557     D3DPRESENT_PARAMETERS *d3dpp = &va->d3dpp;
558     ZeroMemory(d3dpp, sizeof(*d3dpp));
559     d3dpp->Flags                  = D3DPRESENTFLAG_VIDEO;
560     d3dpp->Windowed               = TRUE;
561     d3dpp->hDeviceWindow          = NULL;
562     d3dpp->SwapEffect             = D3DSWAPEFFECT_DISCARD;
563     d3dpp->MultiSampleType        = D3DMULTISAMPLE_NONE;
564     d3dpp->PresentationInterval   = D3DPRESENT_INTERVAL_DEFAULT;
565     d3dpp->BackBufferCount        = 0;                  /* FIXME what to put here */
566     d3dpp->BackBufferFormat       = D3DFMT_X8R8G8B8;    /* FIXME what to put here */
567     d3dpp->BackBufferWidth        = 0;
568     d3dpp->BackBufferHeight       = 0;
569     d3dpp->EnableAutoDepthStencil = FALSE;
570
571     /* Direct3D needs a HWND to create a device, even without using ::Present
572     this HWND is used to alert Direct3D when there's a change of focus window.
573     For now, use GetShellWindow, as it looks harmless */
574     LPDIRECT3DDEVICE9 d3ddev;
575     if (FAILED(IDirect3D9_CreateDevice(d3dobj, D3DADAPTER_DEFAULT,
576                                        D3DDEVTYPE_HAL, GetShellWindow(),
577                                        D3DCREATE_SOFTWARE_VERTEXPROCESSING |
578                                        D3DCREATE_MULTITHREADED,
579                                        d3dpp, &d3ddev))) {
580         msg_Err(va->log, "IDirect3D9_CreateDevice failed");
581         return VLC_EGENERIC;
582     }
583     va->d3ddev = d3ddev;
584
585     return VLC_SUCCESS;
586 }
587 /**
588  * It releases a Direct3D device and its resources.
589  */
590 static void D3dDestroyDevice(vlc_va_dxva2_t *va)
591 {
592     if (va->d3ddev)
593         IDirect3DDevice9_Release(va->d3ddev);
594     if (va->d3dobj)
595         IDirect3D9_Release(va->d3dobj);
596 }
597 /**
598  * It describes our Direct3D object
599  */
600 static char *DxDescribe(vlc_va_dxva2_t *va)
601 {
602     static const struct {
603         unsigned id;
604         char     name[32];
605     } vendors [] = {
606         { 0x1002, "ATI" },
607         { 0x10DE, "NVIDIA" },
608         { 0x8086, "Intel" },
609         { 0x5333, "S3 Graphics" },
610         { 0, "" }
611     };
612     D3DADAPTER_IDENTIFIER9 *id = &va->d3dai;
613
614     const char *vendor = "Unknown";
615     for (int i = 0; vendors[i].id != 0; i++) {
616         if (vendors[i].id == id->VendorId) {
617             vendor = vendors[i].name;
618             break;
619         }
620     }
621
622     char *description;
623     if (asprintf(&description, "DXVA2 (%.*s, vendor %d(%s), device %d, revision %d)",
624                  sizeof(id->Description), id->Description,
625                  id->VendorId, vendor, id->DeviceId, id->Revision) < 0)
626         return NULL;
627     return description;
628 }
629
630 /**
631  * It creates a Direct3D device manager
632  */
633 static int D3dCreateDeviceManager(vlc_va_dxva2_t *va)
634 {
635     HRESULT (WINAPI *CreateDeviceManager9)(UINT *pResetToken,
636                                            IDirect3DDeviceManager9 **);
637     CreateDeviceManager9 =
638       (void *)GetProcAddress(va->hdxva2_dll,
639                              TEXT("DXVA2CreateDirect3DDeviceManager9"));
640
641     if (!CreateDeviceManager9) {
642         msg_Err(va->log, "cannot load function\n");
643         return VLC_EGENERIC;
644     }
645     msg_Dbg(va->log, "OurDirect3DCreateDeviceManager9 Success!");
646
647     UINT token;
648     IDirect3DDeviceManager9 *devmng;
649     if (FAILED(CreateDeviceManager9(&token, &devmng))) {
650         msg_Err(va->log, " OurDirect3DCreateDeviceManager9 failed");
651         return VLC_EGENERIC;
652     }
653     va->token  = token;
654     va->devmng = devmng;
655     msg_Info(va->log, "obtained IDirect3DDeviceManager9");
656
657     HRESULT hr = IDirect3DDeviceManager9_ResetDevice(devmng, va->d3ddev, token);
658     if (FAILED(hr)) {
659         msg_Err(va->log, "IDirect3DDeviceManager9_ResetDevice failed: %08x", (unsigned)hr);
660         return VLC_EGENERIC;
661     }
662     return VLC_SUCCESS;
663 }
664 /**
665  * It destroys a Direct3D device manager
666  */
667 static void D3dDestroyDeviceManager(vlc_va_dxva2_t *va)
668 {
669     if (va->devmng)
670         IDirect3DDeviceManager9_Release(va->devmng);
671 }
672
673 /**
674  * It creates a DirectX video service
675  */
676 static int DxCreateVideoService(vlc_va_dxva2_t *va)
677 {
678     HRESULT (WINAPI *CreateVideoService)(IDirect3DDevice9 *,
679                                          REFIID riid,
680                                          void **ppService);
681     CreateVideoService =
682       (void *)GetProcAddress(va->hdxva2_dll,
683                              TEXT("DXVA2CreateVideoService"));
684
685     if (!CreateVideoService) {
686         msg_Err(va->log, "cannot load function\n");
687         return 4;
688     }
689     msg_Info(va->log, "DXVA2CreateVideoService Success!");
690
691     HRESULT hr;
692
693     HANDLE device;
694     hr = IDirect3DDeviceManager9_OpenDeviceHandle(va->devmng, &device);
695     if (FAILED(hr)) {
696         msg_Err(va->log, "OpenDeviceHandle failed");
697         return VLC_EGENERIC;
698     }
699     va->device = device;
700
701     IDirectXVideoDecoderService *vs;
702     hr = IDirect3DDeviceManager9_GetVideoService(va->devmng, device,
703                                                  &IID_IDirectXVideoDecoderService,
704                                                  &vs);
705     if (FAILED(hr)) {
706         msg_Err(va->log, "GetVideoService failed");
707         return VLC_EGENERIC;
708     }
709     va->vs = vs;
710
711     return VLC_SUCCESS;
712 }
713 /**
714  * It destroys a DirectX video service
715  */
716 static void DxDestroyVideoService(vlc_va_dxva2_t *va)
717 {
718     if (va->device)
719         IDirect3DDeviceManager9_CloseDeviceHandle(va->devmng, va->device);
720     if (va->vs)
721         IDirectXVideoDecoderService_Release(va->vs);
722 }
723 /**
724  * Find the best suited decoder mode GUID and render format.
725  */
726 static int DxFindVideoServiceConversion(vlc_va_dxva2_t *va, GUID *input, D3DFORMAT *output)
727 {
728     /* Retreive supported modes from the decoder service */
729     UINT input_count = 0;
730     GUID *input_list = NULL;
731     if (FAILED(IDirectXVideoDecoderService_GetDecoderDeviceGuids(va->vs,
732                                                                  &input_count,
733                                                                  &input_list))) {
734         msg_Err(va->log, "IDirectXVideoDecoderService_GetDecoderDeviceGuids failed");
735         return VLC_EGENERIC;
736     }
737     for (unsigned i = 0; i < input_count; i++) {
738         const GUID *g = &input_list[i];
739         const dxva2_mode_t *mode = Dxva2FindMode(g);
740         if (mode) {
741             msg_Dbg(va->log, "- '%s' is supported by hardware", mode->name);
742         } else {
743             msg_Warn(va->log, "- Unknown GUID = %08X-%04x-%04x-XXXX",
744                      (unsigned)g->Data1, g->Data2, g->Data3);
745         }
746     }
747
748     /* Try all supported mode by our priority */
749     for (unsigned i = 0; dxva2_modes[i].name; i++) {
750         const dxva2_mode_t *mode = &dxva2_modes[i];
751         if (!mode->codec || mode->codec != va->codec_id)
752             continue;
753
754         /* */
755         bool is_suported = false;
756         for (const GUID *g = &input_list[0]; !is_suported && g < &input_list[input_count]; g++) {
757             is_suported = IsEqualGUID(mode->guid, g);
758         }
759         if (!is_suported)
760             continue;
761
762         /* */
763         msg_Dbg(va->log, "Trying to use '%s' as input", mode->name);
764         UINT      output_count = 0;
765         D3DFORMAT *output_list = NULL;
766         if (FAILED(IDirectXVideoDecoderService_GetDecoderRenderTargets(va->vs, mode->guid,
767                                                                        &output_count,
768                                                                        &output_list))) {
769             msg_Err(va->log, "IDirectXVideoDecoderService_GetDecoderRenderTargets failed");
770             continue;
771         }
772         for (unsigned j = 0; j < output_count; j++) {
773             const D3DFORMAT f = output_list[j];
774             const d3d_format_t *format = D3dFindFormat(f);
775             if (format) {
776                 msg_Dbg(va->log, "%s is supported for output", format->name);
777             } else {
778                 msg_Dbg(va->log, "%d is supported for output (%4.4s)", f, (const char*)&f);
779             }
780         }
781
782         /* */
783         for (unsigned j = 0; d3d_formats[j].name; j++) {
784             const d3d_format_t *format = &d3d_formats[j];
785
786             /* */
787             bool is_suported = false;
788             for (unsigned k = 0; !is_suported && k < output_count; k++) {
789                 is_suported = format->format == output_list[k];
790             }
791             if (!is_suported)
792                 continue;
793
794             /* We have our solution */
795             msg_Dbg(va->log, "Using '%s' to decode to '%s'", mode->name, format->name);
796             *input  = *mode->guid;
797             *output = format->format;
798             CoTaskMemFree(output_list);
799             CoTaskMemFree(input_list);
800             return VLC_SUCCESS;
801         }
802         CoTaskMemFree(output_list);
803     }
804     CoTaskMemFree(input_list);
805     return VLC_EGENERIC;
806 }
807
808 /**
809  * It creates a DXVA2 decoder using the given video format
810  */
811 static int DxCreateVideoDecoder(vlc_va_dxva2_t *va,
812                                 int codec_id, const video_format_t *fmt)
813 {
814     /* */
815     msg_Dbg(va->log, "DxCreateVideoDecoder id %d %dx%d",
816             codec_id, fmt->i_width, fmt->i_height);
817
818     /* Allocates all surfaces needed for the decoder */
819     switch (codec_id) {
820     case CODEC_ID_H264:
821         va->surface_count = 16 + 1;
822         break;
823     default:
824         va->surface_count = 2 + 1;
825         break;
826     }
827     LPDIRECT3DSURFACE9 surface_list[VA_DXVA2_MAX_SURFACE_COUNT];
828     if (FAILED(IDirectXVideoDecoderService_CreateSurface(va->vs,
829                                                          fmt->i_width,
830                                                          fmt->i_height,
831                                                          va->surface_count - 1,
832                                                          va->render,
833                                                          D3DPOOL_DEFAULT,
834                                                          0,
835                                                          DXVA2_VideoDecoderRenderTarget,
836                                                          surface_list,
837                                                          NULL))) {
838         msg_Err(va->log, "IDirectXVideoAccelerationService_CreateSurface failed\n");
839         va->surface_count = 0;
840         return VLC_EGENERIC;
841     }
842     for (unsigned i = 0; i < va->surface_count; i++) {
843         vlc_va_surface_t *surface = &va->surface[i];
844         surface->d3d = surface_list[i];
845         surface->refcount = 0;
846         surface->order = 0;
847     }
848     va->surface_width  = fmt->i_width;
849     va->surface_height = fmt->i_height;
850     msg_Dbg(va->log, "IDirectXVideoAccelerationService_CreateSurface succeed with %d surfaces (%dx%d)",
851             va->surface_count, fmt->i_width, fmt->i_height);
852
853     /* */
854     DXVA2_VideoDesc dsc;
855     ZeroMemory(&dsc, sizeof(dsc));
856     dsc.SampleWidth     = fmt->i_width;
857     dsc.SampleHeight    = fmt->i_height;
858     dsc.Format          = va->render;
859     if (fmt->i_frame_rate > 0 && fmt->i_frame_rate_base > 0) {
860         dsc.InputSampleFreq.Numerator   = fmt->i_frame_rate;
861         dsc.InputSampleFreq.Denominator = fmt->i_frame_rate_base;
862     } else {
863         dsc.InputSampleFreq.Numerator   = 0;
864         dsc.InputSampleFreq.Denominator = 0;
865     }
866     dsc.OutputFrameFreq = dsc.InputSampleFreq;
867     dsc.UABProtectionLevel = FALSE;
868     dsc.Reserved = 0;
869
870     /* FIXME I am unsure we can let unknown everywhere */
871     DXVA2_ExtendedFormat *ext = &dsc.SampleFormat;
872     ext->SampleFormat = 0;//DXVA2_SampleUnknown;
873     ext->VideoChromaSubsampling = 0;//DXVA2_VideoChromaSubsampling_Unknown;
874     ext->NominalRange = 0;//DXVA2_NominalRange_Unknown;
875     ext->VideoTransferMatrix = 0;//DXVA2_VideoTransferMatrix_Unknown;
876     ext->VideoLighting = 0;//DXVA2_VideoLighting_Unknown;
877     ext->VideoPrimaries = 0;//DXVA2_VideoPrimaries_Unknown;
878     ext->VideoTransferFunction = 0;//DXVA2_VideoTransFunc_Unknown;
879
880     /* List all configurations available for the decoder */
881     UINT                      cfg_count = 0;
882     DXVA2_ConfigPictureDecode *cfg_list = NULL;
883     if (FAILED(IDirectXVideoDecoderService_GetDecoderConfigurations(va->vs,
884                                                                     &va->input,
885                                                                     &dsc,
886                                                                     NULL,
887                                                                     &cfg_count,
888                                                                     &cfg_list))) {
889         msg_Err(va->log, "IDirectXVideoDecoderService_GetDecoderConfigurations failed\n");
890         return VLC_EGENERIC;
891     }
892     msg_Dbg(va->log, "we got %d decoder configurations", cfg_count);
893
894     /* Select the best decoder configuration */
895     bool has_cfg = false;
896     for (unsigned i = 0; i < cfg_count; i++) {
897         const DXVA2_ConfigPictureDecode *cfg = &cfg_list[i];
898
899         /* */
900         msg_Dbg(va->log, "configuration[%d] ConfigBitstreamRaw %d",
901                 i, cfg->ConfigBitstreamRaw);
902
903         /* */
904         if ((!has_cfg && cfg->ConfigBitstreamRaw == 1) ||
905             (codec_id == CODEC_ID_H264 && cfg->ConfigBitstreamRaw == 2)) {
906             va->cfg = *cfg;
907             has_cfg = true;
908         }
909     }
910     CoTaskMemFree(cfg_list);
911     if (!has_cfg) {
912         msg_Err(va->log, "Failed to find a supported decoder configuration");
913         return VLC_EGENERIC;
914     }
915
916     /* Create the decoder */
917     IDirectXVideoDecoder *decoder;
918     if (FAILED(IDirectXVideoDecoderService_CreateVideoDecoder(va->vs,
919                                                               &va->input,
920                                                               &dsc,
921                                                               &va->cfg,
922                                                               surface_list,
923                                                               va->surface_count,
924                                                               &decoder))) {
925         msg_Err(va->log, "IDirectXVideoDecoderService_CreateVideoDecoder failed\n");
926         return VLC_EGENERIC;
927     }
928     va->decoder = decoder;
929     msg_Dbg(va->log, "IDirectXVideoDecoderService_CreateVideoDecoder succeed");
930     return VLC_SUCCESS;
931 }
932 static void DxDestroyVideoDecoder(vlc_va_dxva2_t *va)
933 {
934     if (va->decoder)
935         IDirectXVideoDecoder_Release(va->decoder);
936     va->decoder = NULL;
937
938     for (unsigned i = 0; i < va->surface_count; i++)
939         IDirect3DSurface9_Release(va->surface[i].d3d);
940     va->surface_count = 0;
941 }
942 static int DxResetVideoDecoder(vlc_va_dxva2_t *va)
943 {
944     msg_Err(va->log, "DxResetVideoDecoder unimplemented");
945     return VLC_EGENERIC;
946 }
947
948 static void DxCreateVideoConversion(vlc_va_dxva2_t *va)
949 {
950     switch (va->render) {
951     case MAKEFOURCC('N','V','1','2'):
952         va->output = MAKEFOURCC('Y','V','1','2');
953         break;
954     default:
955         va->output = va->render;
956         break;
957     }
958     va->surface_cache_size = __MAX((va->surface_width + 0x0f) & ~ 0x0f, 4096);
959     va->surface_cache_base = malloc(16 + va->surface_cache_size);
960     va->surface_cache      = &va->surface_cache_base[16 - ((intptr_t)va->surface_cache_base & 0x0f)];
961 }
962 static void DxDestroyVideoConversion(vlc_va_dxva2_t *va)
963 {
964     free(va->surface_cache_base);
965     va->surface_cache_base = NULL;
966     va->surface_cache      = NULL;
967     va->surface_cache_size = 0;
968 }
969
970 /* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
971  * storing data with the SSE>=2 instruction store.
972  */
973 #define COPY64(dstp, srcp, load, store) \
974     asm volatile (                      \
975         load "  0(%[src]), %%xmm1\n"    \
976         load " 16(%[src]), %%xmm2\n"    \
977         load " 32(%[src]), %%xmm3\n"    \
978         load " 48(%[src]), %%xmm4\n"    \
979         store " %%xmm1,    0(%[dst])\n" \
980         store " %%xmm2,   16(%[dst])\n" \
981         store " %%xmm3,   32(%[dst])\n" \
982         store " %%xmm4,   48(%[dst])\n" \
983         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
984
985 /* Execute the instruction op only if SSE2 is supported. */
986 #ifdef CAN_COMPILE_SSE2
987 #   define ASM_SSE2(cpu, op) do {          \
988         if (cpu & CPU_CAPABILITY_SSE2)  \
989             asm volatile (op);    \
990     } while (0)
991 #else
992 #   define ASM_SSE2(cpu, op)
993 #endif
994
995 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
996  * as used by some video surface.
997  * XXX It is really efficient only when SSE4.1 is available.
998  */
999 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
1000                          const uint8_t *src, size_t src_pitch,
1001                          unsigned unaligned,
1002                          unsigned width, unsigned height,
1003                          unsigned cpu)
1004 {
1005     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
1006
1007     ASM_SSE2(cpu, "mfence");
1008     for (unsigned y = 0; y < height; y++) {
1009         unsigned x;
1010
1011         for (x = 0; x < unaligned; x++)
1012             dst[x] = src[x];
1013
1014 #ifdef CAN_COMPILE_SSE4_1
1015         if (cpu & CPU_CAPABILITY_SSE4_1) {
1016             if (!unaligned) {
1017                 for (; x+63 < width; x += 64)
1018                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
1019             } else {
1020                 for (; x+63 < width; x += 64)
1021                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
1022             }
1023         } else
1024 #endif
1025 #ifdef CAN_COMPILE_SSE2
1026         if (cpu & CPU_CAPABILITY_SSE2) {
1027             if (!unaligned) {
1028                 for (; x+63 < width; x += 64)
1029                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
1030             } else {
1031                 for (; x+63 < width; x += 64)
1032                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
1033             }
1034         }
1035 #endif
1036
1037         for (; x < width; x++)
1038             dst[x] = src[x];
1039
1040         src += src_pitch;
1041         dst += dst_pitch;
1042     }
1043 }
1044
1045 static void Copy2d(uint8_t *dst, size_t dst_pitch,
1046                    const uint8_t *src, size_t src_pitch,
1047                    unsigned width, unsigned height,
1048                    unsigned cpu)
1049 {
1050     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
1051
1052     ASM_SSE2(cpu, "mfence");
1053
1054     for (unsigned y = 0; y < height; y++) {
1055         unsigned x = 0;
1056         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
1057
1058 #ifdef CAN_COMPILE_SSE2
1059         if (cpu & CPU_CAPABILITY_SSE2) {
1060             if (!unaligned) {
1061                 for (; x+63 < width; x += 64)
1062                     COPY64(&dst[x], &src[x], "movdqa", "movntdq");
1063             } else {
1064                 for (; x+63 < width; x += 64)
1065                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
1066             }
1067         }
1068 #endif
1069
1070         for (; x < width; x++)
1071             dst[x] = src[x];
1072
1073         src += src_pitch;
1074         dst += dst_pitch;
1075     }
1076 }
1077
1078 static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
1079                     uint8_t *dstv, size_t dstv_pitch,
1080                     const uint8_t *src, size_t src_pitch,
1081                     unsigned width, unsigned height, unsigned cpu)
1082 {
1083     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1084                                 1, 3, 5, 7, 9, 11, 13, 15 };
1085     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
1086                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
1087
1088     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
1089
1090     ASM_SSE2(cpu, "mfence");
1091
1092     for (unsigned y = 0; y < height; y++) {
1093         unsigned x = 0;
1094
1095 #define LOAD64 \
1096     "movdqa  0(%[src]), %%xmm0\n" \
1097     "movdqa 16(%[src]), %%xmm1\n" \
1098     "movdqa 32(%[src]), %%xmm2\n" \
1099     "movdqa 48(%[src]), %%xmm3\n"
1100
1101 #define STORE2X32 \
1102     "movq   %%xmm0,   0(%[dst1])\n" \
1103     "movq   %%xmm1,   8(%[dst1])\n" \
1104     "movhpd %%xmm0,   0(%[dst2])\n" \
1105     "movhpd %%xmm1,   8(%[dst2])\n" \
1106     "movq   %%xmm2,  16(%[dst1])\n" \
1107     "movq   %%xmm3,  24(%[dst1])\n" \
1108     "movhpd %%xmm2,  16(%[dst2])\n" \
1109     "movhpd %%xmm3,  24(%[dst2])\n"
1110
1111 #ifdef CAN_COMPILE_SSSE3
1112         if (cpu & CPU_CAPABILITY_SSSE3) {
1113             for (x = 0; x < (width & ~31); x += 32) {
1114                 asm volatile (
1115                     "movdqu (%[shuffle]), %%xmm7\n"
1116                     LOAD64
1117                     "pshufb  %%xmm7, %%xmm0\n"
1118                     "pshufb  %%xmm7, %%xmm1\n"
1119                     "pshufb  %%xmm7, %%xmm2\n"
1120                     "pshufb  %%xmm7, %%xmm3\n"
1121                     STORE2X32
1122                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
1123             }
1124         } else
1125 #endif
1126 #ifdef CAN_COMPILE_SSE2
1127         if (cpu & CPU_CAPABILITY_SSE2) {
1128             for (x = 0; x < (width & ~31); x += 32) {
1129                 asm volatile (
1130                     "movdqu (%[mask]), %%xmm7\n"
1131                     LOAD64
1132                     "movdqa   %%xmm0, %%xmm4\n"
1133                     "movdqa   %%xmm1, %%xmm5\n"
1134                     "movdqa   %%xmm2, %%xmm6\n"
1135                     "psrlw    $8,     %%xmm0\n"
1136                     "psrlw    $8,     %%xmm1\n"
1137                     "pand     %%xmm7, %%xmm4\n"
1138                     "pand     %%xmm7, %%xmm5\n"
1139                     "pand     %%xmm7, %%xmm6\n"
1140                     "packuswb %%xmm4, %%xmm0\n"
1141                     "packuswb %%xmm5, %%xmm1\n"
1142                     "pand     %%xmm3, %%xmm7\n"
1143                     "psrlw    $8,     %%xmm2\n"
1144                     "psrlw    $8,     %%xmm3\n"
1145                     "packuswb %%xmm6, %%xmm2\n"
1146                     "packuswb %%xmm7, %%xmm3\n"
1147                     STORE2X32
1148                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
1149             }
1150         }
1151 #endif
1152 #undef STORE2X32
1153 #undef LOAD64
1154
1155         for (; x < width; x++) {
1156             dstu[x] = src[2*x+0];
1157             dstv[x] = src[2*x+1];
1158         }
1159         src  += src_pitch;
1160         dstu += dstu_pitch;
1161         dstv += dstv_pitch;
1162     }
1163 }
1164
1165 static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
1166                       uint8_t *cache, size_t cache_size,
1167                       unsigned width, unsigned height,
1168                       unsigned cpu)
1169 {
1170     const unsigned w16 = (width+15) & ~15;
1171     const unsigned hstep = cache_size / w16;
1172     assert(hstep > 0);
1173
1174     for (unsigned y = 0; y < height; y += hstep) {
1175         const unsigned unaligned = (intptr_t)src & 0x0f;
1176         const unsigned hblock =  __MIN(hstep, height - y);
1177
1178         /* Copy a bunch of line into our cache */
1179         CopyFromUswc(cache, w16,
1180                      src, src_pitch,
1181                      unaligned,
1182                      width, hblock, cpu);
1183
1184         /* Copy from our cache to the destination */
1185         Copy2d(dst, dst_pitch,
1186                cache, w16,
1187                width, hblock, cpu);
1188
1189         /* */
1190         src += src_pitch * hblock;
1191         dst += dst_pitch * hblock;
1192     }
1193
1194     ASM_SSE2(cpu, "mfence");
1195 }
1196 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
1197                         uint8_t *dstv, size_t dstv_pitch,
1198                         const uint8_t *src, size_t src_pitch,
1199                         uint8_t *cache, size_t cache_size,
1200                         unsigned width, unsigned height,
1201                         unsigned cpu)
1202 {
1203     const unsigned w2_16 = (2*width+15) & ~15;
1204     const unsigned hstep = cache_size / w2_16;
1205     assert(hstep > 0);
1206
1207     for (unsigned y = 0; y < height; y += hstep) {
1208         const unsigned unaligned = (intptr_t)src & 0x0f;
1209         const unsigned hblock =  __MIN(hstep, height - y);
1210
1211         /* Copy a bunch of line into our cache */
1212         CopyFromUswc(cache, w2_16,
1213                      src, src_pitch,
1214                      unaligned,
1215                      2*width, hblock, cpu);
1216
1217         /* Copy from our cache to the destination */
1218         SplitUV(dstu, dstu_pitch,
1219                 dstv, dstv_pitch,
1220                 cache, w2_16,
1221                 width, hblock, cpu);
1222
1223         /* */
1224         src  += src_pitch  * hblock;
1225         dstu += dstu_pitch * hblock;
1226         dstv += dstv_pitch * hblock;
1227     }
1228
1229     ASM_SSE2(cpu, "mfence");
1230 }
1231
1232 static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src,
1233                          uint8_t *cache, size_t cache_size,
1234                          unsigned width, unsigned height)
1235 {
1236     const unsigned cpu = vlc_CPU();
1237
1238     /* */
1239     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
1240               src->pBits, src->Pitch,
1241               cache, cache_size,
1242               width, height, cpu);
1243     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
1244                 dst->p[1].p_pixels, dst->p[1].i_pitch,
1245                 (const uint8_t*)src->pBits + src->Pitch * height, src->Pitch,
1246                 cache, cache_size,
1247                 width/2, height/2, cpu);
1248
1249     ASM_SSE2(cpu, "emms");
1250 }
1251 static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src,
1252                          uint8_t *cache, size_t cache_size,
1253                          unsigned width, unsigned height)
1254 {
1255     const unsigned cpu = vlc_CPU();
1256
1257     /* */
1258     for (unsigned n = 0, offset = 0; n < 3; n++) {
1259         const unsigned d = n > 0 ? 2 : 1;
1260         CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
1261                   (const uint8_t*)src->pBits + offset, src->Pitch/d,
1262                   cache, cache_size,
1263                   width/d, height/d, cpu);
1264         offset += (src->Pitch/d) * (height/d);
1265     }
1266     ASM_SSE2(cpu, "emms");
1267 }
1268 #undef ASM_SSE2
1269 #undef COPY64
1270
1271 #else
1272 vlc_va_t *vlc_va_NewDxva2(vlc_object_t *log, int codec_id)
1273 {
1274     (void)log;
1275     (void)codec_id;
1276     return NULL;
1277 }
1278 #endif