]> git.sesse.net Git - ffmpeg/blobdiff - libavutil/hwcontext_vulkan.c
lavfi/lavfutils: switch to the new decoding API
[ffmpeg] / libavutil / hwcontext_vulkan.c
index 0fe88e6b9f7315010c5207b1589ef4e12cd602f5..a98ea916e7710716f9af902dc23251be7f35cca5 100644 (file)
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(cuda_cu, cu, x)
 #endif
 
+typedef struct VulkanQueueCtx {
+    VkFence fence;
+    VkQueue queue;
+    int was_synchronous;
+
+    /* Buffer dependencies */
+    AVBufferRef **buf_deps;
+    int nb_buf_deps;
+    int buf_deps_alloc_size;
+} VulkanQueueCtx;
+
 typedef struct VulkanExecCtx {
     VkCommandPool pool;
-    VkCommandBuffer buf;
-    VkQueue queue;
-    VkFence fence;
+    VkCommandBuffer *bufs;
+    VulkanQueueCtx *queues;
+    int nb_queues;
+    int cur_queue_idx;
 } VulkanExecCtx;
 
 typedef struct VulkanDevicePriv {
     /* Properties */
-    VkPhysicalDeviceProperties props;
+    VkPhysicalDeviceProperties2 props;
     VkPhysicalDeviceMemoryProperties mprops;
+    VkPhysicalDeviceExternalMemoryHostPropertiesEXT hprops;
+
+    /* Queues */
+    uint32_t qfs[3];
+    int num_qfs;
 
     /* Debug callback */
     VkDebugUtilsMessengerEXT debug_ctx;
 
-    /* Image uploading */
-    VulkanExecCtx cmd;
-
     /* Extensions */
     uint64_t extensions;
 
@@ -70,7 +84,12 @@ typedef struct VulkanDevicePriv {
 } VulkanDevicePriv;
 
 typedef struct VulkanFramesPriv {
-    VulkanExecCtx cmd;
+    /* Image conversions */
+    VulkanExecCtx conv_ctx;
+
+    /* Image transfers */
+    VulkanExecCtx upload_ctx;
+    VulkanExecCtx download_ctx;
 } VulkanFramesPriv;
 
 typedef struct AVVkFrameInternal {
@@ -85,6 +104,16 @@ typedef struct AVVkFrameInternal {
 #endif
 } AVVkFrameInternal;
 
+#define GET_QUEUE_COUNT(hwctx, graph, comp, tx) (                   \
+    graph ?  hwctx->nb_graphics_queues :                            \
+    comp  ? (hwctx->nb_comp_queues ?                                \
+             hwctx->nb_comp_queues : hwctx->nb_graphics_queues) :   \
+    tx    ? (hwctx->nb_tx_queues ? hwctx->nb_tx_queues :            \
+             (hwctx->nb_comp_queues ?                               \
+              hwctx->nb_comp_queues : hwctx->nb_graphics_queues)) : \
+    0                                                               \
+)
+
 #define VK_LOAD_PFN(inst, name) PFN_##name pfn_##name = (PFN_##name)           \
                                               vkGetInstanceProcAddr(inst, #name)
 
@@ -98,9 +127,13 @@ typedef struct AVVkFrameInternal {
         list = av_realloc_array(list, sizeof(*list), ++count);                 \
         if (!list) {                                                           \
             err = AVERROR(ENOMEM);                                             \
-            goto end;                                                          \
+            goto fail;                                                         \
+        }                                                                      \
+        list[count - 1] = av_strdup(val);                                      \
+        if (!list[count - 1]) {                                                \
+            err = AVERROR(ENOMEM);                                             \
+            goto fail;                                                         \
         }                                                                      \
-        list[count - 1] = val;                                                 \
     } while(0)
 
 static const struct {
@@ -176,9 +209,9 @@ enum VulkanExtensions {
     EXT_DRM_MODIFIER_FLAGS     = 1ULL <<  1, /* VK_EXT_image_drm_format_modifier */
     EXT_EXTERNAL_FD_MEMORY     = 1ULL <<  2, /* VK_KHR_external_memory_fd */
     EXT_EXTERNAL_FD_SEM        = 1ULL <<  3, /* VK_KHR_external_semaphore_fd */
+    EXT_EXTERNAL_HOST_MEMORY   = 1ULL <<  4, /* VK_EXT_external_memory_host */
 
-    EXT_OPTIONAL               = 1ULL << 62,
-    EXT_REQUIRED               = 1ULL << 63,
+    EXT_NO_FLAG                = 1ULL << 63,
 };
 
 typedef struct VulkanOptExtension {
@@ -195,6 +228,7 @@ static const VulkanOptExtension optional_device_exts[] = {
     { VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,          EXT_EXTERNAL_DMABUF_MEMORY, },
     { VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,        EXT_DRM_MODIFIER_FLAGS,     },
     { VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,            EXT_EXTERNAL_FD_SEM,        },
+    { VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,             EXT_EXTERNAL_HOST_MEMORY,   },
 };
 
 /* Converts return values to strings */
@@ -261,7 +295,7 @@ static VkBool32 vk_dbg_callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity,
     return 0;
 }
 
-static int check_extensions(AVHWDeviceContext *ctx, int dev,
+static int check_extensions(AVHWDeviceContext *ctx, int dev, AVDictionary *opts,
                             const char * const **dst, uint32_t *num, int debug)
 {
     const char *tstr;
@@ -273,6 +307,8 @@ static int check_extensions(AVHWDeviceContext *ctx, int dev,
     const char *mod;
     int optional_exts_num;
     uint32_t sup_ext_count;
+    char *user_exts_str = NULL;
+    AVDictionaryEntry *user_exts;
     VkExtensionProperties *sup_ext;
     const VulkanOptExtension *optional_exts;
 
@@ -280,6 +316,14 @@ static int check_extensions(AVHWDeviceContext *ctx, int dev,
         mod = "instance";
         optional_exts = optional_instance_exts;
         optional_exts_num = FF_ARRAY_ELEMS(optional_instance_exts);
+        user_exts = av_dict_get(opts, "instance_extensions", NULL, 0);
+        if (user_exts) {
+            user_exts_str = av_strdup(user_exts->value);
+            if (!user_exts_str) {
+                err = AVERROR(ENOMEM);
+                goto fail;
+            }
+        }
         vkEnumerateInstanceExtensionProperties(NULL, &sup_ext_count, NULL);
         sup_ext = av_malloc_array(sup_ext_count, sizeof(VkExtensionProperties));
         if (!sup_ext)
@@ -289,6 +333,14 @@ static int check_extensions(AVHWDeviceContext *ctx, int dev,
         mod = "device";
         optional_exts = optional_device_exts;
         optional_exts_num = FF_ARRAY_ELEMS(optional_device_exts);
+        user_exts = av_dict_get(opts, "device_extensions", NULL, 0);
+        if (user_exts) {
+            user_exts_str = av_strdup(user_exts->value);
+            if (!user_exts_str) {
+                err = AVERROR(ENOMEM);
+                goto fail;
+            }
+        }
         vkEnumerateDeviceExtensionProperties(hwctx->phys_dev, NULL,
                                              &sup_ext_count, NULL);
         sup_ext = av_malloc_array(sup_ext_count, sizeof(VkExtensionProperties));
@@ -299,9 +351,7 @@ static int check_extensions(AVHWDeviceContext *ctx, int dev,
     }
 
     for (int i = 0; i < optional_exts_num; i++) {
-        int req = optional_exts[i].flag & EXT_REQUIRED;
         tstr = optional_exts[i].name;
-
         found = 0;
         for (int j = 0; j < sup_ext_count; j++) {
             if (!strcmp(tstr, sup_ext[j].extensionName)) {
@@ -309,20 +359,11 @@ static int check_extensions(AVHWDeviceContext *ctx, int dev,
                 break;
             }
         }
-        if (!found) {
-            int lvl = req ? AV_LOG_ERROR : AV_LOG_VERBOSE;
-            av_log(ctx, lvl, "Extension \"%s\" not found!\n", tstr);
-            if (req) {
-                err = AVERROR(EINVAL);
-                goto end;
-            }
+        if (!found)
             continue;
-        }
-        if (!req)
-            p->extensions |= optional_exts[i].flag;
 
         av_log(ctx, AV_LOG_VERBOSE, "Using %s extension \"%s\"\n", mod, tstr);
-
+        p->extensions |= optional_exts[i].flag;
         ADD_VAL_TO_LIST(extension_names, extensions_found, tstr);
     }
 
@@ -336,19 +377,50 @@ static int check_extensions(AVHWDeviceContext *ctx, int dev,
             }
         }
         if (found) {
+            av_log(ctx, AV_LOG_VERBOSE, "Using %s extension \"%s\"\n", mod, tstr);
             ADD_VAL_TO_LIST(extension_names, extensions_found, tstr);
         } else {
             av_log(ctx, AV_LOG_ERROR, "Debug extension \"%s\" not found!\n",
                    tstr);
             err = AVERROR(EINVAL);
-            goto end;
+            goto fail;
+        }
+    }
+
+    if (user_exts_str) {
+        char *save, *token = av_strtok(user_exts_str, "+", &save);
+        while (token) {
+            found = 0;
+            for (int j = 0; j < sup_ext_count; j++) {
+                if (!strcmp(token, sup_ext[j].extensionName)) {
+                    found = 1;
+                    break;
+                }
+            }
+            if (found) {
+                av_log(ctx, AV_LOG_VERBOSE, "Using %s extension \"%s\"\n", mod, token);
+                ADD_VAL_TO_LIST(extension_names, extensions_found, token);
+            } else {
+                av_log(ctx, AV_LOG_WARNING, "%s extension \"%s\" not found, excluding.\n",
+                       mod, token);
+            }
+            token = av_strtok(NULL, "+", &save);
         }
     }
 
     *dst = extension_names;
     *num = extensions_found;
 
-end:
+    av_free(user_exts_str);
+    av_free(sup_ext);
+    return 0;
+
+fail:
+    if (extension_names)
+        for (int i = 0; i < extensions_found; i++)
+            av_free((void *)extension_names[i]);
+    av_free(extension_names);
+    av_free(user_exts_str);
     av_free(sup_ext);
     return err;
 }
@@ -376,13 +448,13 @@ static int create_instance(AVHWDeviceContext *ctx, AVDictionary *opts)
     };
 
     /* Check for present/missing extensions */
-    err = check_extensions(ctx, 0, &inst_props.ppEnabledExtensionNames,
+    err = check_extensions(ctx, 0, opts, &inst_props.ppEnabledExtensionNames,
                            &inst_props.enabledExtensionCount, debug_mode);
     if (err < 0)
         return err;
 
     if (debug_mode) {
-        static const char *layers[] = { "VK_LAYER_LUNARG_standard_validation" };
+        static const char *layers[] = { "VK_LAYER_KHRONOS_validation" };
         inst_props.ppEnabledLayerNames = layers;
         inst_props.enabledLayerCount = FF_ARRAY_ELEMS(layers);
     }
@@ -390,13 +462,13 @@ static int create_instance(AVHWDeviceContext *ctx, AVDictionary *opts)
     /* Try to create the instance */
     ret = vkCreateInstance(&inst_props, hwctx->alloc, &hwctx->inst);
 
-    /* Free used memory */
-    av_free((void *)inst_props.ppEnabledExtensionNames);
-
     /* Check for errors */
     if (ret != VK_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Instance creation failure: %s\n",
                vk_ret2str(ret));
+        for (int i = 0; i < inst_props.enabledExtensionCount; i++)
+            av_free((void *)inst_props.ppEnabledExtensionNames[i]);
+        av_free((void *)inst_props.ppEnabledExtensionNames);
         return AVERROR_EXTERNAL;
     }
 
@@ -419,6 +491,9 @@ static int create_instance(AVHWDeviceContext *ctx, AVDictionary *opts)
                                            hwctx->alloc, &p->debug_ctx);
     }
 
+    hwctx->enabled_inst_extensions = inst_props.ppEnabledExtensionNames;
+    hwctx->nb_enabled_inst_extensions = inst_props.enabledExtensionCount;
+
     return 0;
 }
 
@@ -451,7 +526,6 @@ static int find_device(AVHWDeviceContext *ctx, VulkanDeviceSelection *select)
     VkPhysicalDevice *devices = NULL;
     VkPhysicalDeviceIDProperties *idp = NULL;
     VkPhysicalDeviceProperties2 *prop = NULL;
-    VulkanDevicePriv *p = ctx->internal->priv;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
 
     ret = vkEnumeratePhysicalDevices(hwctx->inst, &num, NULL);
@@ -555,10 +629,9 @@ static int find_device(AVHWDeviceContext *ctx, VulkanDeviceSelection *select)
     }
 
 end:
-    if (choice > -1) {
-        p->dev_is_nvidia = (prop[choice].properties.vendorID == 0x10de);
+    if (choice > -1)
         hwctx->phys_dev = devices[choice];
-    }
+
     av_free(devices);
     av_free(prop);
     av_free(idp);
@@ -569,6 +642,7 @@ end:
 static int search_queue_families(AVHWDeviceContext *ctx, VkDeviceCreateInfo *cd)
 {
     uint32_t num;
+    float *weights;
     VkQueueFamilyProperties *qs = NULL;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
     int graph_index = -1, comp_index = -1, tx_index = -1;
@@ -607,48 +681,63 @@ static int search_queue_families(AVHWDeviceContext *ctx, VkDeviceCreateInfo *cd)
                  (i != comp_index), tx_index)
 
 #undef SEARCH_FLAGS
-#define QF_FLAGS(flags)                                                        \
-    ((flags) & VK_QUEUE_GRAPHICS_BIT      ) ? "(graphics) " : "",              \
-    ((flags) & VK_QUEUE_COMPUTE_BIT       ) ? "(compute) "  : "",              \
-    ((flags) & VK_QUEUE_TRANSFER_BIT      ) ? "(transfer) " : "",              \
-    ((flags) & VK_QUEUE_SPARSE_BINDING_BIT) ? "(sparse) "   : ""
-
-    av_log(ctx, AV_LOG_VERBOSE, "Using queue family %i for graphics, "
-           "flags: %s%s%s%s\n", graph_index, QF_FLAGS(qs[graph_index].queueFlags));
-
+#define ADD_QUEUE(fidx, graph, comp, tx)                                                 \
+    av_log(ctx, AV_LOG_VERBOSE, "Using queue family %i (total queues: %i) for %s%s%s\n", \
+           fidx, qs[fidx].queueCount, graph ? "graphics " : "",                          \
+           comp ? "compute " : "", tx ? "transfers " : "");                              \
+    av_log(ctx, AV_LOG_VERBOSE, "    QF %i flags: %s%s%s%s\n", fidx,                     \
+           ((qs[fidx].queueFlags) & VK_QUEUE_GRAPHICS_BIT) ? "(graphics) " : "",         \
+           ((qs[fidx].queueFlags) & VK_QUEUE_COMPUTE_BIT) ? "(compute) " : "",           \
+           ((qs[fidx].queueFlags) & VK_QUEUE_TRANSFER_BIT) ? "(transfers) " : "",        \
+           ((qs[fidx].queueFlags) & VK_QUEUE_SPARSE_BINDING_BIT) ? "(sparse) " : "");    \
+    pc[cd->queueCreateInfoCount].queueFamilyIndex = fidx;                                \
+    pc[cd->queueCreateInfoCount].queueCount = qs[fidx].queueCount;                       \
+    weights = av_malloc(qs[fidx].queueCount * sizeof(float));                            \
+    pc[cd->queueCreateInfoCount].pQueuePriorities = weights;                             \
+    if (!weights)                                                                        \
+        goto fail;                                                                       \
+    for (int i = 0; i < qs[fidx].queueCount; i++)                                        \
+        weights[i] = 1.0f;                                                               \
+    cd->queueCreateInfoCount++;
+
+    ADD_QUEUE(graph_index, 1, comp_index < 0, tx_index < 0 && comp_index < 0)
     hwctx->queue_family_index      = graph_index;
-    hwctx->queue_family_tx_index   = graph_index;
     hwctx->queue_family_comp_index = graph_index;
-
-    pc[cd->queueCreateInfoCount++].queueFamilyIndex = graph_index;
+    hwctx->queue_family_tx_index   = graph_index;
+    hwctx->nb_graphics_queues      = qs[graph_index].queueCount;
 
     if (comp_index != -1) {
-        av_log(ctx, AV_LOG_VERBOSE, "Using queue family %i for compute, "
-               "flags: %s%s%s%s\n", comp_index, QF_FLAGS(qs[comp_index].queueFlags));
-        hwctx->queue_family_tx_index                    = comp_index;
-        hwctx->queue_family_comp_index                  = comp_index;
-        pc[cd->queueCreateInfoCount++].queueFamilyIndex = comp_index;
+        ADD_QUEUE(comp_index, 0, 1, tx_index < 0)
+        hwctx->queue_family_tx_index   = comp_index;
+        hwctx->queue_family_comp_index = comp_index;
+        hwctx->nb_comp_queues          = qs[comp_index].queueCount;
     }
 
     if (tx_index != -1) {
-        av_log(ctx, AV_LOG_VERBOSE, "Using queue family %i for transfers, "
-               "flags: %s%s%s%s\n", tx_index, QF_FLAGS(qs[tx_index].queueFlags));
-        hwctx->queue_family_tx_index                    = tx_index;
-        pc[cd->queueCreateInfoCount++].queueFamilyIndex = tx_index;
+        ADD_QUEUE(tx_index, 0, 0, 1)
+        hwctx->queue_family_tx_index = tx_index;
+        hwctx->nb_tx_queues          = qs[tx_index].queueCount;
     }
 
-#undef QF_FLAGS
-
+#undef ADD_QUEUE
     av_free(qs);
 
     return 0;
+
+fail:
+    av_freep(&pc[0].pQueuePriorities);
+    av_freep(&pc[1].pQueuePriorities);
+    av_freep(&pc[2].pQueuePriorities);
+    av_free(qs);
+
+    return AVERROR(ENOMEM);
 }
 
-static int create_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd,
-                           int queue_family_index)
+static int create_exec_ctx(AVHWFramesContext *hwfc, VulkanExecCtx *cmd,
+                           int queue_family_index, int num_queues)
 {
     VkResult ret;
-    AVVulkanDeviceContext *hwctx = ctx->hwctx;
+    AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
 
     VkCommandPoolCreateInfo cqueue_create = {
         .sType              = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
@@ -658,54 +747,199 @@ static int create_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd,
     VkCommandBufferAllocateInfo cbuf_create = {
         .sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
         .level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
-        .commandBufferCount = 1,
+        .commandBufferCount = num_queues,
     };
 
-    VkFenceCreateInfo fence_spawn = {
-        .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
-    };
+    cmd->nb_queues = num_queues;
 
-    ret = vkCreateFence(hwctx->act_dev, &fence_spawn,
-                        hwctx->alloc, &cmd->fence);
-    if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to create frame fence: %s\n",
-               vk_ret2str(ret));
-        return AVERROR_EXTERNAL;
-    }
+    cmd->queues = av_mallocz(num_queues * sizeof(*cmd->queues));
+    if (!cmd->queues)
+        return AVERROR(ENOMEM);
+
+    cmd->bufs = av_mallocz(num_queues * sizeof(*cmd->bufs));
+    if (!cmd->bufs)
+        return AVERROR(ENOMEM);
 
+    /* Create command pool */
     ret = vkCreateCommandPool(hwctx->act_dev, &cqueue_create,
                               hwctx->alloc, &cmd->pool);
     if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Command pool creation failure: %s\n",
+        av_log(hwfc, AV_LOG_ERROR, "Command pool creation failure: %s\n",
                vk_ret2str(ret));
         return AVERROR_EXTERNAL;
     }
 
     cbuf_create.commandPool = cmd->pool;
 
-    ret = vkAllocateCommandBuffers(hwctx->act_dev, &cbuf_create, &cmd->buf);
+    /* Allocate command buffer */
+    ret = vkAllocateCommandBuffers(hwctx->act_dev, &cbuf_create, cmd->bufs);
     if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Command buffer alloc failure: %s\n",
+        av_log(hwfc, AV_LOG_ERROR, "Command buffer alloc failure: %s\n",
                vk_ret2str(ret));
         return AVERROR_EXTERNAL;
     }
 
-    vkGetDeviceQueue(hwctx->act_dev, cqueue_create.queueFamilyIndex, 0,
-                     &cmd->queue);
+    for (int i = 0; i < num_queues; i++) {
+        VulkanQueueCtx *q = &cmd->queues[i];
+        vkGetDeviceQueue(hwctx->act_dev, queue_family_index, i, &q->queue);
+        q->was_synchronous = 1;
+    }
 
     return 0;
 }
 
-static void free_exec_ctx(AVHWDeviceContext *ctx, VulkanExecCtx *cmd)
+static void free_exec_ctx(AVHWFramesContext *hwfc, VulkanExecCtx *cmd)
 {
-    AVVulkanDeviceContext *hwctx = ctx->hwctx;
+    AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
 
-    if (cmd->fence)
-        vkDestroyFence(hwctx->act_dev, cmd->fence, hwctx->alloc);
-    if (cmd->buf)
-        vkFreeCommandBuffers(hwctx->act_dev, cmd->pool, 1, &cmd->buf);
+    /* Make sure all queues have finished executing */
+    for (int i = 0; i < cmd->nb_queues; i++) {
+        VulkanQueueCtx *q = &cmd->queues[i];
+
+        if (q->fence && !q->was_synchronous) {
+            vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX);
+            vkResetFences(hwctx->act_dev, 1, &q->fence);
+        }
+
+        /* Free the fence */
+        if (q->fence)
+            vkDestroyFence(hwctx->act_dev, q->fence, hwctx->alloc);
+
+        /* Free buffer dependencies */
+        for (int j = 0; j < q->nb_buf_deps; j++)
+            av_buffer_unref(&q->buf_deps[j]);
+        av_free(q->buf_deps);
+    }
+
+    if (cmd->bufs)
+        vkFreeCommandBuffers(hwctx->act_dev, cmd->pool, cmd->nb_queues, cmd->bufs);
     if (cmd->pool)
         vkDestroyCommandPool(hwctx->act_dev, cmd->pool, hwctx->alloc);
+
+    av_freep(&cmd->bufs);
+    av_freep(&cmd->queues);
+}
+
+static VkCommandBuffer get_buf_exec_ctx(AVHWFramesContext *hwfc, VulkanExecCtx *cmd)
+{
+    return cmd->bufs[cmd->cur_queue_idx];
+}
+
+static void unref_exec_ctx_deps(AVHWFramesContext *hwfc, VulkanExecCtx *cmd)
+{
+    VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx];
+
+    for (int j = 0; j < q->nb_buf_deps; j++)
+        av_buffer_unref(&q->buf_deps[j]);
+    q->nb_buf_deps = 0;
+}
+
+static int wait_start_exec_ctx(AVHWFramesContext *hwfc, VulkanExecCtx *cmd)
+{
+    VkResult ret;
+    AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
+    VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx];
+
+    VkCommandBufferBeginInfo cmd_start = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+    };
+
+    /* Create the fence and don't wait for it initially */
+    if (!q->fence) {
+        VkFenceCreateInfo fence_spawn = {
+            .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+        };
+        ret = vkCreateFence(hwctx->act_dev, &fence_spawn, hwctx->alloc,
+                            &q->fence);
+        if (ret != VK_SUCCESS) {
+            av_log(hwfc, AV_LOG_ERROR, "Failed to queue frame fence: %s\n",
+                   vk_ret2str(ret));
+            return AVERROR_EXTERNAL;
+        }
+    } else if (!q->was_synchronous) {
+        vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX);
+        vkResetFences(hwctx->act_dev, 1, &q->fence);
+    }
+
+    /* Discard queue dependencies */
+    unref_exec_ctx_deps(hwfc, cmd);
+
+    ret = vkBeginCommandBuffer(cmd->bufs[cmd->cur_queue_idx], &cmd_start);
+    if (ret != VK_SUCCESS) {
+        av_log(hwfc, AV_LOG_ERROR, "Unable to init command buffer: %s\n",
+               vk_ret2str(ret));
+        return AVERROR_EXTERNAL;
+    }
+
+    return 0;
+}
+
+static int add_buf_dep_exec_ctx(AVHWFramesContext *hwfc, VulkanExecCtx *cmd,
+                                AVBufferRef * const *deps, int nb_deps)
+{
+    AVBufferRef **dst;
+    VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx];
+
+    if (!deps || !nb_deps)
+        return 0;
+
+    dst = av_fast_realloc(q->buf_deps, &q->buf_deps_alloc_size,
+                          (q->nb_buf_deps + nb_deps) * sizeof(*dst));
+    if (!dst)
+        goto err;
+
+    q->buf_deps = dst;
+
+    for (int i = 0; i < nb_deps; i++) {
+        q->buf_deps[q->nb_buf_deps] = av_buffer_ref(deps[i]);
+        if (!q->buf_deps[q->nb_buf_deps])
+            goto err;
+        q->nb_buf_deps++;
+    }
+
+    return 0;
+
+err:
+    unref_exec_ctx_deps(hwfc, cmd);
+    return AVERROR(ENOMEM);
+}
+
+static int submit_exec_ctx(AVHWFramesContext *hwfc, VulkanExecCtx *cmd,
+                           VkSubmitInfo *s_info, int synchronous)
+{
+    VkResult ret;
+    VulkanQueueCtx *q = &cmd->queues[cmd->cur_queue_idx];
+
+    ret = vkEndCommandBuffer(cmd->bufs[cmd->cur_queue_idx]);
+    if (ret != VK_SUCCESS) {
+        av_log(hwfc, AV_LOG_ERROR, "Unable to finish command buffer: %s\n",
+               vk_ret2str(ret));
+        unref_exec_ctx_deps(hwfc, cmd);
+        return AVERROR_EXTERNAL;
+    }
+
+    s_info->pCommandBuffers = &cmd->bufs[cmd->cur_queue_idx];
+    s_info->commandBufferCount = 1;
+
+    ret = vkQueueSubmit(q->queue, 1, s_info, q->fence);
+    if (ret != VK_SUCCESS) {
+        unref_exec_ctx_deps(hwfc, cmd);
+        return AVERROR_EXTERNAL;
+    }
+
+    q->was_synchronous = synchronous;
+
+    if (synchronous) {
+        AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
+        vkWaitForFences(hwctx->act_dev, 1, &q->fence, VK_TRUE, UINT64_MAX);
+        vkResetFences(hwctx->act_dev, 1, &q->fence);
+        unref_exec_ctx_deps(hwfc, cmd);
+    } else { /* Rotate queues */
+        cmd->cur_queue_idx = (cmd->cur_queue_idx + 1) % cmd->nb_queues;
+    }
+
+    return 0;
 }
 
 static void vulkan_device_free(AVHWDeviceContext *ctx)
@@ -713,8 +947,6 @@ static void vulkan_device_free(AVHWDeviceContext *ctx)
     VulkanDevicePriv *p = ctx->internal->priv;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
 
-    free_exec_ctx(ctx, &p->cmd);
-
     vkDestroyDevice(hwctx->act_dev, hwctx->alloc);
 
     if (p->debug_ctx) {
@@ -724,6 +956,14 @@ static void vulkan_device_free(AVHWDeviceContext *ctx)
     }
 
     vkDestroyInstance(hwctx->inst, hwctx->alloc);
+
+    for (int i = 0; i < hwctx->nb_enabled_inst_extensions; i++)
+        av_free((void *)hwctx->enabled_inst_extensions[i]);
+    av_free((void *)hwctx->enabled_inst_extensions);
+
+    for (int i = 0; i < hwctx->nb_enabled_dev_extensions; i++)
+        av_free((void *)hwctx->enabled_dev_extensions[i]);
+    av_free((void *)hwctx->enabled_dev_extensions);
 }
 
 static int vulkan_device_create_internal(AVHWDeviceContext *ctx,
@@ -735,24 +975,21 @@ static int vulkan_device_create_internal(AVHWDeviceContext *ctx,
     AVDictionaryEntry *opt_d;
     VulkanDevicePriv *p = ctx->internal->priv;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
+    VkPhysicalDeviceFeatures dev_features = { 0 };
     VkDeviceQueueCreateInfo queue_create_info[3] = {
-        {   .sType            = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
-            .pQueuePriorities = (float []){ 1.0f },
-            .queueCount       = 1, },
-        {   .sType            = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
-            .pQueuePriorities = (float []){ 1.0f },
-            .queueCount       = 1, },
-        {   .sType            = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
-            .pQueuePriorities = (float []){ 1.0f },
-            .queueCount       = 1, },
+        { .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, },
+        { .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, },
+        { .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, },
     };
 
     VkDeviceCreateInfo dev_info = {
         .sType                = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .pNext                = &hwctx->device_features,
         .pQueueCreateInfos    = queue_create_info,
         .queueCreateInfoCount = 0,
     };
 
+    hwctx->device_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
     ctx->free = vulkan_device_free;
 
     /* Create an instance if not given one */
@@ -763,32 +1000,39 @@ static int vulkan_device_create_internal(AVHWDeviceContext *ctx,
     if ((err = find_device(ctx, dev_select)))
         goto end;
 
-    vkGetPhysicalDeviceProperties(hwctx->phys_dev, &p->props);
-    av_log(ctx, AV_LOG_VERBOSE, "Using device: %s\n", p->props.deviceName);
-    av_log(ctx, AV_LOG_VERBOSE, "Alignments:\n");
-    av_log(ctx, AV_LOG_VERBOSE, "    optimalBufferCopyOffsetAlignment:   %li\n",
-           p->props.limits.optimalBufferCopyOffsetAlignment);
-    av_log(ctx, AV_LOG_VERBOSE, "    optimalBufferCopyRowPitchAlignment: %li\n",
-           p->props.limits.optimalBufferCopyRowPitchAlignment);
-    av_log(ctx, AV_LOG_VERBOSE, "    minMemoryMapAlignment:              %li\n",
-           p->props.limits.minMemoryMapAlignment);
+    vkGetPhysicalDeviceFeatures(hwctx->phys_dev, &dev_features);
+#define COPY_FEATURE(DST, NAME) (DST).features.NAME = dev_features.NAME;
+    COPY_FEATURE(hwctx->device_features, shaderImageGatherExtended)
+    COPY_FEATURE(hwctx->device_features, fragmentStoresAndAtomics)
+    COPY_FEATURE(hwctx->device_features, vertexPipelineStoresAndAtomics)
+    COPY_FEATURE(hwctx->device_features, shaderInt64)
+#undef COPY_FEATURE
 
     /* Search queue family */
     if ((err = search_queue_families(ctx, &dev_info)))
         goto end;
 
-    if ((err = check_extensions(ctx, 1, &dev_info.ppEnabledExtensionNames,
-                                &dev_info.enabledExtensionCount, 0)))
+    if ((err = check_extensions(ctx, 1, opts, &dev_info.ppEnabledExtensionNames,
+                                &dev_info.enabledExtensionCount, 0))) {
+        av_free((void *)queue_create_info[0].pQueuePriorities);
+        av_free((void *)queue_create_info[1].pQueuePriorities);
+        av_free((void *)queue_create_info[2].pQueuePriorities);
         goto end;
+    }
 
     ret = vkCreateDevice(hwctx->phys_dev, &dev_info, hwctx->alloc,
                          &hwctx->act_dev);
 
-    av_free((void *)dev_info.ppEnabledExtensionNames);
+    av_free((void *)queue_create_info[0].pQueuePriorities);
+    av_free((void *)queue_create_info[1].pQueuePriorities);
+    av_free((void *)queue_create_info[2].pQueuePriorities);
 
     if (ret != VK_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Device creation failure: %s\n",
                vk_ret2str(ret));
+        for (int i = 0; i < dev_info.enabledExtensionCount; i++)
+            av_free((void *)dev_info.ppEnabledExtensionNames[i]);
+        av_free((void *)dev_info.ppEnabledExtensionNames);
         err = AVERROR_EXTERNAL;
         goto end;
     }
@@ -798,17 +1042,50 @@ static int vulkan_device_create_internal(AVHWDeviceContext *ctx,
     if (opt_d)
         p->use_linear_images = strtol(opt_d->value, NULL, 10);
 
+    hwctx->enabled_dev_extensions = dev_info.ppEnabledExtensionNames;
+    hwctx->nb_enabled_dev_extensions = dev_info.enabledExtensionCount;
+
 end:
     return err;
 }
 
 static int vulkan_device_init(AVHWDeviceContext *ctx)
 {
-    int err;
     uint32_t queue_num;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
     VulkanDevicePriv *p = ctx->internal->priv;
 
+    /* Set device extension flags */
+    for (int i = 0; i < hwctx->nb_enabled_dev_extensions; i++) {
+        for (int j = 0; j < FF_ARRAY_ELEMS(optional_device_exts); j++) {
+            if (!strcmp(hwctx->enabled_dev_extensions[i],
+                        optional_device_exts[j].name)) {
+                av_log(ctx, AV_LOG_VERBOSE, "Using device extension %s\n",
+                       hwctx->enabled_dev_extensions[i]);
+                p->extensions |= optional_device_exts[j].flag;
+                break;
+            }
+        }
+    }
+
+    p->props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+    p->props.pNext = &p->hprops;
+    p->hprops.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT;
+
+    vkGetPhysicalDeviceProperties2(hwctx->phys_dev, &p->props);
+    av_log(ctx, AV_LOG_VERBOSE, "Using device: %s\n",
+           p->props.properties.deviceName);
+    av_log(ctx, AV_LOG_VERBOSE, "Alignments:\n");
+    av_log(ctx, AV_LOG_VERBOSE, "    optimalBufferCopyRowPitchAlignment: %li\n",
+           p->props.properties.limits.optimalBufferCopyRowPitchAlignment);
+    av_log(ctx, AV_LOG_VERBOSE, "    minMemoryMapAlignment:              %li\n",
+           p->props.properties.limits.minMemoryMapAlignment);
+    if (p->extensions & EXT_EXTERNAL_HOST_MEMORY)
+        av_log(ctx, AV_LOG_VERBOSE, "    minImportedHostPointerAlignment:    %li\n",
+               p->hprops.minImportedHostPointerAlignment);
+
+    p->dev_is_nvidia = (p->props.properties.vendorID == 0x10de);
+
     vkGetPhysicalDeviceQueueFamilyProperties(hwctx->phys_dev, &queue_num, NULL);
     if (!queue_num) {
         av_log(ctx, AV_LOG_ERROR, "Failed to get queues!\n");
@@ -828,10 +1105,13 @@ if (n >= queue_num) {
 
 #undef CHECK_QUEUE
 
-    /* Create exec context - if there's something invalid this will error out */
-    err = create_exec_ctx(ctx, &p->cmd, hwctx->queue_family_tx_index);
-    if (err)
-        return err;
+    p->qfs[p->num_qfs++] = hwctx->queue_family_index;
+    if ((hwctx->queue_family_tx_index != hwctx->queue_family_index) &&
+        (hwctx->queue_family_tx_index != hwctx->queue_family_comp_index))
+        p->qfs[p->num_qfs++] = hwctx->queue_family_tx_index;
+    if ((hwctx->queue_family_comp_index != hwctx->queue_family_index) &&
+        (hwctx->queue_family_comp_index != hwctx->queue_family_tx_index))
+        p->qfs[p->num_qfs++] = hwctx->queue_family_comp_index;
 
     /* Get device capabilities */
     vkGetPhysicalDeviceMemoryProperties(hwctx->phys_dev, &p->mprops);
@@ -856,7 +1136,8 @@ static int vulkan_device_create(AVHWDeviceContext *ctx, const char *device,
 }
 
 static int vulkan_device_derive(AVHWDeviceContext *ctx,
-                                AVHWDeviceContext *src_ctx, int flags)
+                                AVHWDeviceContext *src_ctx,
+                                AVDictionary *opts, int flags)
 {
     av_unused VulkanDeviceSelection dev_select = { 0 };
 
@@ -880,7 +1161,7 @@ static int vulkan_device_derive(AVHWDeviceContext *ctx,
         if (strstr(vendor, "AMD"))
             dev_select.vendor_id = 0x1002;
 
-        return vulkan_device_create_internal(ctx, &dev_select, NULL, flags);
+        return vulkan_device_create_internal(ctx, &dev_select, opts, flags);
     }
 #endif
     case AV_HWDEVICE_TYPE_DRM: {
@@ -898,7 +1179,7 @@ static int vulkan_device_derive(AVHWDeviceContext *ctx,
 
         drmFreeDevice(&drm_dev_info);
 
-        return vulkan_device_create_internal(ctx, &dev_select, NULL, flags);
+        return vulkan_device_create_internal(ctx, &dev_select, opts, flags);
     }
 #endif
 #if CONFIG_CUDA
@@ -917,7 +1198,7 @@ static int vulkan_device_derive(AVHWDeviceContext *ctx,
 
         dev_select.has_uuid = 1;
 
-        return vulkan_device_create_internal(ctx, &dev_select, NULL, flags);
+        return vulkan_device_create_internal(ctx, &dev_select, opts, flags);
     }
 #endif
     default:
@@ -959,8 +1240,8 @@ static int vulkan_frames_get_constraints(AVHWDeviceContext *ctx,
 
     constraints->min_width  = 0;
     constraints->min_height = 0;
-    constraints->max_width  = p->props.limits.maxImageDimension2D;
-    constraints->max_height = p->props.limits.maxImageDimension2D;
+    constraints->max_width  = p->props.properties.limits.maxImageDimension2D;
+    constraints->max_height = p->props.properties.limits.maxImageDimension2D;
 
     constraints->valid_hw_formats = av_malloc_array(2, sizeof(enum AVPixelFormat));
     if (!constraints->valid_hw_formats)
@@ -973,7 +1254,7 @@ static int vulkan_frames_get_constraints(AVHWDeviceContext *ctx,
 }
 
 static int alloc_mem(AVHWDeviceContext *ctx, VkMemoryRequirements *req,
-                     VkMemoryPropertyFlagBits req_flags, void *alloc_extension,
+                     VkMemoryPropertyFlagBits req_flags, const void *alloc_extension,
                      VkMemoryPropertyFlagBits *mem_flags, VkDeviceMemory *mem)
 {
     VkResult ret;
@@ -981,16 +1262,11 @@ static int alloc_mem(AVHWDeviceContext *ctx, VkMemoryRequirements *req,
     VulkanDevicePriv *p = ctx->internal->priv;
     AVVulkanDeviceContext *dev_hwctx = ctx->hwctx;
     VkMemoryAllocateInfo alloc_info = {
-        .sType           = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
-        .pNext           = alloc_extension,
+        .sType          = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .pNext          = alloc_extension,
+        .allocationSize = req->size,
     };
 
-    /* Align if we need to */
-    if (req_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
-        req->size = FFALIGN(req->size, p->props.limits.minMemoryMapAlignment);
-
-    alloc_info.allocationSize = req->size;
-
     /* The vulkan spec requires memory types to be sorted in the "optimal"
      * order, so the first matching type we find will be the best/fastest one */
     for (int i = 0; i < p->mprops.memoryTypeCount; i++) {
@@ -1082,6 +1358,7 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
     int err;
     VkResult ret;
     AVHWDeviceContext *ctx = hwfc->device_ctx;
+    VulkanDevicePriv *p = ctx->internal->priv;
     const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
     VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { { 0 } };
 
@@ -1107,6 +1384,10 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
 
         vkGetImageMemoryRequirements2(hwctx->act_dev, &req_desc, &req);
 
+        if (f->tiling == VK_IMAGE_TILING_LINEAR)
+            req.memoryRequirements.size = FFALIGN(req.memoryRequirements.size,
+                                                  p->props.properties.limits.minMemoryMapAlignment);
+
         /* In case the implementation prefers/requires dedicated allocation */
         use_ded_mem = ded_req.prefersDedicatedAllocation |
                       ded_req.requiresDedicatedAllocation;
@@ -1139,33 +1420,56 @@ static int alloc_bind_mem(AVHWFramesContext *hwfc, AVVkFrame *f,
     return 0;
 }
 
-static int prepare_frame(AVHWFramesContext *hwfc, AVVkFrame *frame)
+enum PrepMode {
+    PREP_MODE_WRITE,
+    PREP_MODE_RO_SHADER,
+    PREP_MODE_EXTERNAL_EXPORT,
+};
+
+static int prepare_frame(AVHWFramesContext *hwfc, VulkanExecCtx *ectx,
+                         AVVkFrame *frame, enum PrepMode pmode)
 {
-    VkResult ret;
-    AVHWDeviceContext *ctx = hwfc->device_ctx;
-    AVVulkanDeviceContext *hwctx = ctx->hwctx;
-    VulkanFramesPriv *s = hwfc->internal->priv;
+    int err;
+    uint32_t dst_qf;
+    VkImageLayout new_layout;
+    VkAccessFlags new_access;
     const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
 
     VkImageMemoryBarrier img_bar[AV_NUM_DATA_POINTERS] = { 0 };
 
-    VkCommandBufferBeginInfo cmd_start = {
-        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
-        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
-    };
-
     VkSubmitInfo s_info = {
         .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-        .commandBufferCount   = 1,
-        .pCommandBuffers      = &s->cmd.buf,
-
         .pSignalSemaphores    = frame->sem,
         .signalSemaphoreCount = planes,
     };
 
-    ret = vkBeginCommandBuffer(s->cmd.buf, &cmd_start);
-    if (ret != VK_SUCCESS)
-        return AVERROR_EXTERNAL;
+    VkPipelineStageFlagBits wait_st[AV_NUM_DATA_POINTERS];
+    for (int i = 0; i < planes; i++)
+        wait_st[i] = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+    switch (pmode) {
+    case PREP_MODE_WRITE:
+        new_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+        new_access = VK_ACCESS_TRANSFER_WRITE_BIT;
+        dst_qf     = VK_QUEUE_FAMILY_IGNORED;
+        break;
+    case PREP_MODE_RO_SHADER:
+        new_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
+        new_access = VK_ACCESS_TRANSFER_READ_BIT;
+        dst_qf     = VK_QUEUE_FAMILY_IGNORED;
+        break;
+    case PREP_MODE_EXTERNAL_EXPORT:
+        new_layout = VK_IMAGE_LAYOUT_GENERAL;
+        new_access = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT;
+        dst_qf     = VK_QUEUE_FAMILY_EXTERNAL_KHR;
+        s_info.pWaitSemaphores = frame->sem;
+        s_info.pWaitDstStageMask = wait_st;
+        s_info.waitSemaphoreCount = planes;
+        break;
+    }
+
+    if ((err = wait_start_exec_ctx(hwfc, ectx)))
+        return err;
 
     /* Change the image layout to something more optimal for writes.
      * This also signals the newly created semaphore, making it usable
@@ -1173,11 +1477,11 @@ static int prepare_frame(AVHWFramesContext *hwfc, AVVkFrame *frame)
     for (int i = 0; i < planes; i++) {
         img_bar[i].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
         img_bar[i].srcAccessMask = 0x0;
-        img_bar[i].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        img_bar[i].dstAccessMask = new_access;
         img_bar[i].oldLayout = frame->layout[i];
-        img_bar[i].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+        img_bar[i].newLayout = new_layout;
         img_bar[i].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        img_bar[i].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        img_bar[i].dstQueueFamilyIndex = dst_qf;
         img_bar[i].image = frame->img[i];
         img_bar[i].subresourceRange.levelCount = 1;
         img_bar[i].subresourceRange.layerCount = 1;
@@ -1187,23 +1491,12 @@ static int prepare_frame(AVHWFramesContext *hwfc, AVVkFrame *frame)
         frame->access[i] = img_bar[i].dstAccessMask;
     }
 
-    vkCmdPipelineBarrier(s->cmd.buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                         VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
-                         0, NULL, 0, NULL, planes, img_bar);
-
-    ret = vkEndCommandBuffer(s->cmd.buf);
-    if (ret != VK_SUCCESS)
-        return AVERROR_EXTERNAL;
-
-    ret = vkQueueSubmit(s->cmd.queue, 1, &s_info, s->cmd.fence);
-    if (ret != VK_SUCCESS) {
-        return AVERROR_EXTERNAL;
-    } else {
-        vkWaitForFences(hwctx->act_dev, 1, &s->cmd.fence, VK_TRUE, UINT64_MAX);
-        vkResetFences(hwctx->act_dev, 1, &s->cmd.fence);
-    }
+    vkCmdPipelineBarrier(get_buf_exec_ctx(hwfc, ectx),
+                         VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                         VK_PIPELINE_STAGE_TRANSFER_BIT,
+                         0, 0, NULL, 0, NULL, planes, img_bar);
 
-    return 0;
+    return submit_exec_ctx(hwfc, ectx, &s_info, 0);
 }
 
 static int create_frame(AVHWFramesContext *hwfc, AVVkFrame **frame,
@@ -1244,21 +1537,24 @@ static int create_frame(AVHWFramesContext *hwfc, AVVkFrame **frame,
         const int p_h = i > 0 ? AV_CEIL_RSHIFT(h, desc->log2_chroma_h) : h;
 
         VkImageCreateInfo image_create_info = {
-            .sType         = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-            .pNext         = create_pnext,
-            .imageType     = VK_IMAGE_TYPE_2D,
-            .format        = img_fmts[i],
-            .extent.width  = p_w,
-            .extent.height = p_h,
-            .extent.depth  = 1,
-            .mipLevels     = 1,
-            .arrayLayers   = 1,
-            .flags         = VK_IMAGE_CREATE_ALIAS_BIT,
-            .tiling        = tiling,
-            .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
-            .usage         = usage,
-            .sharingMode   = VK_SHARING_MODE_EXCLUSIVE,
-            .samples       = VK_SAMPLE_COUNT_1_BIT,
+            .sType                 = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+            .pNext                 = create_pnext,
+            .imageType             = VK_IMAGE_TYPE_2D,
+            .format                = img_fmts[i],
+            .extent.width          = p_w,
+            .extent.height         = p_h,
+            .extent.depth          = 1,
+            .mipLevels             = 1,
+            .arrayLayers           = 1,
+            .flags                 = VK_IMAGE_CREATE_ALIAS_BIT,
+            .tiling                = tiling,
+            .initialLayout         = VK_IMAGE_LAYOUT_UNDEFINED,
+            .usage                 = usage,
+            .samples               = VK_SAMPLE_COUNT_1_BIT,
+            .pQueueFamilyIndices   = p->qfs,
+            .queueFamilyIndexCount = p->num_qfs,
+            .sharingMode           = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT :
+                                                      VK_SHARING_MODE_EXCLUSIVE,
         };
 
         ret = vkCreateImage(hwctx->act_dev, &image_create_info,
@@ -1340,6 +1636,7 @@ static AVBufferRef *vulkan_pool_alloc(void *opaque, int size)
     AVHWFramesContext *hwfc = opaque;
     AVVulkanFramesContext *hwctx = hwfc->hwctx;
     VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
+    VulkanFramesPriv *fp = hwfc->internal->priv;
     VkExportMemoryAllocateInfo eminfo[AV_NUM_DATA_POINTERS];
     VkExternalMemoryHandleTypeFlags e = 0x0;
 
@@ -1371,7 +1668,7 @@ static AVBufferRef *vulkan_pool_alloc(void *opaque, int size)
     if (err)
         goto fail;
 
-    err = prepare_frame(hwfc, f);
+    err = prepare_frame(hwfc, &fp->conv_ctx, f, PREP_MODE_WRITE);
     if (err)
         goto fail;
 
@@ -1391,7 +1688,9 @@ static void vulkan_frames_uninit(AVHWFramesContext *hwfc)
 {
     VulkanFramesPriv *fp = hwfc->internal->priv;
 
-    free_exec_ctx(hwfc->device_ctx, &fp->cmd);
+    free_exec_ctx(hwfc, &fp->conv_ctx);
+    free_exec_ctx(hwfc, &fp->upload_ctx);
+    free_exec_ctx(hwfc, &fp->download_ctx);
 }
 
 static int vulkan_frames_init(AVHWFramesContext *hwfc)
@@ -1403,39 +1702,58 @@ static int vulkan_frames_init(AVHWFramesContext *hwfc)
     AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
     VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
 
-    if (hwfc->pool)
-        return 0;
-
     /* Default pool flags */
     hwctx->tiling = hwctx->tiling ? hwctx->tiling : p->use_linear_images ?
                     VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
 
-    hwctx->usage |= DEFAULT_USAGE_FLAGS;
+    if (!hwctx->usage)
+        hwctx->usage = DEFAULT_USAGE_FLAGS;
 
-    err = create_exec_ctx(hwfc->device_ctx, &fp->cmd,
-                          dev_hwctx->queue_family_tx_index);
+    err = create_exec_ctx(hwfc, &fp->conv_ctx,
+                          dev_hwctx->queue_family_comp_index,
+                          GET_QUEUE_COUNT(dev_hwctx, 0, 1, 0));
     if (err)
-        return err;
+        goto fail;
+
+    err = create_exec_ctx(hwfc, &fp->upload_ctx,
+                          dev_hwctx->queue_family_tx_index,
+                          GET_QUEUE_COUNT(dev_hwctx, 0, 0, 1));
+    if (err)
+        goto fail;
+
+    err = create_exec_ctx(hwfc, &fp->download_ctx,
+                          dev_hwctx->queue_family_tx_index, 1);
+    if (err)
+        goto fail;
 
     /* Test to see if allocation will fail */
     err = create_frame(hwfc, &f, hwctx->tiling, hwctx->usage,
                        hwctx->create_pnext);
-    if (err) {
-        free_exec_ctx(hwfc->device_ctx, &p->cmd);
-        return err;
-    }
+    if (err)
+        goto fail;
 
     vulkan_frame_free(hwfc, (uint8_t *)f);
 
-    hwfc->internal->pool_internal = av_buffer_pool_init2(sizeof(AVVkFrame),
-                                                         hwfc, vulkan_pool_alloc,
-                                                         NULL);
-    if (!hwfc->internal->pool_internal) {
-        free_exec_ctx(hwfc->device_ctx, &p->cmd);
-        return AVERROR(ENOMEM);
+    /* If user did not specify a pool, hwfc->pool will be set to the internal one
+     * in hwcontext.c just after this gets called */
+    if (!hwfc->pool) {
+        hwfc->internal->pool_internal = av_buffer_pool_init2(sizeof(AVVkFrame),
+                                                             hwfc, vulkan_pool_alloc,
+                                                             NULL);
+        if (!hwfc->internal->pool_internal) {
+            err = AVERROR(ENOMEM);
+            goto fail;
+        }
     }
 
     return 0;
+
+fail:
+    free_exec_ctx(hwfc, &fp->conv_ctx);
+    free_exec_ctx(hwfc, &fp->upload_ctx);
+    free_exec_ctx(hwfc, &fp->download_ctx);
+
+    return err;
 }
 
 static int vulkan_get_buffer(AVHWFramesContext *hwfc, AVFrame *frame)
@@ -1640,25 +1958,22 @@ static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, AVVkFrame **f
     int err = 0;
     VkResult ret;
     AVVkFrame *f;
+    int bind_counts = 0;
     AVHWDeviceContext *ctx = hwfc->device_ctx;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
     VulkanDevicePriv *p = ctx->internal->priv;
-    const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
+    VulkanFramesPriv *fp = hwfc->internal->priv;
+    AVVulkanFramesContext *frames_hwctx = hwfc->hwctx;
     const AVPixFmtDescriptor *fmt_desc = av_pix_fmt_desc_get(hwfc->sw_format);
     const int has_modifiers = p->extensions & EXT_DRM_MODIFIER_FLAGS;
-    VkSubresourceLayout plane_data[AV_NUM_DATA_POINTERS];
-    VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS];
+    VkSubresourceLayout plane_data[AV_NUM_DATA_POINTERS] = { 0 };
+    VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS] = { 0 };
+    VkBindImagePlaneMemoryInfo plane_info[AV_NUM_DATA_POINTERS] = { 0 };
     VkExternalMemoryHandleTypeFlagBits htype = VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
 
     VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdPropertiesKHR);
 
     for (int i = 0; i < desc->nb_layers; i++) {
-        if (desc->layers[i].nb_planes > 1) {
-            av_log(ctx, AV_LOG_ERROR, "Cannot import DMABUFS with more than 1 "
-                                      "plane per layer!\n");
-            return AVERROR(EINVAL);
-        }
-
         if (drm_to_vulkan_fmt(desc->layers[i].format) == VK_FORMAT_UNDEFINED) {
             av_log(ctx, AV_LOG_ERROR, "Unsupported DMABUF layer format %#08x!\n",
                    desc->layers[i].format);
@@ -1672,46 +1987,16 @@ static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, AVVkFrame **f
         goto fail;
     }
 
-    for (int i = 0; i < desc->nb_objects; i++) {
-        VkMemoryFdPropertiesKHR fdmp = {
-            .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR,
-        };
-        VkMemoryRequirements req = {
-            .size = desc->objects[i].size,
-        };
-        VkImportMemoryFdInfoKHR idesc = {
-            .sType      = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
-            .handleType = htype,
-            .fd         = desc->objects[i].fd,
-        };
-
-        ret = pfn_vkGetMemoryFdPropertiesKHR(hwctx->act_dev, htype,
-                                             desc->objects[i].fd, &fdmp);
-        if (ret != VK_SUCCESS) {
-            av_log(hwfc, AV_LOG_ERROR, "Failed to get FD properties: %s\n",
-                   vk_ret2str(ret));
-            err = AVERROR_EXTERNAL;
-            goto fail;
-        }
-
-        req.memoryTypeBits = fdmp.memoryTypeBits;
-
-        err = alloc_mem(ctx, &req, 0x0, &idesc, &f->flags, &f->mem[i]);
-        if (err)
-            return err;
-
-        f->size[i] = desc->objects[i].size;
-    }
-
     f->tiling = has_modifiers ? VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT :
                 desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ?
                 VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
 
     for (int i = 0; i < desc->nb_layers; i++) {
+        const int planes = desc->layers[i].nb_planes;
         VkImageDrmFormatModifierExplicitCreateInfoEXT drm_info = {
             .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT,
             .drmFormatModifier = desc->objects[0].format_modifier,
-            .drmFormatModifierPlaneCount = desc->layers[i].nb_planes,
+            .drmFormatModifierPlaneCount = planes,
             .pPlaneLayouts = (const VkSubresourceLayout *)&plane_data,
         };
 
@@ -1729,24 +2014,27 @@ static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, AVVkFrame **f
         const int p_h = i > 0 ? AV_CEIL_RSHIFT(hwfc->height, fmt_desc->log2_chroma_h) : hwfc->height;
 
         VkImageCreateInfo image_create_info = {
-            .sType         = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-            .pNext         = &einfo,
-            .imageType     = VK_IMAGE_TYPE_2D,
-            .format        = drm_to_vulkan_fmt(desc->layers[i].format),
-            .extent.width  = p_w,
-            .extent.height = p_h,
-            .extent.depth  = 1,
-            .mipLevels     = 1,
-            .arrayLayers   = 1,
-            .flags         = VK_IMAGE_CREATE_ALIAS_BIT,
-            .tiling        = f->tiling,
-            .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, /* specs say so */
-            .usage         = DEFAULT_USAGE_FLAGS,
-            .sharingMode   = VK_SHARING_MODE_EXCLUSIVE,
-            .samples       = VK_SAMPLE_COUNT_1_BIT,
+            .sType                 = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+            .pNext                 = &einfo,
+            .imageType             = VK_IMAGE_TYPE_2D,
+            .format                = drm_to_vulkan_fmt(desc->layers[i].format),
+            .extent.width          = p_w,
+            .extent.height         = p_h,
+            .extent.depth          = 1,
+            .mipLevels             = 1,
+            .arrayLayers           = 1,
+            .flags                 = VK_IMAGE_CREATE_ALIAS_BIT,
+            .tiling                = f->tiling,
+            .initialLayout         = VK_IMAGE_LAYOUT_UNDEFINED, /* specs say so */
+            .usage                 = frames_hwctx->usage,
+            .samples               = VK_SAMPLE_COUNT_1_BIT,
+            .pQueueFamilyIndices   = p->qfs,
+            .queueFamilyIndexCount = p->num_qfs,
+            .sharingMode           = p->num_qfs > 1 ? VK_SHARING_MODE_CONCURRENT :
+                                                      VK_SHARING_MODE_EXCLUSIVE,
         };
 
-        for (int j = 0; j < desc->layers[i].nb_planes; j++) {
+        for (int j = 0; j < planes; j++) {
             plane_data[j].offset     = desc->layers[i].planes[j].offset;
             plane_data[j].rowPitch   = desc->layers[i].planes[j].pitch;
             plane_data[j].size       = 0; /* The specs say so for all 3 */
@@ -1775,37 +2063,123 @@ static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, AVVkFrame **f
         /* We'd import a semaphore onto the one we created using
          * vkImportSemaphoreFdKHR but unfortunately neither DRM nor VAAPI
          * offer us anything we could import and sync with, so instead
-         * leave the semaphore unsignalled and enjoy the validation spam. */
+         * just signal the semaphore we created. */
 
         f->layout[i] = image_create_info.initialLayout;
         f->access[i] = 0x0;
+    }
 
-        /* TODO: Fix to support more than 1 plane per layer */
-        bind_info[i].sType  = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
-        bind_info[i].pNext  = NULL;
-        bind_info[i].image  = f->img[i];
-        bind_info[i].memory = f->mem[desc->layers[i].planes[0].object_index];
-        bind_info[i].memoryOffset = desc->layers[i].planes[0].offset;
+    for (int i = 0; i < desc->nb_objects; i++) {
+        int use_ded_mem = 0;
+        VkMemoryFdPropertiesKHR fdmp = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR,
+        };
+        VkMemoryRequirements req = {
+            .size = desc->objects[i].size,
+        };
+        VkImportMemoryFdInfoKHR idesc = {
+            .sType      = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+            .handleType = htype,
+            .fd         = dup(desc->objects[i].fd),
+        };
+        VkMemoryDedicatedAllocateInfo ded_alloc = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
+            .pNext = &idesc,
+        };
+
+        ret = pfn_vkGetMemoryFdPropertiesKHR(hwctx->act_dev, htype,
+                                             idesc.fd, &fdmp);
+        if (ret != VK_SUCCESS) {
+            av_log(hwfc, AV_LOG_ERROR, "Failed to get FD properties: %s\n",
+                   vk_ret2str(ret));
+            err = AVERROR_EXTERNAL;
+            close(idesc.fd);
+            goto fail;
+        }
+
+        req.memoryTypeBits = fdmp.memoryTypeBits;
+
+        /* Dedicated allocation only makes sense if there's a one to one mapping
+         * between images and the memory backing them, so only check in this
+         * case. */
+        if (desc->nb_layers == desc->nb_objects) {
+            VkImageMemoryRequirementsInfo2 req_desc = {
+                .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
+                .image = f->img[i],
+            };
+            VkMemoryDedicatedRequirements ded_req = {
+                .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS,
+            };
+            VkMemoryRequirements2 req2 = {
+                .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+                .pNext = &ded_req,
+            };
+
+            vkGetImageMemoryRequirements2(hwctx->act_dev, &req_desc, &req2);
+
+            use_ded_mem = ded_req.prefersDedicatedAllocation |
+                          ded_req.requiresDedicatedAllocation;
+            if (use_ded_mem)
+                ded_alloc.image = f->img[i];
+        }
+
+        err = alloc_mem(ctx, &req, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                        use_ded_mem ? &ded_alloc : ded_alloc.pNext,
+                        &f->flags, &f->mem[i]);
+        if (err) {
+            close(idesc.fd);
+            return err;
+        }
+
+        f->size[i] = desc->objects[i].size;
+    }
+
+    for (int i = 0; i < desc->nb_layers; i++) {
+        const int planes = desc->layers[i].nb_planes;
+        const int signal_p = has_modifiers && (planes > 1);
+        for (int j = 0; j < planes; j++) {
+            VkImageAspectFlagBits aspect = j == 0 ? VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT :
+                                           j == 1 ? VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT :
+                                                    VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT;
+
+            plane_info[bind_counts].sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO;
+            plane_info[bind_counts].planeAspect = aspect;
+
+            bind_info[bind_counts].sType  = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
+            bind_info[bind_counts].pNext  = signal_p ? &plane_info[bind_counts] : NULL;
+            bind_info[bind_counts].image  = f->img[i];
+            bind_info[bind_counts].memory = f->mem[desc->layers[i].planes[j].object_index];
+            bind_info[bind_counts].memoryOffset = desc->layers[i].planes[j].offset;
+            bind_counts++;
+        }
     }
 
     /* Bind the allocated memory to the images */
-    ret = vkBindImageMemory2(hwctx->act_dev, planes, bind_info);
+    ret = vkBindImageMemory2(hwctx->act_dev, bind_counts, bind_info);
     if (ret != VK_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Failed to bind memory: %s\n",
                vk_ret2str(ret));
         return AVERROR_EXTERNAL;
     }
 
+    /* NOTE: This is completely uneccesary and unneeded once we can import
+     * semaphores from DRM. Otherwise we have to activate the semaphores.
+     * We're reusing the exec context that's also used for uploads/downloads. */
+    err = prepare_frame(hwfc, &fp->conv_ctx, f, PREP_MODE_RO_SHADER);
+    if (err)
+        goto fail;
+
     *frame = f;
 
     return 0;
 
 fail:
-    for (int i = 0; i < planes; i++) {
+    for (int i = 0; i < desc->nb_layers; i++) {
         vkDestroyImage(hwctx->act_dev, f->img[i], hwctx->alloc);
-        vkFreeMemory(hwctx->act_dev, f->mem[i], hwctx->alloc);
         vkDestroySemaphore(hwctx->act_dev, f->sem[i], hwctx->alloc);
     }
+    for (int i = 0; i < desc->nb_objects; i++)
+        vkFreeMemory(hwctx->act_dev, f->mem[i], hwctx->alloc);
 
     av_free(f);
 
@@ -2151,6 +2525,7 @@ static int vulkan_map_to_drm(AVHWFramesContext *hwfc, AVFrame *dst,
     VkResult ret;
     AVVkFrame *f = (AVVkFrame *)src->data[0];
     VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
+    VulkanFramesPriv *fp = hwfc->internal->priv;
     AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
     const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
     VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdKHR);
@@ -2162,6 +2537,10 @@ static int vulkan_map_to_drm(AVHWFramesContext *hwfc, AVFrame *dst,
     if (!drm_desc)
         return AVERROR(ENOMEM);
 
+    err = prepare_frame(hwfc, &fp->conv_ctx, f, PREP_MODE_EXTERNAL_EXPORT);
+    if (err < 0)
+        goto end;
+
     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, &vulkan_unmap_to_drm, drm_desc);
     if (err < 0)
         goto end;
@@ -2218,7 +2597,7 @@ static int vulkan_map_to_drm(AVHWFramesContext *hwfc, AVFrame *dst,
 
         drm_desc->layers[i].planes[0].object_index = FFMIN(i, drm_desc->nb_objects - 1);
 
-        if (f->tiling != VK_IMAGE_TILING_OPTIMAL)
+        if (f->tiling == VK_IMAGE_TILING_OPTIMAL)
             continue;
 
         vkGetImageSubresourceLayout(hwctx->act_dev, f->img[i], &sub, &layout);
@@ -2292,26 +2671,31 @@ typedef struct ImageBuffer {
     VkBuffer buf;
     VkDeviceMemory mem;
     VkMemoryPropertyFlagBits flags;
+    int mapped_mem;
 } ImageBuffer;
 
-static void free_buf(AVHWDeviceContext *ctx, ImageBuffer *buf)
+static void free_buf(void *opaque, uint8_t *data)
 {
+    AVHWDeviceContext *ctx = opaque;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
-    if (!buf)
-        return;
+    ImageBuffer *vkbuf = (ImageBuffer *)data;
+
+    if (vkbuf->buf)
+        vkDestroyBuffer(hwctx->act_dev, vkbuf->buf, hwctx->alloc);
+    if (vkbuf->mem)
+        vkFreeMemory(hwctx->act_dev, vkbuf->mem, hwctx->alloc);
 
-    vkDestroyBuffer(hwctx->act_dev, buf->buf, hwctx->alloc);
-    vkFreeMemory(hwctx->act_dev, buf->mem, hwctx->alloc);
+    av_free(data);
 }
 
-static int create_buf(AVHWDeviceContext *ctx, ImageBuffer *buf, int height,
-                      int *stride, VkBufferUsageFlags usage,
+static int create_buf(AVHWDeviceContext *ctx, AVBufferRef **buf, size_t imp_size,
+                      int height, int *stride, VkBufferUsageFlags usage,
                       VkMemoryPropertyFlagBits flags, void *create_pnext,
                       void *alloc_pnext)
 {
     int err;
     VkResult ret;
-    VkMemoryRequirements req;
+    int use_ded_mem;
     AVVulkanDeviceContext *hwctx = ctx->hwctx;
     VulkanDevicePriv *p = ctx->internal->priv;
 
@@ -2322,34 +2706,77 @@ static int create_buf(AVHWDeviceContext *ctx, ImageBuffer *buf, int height,
         .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
     };
 
-    *stride = FFALIGN(*stride, p->props.limits.optimalBufferCopyRowPitchAlignment);
-    buf_spawn.size = height*(*stride);
+    VkBufferMemoryRequirementsInfo2 req_desc = {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
+    };
+    VkMemoryDedicatedAllocateInfo ded_alloc = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
+        .pNext = alloc_pnext,
+    };
+    VkMemoryDedicatedRequirements ded_req = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS,
+    };
+    VkMemoryRequirements2 req = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+        .pNext = &ded_req,
+    };
+
+    ImageBuffer *vkbuf = av_mallocz(sizeof(*vkbuf));
+    if (!vkbuf)
+        return AVERROR(ENOMEM);
+
+    vkbuf->mapped_mem = !!imp_size;
 
-    ret = vkCreateBuffer(hwctx->act_dev, &buf_spawn, NULL, &buf->buf);
+    if (!vkbuf->mapped_mem) {
+        *stride = FFALIGN(*stride, p->props.properties.limits.optimalBufferCopyRowPitchAlignment);
+        buf_spawn.size = height*(*stride);
+        buf_spawn.size = FFALIGN(buf_spawn.size, p->props.properties.limits.minMemoryMapAlignment);
+    } else {
+        buf_spawn.size = imp_size;
+    }
+
+    ret = vkCreateBuffer(hwctx->act_dev, &buf_spawn, NULL, &vkbuf->buf);
     if (ret != VK_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Failed to create buffer: %s\n",
                vk_ret2str(ret));
         return AVERROR_EXTERNAL;
     }
 
-    vkGetBufferMemoryRequirements(hwctx->act_dev, buf->buf, &req);
+    req_desc.buffer = vkbuf->buf;
 
-    err = alloc_mem(ctx, &req, flags, alloc_pnext, &buf->flags, &buf->mem);
+    vkGetBufferMemoryRequirements2(hwctx->act_dev, &req_desc, &req);
+
+    /* In case the implementation prefers/requires dedicated allocation */
+    use_ded_mem = ded_req.prefersDedicatedAllocation |
+                  ded_req.requiresDedicatedAllocation;
+    if (use_ded_mem)
+        ded_alloc.buffer = vkbuf->buf;
+
+    err = alloc_mem(ctx, &req.memoryRequirements, flags,
+                    use_ded_mem ? &ded_alloc : (void *)ded_alloc.pNext,
+                    &vkbuf->flags, &vkbuf->mem);
     if (err)
         return err;
 
-    ret = vkBindBufferMemory(hwctx->act_dev, buf->buf, buf->mem, 0);
+    ret = vkBindBufferMemory(hwctx->act_dev, vkbuf->buf, vkbuf->mem, 0);
     if (ret != VK_SUCCESS) {
         av_log(ctx, AV_LOG_ERROR, "Failed to bind memory to buffer: %s\n",
                vk_ret2str(ret));
-        free_buf(ctx, buf);
+        free_buf(ctx, (uint8_t *)vkbuf);
         return AVERROR_EXTERNAL;
     }
 
+    *buf = av_buffer_create((uint8_t *)vkbuf, sizeof(*vkbuf), free_buf, ctx, 0);
+    if (!(*buf)) {
+        free_buf(ctx, (uint8_t *)vkbuf);
+        return AVERROR(ENOMEM);
+    }
+
     return 0;
 }
 
-static int map_buffers(AVHWDeviceContext *ctx, ImageBuffer *buf, uint8_t *mem[],
+/* Skips mapping of host mapped buffers but still invalidates them */
+static int map_buffers(AVHWDeviceContext *ctx, AVBufferRef **bufs, uint8_t *mem[],
                        int nb_buffers, int invalidate)
 {
     VkResult ret;
@@ -2358,7 +2785,11 @@ static int map_buffers(AVHWDeviceContext *ctx, ImageBuffer *buf, uint8_t *mem[],
     int invalidate_count = 0;
 
     for (int i = 0; i < nb_buffers; i++) {
-        ret = vkMapMemory(hwctx->act_dev, buf[i].mem, 0,
+        ImageBuffer *vkbuf = (ImageBuffer *)bufs[i]->data;
+        if (vkbuf->mapped_mem)
+            continue;
+
+        ret = vkMapMemory(hwctx->act_dev, vkbuf->mem, 0,
                           VK_WHOLE_SIZE, 0, (void **)&mem[i]);
         if (ret != VK_SUCCESS) {
             av_log(ctx, AV_LOG_ERROR, "Failed to map buffer memory: %s\n",
@@ -2371,12 +2802,13 @@ static int map_buffers(AVHWDeviceContext *ctx, ImageBuffer *buf, uint8_t *mem[],
         return 0;
 
     for (int i = 0; i < nb_buffers; i++) {
+        ImageBuffer *vkbuf = (ImageBuffer *)bufs[i]->data;
         const VkMappedMemoryRange ival_buf = {
             .sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
-            .memory = buf[i].mem,
+            .memory = vkbuf->mem,
             .size   = VK_WHOLE_SIZE,
         };
-        if (buf[i].flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+        if (vkbuf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
             continue;
         invalidate_ctx[invalidate_count++] = ival_buf;
     }
@@ -2392,7 +2824,7 @@ static int map_buffers(AVHWDeviceContext *ctx, ImageBuffer *buf, uint8_t *mem[],
     return 0;
 }
 
-static int unmap_buffers(AVHWDeviceContext *ctx, ImageBuffer *buf,
+static int unmap_buffers(AVHWDeviceContext *ctx, AVBufferRef **bufs,
                          int nb_buffers, int flush)
 {
     int err = 0;
@@ -2403,12 +2835,13 @@ static int unmap_buffers(AVHWDeviceContext *ctx, ImageBuffer *buf,
 
     if (flush) {
         for (int i = 0; i < nb_buffers; i++) {
+            ImageBuffer *vkbuf = (ImageBuffer *)bufs[i]->data;
             const VkMappedMemoryRange flush_buf = {
                 .sType  = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
-                .memory = buf[i].mem,
+                .memory = vkbuf->mem,
                 .size   = VK_WHOLE_SIZE,
             };
-            if (buf[i].flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+            if (vkbuf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
                 continue;
             flush_ctx[flush_count++] = flush_buf;
         }
@@ -2423,35 +2856,37 @@ static int unmap_buffers(AVHWDeviceContext *ctx, ImageBuffer *buf,
         }
     }
 
-    for (int i = 0; i < nb_buffers; i++)
-        vkUnmapMemory(hwctx->act_dev, buf[i].mem);
+    for (int i = 0; i < nb_buffers; i++) {
+        ImageBuffer *vkbuf = (ImageBuffer *)bufs[i]->data;
+        if (vkbuf->mapped_mem)
+            continue;
+
+        vkUnmapMemory(hwctx->act_dev, vkbuf->mem);
+    }
 
     return err;
 }
 
-static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame,
-                              ImageBuffer *buffer, const int *buf_stride, int w,
+static int transfer_image_buf(AVHWFramesContext *hwfc, const AVFrame *f,
+                              AVBufferRef **bufs, const int *buf_stride, int w,
                               int h, enum AVPixelFormat pix_fmt, int to_buf)
 {
-    VkResult ret;
-    AVVulkanDeviceContext *hwctx = ctx->hwctx;
-    VulkanDevicePriv *s = ctx->internal->priv;
+    int err;
+    AVVkFrame *frame = (AVVkFrame *)f->data[0];
+    VulkanFramesPriv *fp = hwfc->internal->priv;
+
+    int bar_num = 0;
     VkPipelineStageFlagBits sem_wait_dst[AV_NUM_DATA_POINTERS];
 
     const int planes = av_pix_fmt_count_planes(pix_fmt);
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
 
-    VkCommandBufferBeginInfo cmd_start = {
-        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
-        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
-    };
-
     VkImageMemoryBarrier img_bar[AV_NUM_DATA_POINTERS] = { 0 };
+    VulkanExecCtx *ectx = to_buf ? &fp->download_ctx : &fp->upload_ctx;
+    VkCommandBuffer cmd_buf = get_buf_exec_ctx(hwfc, ectx);
 
     VkSubmitInfo s_info = {
         .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-        .commandBufferCount   = 1,
-        .pCommandBuffers      = &s->cmd.buf,
         .pSignalSemaphores    = frame->sem,
         .pWaitSemaphores      = frame->sem,
         .pWaitDstStageMask    = sem_wait_dst,
@@ -2459,41 +2894,48 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame,
         .waitSemaphoreCount   = planes,
     };
 
-    ret = vkBeginCommandBuffer(s->cmd.buf, &cmd_start);
-    if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to init command buffer: %s\n",
-               vk_ret2str(ret));
-        return AVERROR_EXTERNAL;
-    }
+    if ((err = wait_start_exec_ctx(hwfc, ectx)))
+        return err;
 
     /* Change the image layout to something more optimal for transfers */
     for (int i = 0; i < planes; i++) {
-        img_bar[i].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
-        img_bar[i].srcAccessMask = 0x0;
-        img_bar[i].dstAccessMask = to_buf ? VK_ACCESS_TRANSFER_READ_BIT :
+        VkImageLayout new_layout = to_buf ? VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL :
+                                            VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+        VkAccessFlags new_access = to_buf ? VK_ACCESS_TRANSFER_READ_BIT :
                                             VK_ACCESS_TRANSFER_WRITE_BIT;
-        img_bar[i].oldLayout = frame->layout[i];
-        img_bar[i].newLayout = to_buf ? VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL :
-                                        VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-        img_bar[i].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        img_bar[i].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-        img_bar[i].image = frame->img[i];
-        img_bar[i].subresourceRange.levelCount = 1;
-        img_bar[i].subresourceRange.layerCount = 1;
-        img_bar[i].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
 
         sem_wait_dst[i] = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
 
-        frame->layout[i] = img_bar[i].newLayout;
-        frame->access[i] = img_bar[i].dstAccessMask;
+        /* If the layout matches and we have read access skip the barrier */
+        if ((frame->layout[i] == new_layout) && (frame->access[i] & new_access))
+            continue;
+
+        img_bar[bar_num].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+        img_bar[bar_num].srcAccessMask = 0x0;
+        img_bar[bar_num].dstAccessMask = new_access;
+        img_bar[bar_num].oldLayout = frame->layout[i];
+        img_bar[bar_num].newLayout = new_layout;
+        img_bar[bar_num].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        img_bar[bar_num].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        img_bar[bar_num].image = frame->img[i];
+        img_bar[bar_num].subresourceRange.levelCount = 1;
+        img_bar[bar_num].subresourceRange.layerCount = 1;
+        img_bar[bar_num].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+
+        frame->layout[i] = img_bar[bar_num].newLayout;
+        frame->access[i] = img_bar[bar_num].dstAccessMask;
+
+        bar_num++;
     }
 
-    vkCmdPipelineBarrier(s->cmd.buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
-                         VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
-                         0, NULL, 0, NULL, planes, img_bar);
+    if (bar_num)
+        vkCmdPipelineBarrier(cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                             VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
+                             0, NULL, 0, NULL, bar_num, img_bar);
 
     /* Schedule a copy for each plane */
     for (int i = 0; i < planes; i++) {
+        ImageBuffer *vkbuf = (ImageBuffer *)bufs[i]->data;
         const int p_w = i > 0 ? AV_CEIL_RSHIFT(w, desc->log2_chroma_w) : w;
         const int p_h = i > 0 ? AV_CEIL_RSHIFT(h, desc->log2_chroma_h) : h;
         VkBufferImageCopy buf_reg = {
@@ -2511,40 +2953,35 @@ static int transfer_image_buf(AVHWDeviceContext *ctx, AVVkFrame *frame,
         };
 
         if (to_buf)
-            vkCmdCopyImageToBuffer(s->cmd.buf, frame->img[i], frame->layout[i],
-                                   buffer[i].buf, 1, &buf_reg);
+            vkCmdCopyImageToBuffer(cmd_buf, frame->img[i], frame->layout[i],
+                                   vkbuf->buf, 1, &buf_reg);
         else
-            vkCmdCopyBufferToImage(s->cmd.buf, buffer[i].buf, frame->img[i],
+            vkCmdCopyBufferToImage(cmd_buf, vkbuf->buf, frame->img[i],
                                    frame->layout[i], 1, &buf_reg);
     }
 
-    ret = vkEndCommandBuffer(s->cmd.buf);
-    if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to finish command buffer: %s\n",
-               vk_ret2str(ret));
-        return AVERROR_EXTERNAL;
-    }
-
-    /* Wait for the download/upload to finish if uploading, otherwise the
-     * semaphore will take care of synchronization when uploading */
-    ret = vkQueueSubmit(s->cmd.queue, 1, &s_info, s->cmd.fence);
-    if (ret != VK_SUCCESS) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to submit command buffer: %s\n",
-               vk_ret2str(ret));
-        return AVERROR_EXTERNAL;
+    /* When uploading, do this asynchronously if the source is refcounted by
+     * keeping the buffers as a submission dependency.
+     * The hwcontext is guaranteed to not be freed until all frames are freed
+     * in the frames_unint function.
+     * When downloading to buffer, do this synchronously and wait for the
+     * queue submission to finish executing */
+    if (!to_buf) {
+        int ref;
+        for (ref = 0; ref < AV_NUM_DATA_POINTERS; ref++) {
+            if (!f->buf[ref])
+                break;
+            if ((err = add_buf_dep_exec_ctx(hwfc, ectx, &f->buf[ref], 1)))
+                return err;
+        }
+        if (ref && (err = add_buf_dep_exec_ctx(hwfc, ectx, bufs, planes)))
+            return err;
+        return submit_exec_ctx(hwfc, ectx, &s_info, !ref);
     } else {
-        vkWaitForFences(hwctx->act_dev, 1, &s->cmd.fence, VK_TRUE, UINT64_MAX);
-        vkResetFences(hwctx->act_dev, 1, &s->cmd.fence);
+        return submit_exec_ctx(hwfc, ectx, &s_info,    1);
     }
-
-    return 0;
 }
 
-/* Technically we can use VK_EXT_external_memory_host to upload and download,
- * however the alignment requirements make this unfeasible as both the pointer
- * and the size of each plane need to be aligned to the minimum alignment
- * requirement, which on all current implementations (anv, radv) is 4096.
- * If the requirement gets relaxed (unlikely) this can easily be implemented. */
 static int vulkan_transfer_data_from_mem(AVHWFramesContext *hwfc, AVFrame *dst,
                                          const AVFrame *src)
 {
@@ -2552,9 +2989,12 @@ static int vulkan_transfer_data_from_mem(AVHWFramesContext *hwfc, AVFrame *dst,
     AVFrame tmp;
     AVVkFrame *f = (AVVkFrame *)dst->data[0];
     AVHWDeviceContext *dev_ctx = hwfc->device_ctx;
-    ImageBuffer buf[AV_NUM_DATA_POINTERS] = { { 0 } };
+    AVBufferRef *bufs[AV_NUM_DATA_POINTERS] = { 0 };
     const int planes = av_pix_fmt_count_planes(src->format);
     int log2_chroma = av_pix_fmt_desc_get(src->format)->log2_chroma_h;
+    VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
+    int host_mapped[AV_NUM_DATA_POINTERS] = { 0 };
+    int map_host = p->extensions & EXT_EXTERNAL_HOST_MEMORY;
 
     if ((src->format != AV_PIX_FMT_NONE && !av_vkfmt_from_pixfmt(src->format))) {
         av_log(hwfc, AV_LOG_ERROR, "Unsupported source pixel format!\n");
@@ -2574,43 +3014,69 @@ static int vulkan_transfer_data_from_mem(AVHWFramesContext *hwfc, AVFrame *dst,
 
         err = vulkan_map_frame_to_mem(hwfc, map, dst, AV_HWFRAME_MAP_WRITE);
         if (err)
-            goto end;
+            return err;
 
         err = av_frame_copy(map, src);
         av_frame_free(&map);
-        goto end;
+        return err;
     }
 
     /* Create buffers */
     for (int i = 0; i < planes; i++) {
         int h = src->height;
         int p_height = i > 0 ? AV_CEIL_RSHIFT(h, log2_chroma) : h;
+        size_t p_size = FFALIGN(FFABS(src->linesize[i]) * p_height,
+                                p->hprops.minImportedHostPointerAlignment);
 
-        tmp.linesize[i] = src->linesize[i];
-        err = create_buf(dev_ctx, &buf[i], p_height,
-                         &tmp.linesize[i], VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
-                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, NULL, NULL);
+        VkImportMemoryHostPointerInfoEXT import_desc = {
+            .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
+            .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
+            .pHostPointer = src->data[i],
+        };
+
+        /* We can only map images with positive stride and alignment appropriate
+         * for the device. */
+        host_mapped[i] = map_host && src->linesize[i] > 0 &&
+                         !(((uintptr_t)import_desc.pHostPointer) %
+                           p->hprops.minImportedHostPointerAlignment);
+        p_size = host_mapped[i] ? p_size : 0;
+
+        tmp.linesize[i] = FFABS(src->linesize[i]);
+        err = create_buf(dev_ctx, &bufs[i], p_size, p_height, &tmp.linesize[i],
+                         VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, NULL,
+                         host_mapped[i] ? &import_desc : NULL);
         if (err)
             goto end;
     }
 
     /* Map, copy image to buffer, unmap */
-    if ((err = map_buffers(dev_ctx, buf, tmp.data, planes, 0)))
+    if ((err = map_buffers(dev_ctx, bufs, tmp.data, planes, 0)))
         goto end;
 
-    av_image_copy(tmp.data, tmp.linesize, (const uint8_t **)src->data,
-                  src->linesize, src->format, src->width, src->height);
+    for (int i = 0; i < planes; i++) {
+        int h = src->height;
+        int p_height = i > 0 ? AV_CEIL_RSHIFT(h, log2_chroma) : h;
+
+        if (host_mapped[i])
+            continue;
+
+        av_image_copy_plane(tmp.data[i], tmp.linesize[i],
+                            (const uint8_t *)src->data[i], src->linesize[i],
+                            FFMIN(tmp.linesize[i], FFABS(src->linesize[i])),
+                            p_height);
+    }
 
-    if ((err = unmap_buffers(dev_ctx, buf, planes, 1)))
+    if ((err = unmap_buffers(dev_ctx, bufs, planes, 1)))
         goto end;
 
     /* Copy buffers to image */
-    err = transfer_image_buf(dev_ctx, f, buf, tmp.linesize,
+    err = transfer_image_buf(hwfc, dst, bufs, tmp.linesize,
                              src->width, src->height, src->format, 0);
 
 end:
     for (int i = 0; i < planes; i++)
-        free_buf(dev_ctx, &buf[i]);
+        av_buffer_unref(&bufs[i]);
 
     return err;
 }
@@ -2712,9 +3178,12 @@ static int vulkan_transfer_data_to_mem(AVHWFramesContext *hwfc, AVFrame *dst,
     AVFrame tmp;
     AVVkFrame *f = (AVVkFrame *)src->data[0];
     AVHWDeviceContext *dev_ctx = hwfc->device_ctx;
-    ImageBuffer buf[AV_NUM_DATA_POINTERS] = { { 0 } };
+    AVBufferRef *bufs[AV_NUM_DATA_POINTERS] = { 0 };
     const int planes = av_pix_fmt_count_planes(dst->format);
     int log2_chroma = av_pix_fmt_desc_get(dst->format)->log2_chroma_h;
+    VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
+    int host_mapped[AV_NUM_DATA_POINTERS] = { 0 };
+    int map_host = p->extensions & EXT_EXTERNAL_HOST_MEMORY;
 
     if (dst->width > hwfc->width || dst->height > hwfc->height)
         return AVERROR(EINVAL);
@@ -2740,30 +3209,58 @@ static int vulkan_transfer_data_to_mem(AVHWFramesContext *hwfc, AVFrame *dst,
     for (int i = 0; i < planes; i++) {
         int h = dst->height;
         int p_height = i > 0 ? AV_CEIL_RSHIFT(h, log2_chroma) : h;
+        size_t p_size = FFALIGN(FFABS(dst->linesize[i]) * p_height,
+                                p->hprops.minImportedHostPointerAlignment);
+
+        VkImportMemoryHostPointerInfoEXT import_desc = {
+            .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
+            .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
+            .pHostPointer = dst->data[i],
+        };
+
+        /* We can only map images with positive stride and alignment appropriate
+         * for the device. */
+        host_mapped[i] = map_host && dst->linesize[i] > 0 &&
+                         !(((uintptr_t)import_desc.pHostPointer) %
+                           p->hprops.minImportedHostPointerAlignment);
+        p_size = host_mapped[i] ? p_size : 0;
 
-        tmp.linesize[i] = dst->linesize[i];
-        err = create_buf(dev_ctx, &buf[i], p_height,
+        tmp.linesize[i] = FFABS(dst->linesize[i]);
+        err = create_buf(dev_ctx, &bufs[i], p_size, p_height,
                          &tmp.linesize[i], VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, NULL, NULL);
+                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, NULL,
+                         host_mapped[i] ? &import_desc : NULL);
+        if (err)
+            goto end;
     }
 
     /* Copy image to buffer */
-    if ((err = transfer_image_buf(dev_ctx, f, buf, tmp.linesize,
+    if ((err = transfer_image_buf(hwfc, src, bufs, tmp.linesize,
                                   dst->width, dst->height, dst->format, 1)))
         goto end;
 
     /* Map, copy buffer to frame, unmap */
-    if ((err = map_buffers(dev_ctx, buf, tmp.data, planes, 1)))
+    if ((err = map_buffers(dev_ctx, bufs, tmp.data, planes, 1)))
         goto end;
 
-    av_image_copy(dst->data, dst->linesize, (const uint8_t **)tmp.data,
-                  tmp.linesize, dst->format, dst->width, dst->height);
+    for (int i = 0; i < planes; i++) {
+        int h = dst->height;
+        int p_height = i > 0 ? AV_CEIL_RSHIFT(h, log2_chroma) : h;
+
+        if (host_mapped[i])
+            continue;
+
+        av_image_copy_plane(dst->data[i], dst->linesize[i],
+                            (const uint8_t *)tmp.data[i], tmp.linesize[i],
+                            FFMIN(tmp.linesize[i], FFABS(dst->linesize[i])),
+                            p_height);
+    }
 
-    err = unmap_buffers(dev_ctx, buf, planes, 0);
+    err = unmap_buffers(dev_ctx, bufs, planes, 0);
 
 end:
     for (int i = 0; i < planes; i++)
-        free_buf(dev_ctx, &buf[i]);
+        av_buffer_unref(&bufs[i]);
 
     return err;
 }
@@ -2788,6 +3285,12 @@ static int vulkan_transfer_data_from(AVHWFramesContext *hwfc, AVFrame *dst,
     }
 }
 
+static int vulkan_frames_derive_to(AVHWFramesContext *dst_fc,
+                                   AVHWFramesContext *src_fc, int flags)
+{
+    return vulkan_frames_init(dst_fc);
+}
+
 AVVkFrame *av_vk_frame_alloc(void)
 {
     return av_mallocz(sizeof(AVVkFrame));
@@ -2817,6 +3320,7 @@ const HWContextType ff_hwcontext_type_vulkan = {
 
     .map_to                 = vulkan_map_to,
     .map_from               = vulkan_map_from,
+    .frames_derive_to       = &vulkan_frames_derive_to,
 
     .pix_fmts = (const enum AVPixelFormat []) {
         AV_PIX_FMT_VULKAN,