From 4b4f249d5221a6456e0a3b725ff6463f14c24699 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Tue, 12 Jan 2016 01:51:19 +0100 Subject: [PATCH] Add some useful (pending) upstream patches that have not gone in yet. --- README | 7 + ...-support-for-persistent-device-memory.diff | 30 ++ ...-support-for-persistent-device-memory.diff | 212 +++++++++ ...-support-for-persistent-device-memory.diff | 404 ++++++++++++++++++ .../mesa-fix-locking-of-glsync-objects.diff | 321 ++++++++++++++ patches/zita-resampler-sse.diff | 213 +++++++++ 6 files changed, 1187 insertions(+) create mode 100644 patches/bmusb-add-support-for-persistent-device-memory.diff create mode 100644 patches/libusb-add-support-for-persistent-device-memory.diff create mode 100644 patches/linux-add-support-for-persistent-device-memory.diff create mode 100644 patches/mesa-fix-locking-of-glsync-objects.diff create mode 100644 patches/zita-resampler-sse.diff diff --git a/README b/README index b983777..56d7f2e 100644 --- a/README +++ b/README @@ -56,6 +56,13 @@ Nageru is in alpha stage. It currently needs: - Lua, for driving the theme engine. +The patches/ directory contains some patches for upstream software that help +Nageru performance and/or stability. They are all meant for upstream, but +probably will not be in by the time Nageru is released. All except the bmusb +patch are taken to be by Steinar H. Gunderson (ie., my work +email, unlike Nageru itself and bmusb), and under the same license as the +projects they patch. + To start it, just hook up your requipment, type “make” and then “./nageru”. It is strongly recommended to have the rights to run at real-time priority; it will make the USB3 threads do so, which will make them a lot more stable. diff --git a/patches/bmusb-add-support-for-persistent-device-memory.diff b/patches/bmusb-add-support-for-persistent-device-memory.diff new file mode 100644 index 0000000..7e5ac32 --- /dev/null +++ b/patches/bmusb-add-support-for-persistent-device-memory.diff @@ -0,0 +1,30 @@ +diff --git a/bmusb.cpp b/bmusb.cpp +index 2ea6407..16ec380 100644 +--- a/bmusb.cpp ++++ b/bmusb.cpp +@@ -1027,7 +1027,7 @@ void BMUSBCapture::configure_card() + // set up isochronous transfers for audio and video + for (int e = 3; e <= 4; ++e) { + //int num_transfers = (e == 3) ? 6 : 6; +- int num_transfers = 10; ++ int num_transfers = 6; + for (int i = 0; i < num_transfers; ++i) { + size_t buf_size; + int num_iso_pack, size; +@@ -1043,8 +1043,14 @@ void BMUSBCapture::configure_card() + num_iso_pack = 80; + buf_size = num_iso_pack * size; + } +- assert(size_t(num_iso_pack * size) <= buf_size); +- uint8_t *buf = new uint8_t[buf_size]; ++ int num_bytes = num_iso_pack * size; ++ assert(size_t(num_bytes) <= buf_size); ++ uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes); ++ if (buf == nullptr) { ++ fprintf(stderr, "Failed to allocate persistent DMA memory (probably missing kernel patch).\n"); ++ fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n"); ++ buf = new uint8_t[num_bytes]; ++ } + + xfr = libusb_alloc_transfer(num_iso_pack); + if (!xfr) { diff --git a/patches/libusb-add-support-for-persistent-device-memory.diff b/patches/libusb-add-support-for-persistent-device-memory.diff new file mode 100644 index 0000000..1642493 --- /dev/null +++ b/patches/libusb-add-support-for-persistent-device-memory.diff @@ -0,0 +1,212 @@ +From 66c3edf744415f117c3be95ae83f0bab6e128830 Mon Sep 17 00:00:00 2001 +From: "Steinar H. Gunderson" +Date: Wed, 9 Dec 2015 10:03:15 +0100 +Subject: [PATCH] Add support for persistent device memory. + +Add a function to allocate memory belonging to a specific device, +so that the operating system can DMA straight into it for zerocopy, +and also avoid some clearing. Also, this allows up-front memory +allocation in the kernel at program startup; memory allocation is +otherwise done per-transfer, which can fail in a system where memory has become +fragmented over time). + +This mirrors new functionality going into Linux' USB stack (recently +reviewed and acked upstream); only Linux is supported as a backend +currently. +--- + libusb/core.c | 29 +++++++++++++++++++++++++++++ + libusb/libusb-1.0.def | 4 ++++ + libusb/libusb.h | 30 +++++++++++++++++++++++++++++- + libusb/libusbi.h | 10 ++++++++++ + libusb/os/linux_usbfs.c | 30 ++++++++++++++++++++++++++++++ + 5 files changed, 102 insertions(+), 1 deletion(-) + +diff --git a/libusb/core.c b/libusb/core.c +index 9c617fb..3eba85a 100644 +--- a/libusb/core.c ++++ b/libusb/core.c +@@ -351,6 +351,8 @@ if (cfg != desired) + * - libusb_control_transfer_get_setup() + * - libusb_cpu_to_le16() + * - libusb_detach_kernel_driver() ++ * - libusb_dev_mem_alloc() ++ * - libusb_dev_mem_free() + * - libusb_error_name() + * - libusb_event_handler_active() + * - libusb_event_handling_ok() +@@ -1805,6 +1807,33 @@ int API_EXPORTED libusb_free_streams(libusb_device_handle *dev, + return LIBUSB_ERROR_NOT_SUPPORTED; + } + ++API_EXPORTED unsigned char *libusb_dev_mem_alloc(libusb_device_handle *dev, ++ int length) ++{ ++ if (!dev->dev->attached) ++ return NULL; ++ ++ if (usbi_backend->dev_mem_alloc) ++ return usbi_backend->dev_mem_alloc(dev, length); ++ else ++ return NULL; ++} ++ ++/* Note: No current backends actually use the "dev" parameter; it is only there ++ * for any future, less lenient OSes. ++ */ ++int API_EXPORTED libusb_dev_mem_free(libusb_device_handle *dev, ++ unsigned char *buffer, int len) ++{ ++ if (!dev->dev->attached) ++ return LIBUSB_ERROR_NO_DEVICE; ++ ++ if (usbi_backend->dev_mem_free) ++ return usbi_backend->dev_mem_free(dev, buffer, len); ++ else ++ return LIBUSB_ERROR_NOT_SUPPORTED; ++} ++ + /** \ingroup dev + * Determine if a kernel driver is active on an interface. If a kernel driver + * is active, you cannot claim the interface, and libusb will be unable to +diff --git a/libusb/libusb-1.0.def b/libusb/libusb-1.0.def +index 538ad49..e040f4b 100644 +--- a/libusb/libusb-1.0.def ++++ b/libusb/libusb-1.0.def +@@ -20,6 +20,10 @@ EXPORTS + libusb_control_transfer@32 = libusb_control_transfer + libusb_detach_kernel_driver + libusb_detach_kernel_driver@8 = libusb_detach_kernel_driver ++ libusb_dev_mem_alloc ++ libusb_dev_mem_alloc@8 = libusb_dev_mem_alloc ++ libusb_dev_mem_free ++ libusb_dev_mem_free@12 = libusb_dev_mem_free + libusb_error_name + libusb_error_name@4 = libusb_error_name + libusb_event_handler_active +diff --git a/libusb/libusb.h b/libusb/libusb.h +index 513945f..5a84f5b 100644 +--- a/libusb/libusb.h ++++ b/libusb/libusb.h +@@ -141,7 +141,7 @@ typedef unsigned __int32 uint32_t; + * Internally, LIBUSB_API_VERSION is defined as follows: + * (libusb major << 24) | (libusb minor << 16) | (16 bit incremental) + */ +-#define LIBUSB_API_VERSION 0x01000104 ++#define LIBUSB_API_VERSION 0x01000105 + + /* The following is kept for compatibility, but will be deprecated in the future */ + #define LIBUSBX_API_VERSION LIBUSB_API_VERSION +@@ -1749,6 +1749,34 @@ int LIBUSB_CALL libusb_interrupt_transfer(libusb_device_handle *dev_handle, + unsigned char endpoint, unsigned char *data, int length, + int *actual_length, unsigned int timeout); + ++/** \ingroup asyncio ++ * Attempts to allocate a block of persistent DMA memory suitable for transfers ++ * against the given device. If successful, will return a block of memory ++ * that is suitable for use as "buffer" in \ref libusb_transfer against this ++ * device. Using this memory instead of regular memory means that the host ++ * controller can use DMA directly into the buffer to increase performance, and ++ * also that transfers can no longer fail due to kernel memory fragmentation. ++ * ++ * Note that this means you should not modify this memory (or even data on ++ * the same cache lines) when a transfer is in progress, although it is legal ++ * to have several transfers going on within the same memory block. ++ * ++ * Will return NULL on failure. Many systems do not support such zerocopy ++ * and will always return NULL. Memory allocated with this function must be ++ * freed with \ref libusb_dev_mem_free. ++ * ++ * Since version 1.0.21, \ref LIBUSB_API_VERSION >= 0x01000105 ++ * ++ * \param dev a device handle ++ * \param length size of desired data buffer ++ * \returns a pointer to the newly allocated memory, or NULL on failure ++ */ ++unsigned char *LIBUSB_CALL libusb_dev_mem_alloc(libusb_device_handle *dev, ++ int length); ++ ++int LIBUSB_CALL libusb_dev_mem_free(libusb_device_handle *dev, ++ unsigned char *buffer, int length); ++ + /** \ingroup desc + * Retrieve a descriptor from the default control pipe. + * This is a convenience function which formulates the appropriate control +diff --git a/libusb/libusbi.h b/libusb/libusbi.h +index f1afd99..66bdf46 100644 +--- a/libusb/libusbi.h ++++ b/libusb/libusbi.h +@@ -913,6 +913,16 @@ struct usbi_os_backend { + int (*free_streams)(struct libusb_device_handle *handle, + unsigned char *endpoints, int num_endpoints); + ++ /* Allocate persistent DMA memory for the given device, suitable for ++ * zerocopy. May return NULL on failure. Optional to implement. ++ */ ++ unsigned char *(*dev_mem_alloc)(struct libusb_device_handle *handle, ++ size_t len); ++ ++ /* Free memory allocated by dev_mem_alloc. */ ++ int (*dev_mem_free)(struct libusb_device_handle *handle, ++ unsigned char *buffer, size_t len); ++ + /* Determine if a kernel driver is active on an interface. Optional. + * + * The presence of a kernel driver on an interface indicates that any +diff --git a/libusb/os/linux_usbfs.c b/libusb/os/linux_usbfs.c +index a63852f..a167084 100644 +--- a/libusb/os/linux_usbfs.c ++++ b/libusb/os/linux_usbfs.c +@@ -33,6 +33,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1561,6 +1562,32 @@ static int op_free_streams(struct libusb_device_handle *handle, + endpoints, num_endpoints); + } + ++static unsigned char *op_dev_mem_alloc(struct libusb_device_handle *handle, ++ size_t len) ++{ ++ struct linux_device_handle_priv *hpriv = _device_handle_priv(handle); ++ unsigned char *buffer = (unsigned char *)mmap(NULL, len, ++ PROT_READ | PROT_WRITE, MAP_SHARED, hpriv->fd, 0); ++ if (buffer == MAP_FAILED) { ++ usbi_err(HANDLE_CTX(handle), "alloc dev mem failed errno %d", ++ errno); ++ return NULL; ++ } ++ return buffer; ++} ++ ++static int op_dev_mem_free(struct libusb_device_handle *handle, ++ unsigned char *buffer, size_t len) ++{ ++ if (munmap(buffer, len) != 0) { ++ usbi_err(HANDLE_CTX(handle), "free dev mem failed errno %d", ++ errno); ++ return LIBUSB_ERROR_OTHER; ++ } else { ++ return LIBUSB_SUCCESS; ++ } ++} ++ + static int op_kernel_driver_active(struct libusb_device_handle *handle, + int interface) + { +@@ -2682,6 +2709,9 @@ const struct usbi_os_backend linux_usbfs_backend = { + .alloc_streams = op_alloc_streams, + .free_streams = op_free_streams, + ++ .dev_mem_alloc = op_dev_mem_alloc, ++ .dev_mem_free = op_dev_mem_free, ++ + .kernel_driver_active = op_kernel_driver_active, + .detach_kernel_driver = op_detach_kernel_driver, + .attach_kernel_driver = op_attach_kernel_driver, +-- +2.6.4 + diff --git a/patches/linux-add-support-for-persistent-device-memory.diff b/patches/linux-add-support-for-persistent-device-memory.diff new file mode 100644 index 0000000..a876b65 --- /dev/null +++ b/patches/linux-add-support-for-persistent-device-memory.diff @@ -0,0 +1,404 @@ +From 206b4c81b6b31d87c758748cdbc6d25e9c721ea1 Mon Sep 17 00:00:00 2001 +In-Reply-To: <20160106001143.GA1171@kroah.com> +References: <20160106001143.GA1171@kroah.com> +From: "Steinar H. Gunderson" +Date: Thu, 26 Nov 2015 01:19:13 +0100 +Subject: [PATCH v2] Add support for usbfs zerocopy. +To: Greg Kroah-Hartman +Cc: linux-usb@vger.kernel.org,linux-kernel@vger.kernel.org,stern@rowland.harvard.edu + +Add a new interface for userspace to preallocate memory that can be +used with usbfs. This gives two primary benefits: + + - Zerocopy; data no longer needs to be copied between the userspace + and the kernel, but can instead be read directly by the driver from + userspace's buffers. This works for all kinds of transfers (even if + nonsensical for control and interrupt transfers); isochronous also + no longer need to memset() the buffer to zero to avoid leaking kernel data. + + - Once the buffers are allocated, USB transfers can no longer fail due to + memory fragmentation; previously, long-running programs could run into + problems finding a large enough contiguous memory chunk, especially on + embedded systems or at high rates. + +Memory is allocated by using mmap() against the usbfs file descriptor, +and similarly deallocated by munmap(). Once memory has been allocated, +using it as pointers to a bulk or isochronous operation means you will +automatically get zerocopy behavior. Note that this also means you cannot +modify outgoing data until the transfer is complete. The same holds for +data on the same cache lines as incoming data; DMA modifying them at the +same time could lead to your changes being overwritten. + +There's a new capability USBDEVFS_CAP_MMAP that userspace can query to see +if the running kernel supports this functionality, if just trying mmap() is +not acceptable. + +Largely based on a patch by Markus Rechberger with some updates. The original +patch can be found at: + + http://sundtek.de/support/devio_mmap_v0.4.diff + +Signed-off-by: Steinar H. Gunderson +Signed-off-by: Markus Rechberger +Acked-by: Alan Stern +--- + drivers/usb/core/devio.c | 227 +++++++++++++++++++++++++++++++++----- + include/uapi/linux/usbdevice_fs.h | 1 + + 2 files changed, 203 insertions(+), 25 deletions(-) + +diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c +index 38ae877c..0238c78 100644 +--- a/drivers/usb/core/devio.c ++++ b/drivers/usb/core/devio.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -69,6 +70,7 @@ struct usb_dev_state { + spinlock_t lock; /* protects the async urb lists */ + struct list_head async_pending; + struct list_head async_completed; ++ struct list_head memory_list; + wait_queue_head_t wait; /* wake up if a request completed */ + unsigned int discsignr; + struct pid *disc_pid; +@@ -79,6 +81,17 @@ struct usb_dev_state { + u32 disabled_bulk_eps; + }; + ++struct usb_memory { ++ struct list_head memlist; ++ int vma_use_count; ++ int urb_use_count; ++ u32 size; ++ void *mem; ++ dma_addr_t dma_handle; ++ unsigned long vm_start; ++ struct usb_dev_state *ps; ++}; ++ + struct async { + struct list_head asynclist; + struct usb_dev_state *ps; +@@ -89,6 +102,7 @@ struct async { + void __user *userbuffer; + void __user *userurb; + struct urb *urb; ++ struct usb_memory *usbm; + unsigned int mem_usage; + int status; + u32 secid; +@@ -157,6 +171,111 @@ static int connected(struct usb_dev_state *ps) + ps->dev->state != USB_STATE_NOTATTACHED); + } + ++static void dec_usb_memory_use_count(struct usb_memory *usbm, int *count) ++{ ++ struct usb_dev_state *ps = usbm->ps; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ps->lock, flags); ++ --*count; ++ if (usbm->urb_use_count == 0 && usbm->vma_use_count == 0) { ++ list_del(&usbm->memlist); ++ spin_unlock_irqrestore(&ps->lock, flags); ++ ++ usb_free_coherent(ps->dev, usbm->size, usbm->mem, ++ usbm->dma_handle); ++ usbfs_decrease_memory_usage( ++ usbm->size + sizeof(struct usb_memory)); ++ kfree(usbm); ++ } else { ++ spin_unlock_irqrestore(&ps->lock, flags); ++ } ++} ++ ++static void usbdev_vm_open(struct vm_area_struct *vma) ++{ ++ struct usb_memory *usbm = vma->vm_private_data; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&usbm->ps->lock, flags); ++ ++usbm->vma_use_count; ++ spin_unlock_irqrestore(&usbm->ps->lock, flags); ++} ++ ++static void usbdev_vm_close(struct vm_area_struct *vma) ++{ ++ struct usb_memory *usbm = vma->vm_private_data; ++ ++ dec_usb_memory_use_count(usbm, &usbm->vma_use_count); ++} ++ ++struct vm_operations_struct usbdev_vm_ops = { ++ .open = usbdev_vm_open, ++ .close = usbdev_vm_close ++}; ++ ++static int usbdev_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ struct usb_memory *usbm = NULL; ++ struct usb_dev_state *ps = file->private_data; ++ size_t size = vma->vm_end - vma->vm_start; ++ void *mem; ++ unsigned long flags; ++ dma_addr_t dma_handle; ++ int ret; ++ ++ ret = usbfs_increase_memory_usage(size + sizeof(struct usb_memory)); ++ if (ret) ++ goto error; ++ ++ usbm = kzalloc(sizeof(struct usb_memory), GFP_KERNEL); ++ if (!usbm) { ++ ret = -ENOMEM; ++ goto error_decrease_mem; ++ } ++ ++ mem = usb_alloc_coherent(ps->dev, size, GFP_USER, &dma_handle); ++ if (!mem) { ++ ret = -ENOMEM; ++ goto error_free_usbm; ++ } ++ ++ memset(mem, 0, size); ++ ++ usbm->mem = mem; ++ usbm->dma_handle = dma_handle; ++ usbm->size = size; ++ usbm->ps = ps; ++ usbm->vm_start = vma->vm_start; ++ usbm->vma_use_count = 1; ++ INIT_LIST_HEAD(&usbm->memlist); ++ ++ if (remap_pfn_range(vma, vma->vm_start, ++ virt_to_phys(usbm->mem) >> PAGE_SHIFT, ++ size, vma->vm_page_prot) < 0) { ++ dec_usb_memory_use_count(usbm, &usbm->vma_use_count); ++ return -EAGAIN; ++ } ++ ++ vma->vm_flags |= VM_IO; ++ vma->vm_flags |= (VM_DONTEXPAND | VM_DONTDUMP); ++ vma->vm_ops = &usbdev_vm_ops; ++ vma->vm_private_data = usbm; ++ ++ spin_lock_irqsave(&ps->lock, flags); ++ list_add_tail(&usbm->memlist, &ps->memory_list); ++ spin_unlock_irqrestore(&ps->lock, flags); ++ ++ return 0; ++ ++error_free_usbm: ++ kfree(usbm); ++error_decrease_mem: ++ usbfs_decrease_memory_usage(size + sizeof(struct usb_memory)); ++error: ++ return ret; ++} ++ + static loff_t usbdev_lseek(struct file *file, loff_t offset, int orig) + { + loff_t ret; +@@ -297,8 +416,13 @@ static void free_async(struct async *as) + if (sg_page(&as->urb->sg[i])) + kfree(sg_virt(&as->urb->sg[i])); + } ++ + kfree(as->urb->sg); +- kfree(as->urb->transfer_buffer); ++ if (as->usbm == NULL) ++ kfree(as->urb->transfer_buffer); ++ else ++ dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count); ++ + kfree(as->urb->setup_packet); + usb_free_urb(as->urb); + usbfs_decrease_memory_usage(as->mem_usage); +@@ -910,6 +1034,7 @@ static int usbdev_open(struct inode *inode, struct file *file) + INIT_LIST_HEAD(&ps->list); + INIT_LIST_HEAD(&ps->async_pending); + INIT_LIST_HEAD(&ps->async_completed); ++ INIT_LIST_HEAD(&ps->memory_list); + init_waitqueue_head(&ps->wait); + ps->discsignr = 0; + ps->disc_pid = get_pid(task_pid(current)); +@@ -962,6 +1087,7 @@ static int usbdev_release(struct inode *inode, struct file *file) + free_async(as); + as = async_getcompleted(ps); + } ++ + kfree(ps); + return 0; + } +@@ -1283,6 +1409,31 @@ static int proc_setconfig(struct usb_dev_state *ps, void __user *arg) + return status; + } + ++static struct usb_memory * ++find_memory_area(struct usb_dev_state *ps, const struct usbdevfs_urb *uurb) ++{ ++ struct usb_memory *usbm = NULL, *iter; ++ unsigned long flags; ++ unsigned long uurb_start = (unsigned long)uurb->buffer; ++ ++ spin_lock_irqsave(&ps->lock, flags); ++ list_for_each_entry(iter, &ps->memory_list, memlist) { ++ if (uurb_start >= iter->vm_start && ++ uurb_start < iter->vm_start + iter->size) { ++ if (uurb->buffer_length > iter->vm_start + iter->size - ++ uurb_start) { ++ usbm = ERR_PTR(-EINVAL); ++ } else { ++ usbm = iter; ++ usbm->urb_use_count++; ++ } ++ break; ++ } ++ } ++ spin_unlock_irqrestore(&ps->lock, flags); ++ return usbm; ++} ++ + static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb, + struct usbdevfs_iso_packet_desc __user *iso_frame_desc, + void __user *arg) +@@ -1439,6 +1590,19 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb + goto error; + } + ++ as->usbm = find_memory_area(ps, uurb); ++ if (IS_ERR(as->usbm)) { ++ ret = PTR_ERR(as->usbm); ++ as->usbm = NULL; ++ goto error; ++ } ++ ++ /* do not use SG buffers when memory mapped segments ++ * are in use ++ */ ++ if (as->usbm) ++ num_sgs = 0; ++ + u += sizeof(struct async) + sizeof(struct urb) + uurb->buffer_length + + num_sgs * sizeof(struct scatterlist); + ret = usbfs_increase_memory_usage(u); +@@ -1476,29 +1640,35 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb + totlen -= u; + } + } else if (uurb->buffer_length > 0) { +- as->urb->transfer_buffer = kmalloc(uurb->buffer_length, +- GFP_KERNEL); +- if (!as->urb->transfer_buffer) { +- ret = -ENOMEM; +- goto error; +- } ++ if (as->usbm) { ++ unsigned long uurb_start = (unsigned long)uurb->buffer; + +- if (!is_in) { +- if (copy_from_user(as->urb->transfer_buffer, +- uurb->buffer, +- uurb->buffer_length)) { +- ret = -EFAULT; ++ as->urb->transfer_buffer = as->usbm->mem + ++ (uurb_start - as->usbm->vm_start); ++ } else { ++ as->urb->transfer_buffer = kmalloc(uurb->buffer_length, ++ GFP_KERNEL); ++ if (!as->urb->transfer_buffer) { ++ ret = -ENOMEM; + goto error; + } +- } else if (uurb->type == USBDEVFS_URB_TYPE_ISO) { +- /* +- * Isochronous input data may end up being +- * discontiguous if some of the packets are short. +- * Clear the buffer so that the gaps don't leak +- * kernel data to userspace. +- */ +- memset(as->urb->transfer_buffer, 0, +- uurb->buffer_length); ++ if (!is_in) { ++ if (copy_from_user(as->urb->transfer_buffer, ++ uurb->buffer, ++ uurb->buffer_length)) { ++ ret = -EFAULT; ++ goto error; ++ } ++ } else if (uurb->type == USBDEVFS_URB_TYPE_ISO) { ++ /* ++ * Isochronous input data may end up being ++ * discontiguous if some of the packets are ++ * short. Clear the buffer so that the gaps ++ * don't leak kernel data to userspace. ++ */ ++ memset(as->urb->transfer_buffer, 0, ++ uurb->buffer_length); ++ } + } + } + as->urb->dev = ps->dev; +@@ -1545,10 +1715,14 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb + isopkt = NULL; + as->ps = ps; + as->userurb = arg; +- if (is_in && uurb->buffer_length > 0) ++ if (as->usbm) { ++ unsigned long uurb_start = (unsigned long)uurb->buffer; ++ ++ as->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; ++ as->urb->transfer_dma = as->usbm->dma_handle + ++ (uurb_start - as->usbm->vm_start); ++ } else if (is_in && uurb->buffer_length > 0) + as->userbuffer = uurb->buffer; +- else +- as->userbuffer = NULL; + as->signr = uurb->signr; + as->ifnum = ifnum; + as->pid = get_pid(task_pid(current)); +@@ -1604,6 +1778,8 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb + return 0; + + error: ++ if (as && as->usbm) ++ dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count); + kfree(isopkt); + kfree(dr); + if (as) +@@ -2047,7 +2223,7 @@ static int proc_get_capabilities(struct usb_dev_state *ps, void __user *arg) + __u32 caps; + + caps = USBDEVFS_CAP_ZERO_PACKET | USBDEVFS_CAP_NO_PACKET_SIZE_LIM | +- USBDEVFS_CAP_REAP_AFTER_DISCONNECT; ++ USBDEVFS_CAP_REAP_AFTER_DISCONNECT | USBDEVFS_CAP_MMAP; + if (!ps->dev->bus->no_stop_on_short) + caps |= USBDEVFS_CAP_BULK_CONTINUATION; + if (ps->dev->bus->sg_tablesize) +@@ -2373,6 +2549,7 @@ const struct file_operations usbdev_file_operations = { + #ifdef CONFIG_COMPAT + .compat_ioctl = usbdev_compat_ioctl, + #endif ++ .mmap = usbdev_mmap, + .open = usbdev_open, + .release = usbdev_release, + }; +diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h +index 019ba1e..ecbd176 100644 +--- a/include/uapi/linux/usbdevice_fs.h ++++ b/include/uapi/linux/usbdevice_fs.h +@@ -134,6 +134,7 @@ struct usbdevfs_hub_portinfo { + #define USBDEVFS_CAP_NO_PACKET_SIZE_LIM 0x04 + #define USBDEVFS_CAP_BULK_SCATTER_GATHER 0x08 + #define USBDEVFS_CAP_REAP_AFTER_DISCONNECT 0x10 ++#define USBDEVFS_CAP_MMAP 0x20 + + /* USBDEVFS_DISCONNECT_CLAIM flags & struct */ + +-- +2.1.4 + diff --git a/patches/mesa-fix-locking-of-glsync-objects.diff b/patches/mesa-fix-locking-of-glsync-objects.diff new file mode 100644 index 0000000..a1c880a --- /dev/null +++ b/patches/mesa-fix-locking-of-glsync-objects.diff @@ -0,0 +1,321 @@ +From 6e3d1880fa78a3a965cb7eb51ee12b1f785f84bb Mon Sep 17 00:00:00 2001 +From: "Steinar H. Gunderson" +Date: Tue, 1 Dec 2015 22:05:11 +0100 +Subject: [PATCH] Fix locking of GLsync objects. + +GLsync objects had a race condition when used from multiple threads +(which is the main point of the extension, really); it could be +validated as a sync object at the beginning of the function, and then +deleted by another thread before use, causing crashes. Fix this by +changing all casts from GLsync to struct gl_sync_object to a new +function _mesa_get_sync() that validates and increases the refcount. + +In a similar vein, validation itself uses _mesa_set_search(), which +requires synchronization -- it was called without a mutex held, causing +spurious error returns and other issues. Since _mesa_get_sync() now +takes the shared context mutex, this problem is also resolved. + +Signed-off-by: Steinar H. Gunderson +--- + src/mesa/main/objectlabel.c | 11 ++++-- + src/mesa/main/shared.c | 2 +- + src/mesa/main/syncobj.c | 89 ++++++++++++++++++++++++++------------------- + src/mesa/main/syncobj.h | 11 ++---- + 4 files changed, 64 insertions(+), 49 deletions(-) + +diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c +index 41f370c..b083c43 100644 +--- a/src/mesa/main/objectlabel.c ++++ b/src/mesa/main/objectlabel.c +@@ -288,7 +288,7 @@ void GLAPIENTRY + _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label) + { + GET_CURRENT_CONTEXT(ctx); +- struct gl_sync_object *const syncObj = (struct gl_sync_object *) ptr; ++ struct gl_sync_object *syncObj = _mesa_get_sync(ctx, sync, true); + const char *callerstr; + char **labelPtr; + +@@ -297,7 +297,7 @@ _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label) + else + callerstr = "glObjectPtrLabelKHR"; + +- if (!_mesa_validate_sync(ctx, syncObj)) { ++ if (!syncObj) { + _mesa_error(ctx, GL_INVALID_VALUE, "%s (not a valid sync object)", + callerstr); + return; +@@ -306,6 +306,7 @@ _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label) + labelPtr = &syncObj->Label; + + set_label(ctx, labelPtr, label, length, callerstr); ++ _mesa_unref_sync_object(ctx, syncObj, 1); + } + + void GLAPIENTRY +@@ -313,7 +314,7 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length, + GLchar *label) + { + GET_CURRENT_CONTEXT(ctx); +- struct gl_sync_object *const syncObj = (struct gl_sync_object *) ptr; ++ struct gl_sync_object *syncObj; + const char *callerstr; + char **labelPtr; + +@@ -328,7 +329,8 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length, + return; + } + +- if (!_mesa_validate_sync(ctx, syncObj)) { ++ syncObj = _mesa_get_sync(ctx, sync, true); ++ if (!syncObj) { + _mesa_error(ctx, GL_INVALID_VALUE, "%s (not a valid sync object)", + callerstr); + return; +@@ -337,4 +339,5 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length, + labelPtr = &syncObj->Label; + + copy_label(*labelPtr, label, length, bufSize); ++ _mesa_unref_sync_object(ctx, syncObj, 1); + } +diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c +index c37b31d..b9f7bb6 100644 +--- a/src/mesa/main/shared.c ++++ b/src/mesa/main/shared.c +@@ -338,7 +338,7 @@ free_shared_state(struct gl_context *ctx, struct gl_shared_state *shared) + struct set_entry *entry; + + set_foreach(shared->SyncObjects, entry) { +- _mesa_unref_sync_object(ctx, (struct gl_sync_object *) entry->key); ++ _mesa_unref_sync_object(ctx, (struct gl_sync_object *) entry->key, 1); + } + } + _mesa_set_destroy(shared->SyncObjects, NULL); +diff --git a/src/mesa/main/syncobj.c b/src/mesa/main/syncobj.c +index c1b2d3b..d1c6c06 100644 +--- a/src/mesa/main/syncobj.c ++++ b/src/mesa/main/syncobj.c +@@ -167,34 +167,42 @@ _mesa_free_sync_data(struct gl_context *ctx) + * - not in sync objects hash table + * - type is GL_SYNC_FENCE + * - not marked as deleted ++ * ++ * Returns the internal gl_sync_object pointer if the sync object is valid ++ * or NULL if it isn't. ++ * ++ * If "incRefCount" is true, the reference count is incremented, which is ++ * normally what you want; otherwise, a glDeleteSync from another thread ++ * could delete the sync object while you are still working on it. + */ +-bool +-_mesa_validate_sync(struct gl_context *ctx, +- const struct gl_sync_object *syncObj) ++struct gl_sync_object * ++_mesa_get_sync(struct gl_context *ctx, GLsync sync, bool incRefCount) + { +- return (syncObj != NULL) ++ struct gl_sync_object *syncObj = (struct gl_sync_object *) sync; ++ mtx_lock(&ctx->Shared->Mutex); ++ if (syncObj != NULL + && _mesa_set_search(ctx->Shared->SyncObjects, syncObj) != NULL + && (syncObj->Type == GL_SYNC_FENCE) +- && !syncObj->DeletePending; +-} +- +- +-void +-_mesa_ref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj) +-{ +- mtx_lock(&ctx->Shared->Mutex); +- syncObj->RefCount++; ++ && !syncObj->DeletePending) { ++ if (incRefCount) { ++ syncObj->RefCount++; ++ } ++ } else { ++ syncObj = NULL; ++ } + mtx_unlock(&ctx->Shared->Mutex); ++ return syncObj; + } + + + void +-_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj) ++_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj, ++ int amount) + { + struct set_entry *entry; + + mtx_lock(&ctx->Shared->Mutex); +- syncObj->RefCount--; ++ syncObj->RefCount -= amount; + if (syncObj->RefCount == 0) { + entry = _mesa_set_search(ctx->Shared->SyncObjects, syncObj); + assert (entry != NULL); +@@ -212,10 +220,9 @@ GLboolean GLAPIENTRY + _mesa_IsSync(GLsync sync) + { + GET_CURRENT_CONTEXT(ctx); +- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync; + ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_FALSE); + +- return _mesa_validate_sync(ctx, syncObj) ? GL_TRUE : GL_FALSE; ++ return _mesa_get_sync(ctx, sync, false) ? GL_TRUE : GL_FALSE; + } + + +@@ -223,7 +230,7 @@ void GLAPIENTRY + _mesa_DeleteSync(GLsync sync) + { + GET_CURRENT_CONTEXT(ctx); +- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync; ++ struct gl_sync_object *syncObj; + + /* From the GL_ARB_sync spec: + * +@@ -235,16 +242,19 @@ _mesa_DeleteSync(GLsync sync) + return; + } + +- if (!_mesa_validate_sync(ctx, syncObj)) { ++ syncObj = _mesa_get_sync(ctx, sync, true); ++ if (!syncObj) { + _mesa_error(ctx, GL_INVALID_VALUE, "glDeleteSync (not a valid sync object)"); + return; + } + + /* If there are no client-waits or server-waits pending on this sync, delete +- * the underlying object. ++ * the underlying object. Note that we double-unref the object, as _mesa_get_sync ++ * above took an extra refcount to make sure the pointer is valid for us to ++ * manipulate. + */ + syncObj->DeletePending = GL_TRUE; +- _mesa_unref_sync_object(ctx, syncObj); ++ _mesa_unref_sync_object(ctx, syncObj, 2); + } + + +@@ -299,21 +309,20 @@ GLenum GLAPIENTRY + _mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout) + { + GET_CURRENT_CONTEXT(ctx); +- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync; ++ struct gl_sync_object *syncObj; + GLenum ret; + ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_WAIT_FAILED); + +- if (!_mesa_validate_sync(ctx, syncObj)) { +- _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync (not a valid sync object)"); +- return GL_WAIT_FAILED; +- } +- + if ((flags & ~GL_SYNC_FLUSH_COMMANDS_BIT) != 0) { + _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync(flags=0x%x)", flags); + return GL_WAIT_FAILED; + } + +- _mesa_ref_sync_object(ctx, syncObj); ++ syncObj = _mesa_get_sync(ctx, sync, true); ++ if (!syncObj) { ++ _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync (not a valid sync object)"); ++ return GL_WAIT_FAILED; ++ } + + /* From the GL_ARB_sync spec: + * +@@ -335,7 +344,7 @@ _mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout) + } + } + +- _mesa_unref_sync_object(ctx, syncObj); ++ _mesa_unref_sync_object(ctx, syncObj, 1); + return ret; + } + +@@ -344,12 +353,7 @@ void GLAPIENTRY + _mesa_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout) + { + GET_CURRENT_CONTEXT(ctx); +- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync; +- +- if (!_mesa_validate_sync(ctx, syncObj)) { +- _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync (not a valid sync object)"); +- return; +- } ++ struct gl_sync_object *syncObj; + + if (flags != 0) { + _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync(flags=0x%x)", flags); +@@ -362,7 +366,14 @@ _mesa_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout) + return; + } + ++ syncObj = _mesa_get_sync(ctx, sync, true); ++ if (!syncObj) { ++ _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync (not a valid sync object)"); ++ return; ++ } ++ + ctx->Driver.ServerWaitSync(ctx, syncObj, flags, timeout); ++ _mesa_unref_sync_object(ctx, syncObj, 1); + } + + +@@ -371,11 +382,12 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length, + GLint *values) + { + GET_CURRENT_CONTEXT(ctx); +- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync; ++ struct gl_sync_object *syncObj; + GLsizei size = 0; + GLint v[1]; + +- if (!_mesa_validate_sync(ctx, syncObj)) { ++ syncObj = _mesa_get_sync(ctx, sync, true); ++ if (!syncObj) { + _mesa_error(ctx, GL_INVALID_VALUE, "glGetSynciv (not a valid sync object)"); + return; + } +@@ -409,6 +421,7 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length, + + default: + _mesa_error(ctx, GL_INVALID_ENUM, "glGetSynciv(pname=0x%x)\n", pname); ++ _mesa_unref_sync_object(ctx, syncObj, 1); + return; + } + +@@ -421,4 +434,6 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length, + if (length != NULL) { + *length = size; + } ++ ++ _mesa_unref_sync_object(ctx, syncObj, 1); + } +diff --git a/src/mesa/main/syncobj.h b/src/mesa/main/syncobj.h +index 5d510e8..e8dbded 100644 +--- a/src/mesa/main/syncobj.h ++++ b/src/mesa/main/syncobj.h +@@ -47,15 +47,12 @@ _mesa_init_sync(struct gl_context *); + extern void + _mesa_free_sync_data(struct gl_context *); + +-extern void +-_mesa_ref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj); ++struct gl_sync_object * ++_mesa_get_sync(struct gl_context *ctx, GLsync sync, bool incRefCount); + + extern void +-_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj); +- +-extern bool +-_mesa_validate_sync(struct gl_context *ctx, +- const struct gl_sync_object *syncObj); ++_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj, ++ int amount); + + extern GLboolean GLAPIENTRY + _mesa_IsSync(GLsync sync); +-- +2.6.2 diff --git a/patches/zita-resampler-sse.diff b/patches/zita-resampler-sse.diff new file mode 100644 index 0000000..43cfef8 --- /dev/null +++ b/patches/zita-resampler-sse.diff @@ -0,0 +1,213 @@ +diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc +--- orig/zita-resampler-1.3.0/libs/resampler.cc 2012-10-26 22:58:55.000000000 +0200 ++++ zita-resampler-1.3.0/libs/resampler.cc 2015-11-15 12:27:42.764591015 +0100 +@@ -24,6 +24,10 @@ + #include + #include + ++#ifdef __SSE2__ ++#include ++#endif ++ + + static unsigned int gcd (unsigned int a, unsigned int b) + { +@@ -47,6 +51,45 @@ + return 1; + } + ++#ifdef __SSE2__ ++ ++static inline void calc_stereo_sample_sse (unsigned int hl, ++ float *c1, ++ float *c2, ++ float *q1, ++ float *q2, ++ float *out_data) ++{ ++ unsigned int i; ++ __m128 denorm, s, w1, w2; ++ ++ denorm = _mm_set1_ps (1e-20f); ++ s = denorm; ++ for (i = 0; i < hl; i += 4) ++ { ++ q2 -= 8; ++ ++ // s += *q1 * c1 [i]; ++ w1 = _mm_loadu_ps (&c1 [i]); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), _mm_unpacklo_ps (w1, w1))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 4), _mm_unpackhi_ps (w1, w1))); ++ ++ // s += *q2 * c2 [i]; ++ w2 = _mm_loadu_ps (&c2 [i]); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 4), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 1, 1)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 3, 3)))); ++ ++ q1 += 8; ++ } ++ s = _mm_sub_ps (s, denorm); ++ s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2))); ++ ++ // Writes two bytes more than we want, but this is fine since out_count >= 2. ++ _mm_storeu_ps (out_data, s); ++} ++ ++#endif ++ + + Resampler::Resampler (void) : + _table (0), +@@ -213,18 +256,28 @@ + { + float *c1 = _table->_ctab + hl * ph; + float *c2 = _table->_ctab + hl * (np - ph); +- for (c = 0; c < _nchan; c++) ++#ifdef __SSE2__ ++ if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2) + { +- float *q1 = p1 + c; +- float *q2 = p2 + c; +- float s = 1e-20f; +- for (i = 0; i < hl; i++) ++ calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data); ++ out_data += 2; ++ } ++ else ++#endif ++ { ++ for (c = 0; c < _nchan; c++) + { +- q2 -= _nchan; +- s += *q1 * c1 [i] + *q2 * c2 [i]; +- q1 += _nchan; ++ float *q1 = p1 + c; ++ float *q2 = p2 + c; ++ float s = 1e-20f; ++ for (i = 0; i < hl; i++) ++ { ++ q2 -= _nchan; ++ s += *q1 * c1 [i] + *q2 * c2 [i]; ++ q1 += _nchan; ++ } ++ *out_data++ = s - 1e-20f; + } +- *out_data++ = s - 1e-20f; + } + } + else +@@ -260,4 +313,3 @@ + return 0; + } + +- +diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc +--- orig/zita-resampler-1.3.0/libs/vresampler.cc 2012-10-26 22:58:55.000000000 +0200 ++++ zita-resampler-1.3.0/libs/vresampler.cc 2015-11-15 12:27:58.424544882 +0100 +@@ -25,6 +25,58 @@ + #include + + ++#ifdef __SSE2__ ++ ++#include ++ ++static inline void calc_stereo_sample_sse (int hl, ++ float b, ++ float *p1, ++ float *p2, ++ float *q1, ++ float *q2, ++ float *out_data) ++{ ++ int i; ++ __m128 denorm, bs, s, c1, c2, w1, w2; ++ ++ denorm = _mm_set1_ps (1e-25f); ++ bs = _mm_set1_ps (b); ++ s = denorm; ++ for (i = 0; i < hl; i += 4) ++ { ++ p2 -= 8; ++ ++ // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]); ++ w1 = _mm_loadu_ps (&q1 [i]); ++ w2 = _mm_loadu_ps (&q1 [i + hl]); ++ c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1))); ++ ++ // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]); ++ w1 = _mm_loadu_ps (&q2 [i]); ++ w2 = _mm_loadu_ps (&q2 [i - hl]); ++ c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1))); ++ ++ // s += *p1 * _c1 [i]; ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), _mm_unpacklo_ps (c1, c1))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 4), _mm_unpackhi_ps (c1, c1))); ++ ++ // s += *p2 * _c2 [i]; ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 4), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 1, 1)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 3, 3)))); ++ ++ p1 += 8; ++ } ++ s = _mm_sub_ps (s, denorm); ++ s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2))); ++ ++ // Writes two bytes more than we want, but this is fine since out_count >= 2. ++ _mm_storeu_ps (out_data, s); ++} ++ ++#endif ++ ++ + VResampler::VResampler (void) : + _table (0), + _nchan (0), +@@ -212,23 +264,33 @@ + a = 1.0f - b; + q1 = _table->_ctab + hl * k; + q2 = _table->_ctab + hl * (np - k); +- for (i = 0; i < hl; i++) ++#ifdef __SSE2__ ++ if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2) + { +- _c1 [i] = a * q1 [i] + b * q1 [i + hl]; +- _c2 [i] = a * q2 [i] + b * q2 [i - hl]; ++ calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data); ++ out_data += 2; + } +- for (c = 0; c < _nchan; c++) ++ else ++#endif + { +- q1 = p1 + c; +- q2 = p2 + c; +- a = 1e-25f; +- for (i = 0; i < hl; i++) +- { +- q2 -= _nchan; +- a += *q1 * _c1 [i] + *q2 * _c2 [i]; +- q1 += _nchan; +- } +- *out_data++ = a - 1e-25f; ++ for (i = 0; i < hl; i++) ++ { ++ _c1 [i] = a * q1 [i] + b * q1 [i + hl]; ++ _c2 [i] = a * q2 [i] + b * q2 [i - hl]; ++ } ++ for (c = 0; c < _nchan; c++) ++ { ++ q1 = p1 + c; ++ q2 = p2 + c; ++ a = 1e-25f; ++ for (i = 0; i < hl; i++) ++ { ++ q2 -= _nchan; ++ a += *q1 * _c1 [i] + *q2 * _c2 [i]; ++ q1 += _nchan; ++ } ++ *out_data++ = a - 1e-25f; ++ } + } + } + else -- 2.39.2