- Lua, for driving the theme engine.
+The patches/ directory contains some patches for upstream software that help
+Nageru performance and/or stability. They are all meant for upstream, but
+probably will not be in by the time Nageru is released. All except the bmusb
+patch are taken to be by Steinar H. Gunderson <sesse@google.com> (ie., my work
+email, unlike Nageru itself and bmusb), and under the same license as the
+projects they patch.
+
To start it, just hook up your requipment, type “make” and then “./nageru”.
It is strongly recommended to have the rights to run at real-time priority;
it will make the USB3 threads do so, which will make them a lot more stable.
--- /dev/null
+diff --git a/bmusb.cpp b/bmusb.cpp
+index 2ea6407..16ec380 100644
+--- a/bmusb.cpp
++++ b/bmusb.cpp
+@@ -1027,7 +1027,7 @@ void BMUSBCapture::configure_card()
+ // set up isochronous transfers for audio and video
+ for (int e = 3; e <= 4; ++e) {
+ //int num_transfers = (e == 3) ? 6 : 6;
+- int num_transfers = 10;
++ int num_transfers = 6;
+ for (int i = 0; i < num_transfers; ++i) {
+ size_t buf_size;
+ int num_iso_pack, size;
+@@ -1043,8 +1043,14 @@ void BMUSBCapture::configure_card()
+ num_iso_pack = 80;
+ buf_size = num_iso_pack * size;
+ }
+- assert(size_t(num_iso_pack * size) <= buf_size);
+- uint8_t *buf = new uint8_t[buf_size];
++ int num_bytes = num_iso_pack * size;
++ assert(size_t(num_bytes) <= buf_size);
++ uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
++ if (buf == nullptr) {
++ fprintf(stderr, "Failed to allocate persistent DMA memory (probably missing kernel patch).\n");
++ fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
++ buf = new uint8_t[num_bytes];
++ }
+
+ xfr = libusb_alloc_transfer(num_iso_pack);
+ if (!xfr) {
--- /dev/null
+From 66c3edf744415f117c3be95ae83f0bab6e128830 Mon Sep 17 00:00:00 2001\r
+From: "Steinar H. Gunderson" <sesse@google.com>\r
+Date: Wed, 9 Dec 2015 10:03:15 +0100\r
+Subject: [PATCH] Add support for persistent device memory.\r
+\r
+Add a function to allocate memory belonging to a specific device,\r
+so that the operating system can DMA straight into it for zerocopy,\r
+and also avoid some clearing. Also, this allows up-front memory\r
+allocation in the kernel at program startup; memory allocation is\r
+otherwise done per-transfer, which can fail in a system where memory has become\r
+fragmented over time).\r
+\r
+This mirrors new functionality going into Linux' USB stack (recently\r
+reviewed and acked upstream); only Linux is supported as a backend\r
+currently.\r
+---\r
+ libusb/core.c | 29 +++++++++++++++++++++++++++++\r
+ libusb/libusb-1.0.def | 4 ++++\r
+ libusb/libusb.h | 30 +++++++++++++++++++++++++++++-\r
+ libusb/libusbi.h | 10 ++++++++++\r
+ libusb/os/linux_usbfs.c | 30 ++++++++++++++++++++++++++++++\r
+ 5 files changed, 102 insertions(+), 1 deletion(-)\r
+\r
+diff --git a/libusb/core.c b/libusb/core.c\r
+index 9c617fb..3eba85a 100644\r
+--- a/libusb/core.c\r
++++ b/libusb/core.c\r
+@@ -351,6 +351,8 @@ if (cfg != desired)\r
+ * - libusb_control_transfer_get_setup()\r
+ * - libusb_cpu_to_le16()\r
+ * - libusb_detach_kernel_driver()\r
++ * - libusb_dev_mem_alloc()\r
++ * - libusb_dev_mem_free()\r
+ * - libusb_error_name()\r
+ * - libusb_event_handler_active()\r
+ * - libusb_event_handling_ok()\r
+@@ -1805,6 +1807,33 @@ int API_EXPORTED libusb_free_streams(libusb_device_handle *dev,\r
+ return LIBUSB_ERROR_NOT_SUPPORTED;\r
+ }\r
+ \r
++API_EXPORTED unsigned char *libusb_dev_mem_alloc(libusb_device_handle *dev,\r
++ int length)\r
++{\r
++ if (!dev->dev->attached)\r
++ return NULL;\r
++\r
++ if (usbi_backend->dev_mem_alloc)\r
++ return usbi_backend->dev_mem_alloc(dev, length);\r
++ else\r
++ return NULL;\r
++}\r
++\r
++/* Note: No current backends actually use the "dev" parameter; it is only there\r
++ * for any future, less lenient OSes.\r
++ */\r
++int API_EXPORTED libusb_dev_mem_free(libusb_device_handle *dev,\r
++ unsigned char *buffer, int len)\r
++{\r
++ if (!dev->dev->attached)\r
++ return LIBUSB_ERROR_NO_DEVICE;\r
++\r
++ if (usbi_backend->dev_mem_free)\r
++ return usbi_backend->dev_mem_free(dev, buffer, len);\r
++ else\r
++ return LIBUSB_ERROR_NOT_SUPPORTED;\r
++}\r
++\r
+ /** \ingroup dev\r
+ * Determine if a kernel driver is active on an interface. If a kernel driver\r
+ * is active, you cannot claim the interface, and libusb will be unable to\r
+diff --git a/libusb/libusb-1.0.def b/libusb/libusb-1.0.def\r
+index 538ad49..e040f4b 100644\r
+--- a/libusb/libusb-1.0.def\r
++++ b/libusb/libusb-1.0.def\r
+@@ -20,6 +20,10 @@ EXPORTS\r
+ libusb_control_transfer@32 = libusb_control_transfer\r
+ libusb_detach_kernel_driver\r
+ libusb_detach_kernel_driver@8 = libusb_detach_kernel_driver\r
++ libusb_dev_mem_alloc\r
++ libusb_dev_mem_alloc@8 = libusb_dev_mem_alloc\r
++ libusb_dev_mem_free\r
++ libusb_dev_mem_free@12 = libusb_dev_mem_free\r
+ libusb_error_name\r
+ libusb_error_name@4 = libusb_error_name\r
+ libusb_event_handler_active\r
+diff --git a/libusb/libusb.h b/libusb/libusb.h\r
+index 513945f..5a84f5b 100644\r
+--- a/libusb/libusb.h\r
++++ b/libusb/libusb.h\r
+@@ -141,7 +141,7 @@ typedef unsigned __int32 uint32_t;\r
+ * Internally, LIBUSB_API_VERSION is defined as follows:\r
+ * (libusb major << 24) | (libusb minor << 16) | (16 bit incremental)\r
+ */\r
+-#define LIBUSB_API_VERSION 0x01000104\r
++#define LIBUSB_API_VERSION 0x01000105\r
+ \r
+ /* The following is kept for compatibility, but will be deprecated in the future */\r
+ #define LIBUSBX_API_VERSION LIBUSB_API_VERSION\r
+@@ -1749,6 +1749,34 @@ int LIBUSB_CALL libusb_interrupt_transfer(libusb_device_handle *dev_handle,\r
+ unsigned char endpoint, unsigned char *data, int length,\r
+ int *actual_length, unsigned int timeout);\r
+ \r
++/** \ingroup asyncio\r
++ * Attempts to allocate a block of persistent DMA memory suitable for transfers\r
++ * against the given device. If successful, will return a block of memory\r
++ * that is suitable for use as "buffer" in \ref libusb_transfer against this\r
++ * device. Using this memory instead of regular memory means that the host\r
++ * controller can use DMA directly into the buffer to increase performance, and\r
++ * also that transfers can no longer fail due to kernel memory fragmentation.\r
++ *\r
++ * Note that this means you should not modify this memory (or even data on\r
++ * the same cache lines) when a transfer is in progress, although it is legal\r
++ * to have several transfers going on within the same memory block.\r
++ *\r
++ * Will return NULL on failure. Many systems do not support such zerocopy\r
++ * and will always return NULL. Memory allocated with this function must be\r
++ * freed with \ref libusb_dev_mem_free.\r
++ *\r
++ * Since version 1.0.21, \ref LIBUSB_API_VERSION >= 0x01000105\r
++ *\r
++ * \param dev a device handle\r
++ * \param length size of desired data buffer\r
++ * \returns a pointer to the newly allocated memory, or NULL on failure\r
++ */\r
++unsigned char *LIBUSB_CALL libusb_dev_mem_alloc(libusb_device_handle *dev,\r
++ int length);\r
++\r
++int LIBUSB_CALL libusb_dev_mem_free(libusb_device_handle *dev,\r
++ unsigned char *buffer, int length);\r
++\r
+ /** \ingroup desc\r
+ * Retrieve a descriptor from the default control pipe.\r
+ * This is a convenience function which formulates the appropriate control\r
+diff --git a/libusb/libusbi.h b/libusb/libusbi.h\r
+index f1afd99..66bdf46 100644\r
+--- a/libusb/libusbi.h\r
++++ b/libusb/libusbi.h\r
+@@ -913,6 +913,16 @@ struct usbi_os_backend {\r
+ int (*free_streams)(struct libusb_device_handle *handle,\r
+ unsigned char *endpoints, int num_endpoints);\r
+ \r
++ /* Allocate persistent DMA memory for the given device, suitable for\r
++ * zerocopy. May return NULL on failure. Optional to implement.\r
++ */\r
++ unsigned char *(*dev_mem_alloc)(struct libusb_device_handle *handle,\r
++ size_t len);\r
++\r
++ /* Free memory allocated by dev_mem_alloc. */\r
++ int (*dev_mem_free)(struct libusb_device_handle *handle,\r
++ unsigned char *buffer, size_t len);\r
++\r
+ /* Determine if a kernel driver is active on an interface. Optional.\r
+ *\r
+ * The presence of a kernel driver on an interface indicates that any\r
+diff --git a/libusb/os/linux_usbfs.c b/libusb/os/linux_usbfs.c\r
+index a63852f..a167084 100644\r
+--- a/libusb/os/linux_usbfs.c\r
++++ b/libusb/os/linux_usbfs.c\r
+@@ -33,6 +33,7 @@\r
+ #include <stdlib.h>\r
+ #include <string.h>\r
+ #include <sys/ioctl.h>\r
++#include <sys/mman.h>\r
+ #include <sys/stat.h>\r
+ #include <sys/types.h>\r
+ #include <sys/utsname.h>\r
+@@ -1561,6 +1562,32 @@ static int op_free_streams(struct libusb_device_handle *handle,\r
+ endpoints, num_endpoints);\r
+ }\r
+ \r
++static unsigned char *op_dev_mem_alloc(struct libusb_device_handle *handle,\r
++ size_t len)\r
++{\r
++ struct linux_device_handle_priv *hpriv = _device_handle_priv(handle);\r
++ unsigned char *buffer = (unsigned char *)mmap(NULL, len,\r
++ PROT_READ | PROT_WRITE, MAP_SHARED, hpriv->fd, 0);\r
++ if (buffer == MAP_FAILED) {\r
++ usbi_err(HANDLE_CTX(handle), "alloc dev mem failed errno %d",\r
++ errno);\r
++ return NULL;\r
++ }\r
++ return buffer;\r
++}\r
++\r
++static int op_dev_mem_free(struct libusb_device_handle *handle,\r
++ unsigned char *buffer, size_t len)\r
++{\r
++ if (munmap(buffer, len) != 0) {\r
++ usbi_err(HANDLE_CTX(handle), "free dev mem failed errno %d",\r
++ errno);\r
++ return LIBUSB_ERROR_OTHER;\r
++ } else {\r
++ return LIBUSB_SUCCESS;\r
++ }\r
++}\r
++\r
+ static int op_kernel_driver_active(struct libusb_device_handle *handle,\r
+ int interface)\r
+ {\r
+@@ -2682,6 +2709,9 @@ const struct usbi_os_backend linux_usbfs_backend = {\r
+ .alloc_streams = op_alloc_streams,\r
+ .free_streams = op_free_streams,\r
+ \r
++ .dev_mem_alloc = op_dev_mem_alloc,\r
++ .dev_mem_free = op_dev_mem_free,\r
++\r
+ .kernel_driver_active = op_kernel_driver_active,\r
+ .detach_kernel_driver = op_detach_kernel_driver,\r
+ .attach_kernel_driver = op_attach_kernel_driver,\r
+-- \r
+2.6.4\r
+\r
--- /dev/null
+From 206b4c81b6b31d87c758748cdbc6d25e9c721ea1 Mon Sep 17 00:00:00 2001
+In-Reply-To: <20160106001143.GA1171@kroah.com>
+References: <20160106001143.GA1171@kroah.com>
+From: "Steinar H. Gunderson" <sesse@google.com>
+Date: Thu, 26 Nov 2015 01:19:13 +0100
+Subject: [PATCH v2] Add support for usbfs zerocopy.
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: linux-usb@vger.kernel.org,linux-kernel@vger.kernel.org,stern@rowland.harvard.edu
+
+Add a new interface for userspace to preallocate memory that can be
+used with usbfs. This gives two primary benefits:
+
+ - Zerocopy; data no longer needs to be copied between the userspace
+ and the kernel, but can instead be read directly by the driver from
+ userspace's buffers. This works for all kinds of transfers (even if
+ nonsensical for control and interrupt transfers); isochronous also
+ no longer need to memset() the buffer to zero to avoid leaking kernel data.
+
+ - Once the buffers are allocated, USB transfers can no longer fail due to
+ memory fragmentation; previously, long-running programs could run into
+ problems finding a large enough contiguous memory chunk, especially on
+ embedded systems or at high rates.
+
+Memory is allocated by using mmap() against the usbfs file descriptor,
+and similarly deallocated by munmap(). Once memory has been allocated,
+using it as pointers to a bulk or isochronous operation means you will
+automatically get zerocopy behavior. Note that this also means you cannot
+modify outgoing data until the transfer is complete. The same holds for
+data on the same cache lines as incoming data; DMA modifying them at the
+same time could lead to your changes being overwritten.
+
+There's a new capability USBDEVFS_CAP_MMAP that userspace can query to see
+if the running kernel supports this functionality, if just trying mmap() is
+not acceptable.
+
+Largely based on a patch by Markus Rechberger with some updates. The original
+patch can be found at:
+
+ http://sundtek.de/support/devio_mmap_v0.4.diff
+
+Signed-off-by: Steinar H. Gunderson <sesse@google.com>
+Signed-off-by: Markus Rechberger <mrechberger@gmail.com>
+Acked-by: Alan Stern <stern@rowland.harvard.edu>
+---
+ drivers/usb/core/devio.c | 227 +++++++++++++++++++++++++++++++++-----
+ include/uapi/linux/usbdevice_fs.h | 1 +
+ 2 files changed, 203 insertions(+), 25 deletions(-)
+
+diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
+index 38ae877c..0238c78 100644
+--- a/drivers/usb/core/devio.c
++++ b/drivers/usb/core/devio.c
+@@ -50,6 +50,7 @@
+ #include <linux/user_namespace.h>
+ #include <linux/scatterlist.h>
+ #include <linux/uaccess.h>
++#include <linux/dma-mapping.h>
+ #include <asm/byteorder.h>
+ #include <linux/moduleparam.h>
+
+@@ -69,6 +70,7 @@ struct usb_dev_state {
+ spinlock_t lock; /* protects the async urb lists */
+ struct list_head async_pending;
+ struct list_head async_completed;
++ struct list_head memory_list;
+ wait_queue_head_t wait; /* wake up if a request completed */
+ unsigned int discsignr;
+ struct pid *disc_pid;
+@@ -79,6 +81,17 @@ struct usb_dev_state {
+ u32 disabled_bulk_eps;
+ };
+
++struct usb_memory {
++ struct list_head memlist;
++ int vma_use_count;
++ int urb_use_count;
++ u32 size;
++ void *mem;
++ dma_addr_t dma_handle;
++ unsigned long vm_start;
++ struct usb_dev_state *ps;
++};
++
+ struct async {
+ struct list_head asynclist;
+ struct usb_dev_state *ps;
+@@ -89,6 +102,7 @@ struct async {
+ void __user *userbuffer;
+ void __user *userurb;
+ struct urb *urb;
++ struct usb_memory *usbm;
+ unsigned int mem_usage;
+ int status;
+ u32 secid;
+@@ -157,6 +171,111 @@ static int connected(struct usb_dev_state *ps)
+ ps->dev->state != USB_STATE_NOTATTACHED);
+ }
+
++static void dec_usb_memory_use_count(struct usb_memory *usbm, int *count)
++{
++ struct usb_dev_state *ps = usbm->ps;
++ unsigned long flags;
++
++ spin_lock_irqsave(&ps->lock, flags);
++ --*count;
++ if (usbm->urb_use_count == 0 && usbm->vma_use_count == 0) {
++ list_del(&usbm->memlist);
++ spin_unlock_irqrestore(&ps->lock, flags);
++
++ usb_free_coherent(ps->dev, usbm->size, usbm->mem,
++ usbm->dma_handle);
++ usbfs_decrease_memory_usage(
++ usbm->size + sizeof(struct usb_memory));
++ kfree(usbm);
++ } else {
++ spin_unlock_irqrestore(&ps->lock, flags);
++ }
++}
++
++static void usbdev_vm_open(struct vm_area_struct *vma)
++{
++ struct usb_memory *usbm = vma->vm_private_data;
++ unsigned long flags;
++
++ spin_lock_irqsave(&usbm->ps->lock, flags);
++ ++usbm->vma_use_count;
++ spin_unlock_irqrestore(&usbm->ps->lock, flags);
++}
++
++static void usbdev_vm_close(struct vm_area_struct *vma)
++{
++ struct usb_memory *usbm = vma->vm_private_data;
++
++ dec_usb_memory_use_count(usbm, &usbm->vma_use_count);
++}
++
++struct vm_operations_struct usbdev_vm_ops = {
++ .open = usbdev_vm_open,
++ .close = usbdev_vm_close
++};
++
++static int usbdev_mmap(struct file *file, struct vm_area_struct *vma)
++{
++ struct usb_memory *usbm = NULL;
++ struct usb_dev_state *ps = file->private_data;
++ size_t size = vma->vm_end - vma->vm_start;
++ void *mem;
++ unsigned long flags;
++ dma_addr_t dma_handle;
++ int ret;
++
++ ret = usbfs_increase_memory_usage(size + sizeof(struct usb_memory));
++ if (ret)
++ goto error;
++
++ usbm = kzalloc(sizeof(struct usb_memory), GFP_KERNEL);
++ if (!usbm) {
++ ret = -ENOMEM;
++ goto error_decrease_mem;
++ }
++
++ mem = usb_alloc_coherent(ps->dev, size, GFP_USER, &dma_handle);
++ if (!mem) {
++ ret = -ENOMEM;
++ goto error_free_usbm;
++ }
++
++ memset(mem, 0, size);
++
++ usbm->mem = mem;
++ usbm->dma_handle = dma_handle;
++ usbm->size = size;
++ usbm->ps = ps;
++ usbm->vm_start = vma->vm_start;
++ usbm->vma_use_count = 1;
++ INIT_LIST_HEAD(&usbm->memlist);
++
++ if (remap_pfn_range(vma, vma->vm_start,
++ virt_to_phys(usbm->mem) >> PAGE_SHIFT,
++ size, vma->vm_page_prot) < 0) {
++ dec_usb_memory_use_count(usbm, &usbm->vma_use_count);
++ return -EAGAIN;
++ }
++
++ vma->vm_flags |= VM_IO;
++ vma->vm_flags |= (VM_DONTEXPAND | VM_DONTDUMP);
++ vma->vm_ops = &usbdev_vm_ops;
++ vma->vm_private_data = usbm;
++
++ spin_lock_irqsave(&ps->lock, flags);
++ list_add_tail(&usbm->memlist, &ps->memory_list);
++ spin_unlock_irqrestore(&ps->lock, flags);
++
++ return 0;
++
++error_free_usbm:
++ kfree(usbm);
++error_decrease_mem:
++ usbfs_decrease_memory_usage(size + sizeof(struct usb_memory));
++error:
++ return ret;
++}
++
+ static loff_t usbdev_lseek(struct file *file, loff_t offset, int orig)
+ {
+ loff_t ret;
+@@ -297,8 +416,13 @@ static void free_async(struct async *as)
+ if (sg_page(&as->urb->sg[i]))
+ kfree(sg_virt(&as->urb->sg[i]));
+ }
++
+ kfree(as->urb->sg);
+- kfree(as->urb->transfer_buffer);
++ if (as->usbm == NULL)
++ kfree(as->urb->transfer_buffer);
++ else
++ dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count);
++
+ kfree(as->urb->setup_packet);
+ usb_free_urb(as->urb);
+ usbfs_decrease_memory_usage(as->mem_usage);
+@@ -910,6 +1034,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
+ INIT_LIST_HEAD(&ps->list);
+ INIT_LIST_HEAD(&ps->async_pending);
+ INIT_LIST_HEAD(&ps->async_completed);
++ INIT_LIST_HEAD(&ps->memory_list);
+ init_waitqueue_head(&ps->wait);
+ ps->discsignr = 0;
+ ps->disc_pid = get_pid(task_pid(current));
+@@ -962,6 +1087,7 @@ static int usbdev_release(struct inode *inode, struct file *file)
+ free_async(as);
+ as = async_getcompleted(ps);
+ }
++
+ kfree(ps);
+ return 0;
+ }
+@@ -1283,6 +1409,31 @@ static int proc_setconfig(struct usb_dev_state *ps, void __user *arg)
+ return status;
+ }
+
++static struct usb_memory *
++find_memory_area(struct usb_dev_state *ps, const struct usbdevfs_urb *uurb)
++{
++ struct usb_memory *usbm = NULL, *iter;
++ unsigned long flags;
++ unsigned long uurb_start = (unsigned long)uurb->buffer;
++
++ spin_lock_irqsave(&ps->lock, flags);
++ list_for_each_entry(iter, &ps->memory_list, memlist) {
++ if (uurb_start >= iter->vm_start &&
++ uurb_start < iter->vm_start + iter->size) {
++ if (uurb->buffer_length > iter->vm_start + iter->size -
++ uurb_start) {
++ usbm = ERR_PTR(-EINVAL);
++ } else {
++ usbm = iter;
++ usbm->urb_use_count++;
++ }
++ break;
++ }
++ }
++ spin_unlock_irqrestore(&ps->lock, flags);
++ return usbm;
++}
++
+ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb,
+ struct usbdevfs_iso_packet_desc __user *iso_frame_desc,
+ void __user *arg)
+@@ -1439,6 +1590,19 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
+ goto error;
+ }
+
++ as->usbm = find_memory_area(ps, uurb);
++ if (IS_ERR(as->usbm)) {
++ ret = PTR_ERR(as->usbm);
++ as->usbm = NULL;
++ goto error;
++ }
++
++ /* do not use SG buffers when memory mapped segments
++ * are in use
++ */
++ if (as->usbm)
++ num_sgs = 0;
++
+ u += sizeof(struct async) + sizeof(struct urb) + uurb->buffer_length +
+ num_sgs * sizeof(struct scatterlist);
+ ret = usbfs_increase_memory_usage(u);
+@@ -1476,29 +1640,35 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
+ totlen -= u;
+ }
+ } else if (uurb->buffer_length > 0) {
+- as->urb->transfer_buffer = kmalloc(uurb->buffer_length,
+- GFP_KERNEL);
+- if (!as->urb->transfer_buffer) {
+- ret = -ENOMEM;
+- goto error;
+- }
++ if (as->usbm) {
++ unsigned long uurb_start = (unsigned long)uurb->buffer;
+
+- if (!is_in) {
+- if (copy_from_user(as->urb->transfer_buffer,
+- uurb->buffer,
+- uurb->buffer_length)) {
+- ret = -EFAULT;
++ as->urb->transfer_buffer = as->usbm->mem +
++ (uurb_start - as->usbm->vm_start);
++ } else {
++ as->urb->transfer_buffer = kmalloc(uurb->buffer_length,
++ GFP_KERNEL);
++ if (!as->urb->transfer_buffer) {
++ ret = -ENOMEM;
+ goto error;
+ }
+- } else if (uurb->type == USBDEVFS_URB_TYPE_ISO) {
+- /*
+- * Isochronous input data may end up being
+- * discontiguous if some of the packets are short.
+- * Clear the buffer so that the gaps don't leak
+- * kernel data to userspace.
+- */
+- memset(as->urb->transfer_buffer, 0,
+- uurb->buffer_length);
++ if (!is_in) {
++ if (copy_from_user(as->urb->transfer_buffer,
++ uurb->buffer,
++ uurb->buffer_length)) {
++ ret = -EFAULT;
++ goto error;
++ }
++ } else if (uurb->type == USBDEVFS_URB_TYPE_ISO) {
++ /*
++ * Isochronous input data may end up being
++ * discontiguous if some of the packets are
++ * short. Clear the buffer so that the gaps
++ * don't leak kernel data to userspace.
++ */
++ memset(as->urb->transfer_buffer, 0,
++ uurb->buffer_length);
++ }
+ }
+ }
+ as->urb->dev = ps->dev;
+@@ -1545,10 +1715,14 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
+ isopkt = NULL;
+ as->ps = ps;
+ as->userurb = arg;
+- if (is_in && uurb->buffer_length > 0)
++ if (as->usbm) {
++ unsigned long uurb_start = (unsigned long)uurb->buffer;
++
++ as->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
++ as->urb->transfer_dma = as->usbm->dma_handle +
++ (uurb_start - as->usbm->vm_start);
++ } else if (is_in && uurb->buffer_length > 0)
+ as->userbuffer = uurb->buffer;
+- else
+- as->userbuffer = NULL;
+ as->signr = uurb->signr;
+ as->ifnum = ifnum;
+ as->pid = get_pid(task_pid(current));
+@@ -1604,6 +1778,8 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
+ return 0;
+
+ error:
++ if (as && as->usbm)
++ dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count);
+ kfree(isopkt);
+ kfree(dr);
+ if (as)
+@@ -2047,7 +2223,7 @@ static int proc_get_capabilities(struct usb_dev_state *ps, void __user *arg)
+ __u32 caps;
+
+ caps = USBDEVFS_CAP_ZERO_PACKET | USBDEVFS_CAP_NO_PACKET_SIZE_LIM |
+- USBDEVFS_CAP_REAP_AFTER_DISCONNECT;
++ USBDEVFS_CAP_REAP_AFTER_DISCONNECT | USBDEVFS_CAP_MMAP;
+ if (!ps->dev->bus->no_stop_on_short)
+ caps |= USBDEVFS_CAP_BULK_CONTINUATION;
+ if (ps->dev->bus->sg_tablesize)
+@@ -2373,6 +2549,7 @@ const struct file_operations usbdev_file_operations = {
+ #ifdef CONFIG_COMPAT
+ .compat_ioctl = usbdev_compat_ioctl,
+ #endif
++ .mmap = usbdev_mmap,
+ .open = usbdev_open,
+ .release = usbdev_release,
+ };
+diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h
+index 019ba1e..ecbd176 100644
+--- a/include/uapi/linux/usbdevice_fs.h
++++ b/include/uapi/linux/usbdevice_fs.h
+@@ -134,6 +134,7 @@ struct usbdevfs_hub_portinfo {
+ #define USBDEVFS_CAP_NO_PACKET_SIZE_LIM 0x04
+ #define USBDEVFS_CAP_BULK_SCATTER_GATHER 0x08
+ #define USBDEVFS_CAP_REAP_AFTER_DISCONNECT 0x10
++#define USBDEVFS_CAP_MMAP 0x20
+
+ /* USBDEVFS_DISCONNECT_CLAIM flags & struct */
+
+--
+2.1.4
+
--- /dev/null
+From 6e3d1880fa78a3a965cb7eb51ee12b1f785f84bb Mon Sep 17 00:00:00 2001
+From: "Steinar H. Gunderson" <sesse@google.com>
+Date: Tue, 1 Dec 2015 22:05:11 +0100
+Subject: [PATCH] Fix locking of GLsync objects.
+
+GLsync objects had a race condition when used from multiple threads
+(which is the main point of the extension, really); it could be
+validated as a sync object at the beginning of the function, and then
+deleted by another thread before use, causing crashes. Fix this by
+changing all casts from GLsync to struct gl_sync_object to a new
+function _mesa_get_sync() that validates and increases the refcount.
+
+In a similar vein, validation itself uses _mesa_set_search(), which
+requires synchronization -- it was called without a mutex held, causing
+spurious error returns and other issues. Since _mesa_get_sync() now
+takes the shared context mutex, this problem is also resolved.
+
+Signed-off-by: Steinar H. Gunderson <sesse@google.com>
+---
+ src/mesa/main/objectlabel.c | 11 ++++--
+ src/mesa/main/shared.c | 2 +-
+ src/mesa/main/syncobj.c | 89 ++++++++++++++++++++++++++-------------------
+ src/mesa/main/syncobj.h | 11 ++----
+ 4 files changed, 64 insertions(+), 49 deletions(-)
+
+diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c
+index 41f370c..b083c43 100644
+--- a/src/mesa/main/objectlabel.c
++++ b/src/mesa/main/objectlabel.c
+@@ -288,7 +288,7 @@ void GLAPIENTRY
+ _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label)
+ {
+ GET_CURRENT_CONTEXT(ctx);
+- struct gl_sync_object *const syncObj = (struct gl_sync_object *) ptr;
++ struct gl_sync_object *syncObj = _mesa_get_sync(ctx, sync, true);
+ const char *callerstr;
+ char **labelPtr;
+
+@@ -297,7 +297,7 @@ _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label)
+ else
+ callerstr = "glObjectPtrLabelKHR";
+
+- if (!_mesa_validate_sync(ctx, syncObj)) {
++ if (!syncObj) {
+ _mesa_error(ctx, GL_INVALID_VALUE, "%s (not a valid sync object)",
+ callerstr);
+ return;
+@@ -306,6 +306,7 @@ _mesa_ObjectPtrLabel(const void *ptr, GLsizei length, const GLchar *label)
+ labelPtr = &syncObj->Label;
+
+ set_label(ctx, labelPtr, label, length, callerstr);
++ _mesa_unref_sync_object(ctx, syncObj, 1);
+ }
+
+ void GLAPIENTRY
+@@ -313,7 +314,7 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length,
+ GLchar *label)
+ {
+ GET_CURRENT_CONTEXT(ctx);
+- struct gl_sync_object *const syncObj = (struct gl_sync_object *) ptr;
++ struct gl_sync_object *syncObj;
+ const char *callerstr;
+ char **labelPtr;
+
+@@ -328,7 +329,8 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length,
+ return;
+ }
+
+- if (!_mesa_validate_sync(ctx, syncObj)) {
++ syncObj = _mesa_get_sync(ctx, sync, true);
++ if (!syncObj) {
+ _mesa_error(ctx, GL_INVALID_VALUE, "%s (not a valid sync object)",
+ callerstr);
+ return;
+@@ -337,4 +339,5 @@ _mesa_GetObjectPtrLabel(const void *ptr, GLsizei bufSize, GLsizei *length,
+ labelPtr = &syncObj->Label;
+
+ copy_label(*labelPtr, label, length, bufSize);
++ _mesa_unref_sync_object(ctx, syncObj, 1);
+ }
+diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c
+index c37b31d..b9f7bb6 100644
+--- a/src/mesa/main/shared.c
++++ b/src/mesa/main/shared.c
+@@ -338,7 +338,7 @@ free_shared_state(struct gl_context *ctx, struct gl_shared_state *shared)
+ struct set_entry *entry;
+
+ set_foreach(shared->SyncObjects, entry) {
+- _mesa_unref_sync_object(ctx, (struct gl_sync_object *) entry->key);
++ _mesa_unref_sync_object(ctx, (struct gl_sync_object *) entry->key, 1);
+ }
+ }
+ _mesa_set_destroy(shared->SyncObjects, NULL);
+diff --git a/src/mesa/main/syncobj.c b/src/mesa/main/syncobj.c
+index c1b2d3b..d1c6c06 100644
+--- a/src/mesa/main/syncobj.c
++++ b/src/mesa/main/syncobj.c
+@@ -167,34 +167,42 @@ _mesa_free_sync_data(struct gl_context *ctx)
+ * - not in sync objects hash table
+ * - type is GL_SYNC_FENCE
+ * - not marked as deleted
++ *
++ * Returns the internal gl_sync_object pointer if the sync object is valid
++ * or NULL if it isn't.
++ *
++ * If "incRefCount" is true, the reference count is incremented, which is
++ * normally what you want; otherwise, a glDeleteSync from another thread
++ * could delete the sync object while you are still working on it.
+ */
+-bool
+-_mesa_validate_sync(struct gl_context *ctx,
+- const struct gl_sync_object *syncObj)
++struct gl_sync_object *
++_mesa_get_sync(struct gl_context *ctx, GLsync sync, bool incRefCount)
+ {
+- return (syncObj != NULL)
++ struct gl_sync_object *syncObj = (struct gl_sync_object *) sync;
++ mtx_lock(&ctx->Shared->Mutex);
++ if (syncObj != NULL
+ && _mesa_set_search(ctx->Shared->SyncObjects, syncObj) != NULL
+ && (syncObj->Type == GL_SYNC_FENCE)
+- && !syncObj->DeletePending;
+-}
+-
+-
+-void
+-_mesa_ref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj)
+-{
+- mtx_lock(&ctx->Shared->Mutex);
+- syncObj->RefCount++;
++ && !syncObj->DeletePending) {
++ if (incRefCount) {
++ syncObj->RefCount++;
++ }
++ } else {
++ syncObj = NULL;
++ }
+ mtx_unlock(&ctx->Shared->Mutex);
++ return syncObj;
+ }
+
+
+ void
+-_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj)
++_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj,
++ int amount)
+ {
+ struct set_entry *entry;
+
+ mtx_lock(&ctx->Shared->Mutex);
+- syncObj->RefCount--;
++ syncObj->RefCount -= amount;
+ if (syncObj->RefCount == 0) {
+ entry = _mesa_set_search(ctx->Shared->SyncObjects, syncObj);
+ assert (entry != NULL);
+@@ -212,10 +220,9 @@ GLboolean GLAPIENTRY
+ _mesa_IsSync(GLsync sync)
+ {
+ GET_CURRENT_CONTEXT(ctx);
+- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
+ ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_FALSE);
+
+- return _mesa_validate_sync(ctx, syncObj) ? GL_TRUE : GL_FALSE;
++ return _mesa_get_sync(ctx, sync, false) ? GL_TRUE : GL_FALSE;
+ }
+
+
+@@ -223,7 +230,7 @@ void GLAPIENTRY
+ _mesa_DeleteSync(GLsync sync)
+ {
+ GET_CURRENT_CONTEXT(ctx);
+- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
++ struct gl_sync_object *syncObj;
+
+ /* From the GL_ARB_sync spec:
+ *
+@@ -235,16 +242,19 @@ _mesa_DeleteSync(GLsync sync)
+ return;
+ }
+
+- if (!_mesa_validate_sync(ctx, syncObj)) {
++ syncObj = _mesa_get_sync(ctx, sync, true);
++ if (!syncObj) {
+ _mesa_error(ctx, GL_INVALID_VALUE, "glDeleteSync (not a valid sync object)");
+ return;
+ }
+
+ /* If there are no client-waits or server-waits pending on this sync, delete
+- * the underlying object.
++ * the underlying object. Note that we double-unref the object, as _mesa_get_sync
++ * above took an extra refcount to make sure the pointer is valid for us to
++ * manipulate.
+ */
+ syncObj->DeletePending = GL_TRUE;
+- _mesa_unref_sync_object(ctx, syncObj);
++ _mesa_unref_sync_object(ctx, syncObj, 2);
+ }
+
+
+@@ -299,21 +309,20 @@ GLenum GLAPIENTRY
+ _mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
+ {
+ GET_CURRENT_CONTEXT(ctx);
+- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
++ struct gl_sync_object *syncObj;
+ GLenum ret;
+ ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, GL_WAIT_FAILED);
+
+- if (!_mesa_validate_sync(ctx, syncObj)) {
+- _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync (not a valid sync object)");
+- return GL_WAIT_FAILED;
+- }
+-
+ if ((flags & ~GL_SYNC_FLUSH_COMMANDS_BIT) != 0) {
+ _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync(flags=0x%x)", flags);
+ return GL_WAIT_FAILED;
+ }
+
+- _mesa_ref_sync_object(ctx, syncObj);
++ syncObj = _mesa_get_sync(ctx, sync, true);
++ if (!syncObj) {
++ _mesa_error(ctx, GL_INVALID_VALUE, "glClientWaitSync (not a valid sync object)");
++ return GL_WAIT_FAILED;
++ }
+
+ /* From the GL_ARB_sync spec:
+ *
+@@ -335,7 +344,7 @@ _mesa_ClientWaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
+ }
+ }
+
+- _mesa_unref_sync_object(ctx, syncObj);
++ _mesa_unref_sync_object(ctx, syncObj, 1);
+ return ret;
+ }
+
+@@ -344,12 +353,7 @@ void GLAPIENTRY
+ _mesa_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
+ {
+ GET_CURRENT_CONTEXT(ctx);
+- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
+-
+- if (!_mesa_validate_sync(ctx, syncObj)) {
+- _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync (not a valid sync object)");
+- return;
+- }
++ struct gl_sync_object *syncObj;
+
+ if (flags != 0) {
+ _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync(flags=0x%x)", flags);
+@@ -362,7 +366,14 @@ _mesa_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
+ return;
+ }
+
++ syncObj = _mesa_get_sync(ctx, sync, true);
++ if (!syncObj) {
++ _mesa_error(ctx, GL_INVALID_VALUE, "glWaitSync (not a valid sync object)");
++ return;
++ }
++
+ ctx->Driver.ServerWaitSync(ctx, syncObj, flags, timeout);
++ _mesa_unref_sync_object(ctx, syncObj, 1);
+ }
+
+
+@@ -371,11 +382,12 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length,
+ GLint *values)
+ {
+ GET_CURRENT_CONTEXT(ctx);
+- struct gl_sync_object *const syncObj = (struct gl_sync_object *) sync;
++ struct gl_sync_object *syncObj;
+ GLsizei size = 0;
+ GLint v[1];
+
+- if (!_mesa_validate_sync(ctx, syncObj)) {
++ syncObj = _mesa_get_sync(ctx, sync, true);
++ if (!syncObj) {
+ _mesa_error(ctx, GL_INVALID_VALUE, "glGetSynciv (not a valid sync object)");
+ return;
+ }
+@@ -409,6 +421,7 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length,
+
+ default:
+ _mesa_error(ctx, GL_INVALID_ENUM, "glGetSynciv(pname=0x%x)\n", pname);
++ _mesa_unref_sync_object(ctx, syncObj, 1);
+ return;
+ }
+
+@@ -421,4 +434,6 @@ _mesa_GetSynciv(GLsync sync, GLenum pname, GLsizei bufSize, GLsizei *length,
+ if (length != NULL) {
+ *length = size;
+ }
++
++ _mesa_unref_sync_object(ctx, syncObj, 1);
+ }
+diff --git a/src/mesa/main/syncobj.h b/src/mesa/main/syncobj.h
+index 5d510e8..e8dbded 100644
+--- a/src/mesa/main/syncobj.h
++++ b/src/mesa/main/syncobj.h
+@@ -47,15 +47,12 @@ _mesa_init_sync(struct gl_context *);
+ extern void
+ _mesa_free_sync_data(struct gl_context *);
+
+-extern void
+-_mesa_ref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj);
++struct gl_sync_object *
++_mesa_get_sync(struct gl_context *ctx, GLsync sync, bool incRefCount);
+
+ extern void
+-_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj);
+-
+-extern bool
+-_mesa_validate_sync(struct gl_context *ctx,
+- const struct gl_sync_object *syncObj);
++_mesa_unref_sync_object(struct gl_context *ctx, struct gl_sync_object *syncObj,
++ int amount);
+
+ extern GLboolean GLAPIENTRY
+ _mesa_IsSync(GLsync sync);
+--
+2.6.2
--- /dev/null
+diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc
+--- orig/zita-resampler-1.3.0/libs/resampler.cc 2012-10-26 22:58:55.000000000 +0200
++++ zita-resampler-1.3.0/libs/resampler.cc 2015-11-15 12:27:42.764591015 +0100
+@@ -24,6 +24,10 @@
+ #include <math.h>
+ #include <zita-resampler/resampler.h>
+
++#ifdef __SSE2__
++#include <xmmintrin.h>
++#endif
++
+
+ static unsigned int gcd (unsigned int a, unsigned int b)
+ {
+@@ -47,6 +51,45 @@
+ return 1;
+ }
+
++#ifdef __SSE2__
++
++static inline void calc_stereo_sample_sse (unsigned int hl,
++ float *c1,
++ float *c2,
++ float *q1,
++ float *q2,
++ float *out_data)
++{
++ unsigned int i;
++ __m128 denorm, s, w1, w2;
++
++ denorm = _mm_set1_ps (1e-20f);
++ s = denorm;
++ for (i = 0; i < hl; i += 4)
++ {
++ q2 -= 8;
++
++ // s += *q1 * c1 [i];
++ w1 = _mm_loadu_ps (&c1 [i]);
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), _mm_unpacklo_ps (w1, w1)));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 4), _mm_unpackhi_ps (w1, w1)));
++
++ // s += *q2 * c2 [i];
++ w2 = _mm_loadu_ps (&c2 [i]);
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 4), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 1, 1))));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 3, 3))));
++
++ q1 += 8;
++ }
++ s = _mm_sub_ps (s, denorm);
++ s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
++
++ // Writes two bytes more than we want, but this is fine since out_count >= 2.
++ _mm_storeu_ps (out_data, s);
++}
++
++#endif
++
+
+ Resampler::Resampler (void) :
+ _table (0),
+@@ -213,18 +256,28 @@
+ {
+ float *c1 = _table->_ctab + hl * ph;
+ float *c2 = _table->_ctab + hl * (np - ph);
+- for (c = 0; c < _nchan; c++)
++#ifdef __SSE2__
++ if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2)
+ {
+- float *q1 = p1 + c;
+- float *q2 = p2 + c;
+- float s = 1e-20f;
+- for (i = 0; i < hl; i++)
++ calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data);
++ out_data += 2;
++ }
++ else
++#endif
++ {
++ for (c = 0; c < _nchan; c++)
+ {
+- q2 -= _nchan;
+- s += *q1 * c1 [i] + *q2 * c2 [i];
+- q1 += _nchan;
++ float *q1 = p1 + c;
++ float *q2 = p2 + c;
++ float s = 1e-20f;
++ for (i = 0; i < hl; i++)
++ {
++ q2 -= _nchan;
++ s += *q1 * c1 [i] + *q2 * c2 [i];
++ q1 += _nchan;
++ }
++ *out_data++ = s - 1e-20f;
+ }
+- *out_data++ = s - 1e-20f;
+ }
+ }
+ else
+@@ -260,4 +313,3 @@
+ return 0;
+ }
+
+-
+diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc
+--- orig/zita-resampler-1.3.0/libs/vresampler.cc 2012-10-26 22:58:55.000000000 +0200
++++ zita-resampler-1.3.0/libs/vresampler.cc 2015-11-15 12:27:58.424544882 +0100
+@@ -25,6 +25,58 @@
+ #include <zita-resampler/vresampler.h>
+
+
++#ifdef __SSE2__
++
++#include <xmmintrin.h>
++
++static inline void calc_stereo_sample_sse (int hl,
++ float b,
++ float *p1,
++ float *p2,
++ float *q1,
++ float *q2,
++ float *out_data)
++{
++ int i;
++ __m128 denorm, bs, s, c1, c2, w1, w2;
++
++ denorm = _mm_set1_ps (1e-25f);
++ bs = _mm_set1_ps (b);
++ s = denorm;
++ for (i = 0; i < hl; i += 4)
++ {
++ p2 -= 8;
++
++ // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
++ w1 = _mm_loadu_ps (&q1 [i]);
++ w2 = _mm_loadu_ps (&q1 [i + hl]);
++ c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++ // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
++ w1 = _mm_loadu_ps (&q2 [i]);
++ w2 = _mm_loadu_ps (&q2 [i - hl]);
++ c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++ // s += *p1 * _c1 [i];
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), _mm_unpacklo_ps (c1, c1)));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 4), _mm_unpackhi_ps (c1, c1)));
++
++ // s += *p2 * _c2 [i];
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 4), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 1, 1))));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 3, 3))));
++
++ p1 += 8;
++ }
++ s = _mm_sub_ps (s, denorm);
++ s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
++
++ // Writes two bytes more than we want, but this is fine since out_count >= 2.
++ _mm_storeu_ps (out_data, s);
++}
++
++#endif
++
++
+ VResampler::VResampler (void) :
+ _table (0),
+ _nchan (0),
+@@ -212,23 +264,33 @@
+ a = 1.0f - b;
+ q1 = _table->_ctab + hl * k;
+ q2 = _table->_ctab + hl * (np - k);
+- for (i = 0; i < hl; i++)
++#ifdef __SSE2__
++ if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2)
+ {
+- _c1 [i] = a * q1 [i] + b * q1 [i + hl];
+- _c2 [i] = a * q2 [i] + b * q2 [i - hl];
++ calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data);
++ out_data += 2;
+ }
+- for (c = 0; c < _nchan; c++)
++ else
++#endif
+ {
+- q1 = p1 + c;
+- q2 = p2 + c;
+- a = 1e-25f;
+- for (i = 0; i < hl; i++)
+- {
+- q2 -= _nchan;
+- a += *q1 * _c1 [i] + *q2 * _c2 [i];
+- q1 += _nchan;
+- }
+- *out_data++ = a - 1e-25f;
++ for (i = 0; i < hl; i++)
++ {
++ _c1 [i] = a * q1 [i] + b * q1 [i + hl];
++ _c2 [i] = a * q2 [i] + b * q2 [i - hl];
++ }
++ for (c = 0; c < _nchan; c++)
++ {
++ q1 = p1 + c;
++ q2 = p2 + c;
++ a = 1e-25f;
++ for (i = 0; i < hl; i++)
++ {
++ q2 -= _nchan;
++ a += *q1 * _c1 [i] + *q2 * _c2 [i];
++ q1 += _nchan;
++ }
++ *out_data++ = a - 1e-25f;
++ }
+ }
+ }
+ else