]> git.sesse.net Git - bmusb/blobdiff - bmusb.cpp
Add a (commented-out) useful AVX2 debugging function.
[bmusb] / bmusb.cpp
index 4b5f75a6fb95f232293a0161d2db52cf4c0ceb85..78e6b412d1438cdd41354c671ca3c117aa371c57 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
 #include <string.h>
 #include <fcntl.h>
 #include <stdint.h>
+#include <assert.h>
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif
 #include <algorithm>
 #include <functional>
 #include <memory>
 #include <condition_variable>
 #include <thread>
 #include <stack>
+#include <atomic>
 #include "bmusb.h"
 
 using namespace std;
-
-static int current_register = 0;
-
-#define NUM_REGISTERS 60
-uint8_t register_file[NUM_REGISTERS];
+using namespace std::placeholders;
 
 #define WIDTH 1280
 #define HEIGHT 750  /* 30 lines ancillary data? */
@@ -44,19 +45,8 @@ uint8_t register_file[NUM_REGISTERS];
 
 FILE *audiofp;
 
-FrameAllocator::Frame current_video_frame;
-FrameAllocator::Frame current_audio_frame;
-
-struct QueuedFrame {
-       uint16_t timecode;
-       uint16_t format;
-       FrameAllocator::Frame frame;
-};
-
-mutex queue_lock;
-condition_variable queues_not_empty;
-deque<QueuedFrame> pending_video_frames;
-deque<QueuedFrame> pending_audio_frames;
+thread usb_thread;
+atomic<bool> should_quit;
 
 FrameAllocator::~FrameAllocator() {}
 
@@ -105,10 +95,6 @@ void MallocFrameAllocator::release_frame(Frame frame)
        freelist.push(unique_ptr<uint8_t[]>(frame.data));
 }
 
-FrameAllocator *video_frame_allocator = nullptr;
-FrameAllocator *audio_frame_allocator = nullptr;
-frame_callback_t frame_callback = nullptr;
-
 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
 {
        if (a == b) {
@@ -121,7 +107,7 @@ bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
        }
 }
 
-void queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
+void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
 {
        if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
                printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
@@ -156,11 +142,11 @@ void dump_audio_block(uint8_t *audio_start, size_t audio_len)
        fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
 }
 
-void dequeue_thread()
+void BMUSBCapture::dequeue_thread()
 {
        for ( ;; ) {
                unique_lock<mutex> lock(queue_lock);
-               queues_not_empty.wait(lock, []{ return !pending_video_frames.empty() && !pending_audio_frames.empty(); });
+               queues_not_empty.wait(lock, [this]{ return !pending_video_frames.empty() && !pending_audio_frames.empty(); });
 
                uint16_t video_timecode = pending_video_frames.front().timecode;
                uint16_t audio_timecode = pending_audio_frames.front().timecode;
@@ -195,23 +181,7 @@ void dequeue_thread()
        }
 }
 
-void add_current_frame(const uint8_t *start, const uint8_t *end)
-{
-       if (current_video_frame.data == nullptr ||
-           current_video_frame.len > current_video_frame.size) return;
-       if (start == end) return;
-
-       int bytes = end - start;
-       if (current_video_frame.len + bytes > current_video_frame.size) {
-               printf("%d bytes overflow after last video frame\n", current_video_frame.len + bytes - current_video_frame.size);
-               //dump_frame();
-       } else {
-               memcpy(current_video_frame.data + current_video_frame.len, start, bytes);
-               current_video_frame.len += bytes;
-       }
-}
-
-void start_new_frame(const uint8_t *start)
+void BMUSBCapture::start_new_frame(const uint8_t *start)
 {
        uint16_t format = (start[3] << 8) | start[2];
        uint16_t timecode = (start[1] << 8) | start[0];
@@ -233,23 +203,7 @@ void start_new_frame(const uint8_t *start)
        //}
 }
 
-void add_current_audio(const uint8_t *start, const uint8_t *end)
-{
-       if (current_audio_frame.data == nullptr ||
-           current_audio_frame.len > current_audio_frame.size) return;
-       if (start == end) return;
-
-       int bytes = end - start;
-       if (current_audio_frame.len + bytes > current_audio_frame.size) {
-               printf("%d bytes overflow after last audio block\n", current_audio_frame.len + bytes - current_audio_frame.size);
-               //dump_audio_block();
-       } else {
-               memcpy(current_audio_frame.data + current_audio_frame.len, start, bytes);
-               current_audio_frame.len += bytes;
-       }
-}
-
-void start_new_audio_block(const uint8_t *start)
+void BMUSBCapture::start_new_audio_block(const uint8_t *start)
 {
        uint16_t format = (start[3] << 8) | start[2];
        uint16_t timecode = (start[1] << 8) | start[0];
@@ -262,10 +216,11 @@ void start_new_audio_block(const uint8_t *start)
        current_audio_frame = audio_frame_allocator->alloc_frame();
 }
 
+#if 0
 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
 {
        //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
-       for (int j = 0; j < pack->actual_length; j++) {
+       for (unsigned j = 0; j < pack->actual_length; j++) {
        //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
                printf("%02x", xfr->buffer[j + offset]);
                if ((j % 16) == 15)
@@ -276,11 +231,279 @@ static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_p
                        printf(" ");
        }
 }
+#endif
+
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+       assert(n % 2 == 0);
+       uint8_t *dptr1 = dest1;
+       uint8_t *dptr2 = dest2;
+
+       for (size_t i = 0; i < n; i += 2) {
+               *dptr1++ = *src++;
+               *dptr2++ = *src++;
+       }
+}
+
+void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
+{
+       if (current_frame->data == nullptr ||
+           current_frame->len > current_frame->size ||
+           start == end) {
+               return;
+       }
+
+       int bytes = end - start;
+       if (current_frame->len + bytes > current_frame->size) {
+               printf("%d bytes overflow after last %s frame\n",
+                       int(current_frame->len + bytes - current_frame->size), frame_type_name);
+               //dump_frame();
+       } else {
+               if (current_frame->interleaved) {
+                       uint8_t *data = current_frame->data + current_frame->len / 2;
+                       uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
+                       if (current_frame->len % 2 == 1) {
+                               ++data;
+                               swap(data, data2);
+                       }
+                       if (bytes % 2 == 1) {
+                               *data++ = *start++;
+                               swap(data, data2);
+                               ++current_frame->len;
+                               --bytes;
+                       }
+                       memcpy_interleaved(data, data2, start, bytes);
+                       current_frame->len += bytes;
+               } else {
+                       memcpy(current_frame->data + current_frame->len, start, bytes);
+                       current_frame->len += bytes;
+               }
+       }
+}
+
+#ifdef __SSE2__
+
+#if 0
+void avx2_dump(const char *name, __m256i n)
+{
+       printf("%-10s:", name);
+       printf(" %02x", _mm256_extract_epi8(n, 0));
+       printf(" %02x", _mm256_extract_epi8(n, 1));
+       printf(" %02x", _mm256_extract_epi8(n, 2));
+       printf(" %02x", _mm256_extract_epi8(n, 3));
+       printf(" %02x", _mm256_extract_epi8(n, 4));
+       printf(" %02x", _mm256_extract_epi8(n, 5));
+       printf(" %02x", _mm256_extract_epi8(n, 6));
+       printf(" %02x", _mm256_extract_epi8(n, 7));
+       printf(" ");
+       printf(" %02x", _mm256_extract_epi8(n, 8));
+       printf(" %02x", _mm256_extract_epi8(n, 9));
+       printf(" %02x", _mm256_extract_epi8(n, 10));
+       printf(" %02x", _mm256_extract_epi8(n, 11));
+       printf(" %02x", _mm256_extract_epi8(n, 12));
+       printf(" %02x", _mm256_extract_epi8(n, 13));
+       printf(" %02x", _mm256_extract_epi8(n, 14));
+       printf(" %02x", _mm256_extract_epi8(n, 15));
+       printf(" ");
+       printf(" %02x", _mm256_extract_epi8(n, 16));
+       printf(" %02x", _mm256_extract_epi8(n, 17));
+       printf(" %02x", _mm256_extract_epi8(n, 18));
+       printf(" %02x", _mm256_extract_epi8(n, 19));
+       printf(" %02x", _mm256_extract_epi8(n, 20));
+       printf(" %02x", _mm256_extract_epi8(n, 21));
+       printf(" %02x", _mm256_extract_epi8(n, 22));
+       printf(" %02x", _mm256_extract_epi8(n, 23));
+       printf(" ");
+       printf(" %02x", _mm256_extract_epi8(n, 24));
+       printf(" %02x", _mm256_extract_epi8(n, 25));
+       printf(" %02x", _mm256_extract_epi8(n, 26));
+       printf(" %02x", _mm256_extract_epi8(n, 27));
+       printf(" %02x", _mm256_extract_epi8(n, 28));
+       printf(" %02x", _mm256_extract_epi8(n, 29));
+       printf(" %02x", _mm256_extract_epi8(n, 30));
+       printf(" %02x", _mm256_extract_epi8(n, 31));
+       printf("\n");
+}
+#endif
+
+// Does a memcpy and memchr in one to reduce processing time.
+// Note that the benefit is somewhat limited if your L3 cache is small,
+// as you'll (unfortunately) spend most of the time loading the data
+// from main memory.
+//
+// Complicated cases are left to the slow path; it basically stops copying
+// up until the first instance of "sync_char" (usually a bit before, actually).
+// This is fine, since 0x00 bytes shouldn't really show up in normal picture
+// data, and what we really need this for is the 00 00 ff ff marker in video data.
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+       if (current_frame->data == nullptr ||
+           current_frame->len > current_frame->size ||
+           start == limit) {
+               return start;
+       }
+       size_t orig_bytes = limit - start;
+       if (orig_bytes < 128) {
+               // Don't bother.
+               return start;
+       }
+
+       // Don't read more bytes than we can write.
+       limit = min(limit, start + (current_frame->size - current_frame->len));
+
+       // Align end to 32 bytes.
+       limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+       if (start >= limit) {
+               return start;
+       }
+
+       // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+       const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
+       if (aligned_start != start) {
+               const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
+               if (sync_start == nullptr) {
+                       add_to_frame(current_frame, "", start, aligned_start);
+               } else {
+                       add_to_frame(current_frame, "", start, sync_start);
+                       return sync_start;
+               }
+       }
+
+       // Make the length a multiple of 64.
+       if (current_frame->interleaved) {
+               if (((limit - aligned_start) % 64) != 0) {
+                       limit -= 32;
+               }
+               assert(((limit - aligned_start) % 64) == 0);
+       }
+
+#if __AVX2__
+       const __m256i needle = _mm256_set1_epi8(sync_char);
+
+       const __restrict __m256i *in = (const __m256i *)aligned_start;
+       if (current_frame->interleaved) {
+               __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+               __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+               if (current_frame->len % 2 == 1) {
+                       swap(out1, out2);
+               }
+
+               __m256i shuffle_cw = _mm256_set_epi8(
+                       15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+                       15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+               while (in < (const __m256i *)limit) {
+                       // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+                       __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
+                       __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
+
+                       __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
+                       __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+                       __m256i found = _mm256_or_si256(found1, found2);
+
+                       data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
+                       data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
+               
+                       data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
+                       data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
+
+                       __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+                       __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+                       _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
+                       _mm256_storeu_si256(out2, hi);
+
+                       if (!_mm256_testz_si256(found, found)) {
+                               break;
+                       }
+
+                       in += 2;
+                       ++out1;
+                       ++out2;
+               }
+               current_frame->len += (uint8_t *)in - aligned_start;
+       } else {
+               __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+               while (in < (const __m256i *)limit) {
+                       __m256i data = _mm256_load_si256(in);
+                       _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
+                       __m256i found = _mm256_cmpeq_epi8(data, needle);
+                       if (!_mm256_testz_si256(found, found)) {
+                               break;
+                       }
+
+                       ++in;
+                       ++out;
+               }
+               current_frame->len = (uint8_t *)out - current_frame->data;
+       }
+#else
+       const __m128i needle = _mm_set1_epi8(sync_char);
+
+       const __m128i *in = (const __m128i *)aligned_start;
+       if (current_frame->interleaved) {
+               __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
+               __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
+               if (current_frame->len % 2 == 1) {
+                       swap(out1, out2);
+               }
+
+               __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+               while (in < (const __m128i *)limit) {
+                       __m128i data1 = _mm_load_si128(in);
+                       __m128i data2 = _mm_load_si128(in + 1);
+                       __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+                       __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+                       __m128i data1_hi = _mm_srli_epi16(data1, 8);
+                       __m128i data2_hi = _mm_srli_epi16(data2, 8);
+                       __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+                       _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
+                       __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+                       _mm_storeu_si128(out2, hi);
+                       __m128i found1 = _mm_cmpeq_epi8(data1, needle);
+                       __m128i found2 = _mm_cmpeq_epi8(data2, needle);
+                       if (!_mm_testz_si128(found1, found1) ||
+                           !_mm_testz_si128(found2, found2)) {
+                               break;
+                       }
+
+                       in += 2;
+                       ++out1;
+                       ++out2;
+               }
+               current_frame->len += (uint8_t *)in - aligned_start;
+       } else {
+               __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+               while (in < (const __m128i *)limit) {
+                       __m128i data = _mm_load_si128(in);
+                       _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
+                       __m128i found = _mm_cmpeq_epi8(data, needle);
+                       if (!_mm_testz_si128(found, found)) {
+                               break;
+                       }
+
+                       ++in;
+                       ++out;
+               }
+               current_frame->len = (uint8_t *)out - current_frame->data;
+       }
+#endif
+
+       //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
+
+       return (const uint8_t *)in;
+}
+#endif
 
-void decode_packs(const libusb_transfer *xfr, const char *sync_pattern, int sync_length, function<void(const uint8_t *start, const uint8_t *end)> add_callback, function<void(const uint8_t *start)> start_callback)
+void decode_packs(const libusb_transfer *xfr,
+                  const char *sync_pattern,
+                  int sync_length,
+                  FrameAllocator::Frame *current_frame,
+                  const char *frame_type_name,
+                  function<void(const uint8_t *start)> start_callback)
 {
        int offset = 0;
-       for (unsigned i = 0; i < xfr->num_iso_packets; i++) {
+       for (int i = 0; i < xfr->num_iso_packets; i++) {
                const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
 
                if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
@@ -289,19 +512,24 @@ void decode_packs(const libusb_transfer *xfr, const char *sync_pattern, int sync
 //exit(5);
                }
 
-               const unsigned char *iso_start = xfr->buffer + offset;
-               for (int iso_offset = 0; iso_offset < pack->actual_length; ) {  // Usually runs only one iteration.
-                       const unsigned char* start_next_frame = (const unsigned char *)memmem(iso_start + iso_offset, pack->actual_length - iso_offset, sync_pattern, sync_length);
+               const uint8_t *start = xfr->buffer + offset;
+               const uint8_t *limit = start + pack->actual_length;
+               while (start < limit) {  // Usually runs only one iteration.
+#ifdef __SSE2__
+                       start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
+                       if (start == limit) break;
+                       assert(start < limit);
+#endif
+
+                       const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
                        if (start_next_frame == nullptr) {
                                // add the rest of the buffer
-                               add_callback(iso_start + iso_offset, iso_start + pack->actual_length);
+                               add_to_frame(current_frame, frame_type_name, start, limit);
                                break;
                        } else {
-                               add_callback(iso_start + iso_offset, start_next_frame);
-                               start_callback(start_next_frame + sync_length);
-
-                               int suboffset = start_next_frame - iso_start;
-                               iso_offset = suboffset + sync_length;  // skip sync
+                               add_to_frame(current_frame, frame_type_name, start, start_next_frame);
+                               start = start_next_frame + sync_length;  // skip sync
+                               start_callback(start);
                        }
                }
 #if 0
@@ -311,7 +539,7 @@ void decode_packs(const libusb_transfer *xfr, const char *sync_pattern, int sync
        }
 }
 
-static void cb_xfr(struct libusb_transfer *xfr)
+void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
 {
        if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
                fprintf(stderr, "transfer status %d\n", xfr->status);
@@ -319,15 +547,18 @@ static void cb_xfr(struct libusb_transfer *xfr)
                exit(3);
        }
 
+       assert(xfr->user_data != nullptr);
+       BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
+
        if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
                if (xfr->endpoint == 0x84) {
-                       decode_packs(xfr, "DeckLinkAudioResyncT", 20, add_current_audio, start_new_audio_block);
+                       decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
                } else {
-                       decode_packs(xfr, "\x00\x00\xff\xff", 4, add_current_frame, start_new_frame);
+                       decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
                }
        }
        if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
-               const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
+               //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
                uint8_t *buf = libusb_control_transfer_get_data(xfr);
 #if 0
                if (setup->wIndex == 44) {
@@ -337,19 +568,19 @@ static void cb_xfr(struct libusb_transfer *xfr)
                                setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
                }
 #else
-               memcpy(register_file + current_register, buf, 4);
-               current_register = (current_register + 4) % NUM_REGISTERS;
-               if (current_register == 0) {
+               memcpy(usb->register_file + usb->current_register, buf, 4);
+               usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
+               if (usb->current_register == 0) {
                        // read through all of them
                        printf("register dump:");
-                       for (int i = 0; i < NUM_REGISTERS; i += 4) {
-                               printf(" 0x%02x%02x%02x%02x", register_file[i], register_file[i + 1], register_file[i + 2], register_file[i + 3]);
+                       for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
+                               printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
                        }
                        printf("\n");
                }
                libusb_fill_control_setup(xfr->buffer,
                    LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
-                       /*index=*/current_register, /*length=*/4);
+                       /*index=*/usb->current_register, /*length=*/4);
 #endif
        }
 
@@ -366,14 +597,13 @@ static void cb_xfr(struct libusb_transfer *xfr)
        }
 #endif
 
-end:
        if (libusb_submit_transfer(xfr) < 0) {
                fprintf(stderr, "error re-submitting URB\n");
                exit(1);
        }
 }
 
-void usb_thread()
+void BMUSBCapture::usb_thread_func()
 {
        printf("usb thread started\n");
 
@@ -383,39 +613,14 @@ void usb_thread()
        if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
                printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
        }
-       while (true) {
+       while (!should_quit) {
                int rc = libusb_handle_events(nullptr);
                if (rc != LIBUSB_SUCCESS)
                        break;
        }
 }
 
-FrameAllocator *get_video_frame_allocator()
-{
-       return video_frame_allocator;
-}
-
-void set_video_frame_allocator(FrameAllocator *allocator)
-{
-       video_frame_allocator = allocator;
-}
-
-FrameAllocator *get_audio_frame_allocator()
-{
-       return audio_frame_allocator;
-}
-
-void set_audio_frame_allocator(FrameAllocator *allocator)
-{
-       audio_frame_allocator = allocator;
-}
-
-void set_frame_callback(frame_callback_t callback)
-{
-       frame_callback = callback;
-}
-
-void start_bm_capture()
+void BMUSBCapture::configure_card()
 {
        if (video_frame_allocator == nullptr) {
                set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE));  // FIXME: leak.
@@ -423,11 +628,10 @@ void start_bm_capture()
        if (audio_frame_allocator == nullptr) {
                set_audio_frame_allocator(new MallocFrameAllocator(65536));  // FIXME: leak.
        }
-       thread(dequeue_thread).detach();
+       thread(&BMUSBCapture::dequeue_thread, this).detach();
 
        int rc;
        struct libusb_transfer *xfr;
-       vector<libusb_transfer *> iso_xfrs;
 
        rc = libusb_init(nullptr);
        if (rc < 0) {
@@ -435,7 +639,9 @@ void start_bm_capture()
                exit(1);
        }
 
-       struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
+       //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
+       //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd4f);
+       struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, vid, pid);
        if (!devh) {
                fprintf(stderr, "Error finding USB device\n");
                exit(1);
@@ -452,8 +658,8 @@ void start_bm_capture()
                printf("  interface %d\n", interface_number);
                const libusb_interface *interface = &config->interface[interface_number];
                for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
-                       printf("    alternate setting %d\n", altsetting);
                        const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
+                       printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
                        for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
                                const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
                                printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
@@ -523,6 +729,8 @@ void start_bm_capture()
        //
        //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
        //
+       //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
+       //
        //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
        //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
        //
@@ -532,6 +740,11 @@ void start_bm_capture()
        //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
        //    but the driver sets it to 0x8036802a at some point.
        //
+       //    all of this is on request 214/215. other requests (192, 219,
+       //    222, 223, 224) are used for firmware upgrade. Probably best to
+       //    stay out of it unless you know what you're doing.
+       //
+       //
        // register 16:
        // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
        //
@@ -551,59 +764,6 @@ void start_bm_capture()
        static const ctrl ctrls[] = {
                { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
                { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
-               { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
-               { LIBUSB_ENDPOINT_IN,  214,  4, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 20, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 28, 0 },
-               { LIBUSB_ENDPOINT_IN,  215, 32, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 36, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  216, 44, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 48, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 52, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },  // packet 354
-               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               // more...
-               //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
-               //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },  // wow, some kind of mode
 
                // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
                // capture (v210).
@@ -613,49 +773,13 @@ void start_bm_capture()
                // 0x3c000000 = composite video? (analog audio)
                // 0x3e000000 = s-video? (analog audio)
                { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
+               //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
                //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
-
-               //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0xffffffff },
-               //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0xffffffff },
-               //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0x40404040 },
-               //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0x40404040 },
-               //{ LIBUSB_ENDPOINT_OUT, 215, 36, 0x8036802a },
                { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
-               //{ LIBUSB_ENDPOINT_OUT, 215, 24, 0x13370001 },  // latch for frame start?
                { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
-               //{ LIBUSB_ENDPOINT_OUT, 215,  4, 0x00000000 },  // appears to have no e fect
-               //{ LIBUSB_ENDPOINT_OUT, 215,  8, 0x00000000 },  // appears to have no effect
-               //{ LIBUSB_ENDPOINT_OUT, 215, 20, 0x00000000 },  // appears to have no effect
-               //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0x00000000 },  // appears to have no effect
-               //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0x00000000 },  // appears to have no effect
-               //{ LIBUSB_ENDPOINT_OUT, 215, 36, 0x00000000 },  // appears to have no effect
-#if 0
-               { LIBUSB_ENDPOINT_OUT, 215,  0 },
-               { LIBUSB_ENDPOINT_OUT, 215,  0 },
-               { LIBUSB_ENDPOINT_OUT, 215, 28 },
-               { LIBUSB_ENDPOINT_OUT, 215, 32 },
-               { LIBUSB_ENDPOINT_OUT, 215, 36 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215,  0 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-#endif
        };
 
-       for (int req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
+       for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
                uint32_t flipped = htonl(ctrls[req].data);
                static uint8_t value[4];
                memcpy(value, &flipped, sizeof(flipped));
@@ -705,6 +829,7 @@ void start_bm_capture()
            LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
                /*index=*/44, /*length=*/4);
        libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
+       xfr->user_data = this;
        libusb_submit_transfer(xfr);
 
        // set up an asynchronous transfer of register 24
@@ -716,6 +841,7 @@ void start_bm_capture()
            LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
                /*index=*/24, /*length=*/4);
        libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
+       xfr->user_data = this;
        libusb_submit_transfer(xfr);
 #endif
 
@@ -728,6 +854,7 @@ void start_bm_capture()
            LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
                /*index=*/current_register, /*length=*/4);
        libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
+       xfr->user_data = this;
        //libusb_submit_transfer(xfr);
 
        audiofp = fopen("audio.raw", "wb");
@@ -749,7 +876,7 @@ void start_bm_capture()
                                        size &= ~1023;
                                        size += 1024;
                                }
-                               num_iso_pack = (2 << 20) / size;  // 2 MB.
+                               num_iso_pack = (2 << 18) / size;  // 512 kB.
                                printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
                        } else {
                                size = 0xc0;
@@ -768,26 +895,28 @@ void start_bm_capture()
                        libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
                                num_iso_pack, cb_xfr, nullptr, 0);
                        libusb_set_iso_packet_lengths(xfr, size);
+                       xfr->user_data = this;
                        iso_xfrs.push_back(xfr);
                }
        }
+}
 
-       {
-               int i = 0;
-               for (libusb_transfer *xfr : iso_xfrs) {
-                       rc = libusb_submit_transfer(xfr);
-                       ++i;
-                       if (rc < 0) {
-                               //printf("num_bytes=%d\n", num_bytes);
-                               fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
-                                       xfr->endpoint, i, libusb_error_name(rc));
-                               exit(1);
-                       }
+void BMUSBCapture::start_bm_capture()
+{
+       printf("starting capture\n");
+       int i = 0;
+       for (libusb_transfer *xfr : iso_xfrs) {
+               printf("submitting transfer...\n");
+               int rc = libusb_submit_transfer(xfr);
+               ++i;
+               if (rc < 0) {
+                       //printf("num_bytes=%d\n", num_bytes);
+                       fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
+                               xfr->endpoint, i, libusb_error_name(rc));
+                       exit(1);
                }
        }
 
-       thread(usb_thread).detach();
-
 
 #if 0
        libusb_release_interface(devh, 0);
@@ -798,3 +927,15 @@ out:
        return rc;
 #endif
 }
+
+void BMUSBCapture::start_bm_thread()
+{
+       should_quit = false;
+       usb_thread = thread(&BMUSBCapture::usb_thread_func);
+}
+
+void BMUSBCapture::stop_bm_thread()
+{
+       should_quit = true;
+       usb_thread.join();
+}