]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Fix issues with the queue locking up when there is no signal.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #include <assert.h>
8 #include <errno.h>
9 #include <libusb.h>
10 #include <netinet/in.h>
11 #include <sched.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #ifdef __SSE4_1__
17 #include <immintrin.h>
18 #endif
19 #include "bmusb.h"
20
21 #include <algorithm>
22 #include <atomic>
23 #include <condition_variable>
24 #include <cstddef>
25 #include <cstdint>
26 #include <deque>
27 #include <functional>
28 #include <memory>
29 #include <mutex>
30 #include <stack>
31 #include <thread>
32
33 using namespace std;
34 using namespace std::placeholders;
35
36 #define WIDTH 1280
37 #define HEIGHT 750  /* 30 lines ancillary data? */
38 //#define WIDTH 1920
39 //#define HEIGHT 1125  /* ??? lines ancillary data? */
40 #define HEADER_SIZE 44
41 //#define HEADER_SIZE 0
42 #define AUDIO_HEADER_SIZE 4
43
44 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
45 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
46 #define FRAME_SIZE (8 << 20)
47
48 FILE *audiofp;
49
50 thread usb_thread;
51 atomic<bool> should_quit;
52
53 FrameAllocator::~FrameAllocator() {}
54
55 #define NUM_QUEUED_FRAMES 16
56 class MallocFrameAllocator : public FrameAllocator {
57 public:
58         MallocFrameAllocator(size_t frame_size);
59         Frame alloc_frame() override;
60         void release_frame(Frame frame) override;
61
62 private:
63         size_t frame_size;
64
65         mutex freelist_mutex;
66         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
67 };
68
69 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size)
70         : frame_size(frame_size)
71 {
72         for (int i = 0; i < NUM_QUEUED_FRAMES; ++i) {
73                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
74         }
75 }
76
77 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
78 {
79         Frame vf;
80         vf.owner = this;
81
82         unique_lock<mutex> lock(freelist_mutex);  // Meh.
83         if (freelist.empty()) {
84                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
85                         frame_size);
86         } else {
87                 vf.data = freelist.top().release();
88                 vf.size = frame_size;
89                 freelist.pop();  // Meh.
90         }
91         return vf;
92 }
93
94 void MallocFrameAllocator::release_frame(Frame frame)
95 {
96         if (frame.overflow > 0) {
97                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
98         }
99         unique_lock<mutex> lock(freelist_mutex);
100         freelist.push(unique_ptr<uint8_t[]>(frame.data));
101 }
102
103 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
104 {
105         if (a == b) {
106                 return false;
107         } else if (a < b) {
108                 return (b - a < 0x8000);
109         } else {
110                 int wrap_b = 0x10000 + int(b);
111                 return (wrap_b - a < 0x8000);
112         }
113 }
114
115 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
116 {
117         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
118                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
119                         q->back().timecode, timecode);
120                 frame.owner->release_frame(frame);
121                 return;
122         }
123
124         QueuedFrame qf;
125         qf.format = format;
126         qf.timecode = timecode;
127         qf.frame = frame;
128
129         {
130                 unique_lock<mutex> lock(queue_lock);
131                 q->push_back(move(qf));
132         }
133         queues_not_empty.notify_one();  // might be spurious
134 }
135
136 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
137 {
138         FILE *fp = fopen(filename, "wb");
139         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
140                 printf("short write!\n");
141         }
142         fclose(fp);
143 }
144
145 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
146 {
147         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
148 }
149
150 void BMUSBCapture::dequeue_thread_func()
151 {
152         if (has_dequeue_callbacks) {
153                 dequeue_init_callback();
154         }
155         while (!dequeue_thread_should_quit) {
156                 unique_lock<mutex> lock(queue_lock);
157                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
158
159                 uint16_t video_timecode = pending_video_frames.front().timecode;
160                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
161                 if (video_timecode < audio_timecode) {
162                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
163                                 video_timecode);
164                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
165                         pending_video_frames.pop_front();
166                 } else if (audio_timecode < video_timecode) {
167                         printf("Audio block 0x%04x without corresponding video block, dropping.\n",
168                                 audio_timecode);
169                         audio_frame_allocator->release_frame(pending_audio_frames.front().frame);
170                         pending_audio_frames.pop_front();
171                 } else {
172                         QueuedFrame video_frame = pending_video_frames.front();
173                         QueuedFrame audio_frame = pending_audio_frames.front();
174                         pending_audio_frames.pop_front();
175                         pending_video_frames.pop_front();
176                         lock.unlock();
177
178 #if 0
179                         char filename[255];
180                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
181                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
182                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
183 #endif
184
185                         frame_callback(video_timecode,
186                                        video_frame.frame, HEADER_SIZE, video_frame.format,
187                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
188                 }
189         }
190         if (has_dequeue_callbacks) {
191                 dequeue_cleanup_callback();
192         }
193 }
194
195 void BMUSBCapture::start_new_frame(const uint8_t *start)
196 {
197         uint16_t format = (start[3] << 8) | start[2];
198         uint16_t timecode = (start[1] << 8) | start[0];
199
200         if (current_video_frame.len > 0) {
201                 // If format is 0x0800 (no signal), add a fake (empty) audio
202                 // frame to get it out of the queue.
203                 // TODO: Figure out if there are other formats that come with
204                 // no audio, and treat them the same.
205                 if (format == 0x0800) {
206                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
207                         if (fake_audio_frame.data == nullptr) {
208                                 // Oh well, it's just a no-signal frame anyway.
209                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
210                                 current_video_frame.owner->release_frame(current_video_frame);
211                                 current_video_frame = video_frame_allocator->alloc_frame();
212                                 return;
213                         }
214                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
215                 }
216                 //dump_frame();
217                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
218         }
219         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
220         //      format, timecode,
221         //      //start[7], start[6], start[5], start[4],
222         //      read_current_frame, FRAME_SIZE);
223
224         current_video_frame = video_frame_allocator->alloc_frame();
225         //if (current_video_frame.data == nullptr) {
226         //      read_current_frame = -1;
227         //} else {
228         //      read_current_frame = 0;
229         //}
230 }
231
232 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
233 {
234         uint16_t format = (start[3] << 8) | start[2];
235         uint16_t timecode = (start[1] << 8) | start[0];
236         if (current_audio_frame.len > 0) {
237                 //dump_audio_block();
238                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
239         }
240         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
241         //      format, timecode, read_current_audio_block);
242         current_audio_frame = audio_frame_allocator->alloc_frame();
243 }
244
245 #if 0
246 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
247 {
248         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
249         for (unsigned j = 0; j < pack->actual_length; j++) {
250         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
251                 printf("%02x", xfr->buffer[j + offset]);
252                 if ((j % 16) == 15)
253                         printf("\n");
254                 else if ((j % 8) == 7)
255                         printf("  ");
256                 else
257                         printf(" ");
258         }
259 }
260 #endif
261
262 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
263 {
264         assert(n % 2 == 0);
265         uint8_t *dptr1 = dest1;
266         uint8_t *dptr2 = dest2;
267
268         for (size_t i = 0; i < n; i += 2) {
269                 *dptr1++ = *src++;
270                 *dptr2++ = *src++;
271         }
272 }
273
274 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
275 {
276         if (current_frame->data == nullptr ||
277             current_frame->len > current_frame->size ||
278             start == end) {
279                 return;
280         }
281
282         int bytes = end - start;
283         if (current_frame->len + bytes > current_frame->size) {
284                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
285                 current_frame->len = current_frame->size;
286                 if (current_frame->overflow > 1048576) {
287                         printf("%d bytes overflow after last %s frame\n",
288                                 int(current_frame->overflow), frame_type_name);
289                         current_frame->overflow = 0;
290                 }
291                 //dump_frame();
292         } else {
293                 if (current_frame->interleaved) {
294                         uint8_t *data = current_frame->data + current_frame->len / 2;
295                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
296                         if (current_frame->len % 2 == 1) {
297                                 ++data;
298                                 swap(data, data2);
299                         }
300                         if (bytes % 2 == 1) {
301                                 *data++ = *start++;
302                                 swap(data, data2);
303                                 ++current_frame->len;
304                                 --bytes;
305                         }
306                         memcpy_interleaved(data, data2, start, bytes);
307                         current_frame->len += bytes;
308                 } else {
309                         memcpy(current_frame->data + current_frame->len, start, bytes);
310                         current_frame->len += bytes;
311                 }
312         }
313 }
314
315 #ifdef __SSE4_1__
316
317 #if 0
318 void avx2_dump(const char *name, __m256i n)
319 {
320         printf("%-10s:", name);
321         printf(" %02x", _mm256_extract_epi8(n, 0));
322         printf(" %02x", _mm256_extract_epi8(n, 1));
323         printf(" %02x", _mm256_extract_epi8(n, 2));
324         printf(" %02x", _mm256_extract_epi8(n, 3));
325         printf(" %02x", _mm256_extract_epi8(n, 4));
326         printf(" %02x", _mm256_extract_epi8(n, 5));
327         printf(" %02x", _mm256_extract_epi8(n, 6));
328         printf(" %02x", _mm256_extract_epi8(n, 7));
329         printf(" ");
330         printf(" %02x", _mm256_extract_epi8(n, 8));
331         printf(" %02x", _mm256_extract_epi8(n, 9));
332         printf(" %02x", _mm256_extract_epi8(n, 10));
333         printf(" %02x", _mm256_extract_epi8(n, 11));
334         printf(" %02x", _mm256_extract_epi8(n, 12));
335         printf(" %02x", _mm256_extract_epi8(n, 13));
336         printf(" %02x", _mm256_extract_epi8(n, 14));
337         printf(" %02x", _mm256_extract_epi8(n, 15));
338         printf(" ");
339         printf(" %02x", _mm256_extract_epi8(n, 16));
340         printf(" %02x", _mm256_extract_epi8(n, 17));
341         printf(" %02x", _mm256_extract_epi8(n, 18));
342         printf(" %02x", _mm256_extract_epi8(n, 19));
343         printf(" %02x", _mm256_extract_epi8(n, 20));
344         printf(" %02x", _mm256_extract_epi8(n, 21));
345         printf(" %02x", _mm256_extract_epi8(n, 22));
346         printf(" %02x", _mm256_extract_epi8(n, 23));
347         printf(" ");
348         printf(" %02x", _mm256_extract_epi8(n, 24));
349         printf(" %02x", _mm256_extract_epi8(n, 25));
350         printf(" %02x", _mm256_extract_epi8(n, 26));
351         printf(" %02x", _mm256_extract_epi8(n, 27));
352         printf(" %02x", _mm256_extract_epi8(n, 28));
353         printf(" %02x", _mm256_extract_epi8(n, 29));
354         printf(" %02x", _mm256_extract_epi8(n, 30));
355         printf(" %02x", _mm256_extract_epi8(n, 31));
356         printf("\n");
357 }
358 #endif
359
360 // Does a memcpy and memchr in one to reduce processing time.
361 // Note that the benefit is somewhat limited if your L3 cache is small,
362 // as you'll (unfortunately) spend most of the time loading the data
363 // from main memory.
364 //
365 // Complicated cases are left to the slow path; it basically stops copying
366 // up until the first instance of "sync_char" (usually a bit before, actually).
367 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
368 // data, and what we really need this for is the 00 00 ff ff marker in video data.
369 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
370 {
371         if (current_frame->data == nullptr ||
372             current_frame->len > current_frame->size ||
373             start == limit) {
374                 return start;
375         }
376         size_t orig_bytes = limit - start;
377         if (orig_bytes < 128) {
378                 // Don't bother.
379                 return start;
380         }
381
382         // Don't read more bytes than we can write.
383         limit = min(limit, start + (current_frame->size - current_frame->len));
384
385         // Align end to 32 bytes.
386         limit = (const uint8_t *)(intptr_t(limit) & ~31);
387
388         if (start >= limit) {
389                 return start;
390         }
391
392         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
393         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
394         if (aligned_start != start) {
395                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
396                 if (sync_start == nullptr) {
397                         add_to_frame(current_frame, "", start, aligned_start);
398                 } else {
399                         add_to_frame(current_frame, "", start, sync_start);
400                         return sync_start;
401                 }
402         }
403
404         // Make the length a multiple of 64.
405         if (current_frame->interleaved) {
406                 if (((limit - aligned_start) % 64) != 0) {
407                         limit -= 32;
408                 }
409                 assert(((limit - aligned_start) % 64) == 0);
410         }
411
412 #if __AVX2__
413         const __m256i needle = _mm256_set1_epi8(sync_char);
414
415         const __restrict __m256i *in = (const __m256i *)aligned_start;
416         if (current_frame->interleaved) {
417                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
418                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
419                 if (current_frame->len % 2 == 1) {
420                         swap(out1, out2);
421                 }
422
423                 __m256i shuffle_cw = _mm256_set_epi8(
424                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
425                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
426                 while (in < (const __m256i *)limit) {
427                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
428                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
429                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
430
431                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
432                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
433                         __m256i found = _mm256_or_si256(found1, found2);
434
435                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
436                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
437                 
438                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
439                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
440
441                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
442                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
443
444                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
445                         _mm256_storeu_si256(out2, hi);
446
447                         if (!_mm256_testz_si256(found, found)) {
448                                 break;
449                         }
450
451                         in += 2;
452                         ++out1;
453                         ++out2;
454                 }
455                 current_frame->len += (uint8_t *)in - aligned_start;
456         } else {
457                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
458                 while (in < (const __m256i *)limit) {
459                         __m256i data = _mm256_load_si256(in);
460                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
461                         __m256i found = _mm256_cmpeq_epi8(data, needle);
462                         if (!_mm256_testz_si256(found, found)) {
463                                 break;
464                         }
465
466                         ++in;
467                         ++out;
468                 }
469                 current_frame->len = (uint8_t *)out - current_frame->data;
470         }
471 #else
472         const __m128i needle = _mm_set1_epi8(sync_char);
473
474         const __m128i *in = (const __m128i *)aligned_start;
475         if (current_frame->interleaved) {
476                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
477                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
478                 if (current_frame->len % 2 == 1) {
479                         swap(out1, out2);
480                 }
481
482                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
483                 while (in < (const __m128i *)limit) {
484                         __m128i data1 = _mm_load_si128(in);
485                         __m128i data2 = _mm_load_si128(in + 1);
486                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
487                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
488                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
489                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
490                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
491                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
492                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
493                         _mm_storeu_si128(out2, hi);
494                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
495                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
496                         if (!_mm_testz_si128(found1, found1) ||
497                             !_mm_testz_si128(found2, found2)) {
498                                 break;
499                         }
500
501                         in += 2;
502                         ++out1;
503                         ++out2;
504                 }
505                 current_frame->len += (uint8_t *)in - aligned_start;
506         } else {
507                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
508                 while (in < (const __m128i *)limit) {
509                         __m128i data = _mm_load_si128(in);
510                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
511                         __m128i found = _mm_cmpeq_epi8(data, needle);
512                         if (!_mm_testz_si128(found, found)) {
513                                 break;
514                         }
515
516                         ++in;
517                         ++out;
518                 }
519                 current_frame->len = (uint8_t *)out - current_frame->data;
520         }
521 #endif
522
523         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
524
525         return (const uint8_t *)in;
526 }
527 #endif
528
529 void decode_packs(const libusb_transfer *xfr,
530                   const char *sync_pattern,
531                   int sync_length,
532                   FrameAllocator::Frame *current_frame,
533                   const char *frame_type_name,
534                   function<void(const uint8_t *start)> start_callback)
535 {
536         int offset = 0;
537         for (int i = 0; i < xfr->num_iso_packets; i++) {
538                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
539
540                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
541                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
542                         continue;
543 //exit(5);
544                 }
545
546                 const uint8_t *start = xfr->buffer + offset;
547                 const uint8_t *limit = start + pack->actual_length;
548                 while (start < limit) {  // Usually runs only one iteration.
549 #ifdef __SSE4_1__
550                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
551                         if (start == limit) break;
552                         assert(start < limit);
553 #endif
554
555                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
556                         if (start_next_frame == nullptr) {
557                                 // add the rest of the buffer
558                                 add_to_frame(current_frame, frame_type_name, start, limit);
559                                 break;
560                         } else {
561                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
562                                 start = start_next_frame + sync_length;  // skip sync
563                                 start_callback(start);
564                         }
565                 }
566 #if 0
567                 dump_pack(xfr, offset, pack);
568 #endif
569                 offset += pack->length;
570         }
571 }
572
573 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
574 {
575         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
576                 fprintf(stderr, "transfer status %d\n", xfr->status);
577                 libusb_free_transfer(xfr);
578                 exit(3);
579         }
580
581         assert(xfr->user_data != nullptr);
582         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
583
584         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
585                 if (xfr->endpoint == 0x84) {
586                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
587                 } else {
588                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
589                 }
590         }
591         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
592                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
593                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
594 #if 0
595                 if (setup->wIndex == 44) {
596                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
597                 } else {
598                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
599                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
600                 }
601 #else
602                 memcpy(usb->register_file + usb->current_register, buf, 4);
603                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
604                 if (usb->current_register == 0) {
605                         // read through all of them
606                         printf("register dump:");
607                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
608                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
609                         }
610                         printf("\n");
611                 }
612                 libusb_fill_control_setup(xfr->buffer,
613                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
614                         /*index=*/usb->current_register, /*length=*/4);
615 #endif
616         }
617
618 #if 0
619         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
620         for (i = 0; i < xfr->actual_length; i++) {
621                 printf("%02x", xfr->buffer[i]);
622                 if (i % 16)
623                         printf("\n");
624                 else if (i % 8)
625                         printf("  ");
626                 else
627                         printf(" ");
628         }
629 #endif
630
631         if (libusb_submit_transfer(xfr) < 0) {
632                 fprintf(stderr, "error re-submitting URB\n");
633                 exit(1);
634         }
635 }
636
637 void BMUSBCapture::usb_thread_func()
638 {
639         sched_param param;
640         memset(&param, 0, sizeof(param));
641         param.sched_priority = 1;
642         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
643                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
644         }
645         while (!should_quit) {
646                 int rc = libusb_handle_events(nullptr);
647                 if (rc != LIBUSB_SUCCESS)
648                         break;
649         }
650 }
651
652 void BMUSBCapture::configure_card()
653 {
654         if (video_frame_allocator == nullptr) {
655                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE));  // FIXME: leak.
656         }
657         if (audio_frame_allocator == nullptr) {
658                 set_audio_frame_allocator(new MallocFrameAllocator(65536));  // FIXME: leak.
659         }
660         dequeue_thread_should_quit = false;
661         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
662
663         int rc;
664         struct libusb_transfer *xfr;
665
666         rc = libusb_init(nullptr);
667         if (rc < 0) {
668                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
669                 exit(1);
670         }
671
672         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
673         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd4f);
674         struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, vid, pid);
675         if (!devh) {
676                 fprintf(stderr, "Error finding USB device\n");
677                 exit(1);
678         }
679
680         libusb_config_descriptor *config;
681         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
682         if (rc < 0) {
683                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
684                 exit(1);
685         }
686         printf("%d interface\n", config->bNumInterfaces);
687         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
688                 printf("  interface %d\n", interface_number);
689                 const libusb_interface *interface = &config->interface[interface_number];
690                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
691                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
692                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
693                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
694                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
695                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
696                         }
697                 }
698         }
699
700         rc = libusb_set_configuration(devh, /*configuration=*/1);
701         if (rc < 0) {
702                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
703                 exit(1);
704         }
705
706         rc = libusb_claim_interface(devh, 0);
707         if (rc < 0) {
708                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
709                 exit(1);
710         }
711
712         // Alternate setting 1 is output, alternate setting 2 is input.
713         // Card is reset when switching alternates, so the driver uses
714         // this “double switch” when it wants to reset.
715         //
716         // There's also alternate settings 3 and 4, which seem to be
717         // like 1 and 2 except they advertise less bandwidth needed.
718         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
719         if (rc < 0) {
720                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
721                 exit(1);
722         }
723         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
724         if (rc < 0) {
725                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
726                 exit(1);
727         }
728 #if 0
729         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
730         if (rc < 0) {
731                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
732                 exit(1);
733         }
734 #endif
735
736 #if 0
737         rc = libusb_claim_interface(devh, 3);
738         if (rc < 0) {
739                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
740                 exit(1);
741         }
742 #endif
743
744         // theories:
745         //   44 is some kind of timer register (first 16 bits count upwards)
746         //   24 is some sort of watchdog?
747         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
748         //      (or will go to 0x73c60010?), also seen 0x73c60100
749         //   12 also changes all the time, unclear why  
750         //   16 seems to be autodetected mode somehow
751         //      --    this is e00115e0 after reset?
752         //                    ed0115e0 after mode change [to output?]
753         //                    2d0015e0 after more mode change [to input]
754         //                    ed0115e0 after more mode change
755         //                    2d0015e0 after more mode change
756         //
757         //                    390115e0 seems to indicate we have signal
758         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
759         //
760         //                    200015e0 on startup
761         //         changes to 250115e0 when we sync to the signal
762         //
763         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
764         //
765         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
766         //
767         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
768         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
769         //
770         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
771         //    perhaps some of them are related to analog output?
772         //
773         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
774         //    but the driver sets it to 0x8036802a at some point.
775         //
776         //    all of this is on request 214/215. other requests (192, 219,
777         //    222, 223, 224) are used for firmware upgrade. Probably best to
778         //    stay out of it unless you know what you're doing.
779         //
780         //
781         // register 16:
782         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
783         //
784         // theories:
785         //   0x01 - stable signal
786         //   0x04 - deep color
787         //   0x08 - unknown (audio??)
788         //   0x20 - 720p??
789         //   0x30 - 576p??
790
791         struct ctrl {
792                 int endpoint;
793                 int request;
794                 int index;
795                 uint32_t data;
796         };
797         static const ctrl ctrls[] = {
798                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
799                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
800
801                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
802                 // capture (v210).
803                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
804                 // 0x10000000 = analog audio instead of embedded audio, it seems
805                 // 0x3a000000 = component video? (analog audio)
806                 // 0x3c000000 = composite video? (analog audio)
807                 // 0x3e000000 = s-video? (analog audio)
808                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
809                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
810                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
811                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
812                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
813         };
814
815         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
816                 uint32_t flipped = htonl(ctrls[req].data);
817                 static uint8_t value[4];
818                 memcpy(value, &flipped, sizeof(flipped));
819                 int size = sizeof(value);
820                 //if (ctrls[req].request == 215) size = 0;
821                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
822                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
823                 if (rc < 0) {
824                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
825                         exit(1);
826                 }
827                 
828                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
829                 for (int i = 0; i < rc; ++i) {
830                         printf("%02x", value[i]);
831                 }
832                 printf("\n");
833         }
834
835 #if 0
836         // DEBUG
837         for ( ;; ) {
838                 static int my_index = 0;
839                 static uint8_t value[4];
840                 int size = sizeof(value);
841                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
842                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
843                 if (rc < 0) {
844                         fprintf(stderr, "Error on control\n");
845                         exit(1);
846                 }
847                 printf("rc=%d index=%d: 0x", rc, my_index);
848                 for (int i = 0; i < rc; ++i) {
849                         printf("%02x", value[i]);
850                 }
851                 printf("\n");
852         }
853 #endif
854
855 #if 0
856         // set up an asynchronous transfer of the timer register
857         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
858         static int completed = 0;
859
860         xfr = libusb_alloc_transfer(0);
861         libusb_fill_control_setup(cmdbuf,
862             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
863                 /*index=*/44, /*length=*/4);
864         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
865         xfr->user_data = this;
866         libusb_submit_transfer(xfr);
867
868         // set up an asynchronous transfer of register 24
869         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
870         static int completed2 = 0;
871
872         xfr = libusb_alloc_transfer(0);
873         libusb_fill_control_setup(cmdbuf2,
874             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
875                 /*index=*/24, /*length=*/4);
876         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
877         xfr->user_data = this;
878         libusb_submit_transfer(xfr);
879 #endif
880
881         // set up an asynchronous transfer of the register dump
882         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
883         static int completed3 = 0;
884
885         xfr = libusb_alloc_transfer(0);
886         libusb_fill_control_setup(cmdbuf3,
887             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
888                 /*index=*/current_register, /*length=*/4);
889         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
890         xfr->user_data = this;
891         //libusb_submit_transfer(xfr);
892
893         audiofp = fopen("audio.raw", "wb");
894
895         // set up isochronous transfers for audio and video
896         for (int e = 3; e <= 4; ++e) {
897                 //int num_transfers = (e == 3) ? 6 : 6;
898                 int num_transfers = 6;
899                 for (int i = 0; i < num_transfers; ++i) {
900                         int num_iso_pack, size;
901                         if (e == 3) {
902                                 // Video seems to require isochronous packets scaled with the width; 
903                                 // seemingly six lines is about right, rounded up to the required 1kB
904                                 // multiple.
905                                 size = WIDTH * 2 * 6;
906                                 // Note that for 10-bit input, you'll need to increase size accordingly.
907                                 //size = size * 4 / 3;
908                                 if (size % 1024 != 0) {
909                                         size &= ~1023;
910                                         size += 1024;
911                                 }
912                                 num_iso_pack = (2 << 18) / size;  // 512 kB.
913                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
914                         } else {
915                                 size = 0xc0;
916                                 num_iso_pack = 80;
917                         }
918                         int num_bytes = num_iso_pack * size;
919                         uint8_t *buf = new uint8_t[num_bytes];
920
921                         xfr = libusb_alloc_transfer(num_iso_pack);
922                         if (!xfr) {
923                                 fprintf(stderr, "oom\n");
924                                 exit(1);
925                         }
926
927                         int ep = LIBUSB_ENDPOINT_IN | e;
928                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
929                                 num_iso_pack, cb_xfr, nullptr, 0);
930                         libusb_set_iso_packet_lengths(xfr, size);
931                         xfr->user_data = this;
932                         iso_xfrs.push_back(xfr);
933                 }
934         }
935 }
936
937 void BMUSBCapture::start_bm_capture()
938 {
939         printf("starting capture\n");
940         int i = 0;
941         for (libusb_transfer *xfr : iso_xfrs) {
942                 printf("submitting transfer...\n");
943                 int rc = libusb_submit_transfer(xfr);
944                 ++i;
945                 if (rc < 0) {
946                         //printf("num_bytes=%d\n", num_bytes);
947                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
948                                 xfr->endpoint, i, libusb_error_name(rc));
949                         exit(1);
950                 }
951         }
952
953
954 #if 0
955         libusb_release_interface(devh, 0);
956 out:
957         if (devh)
958                 libusb_close(devh);
959         libusb_exit(nullptr);
960         return rc;
961 #endif
962 }
963
964 void BMUSBCapture::stop_dequeue_thread()
965 {
966         dequeue_thread_should_quit = true;
967         queues_not_empty.notify_all();
968         dequeue_thread.join();
969 }
970
971 void BMUSBCapture::start_bm_thread()
972 {
973         should_quit = false;
974         usb_thread = thread(&BMUSBCapture::usb_thread_func);
975 }
976
977 void BMUSBCapture::stop_bm_thread()
978 {
979         should_quit = true;
980         usb_thread.join();
981 }