]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Be less verbose on overflow.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #include <assert.h>
8 #include <errno.h>
9 #include <libusb.h>
10 #include <netinet/in.h>
11 #include <sched.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #ifdef __SSE4_1__
17 #include <immintrin.h>
18 #endif
19 #include "bmusb.h"
20
21 #include <algorithm>
22 #include <atomic>
23 #include <condition_variable>
24 #include <cstddef>
25 #include <cstdint>
26 #include <deque>
27 #include <functional>
28 #include <memory>
29 #include <mutex>
30 #include <stack>
31 #include <thread>
32
33 using namespace std;
34 using namespace std::placeholders;
35
36 #define WIDTH 1280
37 #define HEIGHT 750  /* 30 lines ancillary data? */
38 //#define WIDTH 1920
39 //#define HEIGHT 1125  /* ??? lines ancillary data? */
40 #define HEADER_SIZE 44
41 //#define HEADER_SIZE 0
42 #define AUDIO_HEADER_SIZE 4
43
44 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
45 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
46 #define FRAME_SIZE (8 << 20)
47
48 FILE *audiofp;
49
50 thread usb_thread;
51 atomic<bool> should_quit;
52
53 FrameAllocator::~FrameAllocator() {}
54
55 #define NUM_QUEUED_FRAMES 16
56 class MallocFrameAllocator : public FrameAllocator {
57 public:
58         MallocFrameAllocator(size_t frame_size);
59         Frame alloc_frame() override;
60         void release_frame(Frame frame) override;
61
62 private:
63         size_t frame_size;
64
65         mutex freelist_mutex;
66         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
67 };
68
69 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size)
70         : frame_size(frame_size)
71 {
72         for (int i = 0; i < NUM_QUEUED_FRAMES; ++i) {
73                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
74         }
75 }
76
77 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
78 {
79         Frame vf;
80         vf.owner = this;
81
82         unique_lock<mutex> lock(freelist_mutex);  // Meh.
83         if (freelist.empty()) {
84                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
85                         frame_size);
86         } else {
87                 vf.data = freelist.top().release();
88                 vf.size = frame_size;
89                 freelist.pop();  // Meh.
90         }
91         return vf;
92 }
93
94 void MallocFrameAllocator::release_frame(Frame frame)
95 {
96         if (frame.overflow > 0) {
97                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
98         }
99         unique_lock<mutex> lock(freelist_mutex);
100         freelist.push(unique_ptr<uint8_t[]>(frame.data));
101 }
102
103 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
104 {
105         if (a == b) {
106                 return false;
107         } else if (a < b) {
108                 return (b - a < 0x8000);
109         } else {
110                 int wrap_b = 0x10000 + int(b);
111                 return (wrap_b - a < 0x8000);
112         }
113 }
114
115 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
116 {
117         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
118                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
119                         q->back().timecode, timecode);
120                 frame.owner->release_frame(frame);
121                 return;
122         }
123
124         QueuedFrame qf;
125         qf.format = format;
126         qf.timecode = timecode;
127         qf.frame = frame;
128
129         {
130                 unique_lock<mutex> lock(queue_lock);
131                 q->push_back(move(qf));
132         }
133         queues_not_empty.notify_one();  // might be spurious
134 }
135
136 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
137 {
138         FILE *fp = fopen(filename, "wb");
139         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
140                 printf("short write!\n");
141         }
142         fclose(fp);
143 }
144
145 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
146 {
147         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
148 }
149
150 void BMUSBCapture::dequeue_thread_func()
151 {
152         if (has_dequeue_callbacks) {
153                 dequeue_init_callback();
154         }
155         while (!dequeue_thread_should_quit) {
156                 unique_lock<mutex> lock(queue_lock);
157                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
158
159                 uint16_t video_timecode = pending_video_frames.front().timecode;
160                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
161                 if (video_timecode < audio_timecode) {
162                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
163                                 video_timecode);
164                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
165                         pending_video_frames.pop_front();
166                 } else if (audio_timecode < video_timecode) {
167                         printf("Audio block 0x%04x without corresponding video block, dropping.\n",
168                                 audio_timecode);
169                         audio_frame_allocator->release_frame(pending_audio_frames.front().frame);
170                         pending_audio_frames.pop_front();
171                 } else {
172                         QueuedFrame video_frame = pending_video_frames.front();
173                         QueuedFrame audio_frame = pending_audio_frames.front();
174                         pending_audio_frames.pop_front();
175                         pending_video_frames.pop_front();
176                         lock.unlock();
177
178 #if 0
179                         char filename[255];
180                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
181                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
182                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
183 #endif
184
185                         frame_callback(video_timecode,
186                                        video_frame.frame, HEADER_SIZE, video_frame.format,
187                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
188                 }
189         }
190         if (has_dequeue_callbacks) {
191                 dequeue_cleanup_callback();
192         }
193 }
194
195 void BMUSBCapture::start_new_frame(const uint8_t *start)
196 {
197         uint16_t format = (start[3] << 8) | start[2];
198         uint16_t timecode = (start[1] << 8) | start[0];
199
200         if (current_video_frame.len > 0) {
201                 //dump_frame();
202                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
203         }
204         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
205         //      format, timecode,
206         //      //start[7], start[6], start[5], start[4],
207         //      read_current_frame, FRAME_SIZE);
208
209         current_video_frame = video_frame_allocator->alloc_frame();
210         //if (current_video_frame.data == nullptr) {
211         //      read_current_frame = -1;
212         //} else {
213         //      read_current_frame = 0;
214         //}
215 }
216
217 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
218 {
219         uint16_t format = (start[3] << 8) | start[2];
220         uint16_t timecode = (start[1] << 8) | start[0];
221         if (current_audio_frame.len > 0) {
222                 //dump_audio_block();
223                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
224         }
225         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
226         //      format, timecode, read_current_audio_block);
227         current_audio_frame = audio_frame_allocator->alloc_frame();
228 }
229
230 #if 0
231 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
232 {
233         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
234         for (unsigned j = 0; j < pack->actual_length; j++) {
235         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
236                 printf("%02x", xfr->buffer[j + offset]);
237                 if ((j % 16) == 15)
238                         printf("\n");
239                 else if ((j % 8) == 7)
240                         printf("  ");
241                 else
242                         printf(" ");
243         }
244 }
245 #endif
246
247 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
248 {
249         assert(n % 2 == 0);
250         uint8_t *dptr1 = dest1;
251         uint8_t *dptr2 = dest2;
252
253         for (size_t i = 0; i < n; i += 2) {
254                 *dptr1++ = *src++;
255                 *dptr2++ = *src++;
256         }
257 }
258
259 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
260 {
261         if (current_frame->data == nullptr ||
262             current_frame->len > current_frame->size ||
263             start == end) {
264                 return;
265         }
266
267         int bytes = end - start;
268         if (current_frame->len + bytes > current_frame->size) {
269                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
270                 current_frame->len = current_frame->size;
271                 if (current_frame->overflow > 1048576) {
272                         printf("%d bytes overflow after last %s frame\n",
273                                 int(current_frame->overflow), frame_type_name);
274                         current_frame->overflow = 0;
275                 }
276                 //dump_frame();
277         } else {
278                 if (current_frame->interleaved) {
279                         uint8_t *data = current_frame->data + current_frame->len / 2;
280                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
281                         if (current_frame->len % 2 == 1) {
282                                 ++data;
283                                 swap(data, data2);
284                         }
285                         if (bytes % 2 == 1) {
286                                 *data++ = *start++;
287                                 swap(data, data2);
288                                 ++current_frame->len;
289                                 --bytes;
290                         }
291                         memcpy_interleaved(data, data2, start, bytes);
292                         current_frame->len += bytes;
293                 } else {
294                         memcpy(current_frame->data + current_frame->len, start, bytes);
295                         current_frame->len += bytes;
296                 }
297         }
298 }
299
300 #ifdef __SSE4_1__
301
302 #if 0
303 void avx2_dump(const char *name, __m256i n)
304 {
305         printf("%-10s:", name);
306         printf(" %02x", _mm256_extract_epi8(n, 0));
307         printf(" %02x", _mm256_extract_epi8(n, 1));
308         printf(" %02x", _mm256_extract_epi8(n, 2));
309         printf(" %02x", _mm256_extract_epi8(n, 3));
310         printf(" %02x", _mm256_extract_epi8(n, 4));
311         printf(" %02x", _mm256_extract_epi8(n, 5));
312         printf(" %02x", _mm256_extract_epi8(n, 6));
313         printf(" %02x", _mm256_extract_epi8(n, 7));
314         printf(" ");
315         printf(" %02x", _mm256_extract_epi8(n, 8));
316         printf(" %02x", _mm256_extract_epi8(n, 9));
317         printf(" %02x", _mm256_extract_epi8(n, 10));
318         printf(" %02x", _mm256_extract_epi8(n, 11));
319         printf(" %02x", _mm256_extract_epi8(n, 12));
320         printf(" %02x", _mm256_extract_epi8(n, 13));
321         printf(" %02x", _mm256_extract_epi8(n, 14));
322         printf(" %02x", _mm256_extract_epi8(n, 15));
323         printf(" ");
324         printf(" %02x", _mm256_extract_epi8(n, 16));
325         printf(" %02x", _mm256_extract_epi8(n, 17));
326         printf(" %02x", _mm256_extract_epi8(n, 18));
327         printf(" %02x", _mm256_extract_epi8(n, 19));
328         printf(" %02x", _mm256_extract_epi8(n, 20));
329         printf(" %02x", _mm256_extract_epi8(n, 21));
330         printf(" %02x", _mm256_extract_epi8(n, 22));
331         printf(" %02x", _mm256_extract_epi8(n, 23));
332         printf(" ");
333         printf(" %02x", _mm256_extract_epi8(n, 24));
334         printf(" %02x", _mm256_extract_epi8(n, 25));
335         printf(" %02x", _mm256_extract_epi8(n, 26));
336         printf(" %02x", _mm256_extract_epi8(n, 27));
337         printf(" %02x", _mm256_extract_epi8(n, 28));
338         printf(" %02x", _mm256_extract_epi8(n, 29));
339         printf(" %02x", _mm256_extract_epi8(n, 30));
340         printf(" %02x", _mm256_extract_epi8(n, 31));
341         printf("\n");
342 }
343 #endif
344
345 // Does a memcpy and memchr in one to reduce processing time.
346 // Note that the benefit is somewhat limited if your L3 cache is small,
347 // as you'll (unfortunately) spend most of the time loading the data
348 // from main memory.
349 //
350 // Complicated cases are left to the slow path; it basically stops copying
351 // up until the first instance of "sync_char" (usually a bit before, actually).
352 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
353 // data, and what we really need this for is the 00 00 ff ff marker in video data.
354 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
355 {
356         if (current_frame->data == nullptr ||
357             current_frame->len > current_frame->size ||
358             start == limit) {
359                 return start;
360         }
361         size_t orig_bytes = limit - start;
362         if (orig_bytes < 128) {
363                 // Don't bother.
364                 return start;
365         }
366
367         // Don't read more bytes than we can write.
368         limit = min(limit, start + (current_frame->size - current_frame->len));
369
370         // Align end to 32 bytes.
371         limit = (const uint8_t *)(intptr_t(limit) & ~31);
372
373         if (start >= limit) {
374                 return start;
375         }
376
377         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
378         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
379         if (aligned_start != start) {
380                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
381                 if (sync_start == nullptr) {
382                         add_to_frame(current_frame, "", start, aligned_start);
383                 } else {
384                         add_to_frame(current_frame, "", start, sync_start);
385                         return sync_start;
386                 }
387         }
388
389         // Make the length a multiple of 64.
390         if (current_frame->interleaved) {
391                 if (((limit - aligned_start) % 64) != 0) {
392                         limit -= 32;
393                 }
394                 assert(((limit - aligned_start) % 64) == 0);
395         }
396
397 #if __AVX2__
398         const __m256i needle = _mm256_set1_epi8(sync_char);
399
400         const __restrict __m256i *in = (const __m256i *)aligned_start;
401         if (current_frame->interleaved) {
402                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
403                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
404                 if (current_frame->len % 2 == 1) {
405                         swap(out1, out2);
406                 }
407
408                 __m256i shuffle_cw = _mm256_set_epi8(
409                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
410                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
411                 while (in < (const __m256i *)limit) {
412                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
413                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
414                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
415
416                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
417                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
418                         __m256i found = _mm256_or_si256(found1, found2);
419
420                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
421                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
422                 
423                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
424                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
425
426                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
427                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
428
429                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
430                         _mm256_storeu_si256(out2, hi);
431
432                         if (!_mm256_testz_si256(found, found)) {
433                                 break;
434                         }
435
436                         in += 2;
437                         ++out1;
438                         ++out2;
439                 }
440                 current_frame->len += (uint8_t *)in - aligned_start;
441         } else {
442                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
443                 while (in < (const __m256i *)limit) {
444                         __m256i data = _mm256_load_si256(in);
445                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
446                         __m256i found = _mm256_cmpeq_epi8(data, needle);
447                         if (!_mm256_testz_si256(found, found)) {
448                                 break;
449                         }
450
451                         ++in;
452                         ++out;
453                 }
454                 current_frame->len = (uint8_t *)out - current_frame->data;
455         }
456 #else
457         const __m128i needle = _mm_set1_epi8(sync_char);
458
459         const __m128i *in = (const __m128i *)aligned_start;
460         if (current_frame->interleaved) {
461                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
462                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
463                 if (current_frame->len % 2 == 1) {
464                         swap(out1, out2);
465                 }
466
467                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
468                 while (in < (const __m128i *)limit) {
469                         __m128i data1 = _mm_load_si128(in);
470                         __m128i data2 = _mm_load_si128(in + 1);
471                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
472                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
473                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
474                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
475                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
476                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
477                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
478                         _mm_storeu_si128(out2, hi);
479                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
480                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
481                         if (!_mm_testz_si128(found1, found1) ||
482                             !_mm_testz_si128(found2, found2)) {
483                                 break;
484                         }
485
486                         in += 2;
487                         ++out1;
488                         ++out2;
489                 }
490                 current_frame->len += (uint8_t *)in - aligned_start;
491         } else {
492                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
493                 while (in < (const __m128i *)limit) {
494                         __m128i data = _mm_load_si128(in);
495                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
496                         __m128i found = _mm_cmpeq_epi8(data, needle);
497                         if (!_mm_testz_si128(found, found)) {
498                                 break;
499                         }
500
501                         ++in;
502                         ++out;
503                 }
504                 current_frame->len = (uint8_t *)out - current_frame->data;
505         }
506 #endif
507
508         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
509
510         return (const uint8_t *)in;
511 }
512 #endif
513
514 void decode_packs(const libusb_transfer *xfr,
515                   const char *sync_pattern,
516                   int sync_length,
517                   FrameAllocator::Frame *current_frame,
518                   const char *frame_type_name,
519                   function<void(const uint8_t *start)> start_callback)
520 {
521         int offset = 0;
522         for (int i = 0; i < xfr->num_iso_packets; i++) {
523                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
524
525                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
526                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
527                         continue;
528 //exit(5);
529                 }
530
531                 const uint8_t *start = xfr->buffer + offset;
532                 const uint8_t *limit = start + pack->actual_length;
533                 while (start < limit) {  // Usually runs only one iteration.
534 #ifdef __SSE4_1__
535                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
536                         if (start == limit) break;
537                         assert(start < limit);
538 #endif
539
540                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
541                         if (start_next_frame == nullptr) {
542                                 // add the rest of the buffer
543                                 add_to_frame(current_frame, frame_type_name, start, limit);
544                                 break;
545                         } else {
546                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
547                                 start = start_next_frame + sync_length;  // skip sync
548                                 start_callback(start);
549                         }
550                 }
551 #if 0
552                 dump_pack(xfr, offset, pack);
553 #endif
554                 offset += pack->length;
555         }
556 }
557
558 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
559 {
560         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
561                 fprintf(stderr, "transfer status %d\n", xfr->status);
562                 libusb_free_transfer(xfr);
563                 exit(3);
564         }
565
566         assert(xfr->user_data != nullptr);
567         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
568
569         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
570                 if (xfr->endpoint == 0x84) {
571                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
572                 } else {
573                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
574                 }
575         }
576         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
577                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
578                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
579 #if 0
580                 if (setup->wIndex == 44) {
581                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
582                 } else {
583                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
584                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
585                 }
586 #else
587                 memcpy(usb->register_file + usb->current_register, buf, 4);
588                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
589                 if (usb->current_register == 0) {
590                         // read through all of them
591                         printf("register dump:");
592                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
593                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
594                         }
595                         printf("\n");
596                 }
597                 libusb_fill_control_setup(xfr->buffer,
598                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
599                         /*index=*/usb->current_register, /*length=*/4);
600 #endif
601         }
602
603 #if 0
604         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
605         for (i = 0; i < xfr->actual_length; i++) {
606                 printf("%02x", xfr->buffer[i]);
607                 if (i % 16)
608                         printf("\n");
609                 else if (i % 8)
610                         printf("  ");
611                 else
612                         printf(" ");
613         }
614 #endif
615
616         if (libusb_submit_transfer(xfr) < 0) {
617                 fprintf(stderr, "error re-submitting URB\n");
618                 exit(1);
619         }
620 }
621
622 void BMUSBCapture::usb_thread_func()
623 {
624         sched_param param;
625         memset(&param, 0, sizeof(param));
626         param.sched_priority = 1;
627         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
628                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
629         }
630         while (!should_quit) {
631                 int rc = libusb_handle_events(nullptr);
632                 if (rc != LIBUSB_SUCCESS)
633                         break;
634         }
635 }
636
637 void BMUSBCapture::configure_card()
638 {
639         if (video_frame_allocator == nullptr) {
640                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE));  // FIXME: leak.
641         }
642         if (audio_frame_allocator == nullptr) {
643                 set_audio_frame_allocator(new MallocFrameAllocator(65536));  // FIXME: leak.
644         }
645         dequeue_thread_should_quit = false;
646         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
647
648         int rc;
649         struct libusb_transfer *xfr;
650
651         rc = libusb_init(nullptr);
652         if (rc < 0) {
653                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
654                 exit(1);
655         }
656
657         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
658         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd4f);
659         struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, vid, pid);
660         if (!devh) {
661                 fprintf(stderr, "Error finding USB device\n");
662                 exit(1);
663         }
664
665         libusb_config_descriptor *config;
666         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
667         if (rc < 0) {
668                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
669                 exit(1);
670         }
671         printf("%d interface\n", config->bNumInterfaces);
672         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
673                 printf("  interface %d\n", interface_number);
674                 const libusb_interface *interface = &config->interface[interface_number];
675                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
676                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
677                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
678                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
679                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
680                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
681                         }
682                 }
683         }
684
685         rc = libusb_set_configuration(devh, /*configuration=*/1);
686         if (rc < 0) {
687                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
688                 exit(1);
689         }
690
691         rc = libusb_claim_interface(devh, 0);
692         if (rc < 0) {
693                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
694                 exit(1);
695         }
696
697         // Alternate setting 1 is output, alternate setting 2 is input.
698         // Card is reset when switching alternates, so the driver uses
699         // this “double switch” when it wants to reset.
700         //
701         // There's also alternate settings 3 and 4, which seem to be
702         // like 1 and 2 except they advertise less bandwidth needed.
703         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
704         if (rc < 0) {
705                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
706                 exit(1);
707         }
708         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
709         if (rc < 0) {
710                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
711                 exit(1);
712         }
713 #if 0
714         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
715         if (rc < 0) {
716                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
717                 exit(1);
718         }
719 #endif
720
721 #if 0
722         rc = libusb_claim_interface(devh, 3);
723         if (rc < 0) {
724                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
725                 exit(1);
726         }
727 #endif
728
729         // theories:
730         //   44 is some kind of timer register (first 16 bits count upwards)
731         //   24 is some sort of watchdog?
732         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
733         //      (or will go to 0x73c60010?), also seen 0x73c60100
734         //   12 also changes all the time, unclear why  
735         //   16 seems to be autodetected mode somehow
736         //      --    this is e00115e0 after reset?
737         //                    ed0115e0 after mode change [to output?]
738         //                    2d0015e0 after more mode change [to input]
739         //                    ed0115e0 after more mode change
740         //                    2d0015e0 after more mode change
741         //
742         //                    390115e0 seems to indicate we have signal
743         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
744         //
745         //                    200015e0 on startup
746         //         changes to 250115e0 when we sync to the signal
747         //
748         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
749         //
750         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
751         //
752         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
753         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
754         //
755         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
756         //    perhaps some of them are related to analog output?
757         //
758         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
759         //    but the driver sets it to 0x8036802a at some point.
760         //
761         //    all of this is on request 214/215. other requests (192, 219,
762         //    222, 223, 224) are used for firmware upgrade. Probably best to
763         //    stay out of it unless you know what you're doing.
764         //
765         //
766         // register 16:
767         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
768         //
769         // theories:
770         //   0x01 - stable signal
771         //   0x04 - deep color
772         //   0x08 - unknown (audio??)
773         //   0x20 - 720p??
774         //   0x30 - 576p??
775
776         struct ctrl {
777                 int endpoint;
778                 int request;
779                 int index;
780                 uint32_t data;
781         };
782         static const ctrl ctrls[] = {
783                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
784                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
785
786                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
787                 // capture (v210).
788                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
789                 // 0x10000000 = analog audio instead of embedded audio, it seems
790                 // 0x3a000000 = component video? (analog audio)
791                 // 0x3c000000 = composite video? (analog audio)
792                 // 0x3e000000 = s-video? (analog audio)
793                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
794                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
795                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
796                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
797                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
798         };
799
800         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
801                 uint32_t flipped = htonl(ctrls[req].data);
802                 static uint8_t value[4];
803                 memcpy(value, &flipped, sizeof(flipped));
804                 int size = sizeof(value);
805                 //if (ctrls[req].request == 215) size = 0;
806                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
807                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
808                 if (rc < 0) {
809                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
810                         exit(1);
811                 }
812                 
813                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
814                 for (int i = 0; i < rc; ++i) {
815                         printf("%02x", value[i]);
816                 }
817                 printf("\n");
818         }
819
820 #if 0
821         // DEBUG
822         for ( ;; ) {
823                 static int my_index = 0;
824                 static uint8_t value[4];
825                 int size = sizeof(value);
826                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
827                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
828                 if (rc < 0) {
829                         fprintf(stderr, "Error on control\n");
830                         exit(1);
831                 }
832                 printf("rc=%d index=%d: 0x", rc, my_index);
833                 for (int i = 0; i < rc; ++i) {
834                         printf("%02x", value[i]);
835                 }
836                 printf("\n");
837         }
838 #endif
839
840 #if 0
841         // set up an asynchronous transfer of the timer register
842         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
843         static int completed = 0;
844
845         xfr = libusb_alloc_transfer(0);
846         libusb_fill_control_setup(cmdbuf,
847             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
848                 /*index=*/44, /*length=*/4);
849         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
850         xfr->user_data = this;
851         libusb_submit_transfer(xfr);
852
853         // set up an asynchronous transfer of register 24
854         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
855         static int completed2 = 0;
856
857         xfr = libusb_alloc_transfer(0);
858         libusb_fill_control_setup(cmdbuf2,
859             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
860                 /*index=*/24, /*length=*/4);
861         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
862         xfr->user_data = this;
863         libusb_submit_transfer(xfr);
864 #endif
865
866         // set up an asynchronous transfer of the register dump
867         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
868         static int completed3 = 0;
869
870         xfr = libusb_alloc_transfer(0);
871         libusb_fill_control_setup(cmdbuf3,
872             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
873                 /*index=*/current_register, /*length=*/4);
874         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
875         xfr->user_data = this;
876         //libusb_submit_transfer(xfr);
877
878         audiofp = fopen("audio.raw", "wb");
879
880         // set up isochronous transfers for audio and video
881         for (int e = 3; e <= 4; ++e) {
882                 //int num_transfers = (e == 3) ? 6 : 6;
883                 int num_transfers = 6;
884                 for (int i = 0; i < num_transfers; ++i) {
885                         int num_iso_pack, size;
886                         if (e == 3) {
887                                 // Video seems to require isochronous packets scaled with the width; 
888                                 // seemingly six lines is about right, rounded up to the required 1kB
889                                 // multiple.
890                                 size = WIDTH * 2 * 6;
891                                 // Note that for 10-bit input, you'll need to increase size accordingly.
892                                 //size = size * 4 / 3;
893                                 if (size % 1024 != 0) {
894                                         size &= ~1023;
895                                         size += 1024;
896                                 }
897                                 num_iso_pack = (2 << 18) / size;  // 512 kB.
898                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
899                         } else {
900                                 size = 0xc0;
901                                 num_iso_pack = 80;
902                         }
903                         int num_bytes = num_iso_pack * size;
904                         uint8_t *buf = new uint8_t[num_bytes];
905
906                         xfr = libusb_alloc_transfer(num_iso_pack);
907                         if (!xfr) {
908                                 fprintf(stderr, "oom\n");
909                                 exit(1);
910                         }
911
912                         int ep = LIBUSB_ENDPOINT_IN | e;
913                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
914                                 num_iso_pack, cb_xfr, nullptr, 0);
915                         libusb_set_iso_packet_lengths(xfr, size);
916                         xfr->user_data = this;
917                         iso_xfrs.push_back(xfr);
918                 }
919         }
920 }
921
922 void BMUSBCapture::start_bm_capture()
923 {
924         printf("starting capture\n");
925         int i = 0;
926         for (libusb_transfer *xfr : iso_xfrs) {
927                 printf("submitting transfer...\n");
928                 int rc = libusb_submit_transfer(xfr);
929                 ++i;
930                 if (rc < 0) {
931                         //printf("num_bytes=%d\n", num_bytes);
932                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
933                                 xfr->endpoint, i, libusb_error_name(rc));
934                         exit(1);
935                 }
936         }
937
938
939 #if 0
940         libusb_release_interface(devh, 0);
941 out:
942         if (devh)
943                 libusb_close(devh);
944         libusb_exit(nullptr);
945         return rc;
946 #endif
947 }
948
949 void BMUSBCapture::stop_dequeue_thread()
950 {
951         dequeue_thread_should_quit = true;
952         queues_not_empty.notify_all();
953         dequeue_thread.join();
954 }
955
956 void BMUSBCapture::start_bm_thread()
957 {
958         should_quit = false;
959         usb_thread = thread(&BMUSBCapture::usb_thread_func);
960 }
961
962 void BMUSBCapture::stop_bm_thread()
963 {
964         should_quit = true;
965         usb_thread.join();
966 }