]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Add support for stopping the dequeue thread.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #include <assert.h>
8 #include <errno.h>
9 #include <libusb.h>
10 #include <netinet/in.h>
11 #include <sched.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #ifdef __SSE4_1__
17 #include <immintrin.h>
18 #endif
19 #include "bmusb.h"
20
21 #include <algorithm>
22 #include <atomic>
23 #include <condition_variable>
24 #include <cstddef>
25 #include <cstdint>
26 #include <deque>
27 #include <functional>
28 #include <memory>
29 #include <mutex>
30 #include <stack>
31 #include <thread>
32
33 using namespace std;
34 using namespace std::placeholders;
35
36 #define WIDTH 1280
37 #define HEIGHT 750  /* 30 lines ancillary data? */
38 //#define WIDTH 1920
39 //#define HEIGHT 1125  /* ??? lines ancillary data? */
40 #define HEADER_SIZE 44
41 //#define HEADER_SIZE 0
42 #define AUDIO_HEADER_SIZE 4
43
44 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
45 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
46 #define FRAME_SIZE (8 << 20)
47
48 FILE *audiofp;
49
50 thread usb_thread;
51 atomic<bool> should_quit;
52
53 FrameAllocator::~FrameAllocator() {}
54
55 #define NUM_QUEUED_FRAMES 8
56 class MallocFrameAllocator : public FrameAllocator {
57 public:
58         MallocFrameAllocator(size_t frame_size);
59         Frame alloc_frame() override;
60         void release_frame(Frame frame) override;
61
62 private:
63         size_t frame_size;
64
65         mutex freelist_mutex;
66         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
67 };
68
69 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size)
70         : frame_size(frame_size)
71 {
72         for (int i = 0; i < NUM_QUEUED_FRAMES; ++i) {
73                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
74         }
75 }
76
77 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
78 {
79         Frame vf;
80         vf.owner = this;
81
82         unique_lock<mutex> lock(freelist_mutex);  // Meh.
83         if (freelist.empty()) {
84                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
85                         frame_size);
86         } else {
87                 vf.data = freelist.top().release();
88                 vf.size = frame_size;
89                 freelist.pop();  // Meh.
90         }
91         return vf;
92 }
93
94 void MallocFrameAllocator::release_frame(Frame frame)
95 {
96         unique_lock<mutex> lock(freelist_mutex);
97         freelist.push(unique_ptr<uint8_t[]>(frame.data));
98 }
99
100 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
101 {
102         if (a == b) {
103                 return false;
104         } else if (a < b) {
105                 return (b - a < 0x8000);
106         } else {
107                 int wrap_b = 0x10000 + int(b);
108                 return (wrap_b - a < 0x8000);
109         }
110 }
111
112 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
113 {
114         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
115                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
116                         q->back().timecode, timecode);
117                 frame.owner->release_frame(frame);
118                 return;
119         }
120
121         QueuedFrame qf;
122         qf.format = format;
123         qf.timecode = timecode;
124         qf.frame = frame;
125
126         {
127                 unique_lock<mutex> lock(queue_lock);
128                 q->push_back(move(qf));
129         }
130         queues_not_empty.notify_one();  // might be spurious
131 }
132
133 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
134 {
135         FILE *fp = fopen(filename, "wb");
136         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
137                 printf("short write!\n");
138         }
139         fclose(fp);
140 }
141
142 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
143 {
144         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
145 }
146
147 void BMUSBCapture::dequeue_thread_func()
148 {
149         if (has_dequeue_callbacks) {
150                 dequeue_init_callback();
151         }
152         while (!dequeue_thread_should_quit) {
153                 unique_lock<mutex> lock(queue_lock);
154                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
155
156                 uint16_t video_timecode = pending_video_frames.front().timecode;
157                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
158                 if (video_timecode < audio_timecode) {
159                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
160                                 video_timecode);
161                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
162                         pending_video_frames.pop_front();
163                 } else if (audio_timecode < video_timecode) {
164                         printf("Audio block 0x%04x without corresponding video block, dropping.\n",
165                                 audio_timecode);
166                         audio_frame_allocator->release_frame(pending_audio_frames.front().frame);
167                         pending_audio_frames.pop_front();
168                 } else {
169                         QueuedFrame video_frame = pending_video_frames.front();
170                         QueuedFrame audio_frame = pending_audio_frames.front();
171                         pending_audio_frames.pop_front();
172                         pending_video_frames.pop_front();
173                         lock.unlock();
174
175 #if 0
176                         char filename[255];
177                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
178                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
179                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
180 #endif
181
182                         frame_callback(video_timecode,
183                                        video_frame.frame, HEADER_SIZE, video_frame.format,
184                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
185                 }
186         }
187         if (has_dequeue_callbacks) {
188                 dequeue_cleanup_callback();
189         }
190 }
191
192 void BMUSBCapture::start_new_frame(const uint8_t *start)
193 {
194         uint16_t format = (start[3] << 8) | start[2];
195         uint16_t timecode = (start[1] << 8) | start[0];
196
197         if (current_video_frame.len > 0) {
198                 //dump_frame();
199                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
200         }
201         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
202         //      format, timecode,
203         //      //start[7], start[6], start[5], start[4],
204         //      read_current_frame, FRAME_SIZE);
205
206         current_video_frame = video_frame_allocator->alloc_frame();
207         //if (current_video_frame.data == nullptr) {
208         //      read_current_frame = -1;
209         //} else {
210         //      read_current_frame = 0;
211         //}
212 }
213
214 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
215 {
216         uint16_t format = (start[3] << 8) | start[2];
217         uint16_t timecode = (start[1] << 8) | start[0];
218         if (current_audio_frame.len > 0) {
219                 //dump_audio_block();
220                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
221         }
222         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
223         //      format, timecode, read_current_audio_block);
224         current_audio_frame = audio_frame_allocator->alloc_frame();
225 }
226
227 #if 0
228 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
229 {
230         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
231         for (unsigned j = 0; j < pack->actual_length; j++) {
232         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
233                 printf("%02x", xfr->buffer[j + offset]);
234                 if ((j % 16) == 15)
235                         printf("\n");
236                 else if ((j % 8) == 7)
237                         printf("  ");
238                 else
239                         printf(" ");
240         }
241 }
242 #endif
243
244 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
245 {
246         assert(n % 2 == 0);
247         uint8_t *dptr1 = dest1;
248         uint8_t *dptr2 = dest2;
249
250         for (size_t i = 0; i < n; i += 2) {
251                 *dptr1++ = *src++;
252                 *dptr2++ = *src++;
253         }
254 }
255
256 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
257 {
258         if (current_frame->data == nullptr ||
259             current_frame->len > current_frame->size ||
260             start == end) {
261                 return;
262         }
263
264         int bytes = end - start;
265         if (current_frame->len + bytes > current_frame->size) {
266                 printf("%d bytes overflow after last %s frame\n",
267                         int(current_frame->len + bytes - current_frame->size), frame_type_name);
268                 //dump_frame();
269         } else {
270                 if (current_frame->interleaved) {
271                         uint8_t *data = current_frame->data + current_frame->len / 2;
272                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
273                         if (current_frame->len % 2 == 1) {
274                                 ++data;
275                                 swap(data, data2);
276                         }
277                         if (bytes % 2 == 1) {
278                                 *data++ = *start++;
279                                 swap(data, data2);
280                                 ++current_frame->len;
281                                 --bytes;
282                         }
283                         memcpy_interleaved(data, data2, start, bytes);
284                         current_frame->len += bytes;
285                 } else {
286                         memcpy(current_frame->data + current_frame->len, start, bytes);
287                         current_frame->len += bytes;
288                 }
289         }
290 }
291
292 #ifdef __SSE4_1__
293
294 #if 0
295 void avx2_dump(const char *name, __m256i n)
296 {
297         printf("%-10s:", name);
298         printf(" %02x", _mm256_extract_epi8(n, 0));
299         printf(" %02x", _mm256_extract_epi8(n, 1));
300         printf(" %02x", _mm256_extract_epi8(n, 2));
301         printf(" %02x", _mm256_extract_epi8(n, 3));
302         printf(" %02x", _mm256_extract_epi8(n, 4));
303         printf(" %02x", _mm256_extract_epi8(n, 5));
304         printf(" %02x", _mm256_extract_epi8(n, 6));
305         printf(" %02x", _mm256_extract_epi8(n, 7));
306         printf(" ");
307         printf(" %02x", _mm256_extract_epi8(n, 8));
308         printf(" %02x", _mm256_extract_epi8(n, 9));
309         printf(" %02x", _mm256_extract_epi8(n, 10));
310         printf(" %02x", _mm256_extract_epi8(n, 11));
311         printf(" %02x", _mm256_extract_epi8(n, 12));
312         printf(" %02x", _mm256_extract_epi8(n, 13));
313         printf(" %02x", _mm256_extract_epi8(n, 14));
314         printf(" %02x", _mm256_extract_epi8(n, 15));
315         printf(" ");
316         printf(" %02x", _mm256_extract_epi8(n, 16));
317         printf(" %02x", _mm256_extract_epi8(n, 17));
318         printf(" %02x", _mm256_extract_epi8(n, 18));
319         printf(" %02x", _mm256_extract_epi8(n, 19));
320         printf(" %02x", _mm256_extract_epi8(n, 20));
321         printf(" %02x", _mm256_extract_epi8(n, 21));
322         printf(" %02x", _mm256_extract_epi8(n, 22));
323         printf(" %02x", _mm256_extract_epi8(n, 23));
324         printf(" ");
325         printf(" %02x", _mm256_extract_epi8(n, 24));
326         printf(" %02x", _mm256_extract_epi8(n, 25));
327         printf(" %02x", _mm256_extract_epi8(n, 26));
328         printf(" %02x", _mm256_extract_epi8(n, 27));
329         printf(" %02x", _mm256_extract_epi8(n, 28));
330         printf(" %02x", _mm256_extract_epi8(n, 29));
331         printf(" %02x", _mm256_extract_epi8(n, 30));
332         printf(" %02x", _mm256_extract_epi8(n, 31));
333         printf("\n");
334 }
335 #endif
336
337 // Does a memcpy and memchr in one to reduce processing time.
338 // Note that the benefit is somewhat limited if your L3 cache is small,
339 // as you'll (unfortunately) spend most of the time loading the data
340 // from main memory.
341 //
342 // Complicated cases are left to the slow path; it basically stops copying
343 // up until the first instance of "sync_char" (usually a bit before, actually).
344 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
345 // data, and what we really need this for is the 00 00 ff ff marker in video data.
346 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
347 {
348         if (current_frame->data == nullptr ||
349             current_frame->len > current_frame->size ||
350             start == limit) {
351                 return start;
352         }
353         size_t orig_bytes = limit - start;
354         if (orig_bytes < 128) {
355                 // Don't bother.
356                 return start;
357         }
358
359         // Don't read more bytes than we can write.
360         limit = min(limit, start + (current_frame->size - current_frame->len));
361
362         // Align end to 32 bytes.
363         limit = (const uint8_t *)(intptr_t(limit) & ~31);
364
365         if (start >= limit) {
366                 return start;
367         }
368
369         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
370         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
371         if (aligned_start != start) {
372                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
373                 if (sync_start == nullptr) {
374                         add_to_frame(current_frame, "", start, aligned_start);
375                 } else {
376                         add_to_frame(current_frame, "", start, sync_start);
377                         return sync_start;
378                 }
379         }
380
381         // Make the length a multiple of 64.
382         if (current_frame->interleaved) {
383                 if (((limit - aligned_start) % 64) != 0) {
384                         limit -= 32;
385                 }
386                 assert(((limit - aligned_start) % 64) == 0);
387         }
388
389 #if __AVX2__
390         const __m256i needle = _mm256_set1_epi8(sync_char);
391
392         const __restrict __m256i *in = (const __m256i *)aligned_start;
393         if (current_frame->interleaved) {
394                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
395                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
396                 if (current_frame->len % 2 == 1) {
397                         swap(out1, out2);
398                 }
399
400                 __m256i shuffle_cw = _mm256_set_epi8(
401                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
402                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
403                 while (in < (const __m256i *)limit) {
404                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
405                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
406                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
407
408                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
409                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
410                         __m256i found = _mm256_or_si256(found1, found2);
411
412                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
413                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
414                 
415                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
416                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
417
418                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
419                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
420
421                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
422                         _mm256_storeu_si256(out2, hi);
423
424                         if (!_mm256_testz_si256(found, found)) {
425                                 break;
426                         }
427
428                         in += 2;
429                         ++out1;
430                         ++out2;
431                 }
432                 current_frame->len += (uint8_t *)in - aligned_start;
433         } else {
434                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
435                 while (in < (const __m256i *)limit) {
436                         __m256i data = _mm256_load_si256(in);
437                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
438                         __m256i found = _mm256_cmpeq_epi8(data, needle);
439                         if (!_mm256_testz_si256(found, found)) {
440                                 break;
441                         }
442
443                         ++in;
444                         ++out;
445                 }
446                 current_frame->len = (uint8_t *)out - current_frame->data;
447         }
448 #else
449         const __m128i needle = _mm_set1_epi8(sync_char);
450
451         const __m128i *in = (const __m128i *)aligned_start;
452         if (current_frame->interleaved) {
453                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
454                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
455                 if (current_frame->len % 2 == 1) {
456                         swap(out1, out2);
457                 }
458
459                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
460                 while (in < (const __m128i *)limit) {
461                         __m128i data1 = _mm_load_si128(in);
462                         __m128i data2 = _mm_load_si128(in + 1);
463                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
464                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
465                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
466                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
467                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
468                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
469                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
470                         _mm_storeu_si128(out2, hi);
471                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
472                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
473                         if (!_mm_testz_si128(found1, found1) ||
474                             !_mm_testz_si128(found2, found2)) {
475                                 break;
476                         }
477
478                         in += 2;
479                         ++out1;
480                         ++out2;
481                 }
482                 current_frame->len += (uint8_t *)in - aligned_start;
483         } else {
484                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
485                 while (in < (const __m128i *)limit) {
486                         __m128i data = _mm_load_si128(in);
487                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
488                         __m128i found = _mm_cmpeq_epi8(data, needle);
489                         if (!_mm_testz_si128(found, found)) {
490                                 break;
491                         }
492
493                         ++in;
494                         ++out;
495                 }
496                 current_frame->len = (uint8_t *)out - current_frame->data;
497         }
498 #endif
499
500         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
501
502         return (const uint8_t *)in;
503 }
504 #endif
505
506 void decode_packs(const libusb_transfer *xfr,
507                   const char *sync_pattern,
508                   int sync_length,
509                   FrameAllocator::Frame *current_frame,
510                   const char *frame_type_name,
511                   function<void(const uint8_t *start)> start_callback)
512 {
513         int offset = 0;
514         for (int i = 0; i < xfr->num_iso_packets; i++) {
515                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
516
517                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
518                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
519                         continue;
520 //exit(5);
521                 }
522
523                 const uint8_t *start = xfr->buffer + offset;
524                 const uint8_t *limit = start + pack->actual_length;
525                 while (start < limit) {  // Usually runs only one iteration.
526 #ifdef __SSE4_1__
527                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
528                         if (start == limit) break;
529                         assert(start < limit);
530 #endif
531
532                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
533                         if (start_next_frame == nullptr) {
534                                 // add the rest of the buffer
535                                 add_to_frame(current_frame, frame_type_name, start, limit);
536                                 break;
537                         } else {
538                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
539                                 start = start_next_frame + sync_length;  // skip sync
540                                 start_callback(start);
541                         }
542                 }
543 #if 0
544                 dump_pack(xfr, offset, pack);
545 #endif
546                 offset += pack->length;
547         }
548 }
549
550 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
551 {
552         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
553                 fprintf(stderr, "transfer status %d\n", xfr->status);
554                 libusb_free_transfer(xfr);
555                 exit(3);
556         }
557
558         assert(xfr->user_data != nullptr);
559         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
560
561         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
562                 if (xfr->endpoint == 0x84) {
563                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
564                 } else {
565                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
566                 }
567         }
568         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
569                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
570                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
571 #if 0
572                 if (setup->wIndex == 44) {
573                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
574                 } else {
575                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
576                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
577                 }
578 #else
579                 memcpy(usb->register_file + usb->current_register, buf, 4);
580                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
581                 if (usb->current_register == 0) {
582                         // read through all of them
583                         printf("register dump:");
584                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
585                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
586                         }
587                         printf("\n");
588                 }
589                 libusb_fill_control_setup(xfr->buffer,
590                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
591                         /*index=*/usb->current_register, /*length=*/4);
592 #endif
593         }
594
595 #if 0
596         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
597         for (i = 0; i < xfr->actual_length; i++) {
598                 printf("%02x", xfr->buffer[i]);
599                 if (i % 16)
600                         printf("\n");
601                 else if (i % 8)
602                         printf("  ");
603                 else
604                         printf(" ");
605         }
606 #endif
607
608         if (libusb_submit_transfer(xfr) < 0) {
609                 fprintf(stderr, "error re-submitting URB\n");
610                 exit(1);
611         }
612 }
613
614 void BMUSBCapture::usb_thread_func()
615 {
616         sched_param param;
617         memset(&param, 0, sizeof(param));
618         param.sched_priority = 1;
619         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
620                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
621         }
622         while (!should_quit) {
623                 int rc = libusb_handle_events(nullptr);
624                 if (rc != LIBUSB_SUCCESS)
625                         break;
626         }
627 }
628
629 void BMUSBCapture::configure_card()
630 {
631         if (video_frame_allocator == nullptr) {
632                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE));  // FIXME: leak.
633         }
634         if (audio_frame_allocator == nullptr) {
635                 set_audio_frame_allocator(new MallocFrameAllocator(65536));  // FIXME: leak.
636         }
637         dequeue_thread_should_quit = false;
638         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
639
640         int rc;
641         struct libusb_transfer *xfr;
642
643         rc = libusb_init(nullptr);
644         if (rc < 0) {
645                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
646                 exit(1);
647         }
648
649         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
650         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd4f);
651         struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, vid, pid);
652         if (!devh) {
653                 fprintf(stderr, "Error finding USB device\n");
654                 exit(1);
655         }
656
657         libusb_config_descriptor *config;
658         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
659         if (rc < 0) {
660                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
661                 exit(1);
662         }
663         printf("%d interface\n", config->bNumInterfaces);
664         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
665                 printf("  interface %d\n", interface_number);
666                 const libusb_interface *interface = &config->interface[interface_number];
667                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
668                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
669                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
670                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
671                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
672                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
673                         }
674                 }
675         }
676
677         rc = libusb_set_configuration(devh, /*configuration=*/1);
678         if (rc < 0) {
679                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
680                 exit(1);
681         }
682
683         rc = libusb_claim_interface(devh, 0);
684         if (rc < 0) {
685                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
686                 exit(1);
687         }
688
689         // Alternate setting 1 is output, alternate setting 2 is input.
690         // Card is reset when switching alternates, so the driver uses
691         // this “double switch” when it wants to reset.
692         //
693         // There's also alternate settings 3 and 4, which seem to be
694         // like 1 and 2 except they advertise less bandwidth needed.
695         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
696         if (rc < 0) {
697                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
698                 exit(1);
699         }
700         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
701         if (rc < 0) {
702                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
703                 exit(1);
704         }
705 #if 0
706         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
707         if (rc < 0) {
708                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
709                 exit(1);
710         }
711 #endif
712
713 #if 0
714         rc = libusb_claim_interface(devh, 3);
715         if (rc < 0) {
716                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
717                 exit(1);
718         }
719 #endif
720
721         // theories:
722         //   44 is some kind of timer register (first 16 bits count upwards)
723         //   24 is some sort of watchdog?
724         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
725         //      (or will go to 0x73c60010?), also seen 0x73c60100
726         //   12 also changes all the time, unclear why  
727         //   16 seems to be autodetected mode somehow
728         //      --    this is e00115e0 after reset?
729         //                    ed0115e0 after mode change [to output?]
730         //                    2d0015e0 after more mode change [to input]
731         //                    ed0115e0 after more mode change
732         //                    2d0015e0 after more mode change
733         //
734         //                    390115e0 seems to indicate we have signal
735         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
736         //
737         //                    200015e0 on startup
738         //         changes to 250115e0 when we sync to the signal
739         //
740         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
741         //
742         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
743         //
744         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
745         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
746         //
747         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
748         //    perhaps some of them are related to analog output?
749         //
750         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
751         //    but the driver sets it to 0x8036802a at some point.
752         //
753         //    all of this is on request 214/215. other requests (192, 219,
754         //    222, 223, 224) are used for firmware upgrade. Probably best to
755         //    stay out of it unless you know what you're doing.
756         //
757         //
758         // register 16:
759         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
760         //
761         // theories:
762         //   0x01 - stable signal
763         //   0x04 - deep color
764         //   0x08 - unknown (audio??)
765         //   0x20 - 720p??
766         //   0x30 - 576p??
767
768         struct ctrl {
769                 int endpoint;
770                 int request;
771                 int index;
772                 uint32_t data;
773         };
774         static const ctrl ctrls[] = {
775                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
776                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
777
778                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
779                 // capture (v210).
780                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
781                 // 0x10000000 = analog audio instead of embedded audio, it seems
782                 // 0x3a000000 = component video? (analog audio)
783                 // 0x3c000000 = composite video? (analog audio)
784                 // 0x3e000000 = s-video? (analog audio)
785                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
786                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
787                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
788                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
789                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
790         };
791
792         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
793                 uint32_t flipped = htonl(ctrls[req].data);
794                 static uint8_t value[4];
795                 memcpy(value, &flipped, sizeof(flipped));
796                 int size = sizeof(value);
797                 //if (ctrls[req].request == 215) size = 0;
798                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
799                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
800                 if (rc < 0) {
801                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
802                         exit(1);
803                 }
804                 
805                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
806                 for (int i = 0; i < rc; ++i) {
807                         printf("%02x", value[i]);
808                 }
809                 printf("\n");
810         }
811
812 #if 0
813         // DEBUG
814         for ( ;; ) {
815                 static int my_index = 0;
816                 static uint8_t value[4];
817                 int size = sizeof(value);
818                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
819                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
820                 if (rc < 0) {
821                         fprintf(stderr, "Error on control\n");
822                         exit(1);
823                 }
824                 printf("rc=%d index=%d: 0x", rc, my_index);
825                 for (int i = 0; i < rc; ++i) {
826                         printf("%02x", value[i]);
827                 }
828                 printf("\n");
829         }
830 #endif
831
832 #if 0
833         // set up an asynchronous transfer of the timer register
834         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
835         static int completed = 0;
836
837         xfr = libusb_alloc_transfer(0);
838         libusb_fill_control_setup(cmdbuf,
839             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
840                 /*index=*/44, /*length=*/4);
841         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
842         xfr->user_data = this;
843         libusb_submit_transfer(xfr);
844
845         // set up an asynchronous transfer of register 24
846         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
847         static int completed2 = 0;
848
849         xfr = libusb_alloc_transfer(0);
850         libusb_fill_control_setup(cmdbuf2,
851             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
852                 /*index=*/24, /*length=*/4);
853         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
854         xfr->user_data = this;
855         libusb_submit_transfer(xfr);
856 #endif
857
858         // set up an asynchronous transfer of the register dump
859         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
860         static int completed3 = 0;
861
862         xfr = libusb_alloc_transfer(0);
863         libusb_fill_control_setup(cmdbuf3,
864             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
865                 /*index=*/current_register, /*length=*/4);
866         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
867         xfr->user_data = this;
868         //libusb_submit_transfer(xfr);
869
870         audiofp = fopen("audio.raw", "wb");
871
872         // set up isochronous transfers for audio and video
873         for (int e = 3; e <= 4; ++e) {
874                 //int num_transfers = (e == 3) ? 6 : 6;
875                 int num_transfers = 6;
876                 for (int i = 0; i < num_transfers; ++i) {
877                         int num_iso_pack, size;
878                         if (e == 3) {
879                                 // Video seems to require isochronous packets scaled with the width; 
880                                 // seemingly six lines is about right, rounded up to the required 1kB
881                                 // multiple.
882                                 size = WIDTH * 2 * 6;
883                                 // Note that for 10-bit input, you'll need to increase size accordingly.
884                                 //size = size * 4 / 3;
885                                 if (size % 1024 != 0) {
886                                         size &= ~1023;
887                                         size += 1024;
888                                 }
889                                 num_iso_pack = (2 << 18) / size;  // 512 kB.
890                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
891                         } else {
892                                 size = 0xc0;
893                                 num_iso_pack = 80;
894                         }
895                         int num_bytes = num_iso_pack * size;
896                         uint8_t *buf = new uint8_t[num_bytes];
897
898                         xfr = libusb_alloc_transfer(num_iso_pack);
899                         if (!xfr) {
900                                 fprintf(stderr, "oom\n");
901                                 exit(1);
902                         }
903
904                         int ep = LIBUSB_ENDPOINT_IN | e;
905                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
906                                 num_iso_pack, cb_xfr, nullptr, 0);
907                         libusb_set_iso_packet_lengths(xfr, size);
908                         xfr->user_data = this;
909                         iso_xfrs.push_back(xfr);
910                 }
911         }
912 }
913
914 void BMUSBCapture::start_bm_capture()
915 {
916         printf("starting capture\n");
917         int i = 0;
918         for (libusb_transfer *xfr : iso_xfrs) {
919                 printf("submitting transfer...\n");
920                 int rc = libusb_submit_transfer(xfr);
921                 ++i;
922                 if (rc < 0) {
923                         //printf("num_bytes=%d\n", num_bytes);
924                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
925                                 xfr->endpoint, i, libusb_error_name(rc));
926                         exit(1);
927                 }
928         }
929
930
931 #if 0
932         libusb_release_interface(devh, 0);
933 out:
934         if (devh)
935                 libusb_close(devh);
936         libusb_exit(nullptr);
937         return rc;
938 #endif
939 }
940
941 void BMUSBCapture::stop_dequeue_thread()
942 {
943         dequeue_thread_should_quit = true;
944         queues_not_empty.notify_all();
945         dequeue_thread.join();
946 }
947
948 void BMUSBCapture::start_bm_thread()
949 {
950         should_quit = false;
951         usb_thread = thread(&BMUSBCapture::usb_thread_func);
952 }
953
954 void BMUSBCapture::stop_bm_thread()
955 {
956         should_quit = true;
957         usb_thread.join();
958 }