]> git.sesse.net Git - nageru/blob - bmusb.cpp
Refcount the input frames directly instead of trying to free them after-the-fact...
[nageru] / bmusb.cpp
1 // TODO: Replace with linking to upstream bmusb.
2
3 // Intensity Shuttle USB3 prototype capture driver, v0.3
4 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
5 // (can do captures for hours at a time with no drops), except during startup
6 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
7 // Audio comes out as 8-channel 24-bit raw audio.
8
9 #include <assert.h>
10 #include <errno.h>
11 #include <libusb.h>
12 #include <netinet/in.h>
13 #include <sched.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #ifdef __SSE4_1__
19 #include <immintrin.h>
20 #endif
21 #include "bmusb.h"
22
23 #include <algorithm>
24 #include <atomic>
25 #include <condition_variable>
26 #include <cstddef>
27 #include <cstdint>
28 #include <deque>
29 #include <functional>
30 #include <memory>
31 #include <mutex>
32 #include <stack>
33 #include <thread>
34
35 using namespace std;
36 using namespace std::placeholders;
37
38 #define WIDTH 1280
39 #define HEIGHT 750  /* 30 lines ancillary data? */
40 //#define WIDTH 1920
41 //#define HEIGHT 1125  /* ??? lines ancillary data? */
42 #define HEADER_SIZE 44
43 //#define HEADER_SIZE 0
44 #define AUDIO_HEADER_SIZE 4
45
46 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
47 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
48 #define FRAME_SIZE (8 << 20)
49
50 FILE *audiofp;
51
52 thread usb_thread;
53 atomic<bool> should_quit;
54
55 FrameAllocator::~FrameAllocator() {}
56
57 #define NUM_QUEUED_FRAMES 16
58 class MallocFrameAllocator : public FrameAllocator {
59 public:
60         MallocFrameAllocator(size_t frame_size);
61         Frame alloc_frame() override;
62         void release_frame(Frame frame) override;
63
64 private:
65         size_t frame_size;
66
67         mutex freelist_mutex;
68         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
69 };
70
71 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size)
72         : frame_size(frame_size)
73 {
74         for (int i = 0; i < NUM_QUEUED_FRAMES; ++i) {
75                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
76         }
77 }
78
79 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
80 {
81         Frame vf;
82         vf.owner = this;
83
84         unique_lock<mutex> lock(freelist_mutex);  // Meh.
85         if (freelist.empty()) {
86                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
87                         frame_size);
88         } else {
89                 vf.data = freelist.top().release();
90                 vf.size = frame_size;
91                 freelist.pop();  // Meh.
92         }
93         return vf;
94 }
95
96 void MallocFrameAllocator::release_frame(Frame frame)
97 {
98         unique_lock<mutex> lock(freelist_mutex);
99         freelist.push(unique_ptr<uint8_t[]>(frame.data));
100 }
101
102 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
103 {
104         if (a == b) {
105                 return false;
106         } else if (a < b) {
107                 return (b - a < 0x8000);
108         } else {
109                 int wrap_b = 0x10000 + int(b);
110                 return (wrap_b - a < 0x8000);
111         }
112 }
113
114 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
115 {
116         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
117                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
118                         q->back().timecode, timecode);
119                 frame.owner->release_frame(frame);
120                 return;
121         }
122
123         QueuedFrame qf;
124         qf.format = format;
125         qf.timecode = timecode;
126         qf.frame = frame;
127
128         {
129                 unique_lock<mutex> lock(queue_lock);
130                 q->push_back(move(qf));
131         }
132         queues_not_empty.notify_one();  // might be spurious
133 }
134
135 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
136 {
137         FILE *fp = fopen(filename, "wb");
138         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
139                 printf("short write!\n");
140         }
141         fclose(fp);
142 }
143
144 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
145 {
146         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
147 }
148
149 void BMUSBCapture::dequeue_thread()
150 {
151         for ( ;; ) {
152                 unique_lock<mutex> lock(queue_lock);
153                 queues_not_empty.wait(lock, [this]{ return !pending_video_frames.empty() && !pending_audio_frames.empty(); });
154
155                 uint16_t video_timecode = pending_video_frames.front().timecode;
156                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
157                 if (video_timecode < audio_timecode) {
158                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
159                                 video_timecode);
160                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
161                         pending_video_frames.pop_front();
162                 } else if (audio_timecode < video_timecode) {
163                         printf("Audio block 0x%04x without corresponding video block, dropping.\n",
164                                 audio_timecode);
165                         audio_frame_allocator->release_frame(pending_audio_frames.front().frame);
166                         pending_audio_frames.pop_front();
167                 } else {
168                         QueuedFrame video_frame = pending_video_frames.front();
169                         QueuedFrame audio_frame = pending_audio_frames.front();
170                         pending_audio_frames.pop_front();
171                         pending_video_frames.pop_front();
172                         lock.unlock();
173
174 #if 0
175                         char filename[255];
176                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
177                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
178                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
179 #endif
180
181                         frame_callback(video_timecode,
182                                        video_frame.frame, HEADER_SIZE, video_frame.format,
183                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
184                 }
185         }
186 }
187
188 void BMUSBCapture::start_new_frame(const uint8_t *start)
189 {
190         uint16_t format = (start[3] << 8) | start[2];
191         uint16_t timecode = (start[1] << 8) | start[0];
192
193         if (current_video_frame.len > 0) {
194                 //dump_frame();
195                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
196         }
197         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
198         //      format, timecode,
199         //      //start[7], start[6], start[5], start[4],
200         //      read_current_frame, FRAME_SIZE);
201
202         current_video_frame = video_frame_allocator->alloc_frame();
203         //if (current_video_frame.data == nullptr) {
204         //      read_current_frame = -1;
205         //} else {
206         //      read_current_frame = 0;
207         //}
208 }
209
210 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
211 {
212         uint16_t format = (start[3] << 8) | start[2];
213         uint16_t timecode = (start[1] << 8) | start[0];
214         if (current_audio_frame.len > 0) {
215                 //dump_audio_block();
216                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
217         }
218         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
219         //      format, timecode, read_current_audio_block);
220         current_audio_frame = audio_frame_allocator->alloc_frame();
221 }
222
223 #if 0
224 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
225 {
226         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
227         for (unsigned j = 0; j < pack->actual_length; j++) {
228         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
229                 printf("%02x", xfr->buffer[j + offset]);
230                 if ((j % 16) == 15)
231                         printf("\n");
232                 else if ((j % 8) == 7)
233                         printf("  ");
234                 else
235                         printf(" ");
236         }
237 }
238 #endif
239
240 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
241 {
242         assert(n % 2 == 0);
243         uint8_t *dptr1 = dest1;
244         uint8_t *dptr2 = dest2;
245
246         for (size_t i = 0; i < n; i += 2) {
247                 *dptr1++ = *src++;
248                 *dptr2++ = *src++;
249         }
250 }
251
252 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
253 {
254         if (current_frame->data == nullptr ||
255             current_frame->len > current_frame->size ||
256             start == end) {
257                 return;
258         }
259
260         int bytes = end - start;
261         if (current_frame->len + bytes > current_frame->size) {
262                 printf("%d bytes overflow after last %s frame\n",
263                         int(current_frame->len + bytes - current_frame->size), frame_type_name);
264                 //dump_frame();
265         } else {
266                 if (current_frame->interleaved) {
267                         uint8_t *data = current_frame->data + current_frame->len / 2;
268                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
269                         if (current_frame->len % 2 == 1) {
270                                 ++data;
271                                 swap(data, data2);
272                         }
273                         if (bytes % 2 == 1) {
274                                 *data++ = *start++;
275                                 swap(data, data2);
276                                 ++current_frame->len;
277                                 --bytes;
278                         }
279                         memcpy_interleaved(data, data2, start, bytes);
280                         current_frame->len += bytes;
281                 } else {
282                         memcpy(current_frame->data + current_frame->len, start, bytes);
283                         current_frame->len += bytes;
284                 }
285         }
286 }
287
288 #ifdef __SSE4_1__
289
290 #if 0
291 void avx2_dump(const char *name, __m256i n)
292 {
293         printf("%-10s:", name);
294         printf(" %02x", _mm256_extract_epi8(n, 0));
295         printf(" %02x", _mm256_extract_epi8(n, 1));
296         printf(" %02x", _mm256_extract_epi8(n, 2));
297         printf(" %02x", _mm256_extract_epi8(n, 3));
298         printf(" %02x", _mm256_extract_epi8(n, 4));
299         printf(" %02x", _mm256_extract_epi8(n, 5));
300         printf(" %02x", _mm256_extract_epi8(n, 6));
301         printf(" %02x", _mm256_extract_epi8(n, 7));
302         printf(" ");
303         printf(" %02x", _mm256_extract_epi8(n, 8));
304         printf(" %02x", _mm256_extract_epi8(n, 9));
305         printf(" %02x", _mm256_extract_epi8(n, 10));
306         printf(" %02x", _mm256_extract_epi8(n, 11));
307         printf(" %02x", _mm256_extract_epi8(n, 12));
308         printf(" %02x", _mm256_extract_epi8(n, 13));
309         printf(" %02x", _mm256_extract_epi8(n, 14));
310         printf(" %02x", _mm256_extract_epi8(n, 15));
311         printf(" ");
312         printf(" %02x", _mm256_extract_epi8(n, 16));
313         printf(" %02x", _mm256_extract_epi8(n, 17));
314         printf(" %02x", _mm256_extract_epi8(n, 18));
315         printf(" %02x", _mm256_extract_epi8(n, 19));
316         printf(" %02x", _mm256_extract_epi8(n, 20));
317         printf(" %02x", _mm256_extract_epi8(n, 21));
318         printf(" %02x", _mm256_extract_epi8(n, 22));
319         printf(" %02x", _mm256_extract_epi8(n, 23));
320         printf(" ");
321         printf(" %02x", _mm256_extract_epi8(n, 24));
322         printf(" %02x", _mm256_extract_epi8(n, 25));
323         printf(" %02x", _mm256_extract_epi8(n, 26));
324         printf(" %02x", _mm256_extract_epi8(n, 27));
325         printf(" %02x", _mm256_extract_epi8(n, 28));
326         printf(" %02x", _mm256_extract_epi8(n, 29));
327         printf(" %02x", _mm256_extract_epi8(n, 30));
328         printf(" %02x", _mm256_extract_epi8(n, 31));
329         printf("\n");
330 }
331 #endif
332
333 // Does a memcpy and memchr in one to reduce processing time.
334 // Note that the benefit is somewhat limited if your L3 cache is small,
335 // as you'll (unfortunately) spend most of the time loading the data
336 // from main memory.
337 //
338 // Complicated cases are left to the slow path; it basically stops copying
339 // up until the first instance of "sync_char" (usually a bit before, actually).
340 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
341 // data, and what we really need this for is the 00 00 ff ff marker in video data.
342 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
343 {
344         if (current_frame->data == nullptr ||
345             current_frame->len > current_frame->size ||
346             start == limit) {
347                 return start;
348         }
349         size_t orig_bytes = limit - start;
350         if (orig_bytes < 128) {
351                 // Don't bother.
352                 return start;
353         }
354
355         // Don't read more bytes than we can write.
356         limit = min(limit, start + (current_frame->size - current_frame->len));
357
358         // Align end to 32 bytes.
359         limit = (const uint8_t *)(intptr_t(limit) & ~31);
360
361         if (start >= limit) {
362                 return start;
363         }
364
365         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
366         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
367         if (aligned_start != start) {
368                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
369                 if (sync_start == nullptr) {
370                         add_to_frame(current_frame, "", start, aligned_start);
371                 } else {
372                         add_to_frame(current_frame, "", start, sync_start);
373                         return sync_start;
374                 }
375         }
376
377         // Make the length a multiple of 64.
378         if (current_frame->interleaved) {
379                 if (((limit - aligned_start) % 64) != 0) {
380                         limit -= 32;
381                 }
382                 assert(((limit - aligned_start) % 64) == 0);
383         }
384
385 #if __AVX2__
386         const __m256i needle = _mm256_set1_epi8(sync_char);
387
388         const __restrict __m256i *in = (const __m256i *)aligned_start;
389         if (current_frame->interleaved) {
390                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
391                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
392                 if (current_frame->len % 2 == 1) {
393                         swap(out1, out2);
394                 }
395
396                 __m256i shuffle_cw = _mm256_set_epi8(
397                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
398                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
399                 while (in < (const __m256i *)limit) {
400                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
401                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
402                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
403
404                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
405                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
406                         __m256i found = _mm256_or_si256(found1, found2);
407
408                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
409                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
410                 
411                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
412                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
413
414                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
415                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
416
417                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
418                         _mm256_storeu_si256(out2, hi);
419
420                         if (!_mm256_testz_si256(found, found)) {
421                                 break;
422                         }
423
424                         in += 2;
425                         ++out1;
426                         ++out2;
427                 }
428                 current_frame->len += (uint8_t *)in - aligned_start;
429         } else {
430                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
431                 while (in < (const __m256i *)limit) {
432                         __m256i data = _mm256_load_si256(in);
433                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
434                         __m256i found = _mm256_cmpeq_epi8(data, needle);
435                         if (!_mm256_testz_si256(found, found)) {
436                                 break;
437                         }
438
439                         ++in;
440                         ++out;
441                 }
442                 current_frame->len = (uint8_t *)out - current_frame->data;
443         }
444 #else
445         const __m128i needle = _mm_set1_epi8(sync_char);
446
447         const __m128i *in = (const __m128i *)aligned_start;
448         if (current_frame->interleaved) {
449                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
450                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
451                 if (current_frame->len % 2 == 1) {
452                         swap(out1, out2);
453                 }
454
455                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
456                 while (in < (const __m128i *)limit) {
457                         __m128i data1 = _mm_load_si128(in);
458                         __m128i data2 = _mm_load_si128(in + 1);
459                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
460                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
461                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
462                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
463                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
464                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
465                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
466                         _mm_storeu_si128(out2, hi);
467                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
468                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
469                         if (!_mm_testz_si128(found1, found1) ||
470                             !_mm_testz_si128(found2, found2)) {
471                                 break;
472                         }
473
474                         in += 2;
475                         ++out1;
476                         ++out2;
477                 }
478                 current_frame->len += (uint8_t *)in - aligned_start;
479         } else {
480                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
481                 while (in < (const __m128i *)limit) {
482                         __m128i data = _mm_load_si128(in);
483                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
484                         __m128i found = _mm_cmpeq_epi8(data, needle);
485                         if (!_mm_testz_si128(found, found)) {
486                                 break;
487                         }
488
489                         ++in;
490                         ++out;
491                 }
492                 current_frame->len = (uint8_t *)out - current_frame->data;
493         }
494 #endif
495
496         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
497
498         return (const uint8_t *)in;
499 }
500 #endif
501
502 void decode_packs(const libusb_transfer *xfr,
503                   const char *sync_pattern,
504                   int sync_length,
505                   FrameAllocator::Frame *current_frame,
506                   const char *frame_type_name,
507                   function<void(const uint8_t *start)> start_callback)
508 {
509         int offset = 0;
510         for (int i = 0; i < xfr->num_iso_packets; i++) {
511                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
512
513                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
514                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
515                         continue;
516 //exit(5);
517                 }
518
519                 const uint8_t *start = xfr->buffer + offset;
520                 const uint8_t *limit = start + pack->actual_length;
521                 while (start < limit) {  // Usually runs only one iteration.
522 #ifdef __SSE4_1__
523                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
524                         if (start == limit) break;
525                         assert(start < limit);
526 #endif
527
528                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
529                         if (start_next_frame == nullptr) {
530                                 // add the rest of the buffer
531                                 add_to_frame(current_frame, frame_type_name, start, limit);
532                                 break;
533                         } else {
534                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
535                                 start = start_next_frame + sync_length;  // skip sync
536                                 start_callback(start);
537                         }
538                 }
539 #if 0
540                 dump_pack(xfr, offset, pack);
541 #endif
542                 offset += pack->length;
543         }
544 }
545
546 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
547 {
548         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
549                 fprintf(stderr, "transfer status %d\n", xfr->status);
550                 libusb_free_transfer(xfr);
551                 exit(3);
552         }
553
554         assert(xfr->user_data != nullptr);
555         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
556
557         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
558                 if (xfr->endpoint == 0x84) {
559                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
560                 } else {
561                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
562                 }
563         }
564         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
565                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
566                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
567 #if 0
568                 if (setup->wIndex == 44) {
569                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
570                 } else {
571                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
572                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
573                 }
574 #else
575                 memcpy(usb->register_file + usb->current_register, buf, 4);
576                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
577                 if (usb->current_register == 0) {
578                         // read through all of them
579                         printf("register dump:");
580                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
581                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
582                         }
583                         printf("\n");
584                 }
585                 libusb_fill_control_setup(xfr->buffer,
586                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
587                         /*index=*/usb->current_register, /*length=*/4);
588 #endif
589         }
590
591 #if 0
592         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
593         for (i = 0; i < xfr->actual_length; i++) {
594                 printf("%02x", xfr->buffer[i]);
595                 if (i % 16)
596                         printf("\n");
597                 else if (i % 8)
598                         printf("  ");
599                 else
600                         printf(" ");
601         }
602 #endif
603
604         if (libusb_submit_transfer(xfr) < 0) {
605                 fprintf(stderr, "error re-submitting URB\n");
606                 exit(1);
607         }
608 }
609
610 void BMUSBCapture::usb_thread_func()
611 {
612         sched_param param;
613         memset(&param, 0, sizeof(param));
614         param.sched_priority = 1;
615         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
616                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
617         }
618         while (!should_quit) {
619                 int rc = libusb_handle_events(nullptr);
620                 if (rc != LIBUSB_SUCCESS)
621                         break;
622         }
623 }
624
625 void BMUSBCapture::configure_card()
626 {
627         if (video_frame_allocator == nullptr) {
628                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE));  // FIXME: leak.
629         }
630         if (audio_frame_allocator == nullptr) {
631                 set_audio_frame_allocator(new MallocFrameAllocator(65536));  // FIXME: leak.
632         }
633         thread(&BMUSBCapture::dequeue_thread, this).detach();
634
635         int rc;
636         struct libusb_transfer *xfr;
637
638         rc = libusb_init(nullptr);
639         if (rc < 0) {
640                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
641                 exit(1);
642         }
643
644         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
645         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd4f);
646         struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, vid, pid);
647         if (!devh) {
648                 fprintf(stderr, "Error finding USB device\n");
649                 exit(1);
650         }
651
652         libusb_config_descriptor *config;
653         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
654         if (rc < 0) {
655                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
656                 exit(1);
657         }
658         printf("%d interface\n", config->bNumInterfaces);
659         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
660                 printf("  interface %d\n", interface_number);
661                 const libusb_interface *interface = &config->interface[interface_number];
662                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
663                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
664                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
665                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
666                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
667                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
668                         }
669                 }
670         }
671
672         rc = libusb_set_configuration(devh, /*configuration=*/1);
673         if (rc < 0) {
674                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
675                 exit(1);
676         }
677
678         rc = libusb_claim_interface(devh, 0);
679         if (rc < 0) {
680                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
681                 exit(1);
682         }
683
684         // Alternate setting 1 is output, alternate setting 2 is input.
685         // Card is reset when switching alternates, so the driver uses
686         // this “double switch” when it wants to reset.
687         //
688         // There's also alternate settings 3 and 4, which seem to be
689         // like 1 and 2 except they advertise less bandwidth needed.
690         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
691         if (rc < 0) {
692                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
693                 exit(1);
694         }
695         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
696         if (rc < 0) {
697                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
698                 exit(1);
699         }
700 #if 0
701         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
702         if (rc < 0) {
703                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
704                 exit(1);
705         }
706 #endif
707
708 #if 0
709         rc = libusb_claim_interface(devh, 3);
710         if (rc < 0) {
711                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
712                 exit(1);
713         }
714 #endif
715
716         // theories:
717         //   44 is some kind of timer register (first 16 bits count upwards)
718         //   24 is some sort of watchdog?
719         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
720         //      (or will go to 0x73c60010?), also seen 0x73c60100
721         //   12 also changes all the time, unclear why  
722         //   16 seems to be autodetected mode somehow
723         //      --    this is e00115e0 after reset?
724         //                    ed0115e0 after mode change [to output?]
725         //                    2d0015e0 after more mode change [to input]
726         //                    ed0115e0 after more mode change
727         //                    2d0015e0 after more mode change
728         //
729         //                    390115e0 seems to indicate we have signal
730         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
731         //
732         //                    200015e0 on startup
733         //         changes to 250115e0 when we sync to the signal
734         //
735         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
736         //
737         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
738         //
739         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
740         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
741         //
742         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
743         //    perhaps some of them are related to analog output?
744         //
745         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
746         //    but the driver sets it to 0x8036802a at some point.
747         //
748         //    all of this is on request 214/215. other requests (192, 219,
749         //    222, 223, 224) are used for firmware upgrade. Probably best to
750         //    stay out of it unless you know what you're doing.
751         //
752         //
753         // register 16:
754         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
755         //
756         // theories:
757         //   0x01 - stable signal
758         //   0x04 - deep color
759         //   0x08 - unknown (audio??)
760         //   0x20 - 720p??
761         //   0x30 - 576p??
762
763         struct ctrl {
764                 int endpoint;
765                 int request;
766                 int index;
767                 uint32_t data;
768         };
769         static const ctrl ctrls[] = {
770                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
771                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
772
773                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
774                 // capture (v210).
775                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
776                 // 0x10000000 = analog audio instead of embedded audio, it seems
777                 // 0x3a000000 = component video? (analog audio)
778                 // 0x3c000000 = composite video? (analog audio)
779                 // 0x3e000000 = s-video? (analog audio)
780                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
781                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
782                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
783                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
784                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
785         };
786
787         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
788                 uint32_t flipped = htonl(ctrls[req].data);
789                 static uint8_t value[4];
790                 memcpy(value, &flipped, sizeof(flipped));
791                 int size = sizeof(value);
792                 //if (ctrls[req].request == 215) size = 0;
793                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
794                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
795                 if (rc < 0) {
796                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
797                         exit(1);
798                 }
799                 
800                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
801                 for (int i = 0; i < rc; ++i) {
802                         printf("%02x", value[i]);
803                 }
804                 printf("\n");
805         }
806
807 #if 0
808         // DEBUG
809         for ( ;; ) {
810                 static int my_index = 0;
811                 static uint8_t value[4];
812                 int size = sizeof(value);
813                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
814                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
815                 if (rc < 0) {
816                         fprintf(stderr, "Error on control\n");
817                         exit(1);
818                 }
819                 printf("rc=%d index=%d: 0x", rc, my_index);
820                 for (int i = 0; i < rc; ++i) {
821                         printf("%02x", value[i]);
822                 }
823                 printf("\n");
824         }
825 #endif
826
827 #if 0
828         // set up an asynchronous transfer of the timer register
829         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
830         static int completed = 0;
831
832         xfr = libusb_alloc_transfer(0);
833         libusb_fill_control_setup(cmdbuf,
834             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
835                 /*index=*/44, /*length=*/4);
836         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
837         xfr->user_data = this;
838         libusb_submit_transfer(xfr);
839
840         // set up an asynchronous transfer of register 24
841         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
842         static int completed2 = 0;
843
844         xfr = libusb_alloc_transfer(0);
845         libusb_fill_control_setup(cmdbuf2,
846             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
847                 /*index=*/24, /*length=*/4);
848         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
849         xfr->user_data = this;
850         libusb_submit_transfer(xfr);
851 #endif
852
853         // set up an asynchronous transfer of the register dump
854         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
855         static int completed3 = 0;
856
857         xfr = libusb_alloc_transfer(0);
858         libusb_fill_control_setup(cmdbuf3,
859             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
860                 /*index=*/current_register, /*length=*/4);
861         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
862         xfr->user_data = this;
863         //libusb_submit_transfer(xfr);
864
865         audiofp = fopen("audio.raw", "wb");
866
867         // set up isochronous transfers for audio and video
868         for (int e = 3; e <= 4; ++e) {
869                 //int num_transfers = (e == 3) ? 6 : 6;
870                 int num_transfers = 6;
871                 for (int i = 0; i < num_transfers; ++i) {
872                         int num_iso_pack, size;
873                         if (e == 3) {
874                                 // Video seems to require isochronous packets scaled with the width; 
875                                 // seemingly six lines is about right, rounded up to the required 1kB
876                                 // multiple.
877                                 size = WIDTH * 2 * 6;
878                                 // Note that for 10-bit input, you'll need to increase size accordingly.
879                                 //size = size * 4 / 3;
880                                 if (size % 1024 != 0) {
881                                         size &= ~1023;
882                                         size += 1024;
883                                 }
884                                 num_iso_pack = (2 << 18) / size;  // 512 kB.
885                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
886                         } else {
887                                 size = 0xc0;
888                                 num_iso_pack = 80;
889                         }
890                         int num_bytes = num_iso_pack * size;
891                         uint8_t *buf = new uint8_t[num_bytes];
892
893                         xfr = libusb_alloc_transfer(num_iso_pack);
894                         if (!xfr) {
895                                 fprintf(stderr, "oom\n");
896                                 exit(1);
897                         }
898
899                         int ep = LIBUSB_ENDPOINT_IN | e;
900                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
901                                 num_iso_pack, cb_xfr, nullptr, 0);
902                         libusb_set_iso_packet_lengths(xfr, size);
903                         xfr->user_data = this;
904                         iso_xfrs.push_back(xfr);
905                 }
906         }
907 }
908
909 void BMUSBCapture::start_bm_capture()
910 {
911         printf("starting capture\n");
912         int i = 0;
913         for (libusb_transfer *xfr : iso_xfrs) {
914                 printf("submitting transfer...\n");
915                 int rc = libusb_submit_transfer(xfr);
916                 ++i;
917                 if (rc < 0) {
918                         //printf("num_bytes=%d\n", num_bytes);
919                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
920                                 xfr->endpoint, i, libusb_error_name(rc));
921                         exit(1);
922                 }
923         }
924
925
926 #if 0
927         libusb_release_interface(devh, 0);
928 out:
929         if (devh)
930                 libusb_close(devh);
931         libusb_exit(nullptr);
932         return rc;
933 #endif
934 }
935
936 void BMUSBCapture::start_bm_thread()
937 {
938         should_quit = false;
939         usb_thread = thread(&BMUSBCapture::usb_thread_func);
940 }
941
942 void BMUSBCapture::stop_bm_thread()
943 {
944         should_quit = true;
945         usb_thread.join();
946 }