]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Switch __SSE2__ defines over to __SSE4_1__, as we use ptest.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <libusb.h>
10 #include <arpa/inet.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <fcntl.h>
14 #include <stdint.h>
15 #include <assert.h>
16 #ifdef __SSE4_1__
17 #include <immintrin.h>
18 #endif
19 #include <algorithm>
20 #include <functional>
21 #include <memory>
22 #include <deque>
23 #include <utility>
24 #include <mutex>
25 #include <condition_variable>
26 #include <thread>
27 #include <stack>
28 #include <atomic>
29 #include "bmusb.h"
30
31 using namespace std;
32 using namespace std::placeholders;
33
34 #define WIDTH 1280
35 #define HEIGHT 750  /* 30 lines ancillary data? */
36 //#define WIDTH 1920
37 //#define HEIGHT 1125  /* ??? lines ancillary data? */
38 #define HEADER_SIZE 44
39 //#define HEADER_SIZE 0
40 #define AUDIO_HEADER_SIZE 4
41
42 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
43 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
44 #define FRAME_SIZE (8 << 20)
45
46 FILE *audiofp;
47
48 thread usb_thread;
49 atomic<bool> should_quit;
50
51 FrameAllocator::~FrameAllocator() {}
52
53 #define NUM_QUEUED_FRAMES 8
54 class MallocFrameAllocator : public FrameAllocator {
55 public:
56         MallocFrameAllocator(size_t frame_size);
57         Frame alloc_frame() override;
58         void release_frame(Frame frame) override;
59
60 private:
61         size_t frame_size;
62
63         mutex freelist_mutex;
64         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
65 };
66
67 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size)
68         : frame_size(frame_size)
69 {
70         for (int i = 0; i < NUM_QUEUED_FRAMES; ++i) {
71                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
72         }
73 }
74
75 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
76 {
77         Frame vf;
78         vf.owner = this;
79
80         unique_lock<mutex> lock(freelist_mutex);  // Meh.
81         if (freelist.empty()) {
82                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
83                         frame_size);
84         } else {
85                 vf.data = freelist.top().release();
86                 vf.size = frame_size;
87                 freelist.pop();  // Meh.
88         }
89         return vf;
90 }
91
92 void MallocFrameAllocator::release_frame(Frame frame)
93 {
94         unique_lock<mutex> lock(freelist_mutex);
95         freelist.push(unique_ptr<uint8_t[]>(frame.data));
96 }
97
98 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
99 {
100         if (a == b) {
101                 return false;
102         } else if (a < b) {
103                 return (b - a < 0x8000);
104         } else {
105                 int wrap_b = 0x10000 + int(b);
106                 return (wrap_b - a < 0x8000);
107         }
108 }
109
110 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
111 {
112         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
113                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
114                         q->back().timecode, timecode);
115                 frame.owner->release_frame(frame);
116                 return;
117         }
118
119         QueuedFrame qf;
120         qf.format = format;
121         qf.timecode = timecode;
122         qf.frame = frame;
123
124         {
125                 unique_lock<mutex> lock(queue_lock);
126                 q->push_back(move(qf));
127         }
128         queues_not_empty.notify_one();  // might be spurious
129 }
130
131 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
132 {
133         FILE *fp = fopen(filename, "wb");
134         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
135                 printf("short write!\n");
136         }
137         fclose(fp);
138 }
139
140 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
141 {
142         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
143 }
144
145 void BMUSBCapture::dequeue_thread()
146 {
147         for ( ;; ) {
148                 unique_lock<mutex> lock(queue_lock);
149                 queues_not_empty.wait(lock, [this]{ return !pending_video_frames.empty() && !pending_audio_frames.empty(); });
150
151                 uint16_t video_timecode = pending_video_frames.front().timecode;
152                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
153                 if (video_timecode < audio_timecode) {
154                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
155                                 video_timecode);
156                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
157                         pending_video_frames.pop_front();
158                 } else if (audio_timecode < video_timecode) {
159                         printf("Audio block 0x%04x without corresponding video block, dropping.\n",
160                                 audio_timecode);
161                         audio_frame_allocator->release_frame(pending_audio_frames.front().frame);
162                         pending_audio_frames.pop_front();
163                 } else {
164                         QueuedFrame video_frame = pending_video_frames.front();
165                         QueuedFrame audio_frame = pending_audio_frames.front();
166                         pending_audio_frames.pop_front();
167                         pending_video_frames.pop_front();
168                         lock.unlock();
169
170 #if 0
171                         char filename[255];
172                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
173                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
174                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
175 #endif
176
177                         frame_callback(video_timecode,
178                                        video_frame.frame, HEADER_SIZE, video_frame.format,
179                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
180                 }
181         }
182 }
183
184 void BMUSBCapture::start_new_frame(const uint8_t *start)
185 {
186         uint16_t format = (start[3] << 8) | start[2];
187         uint16_t timecode = (start[1] << 8) | start[0];
188
189         if (current_video_frame.len > 0) {
190                 //dump_frame();
191                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
192         }
193         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
194         //      format, timecode,
195         //      //start[7], start[6], start[5], start[4],
196         //      read_current_frame, FRAME_SIZE);
197
198         current_video_frame = video_frame_allocator->alloc_frame();
199         //if (current_video_frame.data == nullptr) {
200         //      read_current_frame = -1;
201         //} else {
202         //      read_current_frame = 0;
203         //}
204 }
205
206 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
207 {
208         uint16_t format = (start[3] << 8) | start[2];
209         uint16_t timecode = (start[1] << 8) | start[0];
210         if (current_audio_frame.len > 0) {
211                 //dump_audio_block();
212                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
213         }
214         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
215         //      format, timecode, read_current_audio_block);
216         current_audio_frame = audio_frame_allocator->alloc_frame();
217 }
218
219 #if 0
220 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
221 {
222         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
223         for (unsigned j = 0; j < pack->actual_length; j++) {
224         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
225                 printf("%02x", xfr->buffer[j + offset]);
226                 if ((j % 16) == 15)
227                         printf("\n");
228                 else if ((j % 8) == 7)
229                         printf("  ");
230                 else
231                         printf(" ");
232         }
233 }
234 #endif
235
236 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
237 {
238         assert(n % 2 == 0);
239         uint8_t *dptr1 = dest1;
240         uint8_t *dptr2 = dest2;
241
242         for (size_t i = 0; i < n; i += 2) {
243                 *dptr1++ = *src++;
244                 *dptr2++ = *src++;
245         }
246 }
247
248 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
249 {
250         if (current_frame->data == nullptr ||
251             current_frame->len > current_frame->size ||
252             start == end) {
253                 return;
254         }
255
256         int bytes = end - start;
257         if (current_frame->len + bytes > current_frame->size) {
258                 printf("%d bytes overflow after last %s frame\n",
259                         int(current_frame->len + bytes - current_frame->size), frame_type_name);
260                 //dump_frame();
261         } else {
262                 if (current_frame->interleaved) {
263                         uint8_t *data = current_frame->data + current_frame->len / 2;
264                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
265                         if (current_frame->len % 2 == 1) {
266                                 ++data;
267                                 swap(data, data2);
268                         }
269                         if (bytes % 2 == 1) {
270                                 *data++ = *start++;
271                                 swap(data, data2);
272                                 ++current_frame->len;
273                                 --bytes;
274                         }
275                         memcpy_interleaved(data, data2, start, bytes);
276                         current_frame->len += bytes;
277                 } else {
278                         memcpy(current_frame->data + current_frame->len, start, bytes);
279                         current_frame->len += bytes;
280                 }
281         }
282 }
283
284 #ifdef __SSE4_1__
285
286 #if 0
287 void avx2_dump(const char *name, __m256i n)
288 {
289         printf("%-10s:", name);
290         printf(" %02x", _mm256_extract_epi8(n, 0));
291         printf(" %02x", _mm256_extract_epi8(n, 1));
292         printf(" %02x", _mm256_extract_epi8(n, 2));
293         printf(" %02x", _mm256_extract_epi8(n, 3));
294         printf(" %02x", _mm256_extract_epi8(n, 4));
295         printf(" %02x", _mm256_extract_epi8(n, 5));
296         printf(" %02x", _mm256_extract_epi8(n, 6));
297         printf(" %02x", _mm256_extract_epi8(n, 7));
298         printf(" ");
299         printf(" %02x", _mm256_extract_epi8(n, 8));
300         printf(" %02x", _mm256_extract_epi8(n, 9));
301         printf(" %02x", _mm256_extract_epi8(n, 10));
302         printf(" %02x", _mm256_extract_epi8(n, 11));
303         printf(" %02x", _mm256_extract_epi8(n, 12));
304         printf(" %02x", _mm256_extract_epi8(n, 13));
305         printf(" %02x", _mm256_extract_epi8(n, 14));
306         printf(" %02x", _mm256_extract_epi8(n, 15));
307         printf(" ");
308         printf(" %02x", _mm256_extract_epi8(n, 16));
309         printf(" %02x", _mm256_extract_epi8(n, 17));
310         printf(" %02x", _mm256_extract_epi8(n, 18));
311         printf(" %02x", _mm256_extract_epi8(n, 19));
312         printf(" %02x", _mm256_extract_epi8(n, 20));
313         printf(" %02x", _mm256_extract_epi8(n, 21));
314         printf(" %02x", _mm256_extract_epi8(n, 22));
315         printf(" %02x", _mm256_extract_epi8(n, 23));
316         printf(" ");
317         printf(" %02x", _mm256_extract_epi8(n, 24));
318         printf(" %02x", _mm256_extract_epi8(n, 25));
319         printf(" %02x", _mm256_extract_epi8(n, 26));
320         printf(" %02x", _mm256_extract_epi8(n, 27));
321         printf(" %02x", _mm256_extract_epi8(n, 28));
322         printf(" %02x", _mm256_extract_epi8(n, 29));
323         printf(" %02x", _mm256_extract_epi8(n, 30));
324         printf(" %02x", _mm256_extract_epi8(n, 31));
325         printf("\n");
326 }
327 #endif
328
329 // Does a memcpy and memchr in one to reduce processing time.
330 // Note that the benefit is somewhat limited if your L3 cache is small,
331 // as you'll (unfortunately) spend most of the time loading the data
332 // from main memory.
333 //
334 // Complicated cases are left to the slow path; it basically stops copying
335 // up until the first instance of "sync_char" (usually a bit before, actually).
336 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
337 // data, and what we really need this for is the 00 00 ff ff marker in video data.
338 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
339 {
340         if (current_frame->data == nullptr ||
341             current_frame->len > current_frame->size ||
342             start == limit) {
343                 return start;
344         }
345         size_t orig_bytes = limit - start;
346         if (orig_bytes < 128) {
347                 // Don't bother.
348                 return start;
349         }
350
351         // Don't read more bytes than we can write.
352         limit = min(limit, start + (current_frame->size - current_frame->len));
353
354         // Align end to 32 bytes.
355         limit = (const uint8_t *)(intptr_t(limit) & ~31);
356
357         if (start >= limit) {
358                 return start;
359         }
360
361         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
362         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
363         if (aligned_start != start) {
364                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
365                 if (sync_start == nullptr) {
366                         add_to_frame(current_frame, "", start, aligned_start);
367                 } else {
368                         add_to_frame(current_frame, "", start, sync_start);
369                         return sync_start;
370                 }
371         }
372
373         // Make the length a multiple of 64.
374         if (current_frame->interleaved) {
375                 if (((limit - aligned_start) % 64) != 0) {
376                         limit -= 32;
377                 }
378                 assert(((limit - aligned_start) % 64) == 0);
379         }
380
381 #if __AVX2__
382         const __m256i needle = _mm256_set1_epi8(sync_char);
383
384         const __restrict __m256i *in = (const __m256i *)aligned_start;
385         if (current_frame->interleaved) {
386                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
387                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
388                 if (current_frame->len % 2 == 1) {
389                         swap(out1, out2);
390                 }
391
392                 __m256i shuffle_cw = _mm256_set_epi8(
393                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
394                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
395                 while (in < (const __m256i *)limit) {
396                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
397                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
398                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
399
400                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
401                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
402                         __m256i found = _mm256_or_si256(found1, found2);
403
404                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
405                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
406                 
407                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
408                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
409
410                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
411                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
412
413                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
414                         _mm256_storeu_si256(out2, hi);
415
416                         if (!_mm256_testz_si256(found, found)) {
417                                 break;
418                         }
419
420                         in += 2;
421                         ++out1;
422                         ++out2;
423                 }
424                 current_frame->len += (uint8_t *)in - aligned_start;
425         } else {
426                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
427                 while (in < (const __m256i *)limit) {
428                         __m256i data = _mm256_load_si256(in);
429                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
430                         __m256i found = _mm256_cmpeq_epi8(data, needle);
431                         if (!_mm256_testz_si256(found, found)) {
432                                 break;
433                         }
434
435                         ++in;
436                         ++out;
437                 }
438                 current_frame->len = (uint8_t *)out - current_frame->data;
439         }
440 #else
441         const __m128i needle = _mm_set1_epi8(sync_char);
442
443         const __m128i *in = (const __m128i *)aligned_start;
444         if (current_frame->interleaved) {
445                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
446                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
447                 if (current_frame->len % 2 == 1) {
448                         swap(out1, out2);
449                 }
450
451                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
452                 while (in < (const __m128i *)limit) {
453                         __m128i data1 = _mm_load_si128(in);
454                         __m128i data2 = _mm_load_si128(in + 1);
455                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
456                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
457                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
458                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
459                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
460                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
461                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
462                         _mm_storeu_si128(out2, hi);
463                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
464                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
465                         if (!_mm_testz_si128(found1, found1) ||
466                             !_mm_testz_si128(found2, found2)) {
467                                 break;
468                         }
469
470                         in += 2;
471                         ++out1;
472                         ++out2;
473                 }
474                 current_frame->len += (uint8_t *)in - aligned_start;
475         } else {
476                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
477                 while (in < (const __m128i *)limit) {
478                         __m128i data = _mm_load_si128(in);
479                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
480                         __m128i found = _mm_cmpeq_epi8(data, needle);
481                         if (!_mm_testz_si128(found, found)) {
482                                 break;
483                         }
484
485                         ++in;
486                         ++out;
487                 }
488                 current_frame->len = (uint8_t *)out - current_frame->data;
489         }
490 #endif
491
492         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
493
494         return (const uint8_t *)in;
495 }
496 #endif
497
498 void decode_packs(const libusb_transfer *xfr,
499                   const char *sync_pattern,
500                   int sync_length,
501                   FrameAllocator::Frame *current_frame,
502                   const char *frame_type_name,
503                   function<void(const uint8_t *start)> start_callback)
504 {
505         int offset = 0;
506         for (int i = 0; i < xfr->num_iso_packets; i++) {
507                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
508
509                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
510                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
511                         continue;
512 //exit(5);
513                 }
514
515                 const uint8_t *start = xfr->buffer + offset;
516                 const uint8_t *limit = start + pack->actual_length;
517                 while (start < limit) {  // Usually runs only one iteration.
518 #ifdef __SSE4_1__
519                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
520                         if (start == limit) break;
521                         assert(start < limit);
522 #endif
523
524                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
525                         if (start_next_frame == nullptr) {
526                                 // add the rest of the buffer
527                                 add_to_frame(current_frame, frame_type_name, start, limit);
528                                 break;
529                         } else {
530                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
531                                 start = start_next_frame + sync_length;  // skip sync
532                                 start_callback(start);
533                         }
534                 }
535 #if 0
536                 dump_pack(xfr, offset, pack);
537 #endif
538                 offset += pack->length;
539         }
540 }
541
542 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
543 {
544         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
545                 fprintf(stderr, "transfer status %d\n", xfr->status);
546                 libusb_free_transfer(xfr);
547                 exit(3);
548         }
549
550         assert(xfr->user_data != nullptr);
551         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
552
553         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
554                 if (xfr->endpoint == 0x84) {
555                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
556                 } else {
557                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
558                 }
559         }
560         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
561                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
562                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
563 #if 0
564                 if (setup->wIndex == 44) {
565                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
566                 } else {
567                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
568                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
569                 }
570 #else
571                 memcpy(usb->register_file + usb->current_register, buf, 4);
572                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
573                 if (usb->current_register == 0) {
574                         // read through all of them
575                         printf("register dump:");
576                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
577                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
578                         }
579                         printf("\n");
580                 }
581                 libusb_fill_control_setup(xfr->buffer,
582                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
583                         /*index=*/usb->current_register, /*length=*/4);
584 #endif
585         }
586
587 #if 0
588         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
589         for (i = 0; i < xfr->actual_length; i++) {
590                 printf("%02x", xfr->buffer[i]);
591                 if (i % 16)
592                         printf("\n");
593                 else if (i % 8)
594                         printf("  ");
595                 else
596                         printf(" ");
597         }
598 #endif
599
600         if (libusb_submit_transfer(xfr) < 0) {
601                 fprintf(stderr, "error re-submitting URB\n");
602                 exit(1);
603         }
604 }
605
606 void BMUSBCapture::usb_thread_func()
607 {
608         printf("usb thread started\n");
609
610         sched_param param;
611         memset(&param, 0, sizeof(param));
612         param.sched_priority = 1;
613         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
614                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
615         }
616         while (!should_quit) {
617                 int rc = libusb_handle_events(nullptr);
618                 if (rc != LIBUSB_SUCCESS)
619                         break;
620         }
621 }
622
623 void BMUSBCapture::configure_card()
624 {
625         if (video_frame_allocator == nullptr) {
626                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE));  // FIXME: leak.
627         }
628         if (audio_frame_allocator == nullptr) {
629                 set_audio_frame_allocator(new MallocFrameAllocator(65536));  // FIXME: leak.
630         }
631         thread(&BMUSBCapture::dequeue_thread, this).detach();
632
633         int rc;
634         struct libusb_transfer *xfr;
635
636         rc = libusb_init(nullptr);
637         if (rc < 0) {
638                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
639                 exit(1);
640         }
641
642         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
643         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd4f);
644         struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, vid, pid);
645         if (!devh) {
646                 fprintf(stderr, "Error finding USB device\n");
647                 exit(1);
648         }
649
650         libusb_config_descriptor *config;
651         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
652         if (rc < 0) {
653                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
654                 exit(1);
655         }
656         printf("%d interface\n", config->bNumInterfaces);
657         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
658                 printf("  interface %d\n", interface_number);
659                 const libusb_interface *interface = &config->interface[interface_number];
660                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
661                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
662                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
663                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
664                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
665                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
666                         }
667                 }
668         }
669
670         rc = libusb_set_configuration(devh, /*configuration=*/1);
671         if (rc < 0) {
672                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
673                 exit(1);
674         }
675
676         rc = libusb_claim_interface(devh, 0);
677         if (rc < 0) {
678                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
679                 exit(1);
680         }
681
682         // Alternate setting 1 is output, alternate setting 2 is input.
683         // Card is reset when switching alternates, so the driver uses
684         // this “double switch” when it wants to reset.
685         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
686         if (rc < 0) {
687                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
688                 exit(1);
689         }
690         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
691         if (rc < 0) {
692                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
693                 exit(1);
694         }
695 #if 0
696         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
697         if (rc < 0) {
698                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
699                 exit(1);
700         }
701 #endif
702
703 #if 0
704         rc = libusb_claim_interface(devh, 3);
705         if (rc < 0) {
706                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
707                 exit(1);
708         }
709 #endif
710
711         // theories:
712         //   44 is some kind of timer register (first 16 bits count upwards)
713         //   24 is some sort of watchdog?
714         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
715         //      (or will go to 0x73c60010?), also seen 0x73c60100
716         //   12 also changes all the time, unclear why  
717         //   16 seems to be autodetected mode somehow
718         //      --    this is e00115e0 after reset?
719         //                    ed0115e0 after mode change [to output?]
720         //                    2d0015e0 after more mode change [to input]
721         //                    ed0115e0 after more mode change
722         //                    2d0015e0 after more mode change
723         //
724         //                    390115e0 seems to indicate we have signal
725         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
726         //
727         //                    200015e0 on startup
728         //         changes to 250115e0 when we sync to the signal
729         //
730         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
731         //
732         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
733         //
734         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
735         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
736         //
737         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
738         //    perhaps some of them are related to analog output?
739         //
740         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
741         //    but the driver sets it to 0x8036802a at some point.
742         //
743         //    all of this is on request 214/215. other requests (192, 219,
744         //    222, 223, 224) are used for firmware upgrade. Probably best to
745         //    stay out of it unless you know what you're doing.
746         //
747         //
748         // register 16:
749         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
750         //
751         // theories:
752         //   0x01 - stable signal
753         //   0x04 - deep color
754         //   0x08 - unknown (audio??)
755         //   0x20 - 720p??
756         //   0x30 - 576p??
757
758         struct ctrl {
759                 int endpoint;
760                 int request;
761                 int index;
762                 uint32_t data;
763         };
764         static const ctrl ctrls[] = {
765                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
766                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
767
768                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
769                 // capture (v210).
770                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
771                 // 0x10000000 = analog audio instead of embedded audio, it seems
772                 // 0x3a000000 = component video? (analog audio)
773                 // 0x3c000000 = composite video? (analog audio)
774                 // 0x3e000000 = s-video? (analog audio)
775                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
776                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
777                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
778                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
779                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
780         };
781
782         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
783                 uint32_t flipped = htonl(ctrls[req].data);
784                 static uint8_t value[4];
785                 memcpy(value, &flipped, sizeof(flipped));
786                 int size = sizeof(value);
787                 //if (ctrls[req].request == 215) size = 0;
788                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
789                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
790                 if (rc < 0) {
791                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
792                         exit(1);
793                 }
794                 
795                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
796                 for (int i = 0; i < rc; ++i) {
797                         printf("%02x", value[i]);
798                 }
799                 printf("\n");
800         }
801
802 #if 0
803         // DEBUG
804         for ( ;; ) {
805                 static int my_index = 0;
806                 static uint8_t value[4];
807                 int size = sizeof(value);
808                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
809                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
810                 if (rc < 0) {
811                         fprintf(stderr, "Error on control\n");
812                         exit(1);
813                 }
814                 printf("rc=%d index=%d: 0x", rc, my_index);
815                 for (int i = 0; i < rc; ++i) {
816                         printf("%02x", value[i]);
817                 }
818                 printf("\n");
819         }
820 #endif
821
822 #if 0
823         // set up an asynchronous transfer of the timer register
824         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
825         static int completed = 0;
826
827         xfr = libusb_alloc_transfer(0);
828         libusb_fill_control_setup(cmdbuf,
829             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
830                 /*index=*/44, /*length=*/4);
831         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
832         xfr->user_data = this;
833         libusb_submit_transfer(xfr);
834
835         // set up an asynchronous transfer of register 24
836         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
837         static int completed2 = 0;
838
839         xfr = libusb_alloc_transfer(0);
840         libusb_fill_control_setup(cmdbuf2,
841             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
842                 /*index=*/24, /*length=*/4);
843         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
844         xfr->user_data = this;
845         libusb_submit_transfer(xfr);
846 #endif
847
848         // set up an asynchronous transfer of the register dump
849         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
850         static int completed3 = 0;
851
852         xfr = libusb_alloc_transfer(0);
853         libusb_fill_control_setup(cmdbuf3,
854             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
855                 /*index=*/current_register, /*length=*/4);
856         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
857         xfr->user_data = this;
858         //libusb_submit_transfer(xfr);
859
860         audiofp = fopen("audio.raw", "wb");
861
862         // set up isochronous transfers for audio and video
863         for (int e = 3; e <= 4; ++e) {
864                 //int num_transfers = (e == 3) ? 6 : 6;
865                 int num_transfers = 6;
866                 for (int i = 0; i < num_transfers; ++i) {
867                         int num_iso_pack, size;
868                         if (e == 3) {
869                                 // Video seems to require isochronous packets scaled with the width; 
870                                 // seemingly six lines is about right, rounded up to the required 1kB
871                                 // multiple.
872                                 size = WIDTH * 2 * 6;
873                                 // Note that for 10-bit input, you'll need to increase size accordingly.
874                                 //size = size * 4 / 3;
875                                 if (size % 1024 != 0) {
876                                         size &= ~1023;
877                                         size += 1024;
878                                 }
879                                 num_iso_pack = (2 << 18) / size;  // 512 kB.
880                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
881                         } else {
882                                 size = 0xc0;
883                                 num_iso_pack = 80;
884                         }
885                         int num_bytes = num_iso_pack * size;
886                         uint8_t *buf = new uint8_t[num_bytes];
887
888                         xfr = libusb_alloc_transfer(num_iso_pack);
889                         if (!xfr) {
890                                 fprintf(stderr, "oom\n");
891                                 exit(1);
892                         }
893
894                         int ep = LIBUSB_ENDPOINT_IN | e;
895                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
896                                 num_iso_pack, cb_xfr, nullptr, 0);
897                         libusb_set_iso_packet_lengths(xfr, size);
898                         xfr->user_data = this;
899                         iso_xfrs.push_back(xfr);
900                 }
901         }
902 }
903
904 void BMUSBCapture::start_bm_capture()
905 {
906         printf("starting capture\n");
907         int i = 0;
908         for (libusb_transfer *xfr : iso_xfrs) {
909                 printf("submitting transfer...\n");
910                 int rc = libusb_submit_transfer(xfr);
911                 ++i;
912                 if (rc < 0) {
913                         //printf("num_bytes=%d\n", num_bytes);
914                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
915                                 xfr->endpoint, i, libusb_error_name(rc));
916                         exit(1);
917                 }
918         }
919
920
921 #if 0
922         libusb_release_interface(devh, 0);
923 out:
924         if (devh)
925                 libusb_close(devh);
926         libusb_exit(nullptr);
927         return rc;
928 #endif
929 }
930
931 void BMUSBCapture::start_bm_thread()
932 {
933         should_quit = false;
934         usb_thread = thread(&BMUSBCapture::usb_thread_func);
935 }
936
937 void BMUSBCapture::stop_bm_thread()
938 {
939         should_quit = true;
940         usb_thread.join();
941 }